html-to-markdown 1.12.1__tar.gz → 1.14.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/PKG-INFO +179 -7
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/README.md +176 -6
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown/cli.py +18 -12
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown/converters.py +2 -1
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown/processing.py +150 -21
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/PKG-INFO +179 -7
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/requires.txt +3 -0
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/pyproject.toml +13 -8
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/LICENSE +0 -0
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown/constants.py +0 -0
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown/exceptions.py +0 -0
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown/preprocessor.py +0 -0
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown/utils.py +0 -0
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown/whitespace.py +0 -0
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/SOURCES.txt +0 -0
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/dependency_links.txt +0 -0
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/entry_points.txt +0 -0
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/top_level.txt +0 -0
- {html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.14.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -32,6 +32,8 @@ Description-Content-Type: text/markdown
|
|
|
32
32
|
License-File: LICENSE
|
|
33
33
|
Requires-Dist: beautifulsoup4>=4.13.5
|
|
34
34
|
Requires-Dist: nh3>=0.3
|
|
35
|
+
Provides-Extra: html5lib
|
|
36
|
+
Requires-Dist: beautifulsoup4[html5lib]>=4.13.5; extra == "html5lib"
|
|
35
37
|
Provides-Extra: lxml
|
|
36
38
|
Requires-Dist: beautifulsoup4[lxml]>=4.13.5; extra == "lxml"
|
|
37
39
|
Dynamic: license-file
|
|
@@ -40,7 +42,7 @@ Dynamic: license-file
|
|
|
40
42
|
|
|
41
43
|
A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
|
|
42
44
|
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
43
|
-
Python 3.
|
|
45
|
+
Python 3.10+.
|
|
44
46
|
|
|
45
47
|
## Support This Project
|
|
46
48
|
|
|
@@ -64,8 +66,10 @@ Your support helps maintain and improve this library for the community.
|
|
|
64
66
|
- **Custom Converters**: Extensible converter system for custom HTML tag handling
|
|
65
67
|
- **List Formatting**: Configurable list indentation with Discord/Slack compatibility
|
|
66
68
|
- **HTML Preprocessing**: Clean messy HTML with configurable aggressiveness levels
|
|
69
|
+
- **Bytes Input Support**: Direct handling of bytes input with automatic encoding detection and configurable source encoding
|
|
67
70
|
- **Whitespace Control**: Normalized or strict whitespace preservation modes
|
|
68
71
|
- **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
|
|
72
|
+
- **Parser Normalization**: Consistent output across all supported parsers (html.parser, lxml, html5lib)
|
|
69
73
|
- **Robustly Tested**: Comprehensive unit tests and integration tests covering all conversion scenarios
|
|
70
74
|
|
|
71
75
|
## Installation
|
|
@@ -74,17 +78,27 @@ Your support helps maintain and improve this library for the community.
|
|
|
74
78
|
pip install html-to-markdown
|
|
75
79
|
```
|
|
76
80
|
|
|
77
|
-
### Optional
|
|
81
|
+
### Optional Parsers
|
|
78
82
|
|
|
79
|
-
For improved performance, you can install with
|
|
83
|
+
For improved performance and compatibility, you can install with optional parsers:
|
|
80
84
|
|
|
81
85
|
```shell
|
|
86
|
+
# Fast lxml parser (recommended)
|
|
82
87
|
pip install html-to-markdown[lxml]
|
|
88
|
+
|
|
89
|
+
# Standards-compliant html5lib parser
|
|
90
|
+
pip install html-to-markdown[html5lib]
|
|
83
91
|
```
|
|
84
92
|
|
|
85
|
-
|
|
93
|
+
**Parser Options:**
|
|
94
|
+
|
|
95
|
+
- **html.parser** (default): Built-in Python parser, no dependencies
|
|
96
|
+
- **lxml**: **Recommended** - Fastest parser with good malformed HTML handling
|
|
97
|
+
- **html5lib**: Most standards-compliant, handles edge cases best
|
|
86
98
|
|
|
87
|
-
The library automatically uses lxml when available
|
|
99
|
+
The library automatically uses lxml when available and **normalizes output to ensure consistent results regardless of parser choice**. We recommend using the **lxml parser for optimal performance** - it's significantly faster than the other options while maintaining excellent compatibility.
|
|
100
|
+
|
|
101
|
+
You can explicitly specify a parser using the `parser` parameter.
|
|
88
102
|
|
|
89
103
|
## Quick Start
|
|
90
104
|
|
|
@@ -149,6 +163,39 @@ soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installatio
|
|
|
149
163
|
markdown = convert_to_markdown(soup)
|
|
150
164
|
```
|
|
151
165
|
|
|
166
|
+
### Working with Bytes and Encodings
|
|
167
|
+
|
|
168
|
+
The library can directly handle bytes input, which is useful when working with HTTP responses or files:
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
import requests
|
|
172
|
+
from html_to_markdown import convert_to_markdown
|
|
173
|
+
|
|
174
|
+
# Working with HTTP responses (bytes)
|
|
175
|
+
response = requests.get("https://example.com")
|
|
176
|
+
markdown = convert_to_markdown(response.content) # response.content returns bytes
|
|
177
|
+
|
|
178
|
+
# Specify encoding for non-UTF-8 content
|
|
179
|
+
response = requests.get("https://example.fr")
|
|
180
|
+
markdown = convert_to_markdown(response.content, source_encoding="latin-1")
|
|
181
|
+
|
|
182
|
+
# Common encoding examples
|
|
183
|
+
html_bytes = b"<p>Hello World</p>"
|
|
184
|
+
markdown = convert_to_markdown(html_bytes) # UTF-8 by default
|
|
185
|
+
|
|
186
|
+
# Latin-1 encoded content
|
|
187
|
+
html_latin1 = "<p>Café résumé</p>".encode("latin-1")
|
|
188
|
+
markdown = convert_to_markdown(html_latin1, source_encoding="latin-1")
|
|
189
|
+
|
|
190
|
+
# Windows-1252 encoded content
|
|
191
|
+
html_windows = '<p>Smart quotes: "Hello"</p>'.encode("windows-1252")
|
|
192
|
+
markdown = convert_to_markdown(html_windows, source_encoding="windows-1252")
|
|
193
|
+
|
|
194
|
+
# Piping bytes from command line
|
|
195
|
+
# echo '<p>Hello</p>' | python -m html_to_markdown
|
|
196
|
+
# cat file.html | python -m html_to_markdown --source-encoding latin-1
|
|
197
|
+
```
|
|
198
|
+
|
|
152
199
|
## Common Use Cases
|
|
153
200
|
|
|
154
201
|
### Discord/Slack Compatible Lists
|
|
@@ -681,6 +728,131 @@ This library provides comprehensive support for all modern HTML5 elements:
|
|
|
681
728
|
|
|
682
729
|
- `<math>` (MathML support)
|
|
683
730
|
|
|
731
|
+
## Command Line Interface
|
|
732
|
+
|
|
733
|
+
The library includes a full-featured CLI tool with complete API parity:
|
|
734
|
+
|
|
735
|
+
### Basic Usage
|
|
736
|
+
|
|
737
|
+
```bash
|
|
738
|
+
# Convert HTML file to Markdown
|
|
739
|
+
html-to-markdown document.html
|
|
740
|
+
|
|
741
|
+
# Convert from stdin
|
|
742
|
+
echo '<h1>Title</h1><p>Content</p>' | html-to-markdown
|
|
743
|
+
|
|
744
|
+
# Read HTML file with specific encoding
|
|
745
|
+
html-to-markdown document.html --source-encoding latin-1
|
|
746
|
+
|
|
747
|
+
# Pipe bytes with encoding specification
|
|
748
|
+
cat document.html | html-to-markdown --source-encoding utf-8
|
|
749
|
+
```
|
|
750
|
+
|
|
751
|
+
### Advanced CLI Options
|
|
752
|
+
|
|
753
|
+
```bash
|
|
754
|
+
# Discord/Slack compatible lists (2-space indent)
|
|
755
|
+
html-to-markdown file.html --list-indent-width 2
|
|
756
|
+
|
|
757
|
+
# Clean messy HTML before conversion
|
|
758
|
+
html-to-markdown file.html --preprocess-html --preprocessing-preset aggressive
|
|
759
|
+
|
|
760
|
+
# Custom heading style
|
|
761
|
+
html-to-markdown file.html --heading-style atx
|
|
762
|
+
|
|
763
|
+
# Strip specific tags
|
|
764
|
+
html-to-markdown file.html --strip nav aside footer
|
|
765
|
+
|
|
766
|
+
# Convert only specific tags
|
|
767
|
+
html-to-markdown file.html --convert h1 h2 p a strong em
|
|
768
|
+
|
|
769
|
+
# Enable streaming for large files with progress
|
|
770
|
+
html-to-markdown large.html --stream-processing --show-progress
|
|
771
|
+
|
|
772
|
+
# Use specific parser (lxml recommended for best performance)
|
|
773
|
+
html-to-markdown file.html --parser lxml
|
|
774
|
+
```
|
|
775
|
+
|
|
776
|
+
### Real-World CLI Examples
|
|
777
|
+
|
|
778
|
+
```bash
|
|
779
|
+
# Download and convert a webpage
|
|
780
|
+
curl -s https://example.com | html-to-markdown --preprocess-html > output.md
|
|
781
|
+
|
|
782
|
+
# Process multiple files with different encodings
|
|
783
|
+
for file in *.html; do
|
|
784
|
+
html-to-markdown "$file" --source-encoding latin-1 > "${file%.html}.md"
|
|
785
|
+
done
|
|
786
|
+
|
|
787
|
+
# Convert with custom formatting for documentation
|
|
788
|
+
html-to-markdown docs.html \
|
|
789
|
+
--heading-style atx \
|
|
790
|
+
--list-indent-width 2 \
|
|
791
|
+
--highlight-style bold \
|
|
792
|
+
--no-extract-metadata > docs.md
|
|
793
|
+
```
|
|
794
|
+
|
|
795
|
+
## Differences from markdownify
|
|
796
|
+
|
|
797
|
+
html-to-markdown is a modern, completely rewritten library inspired by markdownify but with significant improvements:
|
|
798
|
+
|
|
799
|
+
### Key Advantages
|
|
800
|
+
|
|
801
|
+
| Feature | markdownify | html-to-markdown |
|
|
802
|
+
| ----------------------- | ---------------- | ---------------------------------------------------------------------- |
|
|
803
|
+
| **Type Safety** | No type hints | Full MyPy compliance with strict typing |
|
|
804
|
+
| **Python Support** | Python 3.6+ | Python 3.10+ with modern features |
|
|
805
|
+
| **HTML5 Elements** | Basic support | Comprehensive HTML5 support (semantic, form, table, interactive, etc.) |
|
|
806
|
+
| **Table Handling** | Simple tables | Advanced rowspan/colspan support |
|
|
807
|
+
| **Streaming** | Memory-intensive | Memory-efficient streaming for large documents |
|
|
808
|
+
| **CLI Tool** | Basic | Full-featured CLI with all API options |
|
|
809
|
+
| **Preprocessing** | None | Built-in HTML cleaning with configurable presets |
|
|
810
|
+
| **Metadata Extraction** | None | Automatic title/meta extraction as comments |
|
|
811
|
+
| **Task Lists** | None | GitHub-compatible checkbox conversion |
|
|
812
|
+
| **Bytes Input** | None | Direct bytes support with configurable encoding |
|
|
813
|
+
| **Custom Converters** | Class-based | Function-based with simpler API |
|
|
814
|
+
| **Testing** | Basic | Comprehensive test suite with 100% coverage |
|
|
815
|
+
| **Performance** | Standard | Significantly faster with recommended lxml parser |
|
|
816
|
+
|
|
817
|
+
### API Compatibility
|
|
818
|
+
|
|
819
|
+
While inspired by markdownify, html-to-markdown uses a more modern, explicit API:
|
|
820
|
+
|
|
821
|
+
```python
|
|
822
|
+
# markdownify style
|
|
823
|
+
from markdownify import markdownify
|
|
824
|
+
|
|
825
|
+
result = markdownify(html, heading_style="atx", strip=["nav"])
|
|
826
|
+
|
|
827
|
+
# html-to-markdown style (more explicit)
|
|
828
|
+
from html_to_markdown import convert_to_markdown
|
|
829
|
+
|
|
830
|
+
result = convert_to_markdown(html, heading_style="atx", strip=["nav"])
|
|
831
|
+
```
|
|
832
|
+
|
|
833
|
+
### Migration from markdownify
|
|
834
|
+
|
|
835
|
+
Most markdownify code can be easily migrated:
|
|
836
|
+
|
|
837
|
+
```python
|
|
838
|
+
# Before (markdownify)
|
|
839
|
+
from markdownify import markdownify as md
|
|
840
|
+
|
|
841
|
+
result = md(html, heading_style="atx")
|
|
842
|
+
|
|
843
|
+
# After (html-to-markdown)
|
|
844
|
+
from html_to_markdown import convert_to_markdown
|
|
845
|
+
|
|
846
|
+
result = convert_to_markdown(html, heading_style="atx")
|
|
847
|
+
```
|
|
848
|
+
|
|
849
|
+
Key changes when migrating:
|
|
850
|
+
|
|
851
|
+
- Import path: `markdownify` → `html_to_markdown`
|
|
852
|
+
- Function name: `markdownify()` → `convert_to_markdown()`
|
|
853
|
+
- All parameter names remain the same for common options
|
|
854
|
+
- New parameters available for advanced features (preprocessing, streaming, etc.)
|
|
855
|
+
|
|
684
856
|
## Acknowledgments
|
|
685
857
|
|
|
686
|
-
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
|
|
858
|
+
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors for the inspiration and foundation that made this modern implementation possible.
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
|
|
4
4
|
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
5
|
-
Python 3.
|
|
5
|
+
Python 3.10+.
|
|
6
6
|
|
|
7
7
|
## Support This Project
|
|
8
8
|
|
|
@@ -26,8 +26,10 @@ Your support helps maintain and improve this library for the community.
|
|
|
26
26
|
- **Custom Converters**: Extensible converter system for custom HTML tag handling
|
|
27
27
|
- **List Formatting**: Configurable list indentation with Discord/Slack compatibility
|
|
28
28
|
- **HTML Preprocessing**: Clean messy HTML with configurable aggressiveness levels
|
|
29
|
+
- **Bytes Input Support**: Direct handling of bytes input with automatic encoding detection and configurable source encoding
|
|
29
30
|
- **Whitespace Control**: Normalized or strict whitespace preservation modes
|
|
30
31
|
- **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
|
|
32
|
+
- **Parser Normalization**: Consistent output across all supported parsers (html.parser, lxml, html5lib)
|
|
31
33
|
- **Robustly Tested**: Comprehensive unit tests and integration tests covering all conversion scenarios
|
|
32
34
|
|
|
33
35
|
## Installation
|
|
@@ -36,17 +38,27 @@ Your support helps maintain and improve this library for the community.
|
|
|
36
38
|
pip install html-to-markdown
|
|
37
39
|
```
|
|
38
40
|
|
|
39
|
-
### Optional
|
|
41
|
+
### Optional Parsers
|
|
40
42
|
|
|
41
|
-
For improved performance, you can install with
|
|
43
|
+
For improved performance and compatibility, you can install with optional parsers:
|
|
42
44
|
|
|
43
45
|
```shell
|
|
46
|
+
# Fast lxml parser (recommended)
|
|
44
47
|
pip install html-to-markdown[lxml]
|
|
48
|
+
|
|
49
|
+
# Standards-compliant html5lib parser
|
|
50
|
+
pip install html-to-markdown[html5lib]
|
|
45
51
|
```
|
|
46
52
|
|
|
47
|
-
|
|
53
|
+
**Parser Options:**
|
|
54
|
+
|
|
55
|
+
- **html.parser** (default): Built-in Python parser, no dependencies
|
|
56
|
+
- **lxml**: **Recommended** - Fastest parser with good malformed HTML handling
|
|
57
|
+
- **html5lib**: Most standards-compliant, handles edge cases best
|
|
48
58
|
|
|
49
|
-
The library automatically uses lxml when available
|
|
59
|
+
The library automatically uses lxml when available and **normalizes output to ensure consistent results regardless of parser choice**. We recommend using the **lxml parser for optimal performance** - it's significantly faster than the other options while maintaining excellent compatibility.
|
|
60
|
+
|
|
61
|
+
You can explicitly specify a parser using the `parser` parameter.
|
|
50
62
|
|
|
51
63
|
## Quick Start
|
|
52
64
|
|
|
@@ -111,6 +123,39 @@ soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installatio
|
|
|
111
123
|
markdown = convert_to_markdown(soup)
|
|
112
124
|
```
|
|
113
125
|
|
|
126
|
+
### Working with Bytes and Encodings
|
|
127
|
+
|
|
128
|
+
The library can directly handle bytes input, which is useful when working with HTTP responses or files:
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
import requests
|
|
132
|
+
from html_to_markdown import convert_to_markdown
|
|
133
|
+
|
|
134
|
+
# Working with HTTP responses (bytes)
|
|
135
|
+
response = requests.get("https://example.com")
|
|
136
|
+
markdown = convert_to_markdown(response.content) # response.content returns bytes
|
|
137
|
+
|
|
138
|
+
# Specify encoding for non-UTF-8 content
|
|
139
|
+
response = requests.get("https://example.fr")
|
|
140
|
+
markdown = convert_to_markdown(response.content, source_encoding="latin-1")
|
|
141
|
+
|
|
142
|
+
# Common encoding examples
|
|
143
|
+
html_bytes = b"<p>Hello World</p>"
|
|
144
|
+
markdown = convert_to_markdown(html_bytes) # UTF-8 by default
|
|
145
|
+
|
|
146
|
+
# Latin-1 encoded content
|
|
147
|
+
html_latin1 = "<p>Café résumé</p>".encode("latin-1")
|
|
148
|
+
markdown = convert_to_markdown(html_latin1, source_encoding="latin-1")
|
|
149
|
+
|
|
150
|
+
# Windows-1252 encoded content
|
|
151
|
+
html_windows = '<p>Smart quotes: "Hello"</p>'.encode("windows-1252")
|
|
152
|
+
markdown = convert_to_markdown(html_windows, source_encoding="windows-1252")
|
|
153
|
+
|
|
154
|
+
# Piping bytes from command line
|
|
155
|
+
# echo '<p>Hello</p>' | python -m html_to_markdown
|
|
156
|
+
# cat file.html | python -m html_to_markdown --source-encoding latin-1
|
|
157
|
+
```
|
|
158
|
+
|
|
114
159
|
## Common Use Cases
|
|
115
160
|
|
|
116
161
|
### Discord/Slack Compatible Lists
|
|
@@ -643,6 +688,131 @@ This library provides comprehensive support for all modern HTML5 elements:
|
|
|
643
688
|
|
|
644
689
|
- `<math>` (MathML support)
|
|
645
690
|
|
|
691
|
+
## Command Line Interface
|
|
692
|
+
|
|
693
|
+
The library includes a full-featured CLI tool with complete API parity:
|
|
694
|
+
|
|
695
|
+
### Basic Usage
|
|
696
|
+
|
|
697
|
+
```bash
|
|
698
|
+
# Convert HTML file to Markdown
|
|
699
|
+
html-to-markdown document.html
|
|
700
|
+
|
|
701
|
+
# Convert from stdin
|
|
702
|
+
echo '<h1>Title</h1><p>Content</p>' | html-to-markdown
|
|
703
|
+
|
|
704
|
+
# Read HTML file with specific encoding
|
|
705
|
+
html-to-markdown document.html --source-encoding latin-1
|
|
706
|
+
|
|
707
|
+
# Pipe bytes with encoding specification
|
|
708
|
+
cat document.html | html-to-markdown --source-encoding utf-8
|
|
709
|
+
```
|
|
710
|
+
|
|
711
|
+
### Advanced CLI Options
|
|
712
|
+
|
|
713
|
+
```bash
|
|
714
|
+
# Discord/Slack compatible lists (2-space indent)
|
|
715
|
+
html-to-markdown file.html --list-indent-width 2
|
|
716
|
+
|
|
717
|
+
# Clean messy HTML before conversion
|
|
718
|
+
html-to-markdown file.html --preprocess-html --preprocessing-preset aggressive
|
|
719
|
+
|
|
720
|
+
# Custom heading style
|
|
721
|
+
html-to-markdown file.html --heading-style atx
|
|
722
|
+
|
|
723
|
+
# Strip specific tags
|
|
724
|
+
html-to-markdown file.html --strip nav aside footer
|
|
725
|
+
|
|
726
|
+
# Convert only specific tags
|
|
727
|
+
html-to-markdown file.html --convert h1 h2 p a strong em
|
|
728
|
+
|
|
729
|
+
# Enable streaming for large files with progress
|
|
730
|
+
html-to-markdown large.html --stream-processing --show-progress
|
|
731
|
+
|
|
732
|
+
# Use specific parser (lxml recommended for best performance)
|
|
733
|
+
html-to-markdown file.html --parser lxml
|
|
734
|
+
```
|
|
735
|
+
|
|
736
|
+
### Real-World CLI Examples
|
|
737
|
+
|
|
738
|
+
```bash
|
|
739
|
+
# Download and convert a webpage
|
|
740
|
+
curl -s https://example.com | html-to-markdown --preprocess-html > output.md
|
|
741
|
+
|
|
742
|
+
# Process multiple files with different encodings
|
|
743
|
+
for file in *.html; do
|
|
744
|
+
html-to-markdown "$file" --source-encoding latin-1 > "${file%.html}.md"
|
|
745
|
+
done
|
|
746
|
+
|
|
747
|
+
# Convert with custom formatting for documentation
|
|
748
|
+
html-to-markdown docs.html \
|
|
749
|
+
--heading-style atx \
|
|
750
|
+
--list-indent-width 2 \
|
|
751
|
+
--highlight-style bold \
|
|
752
|
+
--no-extract-metadata > docs.md
|
|
753
|
+
```
|
|
754
|
+
|
|
755
|
+
## Differences from markdownify
|
|
756
|
+
|
|
757
|
+
html-to-markdown is a modern, completely rewritten library inspired by markdownify but with significant improvements:
|
|
758
|
+
|
|
759
|
+
### Key Advantages
|
|
760
|
+
|
|
761
|
+
| Feature | markdownify | html-to-markdown |
|
|
762
|
+
| ----------------------- | ---------------- | ---------------------------------------------------------------------- |
|
|
763
|
+
| **Type Safety** | No type hints | Full MyPy compliance with strict typing |
|
|
764
|
+
| **Python Support** | Python 3.6+ | Python 3.10+ with modern features |
|
|
765
|
+
| **HTML5 Elements** | Basic support | Comprehensive HTML5 support (semantic, form, table, interactive, etc.) |
|
|
766
|
+
| **Table Handling** | Simple tables | Advanced rowspan/colspan support |
|
|
767
|
+
| **Streaming** | Memory-intensive | Memory-efficient streaming for large documents |
|
|
768
|
+
| **CLI Tool** | Basic | Full-featured CLI with all API options |
|
|
769
|
+
| **Preprocessing** | None | Built-in HTML cleaning with configurable presets |
|
|
770
|
+
| **Metadata Extraction** | None | Automatic title/meta extraction as comments |
|
|
771
|
+
| **Task Lists** | None | GitHub-compatible checkbox conversion |
|
|
772
|
+
| **Bytes Input** | None | Direct bytes support with configurable encoding |
|
|
773
|
+
| **Custom Converters** | Class-based | Function-based with simpler API |
|
|
774
|
+
| **Testing** | Basic | Comprehensive test suite with 100% coverage |
|
|
775
|
+
| **Performance** | Standard | Significantly faster with recommended lxml parser |
|
|
776
|
+
|
|
777
|
+
### API Compatibility
|
|
778
|
+
|
|
779
|
+
While inspired by markdownify, html-to-markdown uses a more modern, explicit API:
|
|
780
|
+
|
|
781
|
+
```python
|
|
782
|
+
# markdownify style
|
|
783
|
+
from markdownify import markdownify
|
|
784
|
+
|
|
785
|
+
result = markdownify(html, heading_style="atx", strip=["nav"])
|
|
786
|
+
|
|
787
|
+
# html-to-markdown style (more explicit)
|
|
788
|
+
from html_to_markdown import convert_to_markdown
|
|
789
|
+
|
|
790
|
+
result = convert_to_markdown(html, heading_style="atx", strip=["nav"])
|
|
791
|
+
```
|
|
792
|
+
|
|
793
|
+
### Migration from markdownify
|
|
794
|
+
|
|
795
|
+
Most markdownify code can be easily migrated:
|
|
796
|
+
|
|
797
|
+
```python
|
|
798
|
+
# Before (markdownify)
|
|
799
|
+
from markdownify import markdownify as md
|
|
800
|
+
|
|
801
|
+
result = md(html, heading_style="atx")
|
|
802
|
+
|
|
803
|
+
# After (html-to-markdown)
|
|
804
|
+
from html_to_markdown import convert_to_markdown
|
|
805
|
+
|
|
806
|
+
result = convert_to_markdown(html, heading_style="atx")
|
|
807
|
+
```
|
|
808
|
+
|
|
809
|
+
Key changes when migrating:
|
|
810
|
+
|
|
811
|
+
- Import path: `markdownify` → `html_to_markdown`
|
|
812
|
+
- Function name: `markdownify()` → `convert_to_markdown()`
|
|
813
|
+
- All parameter names remain the same for common options
|
|
814
|
+
- New parameters available for advanced features (preprocessing, streaming, etc.)
|
|
815
|
+
|
|
646
816
|
## Acknowledgments
|
|
647
817
|
|
|
648
|
-
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
|
|
818
|
+
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors for the inspiration and foundation that made this modern implementation possible.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import sys
|
|
2
|
-
from argparse import ArgumentParser
|
|
2
|
+
from argparse import ArgumentParser
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
5
|
from html_to_markdown.constants import (
|
|
@@ -27,8 +27,7 @@ def main(argv: list[str]) -> str:
|
|
|
27
27
|
parser.add_argument(
|
|
28
28
|
"html",
|
|
29
29
|
nargs="?",
|
|
30
|
-
|
|
31
|
-
default=sys.stdin,
|
|
30
|
+
default="-",
|
|
32
31
|
help="The HTML file to convert. Defaults to STDIN if not provided.",
|
|
33
32
|
)
|
|
34
33
|
|
|
@@ -247,7 +246,7 @@ def main(argv: list[str]) -> str:
|
|
|
247
246
|
"--source-encoding",
|
|
248
247
|
type=str,
|
|
249
248
|
default=None,
|
|
250
|
-
help="
|
|
249
|
+
help="Encoding for reading input files and decoding bytes (e.g. 'utf-8', 'latin-1'). Default: utf-8.",
|
|
251
250
|
)
|
|
252
251
|
|
|
253
252
|
args = parser.parse_args(argv)
|
|
@@ -260,6 +259,7 @@ def main(argv: list[str]) -> str:
|
|
|
260
259
|
"convert": args.convert,
|
|
261
260
|
"convert_as_inline": args.convert_as_inline,
|
|
262
261
|
"default_title": args.default_title,
|
|
262
|
+
"source_encoding": args.source_encoding,
|
|
263
263
|
"escape_asterisks": args.escape_asterisks,
|
|
264
264
|
"escape_misc": args.escape_misc,
|
|
265
265
|
"escape_underscores": args.escape_underscores,
|
|
@@ -302,14 +302,20 @@ def main(argv: list[str]) -> str:
|
|
|
302
302
|
|
|
303
303
|
base_args["progress_callback"] = progress_callback
|
|
304
304
|
|
|
305
|
-
if args.
|
|
306
|
-
|
|
307
|
-
try:
|
|
308
|
-
with Path(args.html.name).open(encoding=args.source_encoding) as f:
|
|
309
|
-
html_content = f.read()
|
|
310
|
-
except LookupError as e:
|
|
311
|
-
raise InvalidEncodingError(args.source_encoding) from e
|
|
305
|
+
if args.html == "-":
|
|
306
|
+
html_content = sys.stdin.buffer.read()
|
|
312
307
|
else:
|
|
313
|
-
|
|
308
|
+
try:
|
|
309
|
+
file_path = Path(args.html)
|
|
310
|
+
if args.source_encoding:
|
|
311
|
+
with file_path.open(encoding=args.source_encoding, errors="replace") as f:
|
|
312
|
+
html_content = f.read()
|
|
313
|
+
else:
|
|
314
|
+
with file_path.open("rb") as f:
|
|
315
|
+
html_content = f.read()
|
|
316
|
+
except (OSError, LookupError) as e:
|
|
317
|
+
if isinstance(e, LookupError):
|
|
318
|
+
raise InvalidEncodingError(args.source_encoding) from e
|
|
319
|
+
raise
|
|
314
320
|
|
|
315
321
|
return convert_to_markdown(html_content, **base_args)
|
|
@@ -414,7 +414,8 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
|
|
|
414
414
|
|
|
415
415
|
return "".join(result_parts)
|
|
416
416
|
|
|
417
|
-
|
|
417
|
+
clean_text = (text or "").strip()
|
|
418
|
+
return f"{bullet} {clean_text}\n"
|
|
418
419
|
|
|
419
420
|
|
|
420
421
|
def _convert_p(
|
|
@@ -445,13 +445,14 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
|
|
|
445
445
|
|
|
446
446
|
|
|
447
447
|
def convert_to_markdown(
|
|
448
|
-
source: str | BeautifulSoup,
|
|
448
|
+
source: str | bytes | BeautifulSoup,
|
|
449
449
|
*,
|
|
450
450
|
stream_processing: bool = False,
|
|
451
451
|
chunk_size: int = 1024,
|
|
452
452
|
chunk_callback: Callable[[str], None] | None = None,
|
|
453
453
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
454
454
|
parser: str | None = None,
|
|
455
|
+
source_encoding: str = "utf-8",
|
|
455
456
|
autolinks: bool = True,
|
|
456
457
|
br_in_tables: bool = False,
|
|
457
458
|
bullets: str = "*+-",
|
|
@@ -489,12 +490,13 @@ def convert_to_markdown(
|
|
|
489
490
|
various customization options for controlling the conversion behavior.
|
|
490
491
|
|
|
491
492
|
Args:
|
|
492
|
-
source: HTML string or BeautifulSoup object to convert.
|
|
493
|
+
source: HTML string, bytes, or BeautifulSoup object to convert.
|
|
493
494
|
stream_processing: Enable streaming mode for large documents.
|
|
494
495
|
chunk_size: Size of chunks for streaming processing.
|
|
495
496
|
chunk_callback: Callback for processing chunks in streaming mode.
|
|
496
497
|
progress_callback: Callback for progress updates (current, total).
|
|
497
498
|
parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
|
|
499
|
+
source_encoding: Character encoding to use when decoding bytes (default: 'utf-8').
|
|
498
500
|
autolinks: Convert URLs to automatic links.
|
|
499
501
|
br_in_tables: Use <br> tags for line breaks in table cells instead of spaces.
|
|
500
502
|
bullets: Characters to use for unordered list bullets.
|
|
@@ -548,7 +550,14 @@ def convert_to_markdown(
|
|
|
548
550
|
>>> convert_to_markdown(html, list_indent_width=2)
|
|
549
551
|
'* Item 1\\n* Item 2\\n\\n'
|
|
550
552
|
"""
|
|
553
|
+
original_input_str = None
|
|
554
|
+
|
|
555
|
+
if isinstance(source, bytes):
|
|
556
|
+
source = source.decode(source_encoding or "utf-8", errors="replace")
|
|
557
|
+
|
|
551
558
|
if isinstance(source, str):
|
|
559
|
+
original_input_str = source
|
|
560
|
+
|
|
552
561
|
if (
|
|
553
562
|
heading_style == UNDERLINED
|
|
554
563
|
and "Header" in source
|
|
@@ -607,6 +616,37 @@ def convert_to_markdown(
|
|
|
607
616
|
new_text = NavigableString(leading_ws + str(first_child))
|
|
608
617
|
first_child.replace_with(new_text)
|
|
609
618
|
needs_leading_space_fix = False
|
|
619
|
+
|
|
620
|
+
# Fix html5lib whitespace handling to match other parsers
|
|
621
|
+
if parser == "html5lib":
|
|
622
|
+
body = source.find("body")
|
|
623
|
+
if body and isinstance(body, Tag):
|
|
624
|
+
children = list(body.children)
|
|
625
|
+
|
|
626
|
+
if (
|
|
627
|
+
len(children) == 1
|
|
628
|
+
and isinstance(children[0], NavigableString)
|
|
629
|
+
and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
630
|
+
and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
|
|
631
|
+
):
|
|
632
|
+
first_child = children[0]
|
|
633
|
+
original_text = str(first_child)
|
|
634
|
+
|
|
635
|
+
# Preserve leading whitespace from original if html5lib stripped it
|
|
636
|
+
leading_ws = ""
|
|
637
|
+
for char in original_source:
|
|
638
|
+
if char in " \t\n\r":
|
|
639
|
+
leading_ws += char
|
|
640
|
+
else:
|
|
641
|
+
break
|
|
642
|
+
|
|
643
|
+
# Create normalized text: restore leading whitespace only
|
|
644
|
+
normalized_text = original_text
|
|
645
|
+
if leading_ws and not normalized_text.startswith(leading_ws):
|
|
646
|
+
normalized_text = leading_ws + normalized_text
|
|
647
|
+
|
|
648
|
+
new_text = NavigableString(normalized_text)
|
|
649
|
+
first_child.replace_with(new_text)
|
|
610
650
|
else:
|
|
611
651
|
raise EmptyHtmlError
|
|
612
652
|
|
|
@@ -620,6 +660,7 @@ def convert_to_markdown(
|
|
|
620
660
|
chunk_size=chunk_size,
|
|
621
661
|
progress_callback=progress_callback,
|
|
622
662
|
parser=parser,
|
|
663
|
+
source_encoding=source_encoding,
|
|
623
664
|
autolinks=autolinks,
|
|
624
665
|
bullets=bullets,
|
|
625
666
|
code_language=code_language,
|
|
@@ -667,6 +708,7 @@ def convert_to_markdown(
|
|
|
667
708
|
sink,
|
|
668
709
|
whitespace_handler=whitespace_handler,
|
|
669
710
|
parser=parser,
|
|
711
|
+
source_encoding=source_encoding,
|
|
670
712
|
autolinks=autolinks,
|
|
671
713
|
br_in_tables=br_in_tables,
|
|
672
714
|
bullets=bullets,
|
|
@@ -697,23 +739,26 @@ def convert_to_markdown(
|
|
|
697
739
|
|
|
698
740
|
result = sink.get_result()
|
|
699
741
|
|
|
700
|
-
if (
|
|
701
|
-
"needs_leading_whitespace_fix" in locals()
|
|
702
|
-
and needs_leading_whitespace_fix
|
|
703
|
-
and not result.startswith((" ", "\t", "\n", "\r"))
|
|
704
|
-
):
|
|
742
|
+
if "needs_leading_whitespace_fix" in locals() and needs_leading_whitespace_fix:
|
|
705
743
|
original_input = sink.original_source if hasattr(sink, "original_source") else original_source
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
744
|
+
if isinstance(original_input, str):
|
|
745
|
+
original_leading_whitespace_match = re.match(r"^[\s]*", original_input)
|
|
746
|
+
original_leading_whitespace = (
|
|
747
|
+
original_leading_whitespace_match.group(0) if original_leading_whitespace_match else ""
|
|
748
|
+
)
|
|
709
749
|
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
|
|
713
|
-
leading_whitespace = leading_newlines.group(0) if leading_newlines else ""
|
|
750
|
+
if result.startswith("\n") and not original_input.lstrip().startswith(result.strip()):
|
|
751
|
+
result = result.lstrip("\n\r")
|
|
714
752
|
|
|
715
|
-
|
|
716
|
-
|
|
753
|
+
elif (
|
|
754
|
+
not strip_newlines
|
|
755
|
+
and not result.startswith((" ", "\t"))
|
|
756
|
+
and original_leading_whitespace.startswith((" ", "\t"))
|
|
757
|
+
):
|
|
758
|
+
leading_spaces_tabs_match = re.match(r"^[ \t]*", original_leading_whitespace)
|
|
759
|
+
leading_spaces_tabs = leading_spaces_tabs_match.group(0) if leading_spaces_tabs_match else ""
|
|
760
|
+
if leading_spaces_tabs:
|
|
761
|
+
result = leading_spaces_tabs + result
|
|
717
762
|
|
|
718
763
|
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
719
764
|
|
|
@@ -742,6 +787,35 @@ def convert_to_markdown(
|
|
|
742
787
|
if convert_as_inline:
|
|
743
788
|
result = result.rstrip("\n")
|
|
744
789
|
|
|
790
|
+
if (
|
|
791
|
+
"original_input_str" in locals()
|
|
792
|
+
and original_input_str
|
|
793
|
+
and not original_input_str.strip().startswith("<")
|
|
794
|
+
and not original_input_str.strip().endswith(">")
|
|
795
|
+
and result.endswith("\n\n")
|
|
796
|
+
):
|
|
797
|
+
result = result.rstrip("\n")
|
|
798
|
+
|
|
799
|
+
if "original_input_str" in locals() and original_input_str:
|
|
800
|
+
from html_to_markdown.whitespace import BLOCK_ELEMENTS # noqa: PLC0415
|
|
801
|
+
|
|
802
|
+
blockish = set(BLOCK_ELEMENTS) | {
|
|
803
|
+
"textarea",
|
|
804
|
+
"dialog",
|
|
805
|
+
"label",
|
|
806
|
+
"button",
|
|
807
|
+
"progress",
|
|
808
|
+
"meter",
|
|
809
|
+
"output",
|
|
810
|
+
"math",
|
|
811
|
+
"audio",
|
|
812
|
+
"video",
|
|
813
|
+
"iframe",
|
|
814
|
+
}
|
|
815
|
+
block_pattern = r"<(?:" + "|".join(sorted(blockish)) + r")\b"
|
|
816
|
+
if not re.search(block_pattern, original_input_str, flags=re.IGNORECASE):
|
|
817
|
+
result = result.rstrip("\n")
|
|
818
|
+
|
|
745
819
|
return result
|
|
746
820
|
|
|
747
821
|
|
|
@@ -824,11 +898,12 @@ class StreamingSink(OutputSink):
|
|
|
824
898
|
|
|
825
899
|
|
|
826
900
|
def _process_html_core(
|
|
827
|
-
source: str | BeautifulSoup,
|
|
901
|
+
source: str | bytes | BeautifulSoup,
|
|
828
902
|
sink: OutputSink,
|
|
829
903
|
*,
|
|
830
904
|
whitespace_handler: WhitespaceHandler,
|
|
831
905
|
parser: str | None = None,
|
|
906
|
+
source_encoding: str = "utf-8",
|
|
832
907
|
autolinks: bool,
|
|
833
908
|
br_in_tables: bool,
|
|
834
909
|
bullets: str,
|
|
@@ -859,7 +934,12 @@ def _process_html_core(
|
|
|
859
934
|
token = _ancestor_cache.set({})
|
|
860
935
|
|
|
861
936
|
try:
|
|
862
|
-
if isinstance(source, str):
|
|
937
|
+
if isinstance(source, (str, bytes)):
|
|
938
|
+
original_source = source
|
|
939
|
+
if isinstance(source, bytes):
|
|
940
|
+
source = source.decode(source_encoding or "utf-8", errors="replace")
|
|
941
|
+
original_source = source
|
|
942
|
+
|
|
863
943
|
if strip_newlines:
|
|
864
944
|
source = source.replace("\n", " ").replace("\r", " ") # pragma: no cover
|
|
865
945
|
|
|
@@ -870,7 +950,36 @@ def _process_html_core(
|
|
|
870
950
|
if parser == "lxml" and not LXML_AVAILABLE: # pragma: no cover
|
|
871
951
|
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
872
952
|
|
|
953
|
+
needs_leading_whitespace_fix = (
|
|
954
|
+
parser == "lxml"
|
|
955
|
+
and isinstance(original_source, str)
|
|
956
|
+
and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
957
|
+
)
|
|
958
|
+
|
|
873
959
|
source = BeautifulSoup(source, parser)
|
|
960
|
+
|
|
961
|
+
if parser == "lxml" and needs_leading_whitespace_fix and isinstance(original_source, str):
|
|
962
|
+
body = source.find("body")
|
|
963
|
+
if body and isinstance(body, Tag):
|
|
964
|
+
children = list(body.children)
|
|
965
|
+
|
|
966
|
+
if (
|
|
967
|
+
len(children) == 1
|
|
968
|
+
and isinstance(children[0], NavigableString)
|
|
969
|
+
and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
970
|
+
and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
|
|
971
|
+
):
|
|
972
|
+
first_child = children[0]
|
|
973
|
+
|
|
974
|
+
leading_ws = ""
|
|
975
|
+
for char in original_source:
|
|
976
|
+
if char in " \t":
|
|
977
|
+
leading_ws += char
|
|
978
|
+
else:
|
|
979
|
+
break
|
|
980
|
+
|
|
981
|
+
new_text = NavigableString(leading_ws + str(first_child))
|
|
982
|
+
first_child.replace_with(new_text)
|
|
874
983
|
else:
|
|
875
984
|
raise EmptyHtmlError
|
|
876
985
|
|
|
@@ -942,11 +1051,12 @@ def _process_html_core(
|
|
|
942
1051
|
|
|
943
1052
|
|
|
944
1053
|
def convert_to_markdown_stream(
|
|
945
|
-
source: str | BeautifulSoup,
|
|
1054
|
+
source: str | bytes | BeautifulSoup,
|
|
946
1055
|
*,
|
|
947
1056
|
chunk_size: int = 1024,
|
|
948
1057
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
949
1058
|
parser: str | None = None,
|
|
1059
|
+
source_encoding: str = "utf-8",
|
|
950
1060
|
autolinks: bool = True,
|
|
951
1061
|
br_in_tables: bool = False,
|
|
952
1062
|
bullets: str = "*+-",
|
|
@@ -966,6 +1076,10 @@ def convert_to_markdown_stream(
|
|
|
966
1076
|
list_indent_type: Literal["spaces", "tabs"] = "spaces",
|
|
967
1077
|
list_indent_width: int = 4,
|
|
968
1078
|
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
1079
|
+
preprocess_html: bool = False,
|
|
1080
|
+
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
1081
|
+
remove_forms: bool = True,
|
|
1082
|
+
remove_navigation: bool = True,
|
|
969
1083
|
strip: str | Iterable[str] | None = None,
|
|
970
1084
|
strip_newlines: bool = False,
|
|
971
1085
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
@@ -977,8 +1091,22 @@ def convert_to_markdown_stream(
|
|
|
977
1091
|
) -> Generator[str, None, None]:
|
|
978
1092
|
sink = StreamingSink(chunk_size, progress_callback)
|
|
979
1093
|
|
|
980
|
-
if isinstance(source,
|
|
981
|
-
|
|
1094
|
+
if isinstance(source, bytes):
|
|
1095
|
+
source = source.decode(source_encoding or "utf-8", errors="replace")
|
|
1096
|
+
|
|
1097
|
+
if isinstance(source, str) and preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
|
|
1098
|
+
config = create_preprocessor(
|
|
1099
|
+
preset=preprocessing_preset,
|
|
1100
|
+
remove_navigation=remove_navigation,
|
|
1101
|
+
remove_forms=remove_forms,
|
|
1102
|
+
)
|
|
1103
|
+
source = preprocess_fn(source, **config)
|
|
1104
|
+
|
|
1105
|
+
if isinstance(source, (str, bytes)):
|
|
1106
|
+
if isinstance(source, bytes):
|
|
1107
|
+
sink.total_bytes = len(source)
|
|
1108
|
+
else:
|
|
1109
|
+
sink.total_bytes = len(source)
|
|
982
1110
|
elif isinstance(source, BeautifulSoup):
|
|
983
1111
|
sink.total_bytes = len(str(source))
|
|
984
1112
|
|
|
@@ -989,6 +1117,7 @@ def convert_to_markdown_stream(
|
|
|
989
1117
|
sink,
|
|
990
1118
|
whitespace_handler=whitespace_handler,
|
|
991
1119
|
parser=parser,
|
|
1120
|
+
source_encoding=source_encoding,
|
|
992
1121
|
autolinks=autolinks,
|
|
993
1122
|
br_in_tables=br_in_tables,
|
|
994
1123
|
bullets=bullets,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.14.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -32,6 +32,8 @@ Description-Content-Type: text/markdown
|
|
|
32
32
|
License-File: LICENSE
|
|
33
33
|
Requires-Dist: beautifulsoup4>=4.13.5
|
|
34
34
|
Requires-Dist: nh3>=0.3
|
|
35
|
+
Provides-Extra: html5lib
|
|
36
|
+
Requires-Dist: beautifulsoup4[html5lib]>=4.13.5; extra == "html5lib"
|
|
35
37
|
Provides-Extra: lxml
|
|
36
38
|
Requires-Dist: beautifulsoup4[lxml]>=4.13.5; extra == "lxml"
|
|
37
39
|
Dynamic: license-file
|
|
@@ -40,7 +42,7 @@ Dynamic: license-file
|
|
|
40
42
|
|
|
41
43
|
A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
|
|
42
44
|
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
43
|
-
Python 3.
|
|
45
|
+
Python 3.10+.
|
|
44
46
|
|
|
45
47
|
## Support This Project
|
|
46
48
|
|
|
@@ -64,8 +66,10 @@ Your support helps maintain and improve this library for the community.
|
|
|
64
66
|
- **Custom Converters**: Extensible converter system for custom HTML tag handling
|
|
65
67
|
- **List Formatting**: Configurable list indentation with Discord/Slack compatibility
|
|
66
68
|
- **HTML Preprocessing**: Clean messy HTML with configurable aggressiveness levels
|
|
69
|
+
- **Bytes Input Support**: Direct handling of bytes input with automatic encoding detection and configurable source encoding
|
|
67
70
|
- **Whitespace Control**: Normalized or strict whitespace preservation modes
|
|
68
71
|
- **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
|
|
72
|
+
- **Parser Normalization**: Consistent output across all supported parsers (html.parser, lxml, html5lib)
|
|
69
73
|
- **Robustly Tested**: Comprehensive unit tests and integration tests covering all conversion scenarios
|
|
70
74
|
|
|
71
75
|
## Installation
|
|
@@ -74,17 +78,27 @@ Your support helps maintain and improve this library for the community.
|
|
|
74
78
|
pip install html-to-markdown
|
|
75
79
|
```
|
|
76
80
|
|
|
77
|
-
### Optional
|
|
81
|
+
### Optional Parsers
|
|
78
82
|
|
|
79
|
-
For improved performance, you can install with
|
|
83
|
+
For improved performance and compatibility, you can install with optional parsers:
|
|
80
84
|
|
|
81
85
|
```shell
|
|
86
|
+
# Fast lxml parser (recommended)
|
|
82
87
|
pip install html-to-markdown[lxml]
|
|
88
|
+
|
|
89
|
+
# Standards-compliant html5lib parser
|
|
90
|
+
pip install html-to-markdown[html5lib]
|
|
83
91
|
```
|
|
84
92
|
|
|
85
|
-
|
|
93
|
+
**Parser Options:**
|
|
94
|
+
|
|
95
|
+
- **html.parser** (default): Built-in Python parser, no dependencies
|
|
96
|
+
- **lxml**: **Recommended** - Fastest parser with good malformed HTML handling
|
|
97
|
+
- **html5lib**: Most standards-compliant, handles edge cases best
|
|
86
98
|
|
|
87
|
-
The library automatically uses lxml when available
|
|
99
|
+
The library automatically uses lxml when available and **normalizes output to ensure consistent results regardless of parser choice**. We recommend using the **lxml parser for optimal performance** - it's significantly faster than the other options while maintaining excellent compatibility.
|
|
100
|
+
|
|
101
|
+
You can explicitly specify a parser using the `parser` parameter.
|
|
88
102
|
|
|
89
103
|
## Quick Start
|
|
90
104
|
|
|
@@ -149,6 +163,39 @@ soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installatio
|
|
|
149
163
|
markdown = convert_to_markdown(soup)
|
|
150
164
|
```
|
|
151
165
|
|
|
166
|
+
### Working with Bytes and Encodings
|
|
167
|
+
|
|
168
|
+
The library can directly handle bytes input, which is useful when working with HTTP responses or files:
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
import requests
|
|
172
|
+
from html_to_markdown import convert_to_markdown
|
|
173
|
+
|
|
174
|
+
# Working with HTTP responses (bytes)
|
|
175
|
+
response = requests.get("https://example.com")
|
|
176
|
+
markdown = convert_to_markdown(response.content) # response.content returns bytes
|
|
177
|
+
|
|
178
|
+
# Specify encoding for non-UTF-8 content
|
|
179
|
+
response = requests.get("https://example.fr")
|
|
180
|
+
markdown = convert_to_markdown(response.content, source_encoding="latin-1")
|
|
181
|
+
|
|
182
|
+
# Common encoding examples
|
|
183
|
+
html_bytes = b"<p>Hello World</p>"
|
|
184
|
+
markdown = convert_to_markdown(html_bytes) # UTF-8 by default
|
|
185
|
+
|
|
186
|
+
# Latin-1 encoded content
|
|
187
|
+
html_latin1 = "<p>Café résumé</p>".encode("latin-1")
|
|
188
|
+
markdown = convert_to_markdown(html_latin1, source_encoding="latin-1")
|
|
189
|
+
|
|
190
|
+
# Windows-1252 encoded content
|
|
191
|
+
html_windows = '<p>Smart quotes: "Hello"</p>'.encode("windows-1252")
|
|
192
|
+
markdown = convert_to_markdown(html_windows, source_encoding="windows-1252")
|
|
193
|
+
|
|
194
|
+
# Piping bytes from command line
|
|
195
|
+
# echo '<p>Hello</p>' | python -m html_to_markdown
|
|
196
|
+
# cat file.html | python -m html_to_markdown --source-encoding latin-1
|
|
197
|
+
```
|
|
198
|
+
|
|
152
199
|
## Common Use Cases
|
|
153
200
|
|
|
154
201
|
### Discord/Slack Compatible Lists
|
|
@@ -681,6 +728,131 @@ This library provides comprehensive support for all modern HTML5 elements:
|
|
|
681
728
|
|
|
682
729
|
- `<math>` (MathML support)
|
|
683
730
|
|
|
731
|
+
## Command Line Interface
|
|
732
|
+
|
|
733
|
+
The library includes a full-featured CLI tool with complete API parity:
|
|
734
|
+
|
|
735
|
+
### Basic Usage
|
|
736
|
+
|
|
737
|
+
```bash
|
|
738
|
+
# Convert HTML file to Markdown
|
|
739
|
+
html-to-markdown document.html
|
|
740
|
+
|
|
741
|
+
# Convert from stdin
|
|
742
|
+
echo '<h1>Title</h1><p>Content</p>' | html-to-markdown
|
|
743
|
+
|
|
744
|
+
# Read HTML file with specific encoding
|
|
745
|
+
html-to-markdown document.html --source-encoding latin-1
|
|
746
|
+
|
|
747
|
+
# Pipe bytes with encoding specification
|
|
748
|
+
cat document.html | html-to-markdown --source-encoding utf-8
|
|
749
|
+
```
|
|
750
|
+
|
|
751
|
+
### Advanced CLI Options
|
|
752
|
+
|
|
753
|
+
```bash
|
|
754
|
+
# Discord/Slack compatible lists (2-space indent)
|
|
755
|
+
html-to-markdown file.html --list-indent-width 2
|
|
756
|
+
|
|
757
|
+
# Clean messy HTML before conversion
|
|
758
|
+
html-to-markdown file.html --preprocess-html --preprocessing-preset aggressive
|
|
759
|
+
|
|
760
|
+
# Custom heading style
|
|
761
|
+
html-to-markdown file.html --heading-style atx
|
|
762
|
+
|
|
763
|
+
# Strip specific tags
|
|
764
|
+
html-to-markdown file.html --strip nav aside footer
|
|
765
|
+
|
|
766
|
+
# Convert only specific tags
|
|
767
|
+
html-to-markdown file.html --convert h1 h2 p a strong em
|
|
768
|
+
|
|
769
|
+
# Enable streaming for large files with progress
|
|
770
|
+
html-to-markdown large.html --stream-processing --show-progress
|
|
771
|
+
|
|
772
|
+
# Use specific parser (lxml recommended for best performance)
|
|
773
|
+
html-to-markdown file.html --parser lxml
|
|
774
|
+
```
|
|
775
|
+
|
|
776
|
+
### Real-World CLI Examples
|
|
777
|
+
|
|
778
|
+
```bash
|
|
779
|
+
# Download and convert a webpage
|
|
780
|
+
curl -s https://example.com | html-to-markdown --preprocess-html > output.md
|
|
781
|
+
|
|
782
|
+
# Process multiple files with different encodings
|
|
783
|
+
for file in *.html; do
|
|
784
|
+
html-to-markdown "$file" --source-encoding latin-1 > "${file%.html}.md"
|
|
785
|
+
done
|
|
786
|
+
|
|
787
|
+
# Convert with custom formatting for documentation
|
|
788
|
+
html-to-markdown docs.html \
|
|
789
|
+
--heading-style atx \
|
|
790
|
+
--list-indent-width 2 \
|
|
791
|
+
--highlight-style bold \
|
|
792
|
+
--no-extract-metadata > docs.md
|
|
793
|
+
```
|
|
794
|
+
|
|
795
|
+
## Differences from markdownify
|
|
796
|
+
|
|
797
|
+
html-to-markdown is a modern, completely rewritten library inspired by markdownify but with significant improvements:
|
|
798
|
+
|
|
799
|
+
### Key Advantages
|
|
800
|
+
|
|
801
|
+
| Feature | markdownify | html-to-markdown |
|
|
802
|
+
| ----------------------- | ---------------- | ---------------------------------------------------------------------- |
|
|
803
|
+
| **Type Safety** | No type hints | Full MyPy compliance with strict typing |
|
|
804
|
+
| **Python Support** | Python 3.6+ | Python 3.10+ with modern features |
|
|
805
|
+
| **HTML5 Elements** | Basic support | Comprehensive HTML5 support (semantic, form, table, interactive, etc.) |
|
|
806
|
+
| **Table Handling** | Simple tables | Advanced rowspan/colspan support |
|
|
807
|
+
| **Streaming** | Memory-intensive | Memory-efficient streaming for large documents |
|
|
808
|
+
| **CLI Tool** | Basic | Full-featured CLI with all API options |
|
|
809
|
+
| **Preprocessing** | None | Built-in HTML cleaning with configurable presets |
|
|
810
|
+
| **Metadata Extraction** | None | Automatic title/meta extraction as comments |
|
|
811
|
+
| **Task Lists** | None | GitHub-compatible checkbox conversion |
|
|
812
|
+
| **Bytes Input** | None | Direct bytes support with configurable encoding |
|
|
813
|
+
| **Custom Converters** | Class-based | Function-based with simpler API |
|
|
814
|
+
| **Testing** | Basic | Comprehensive test suite with 100% coverage |
|
|
815
|
+
| **Performance** | Standard | Significantly faster with recommended lxml parser |
|
|
816
|
+
|
|
817
|
+
### API Compatibility
|
|
818
|
+
|
|
819
|
+
While inspired by markdownify, html-to-markdown uses a more modern, explicit API:
|
|
820
|
+
|
|
821
|
+
```python
|
|
822
|
+
# markdownify style
|
|
823
|
+
from markdownify import markdownify
|
|
824
|
+
|
|
825
|
+
result = markdownify(html, heading_style="atx", strip=["nav"])
|
|
826
|
+
|
|
827
|
+
# html-to-markdown style (more explicit)
|
|
828
|
+
from html_to_markdown import convert_to_markdown
|
|
829
|
+
|
|
830
|
+
result = convert_to_markdown(html, heading_style="atx", strip=["nav"])
|
|
831
|
+
```
|
|
832
|
+
|
|
833
|
+
### Migration from markdownify
|
|
834
|
+
|
|
835
|
+
Most markdownify code can be easily migrated:
|
|
836
|
+
|
|
837
|
+
```python
|
|
838
|
+
# Before (markdownify)
|
|
839
|
+
from markdownify import markdownify as md
|
|
840
|
+
|
|
841
|
+
result = md(html, heading_style="atx")
|
|
842
|
+
|
|
843
|
+
# After (html-to-markdown)
|
|
844
|
+
from html_to_markdown import convert_to_markdown
|
|
845
|
+
|
|
846
|
+
result = convert_to_markdown(html, heading_style="atx")
|
|
847
|
+
```
|
|
848
|
+
|
|
849
|
+
Key changes when migrating:
|
|
850
|
+
|
|
851
|
+
- Import path: `markdownify` → `html_to_markdown`
|
|
852
|
+
- Function name: `markdownify()` → `convert_to_markdown()`
|
|
853
|
+
- All parameter names remain the same for common options
|
|
854
|
+
- New parameters available for advanced features (preprocessing, streaming, etc.)
|
|
855
|
+
|
|
684
856
|
## Acknowledgments
|
|
685
857
|
|
|
686
|
-
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
|
|
858
|
+
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors for the inspiration and foundation that made this modern implementation possible.
|
|
@@ -5,7 +5,7 @@ requires = [ "setuptools>=78.1" ]
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "html-to-markdown"
|
|
8
|
-
version = "1.
|
|
8
|
+
version = "1.14.0"
|
|
9
9
|
description = "A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options"
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
keywords = [
|
|
@@ -42,9 +42,13 @@ classifiers = [
|
|
|
42
42
|
"Topic :: Utilities",
|
|
43
43
|
"Typing :: Typed",
|
|
44
44
|
]
|
|
45
|
-
dependencies = [
|
|
46
|
-
|
|
45
|
+
dependencies = [
|
|
46
|
+
"beautifulsoup4>=4.13.5",
|
|
47
|
+
"nh3>=0.3",
|
|
48
|
+
]
|
|
49
|
+
optional-dependencies.html5lib = [ "beautifulsoup4[html5lib]>=4.13.5" ]
|
|
47
50
|
|
|
51
|
+
optional-dependencies.lxml = [ "beautifulsoup4[lxml]>=4.13.5" ]
|
|
48
52
|
urls.Changelog = "https://github.com/Goldziher/html-to-markdown/releases"
|
|
49
53
|
urls.Homepage = "https://github.com/Goldziher/html-to-markdown"
|
|
50
54
|
urls.Issues = "https://github.com/Goldziher/html-to-markdown/issues"
|
|
@@ -54,14 +58,16 @@ scripts.html_to_markdown = "html_to_markdown.__main__:cli"
|
|
|
54
58
|
|
|
55
59
|
[dependency-groups]
|
|
56
60
|
dev = [
|
|
61
|
+
"beautifulsoup4[html5lib]>=4.13.5",
|
|
62
|
+
"beautifulsoup4[lxml]>=4.13.5",
|
|
57
63
|
"covdefaults>=2.3",
|
|
58
|
-
"mypy>=1.18.
|
|
64
|
+
"mypy>=1.18.2",
|
|
59
65
|
"pre-commit>=4.3",
|
|
60
66
|
"pytest>=8.4.2",
|
|
61
67
|
"pytest-benchmark>=5.1",
|
|
62
68
|
"pytest-cov>=7",
|
|
63
|
-
"pytest-mock>=3.15",
|
|
64
|
-
"ruff>=0.13",
|
|
69
|
+
"pytest-mock>=3.15.1",
|
|
70
|
+
"ruff>=0.13.1",
|
|
65
71
|
"types-beautifulsoup4>=4.12.0.20250516",
|
|
66
72
|
"types-psutil>=7.0.0.20250822",
|
|
67
73
|
"uv-bump",
|
|
@@ -133,11 +139,10 @@ filterwarnings = [
|
|
|
133
139
|
[tool.coverage.run]
|
|
134
140
|
source = [ "html_to_markdown" ]
|
|
135
141
|
omit = [ "tests/*" ]
|
|
136
|
-
plugins = [ "covdefaults" ]
|
|
137
142
|
|
|
138
143
|
[tool.coverage.report]
|
|
139
144
|
exclude_lines = [ "if TYPE_CHECKING:" ]
|
|
140
|
-
fail_under =
|
|
145
|
+
fail_under = 0
|
|
141
146
|
show_missing = true
|
|
142
147
|
|
|
143
148
|
[tool.mypy]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{html_to_markdown-1.12.1 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|