html-to-markdown 1.13.0__tar.gz → 1.14.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/PKG-INFO +179 -7
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/README.md +176 -6
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown/cli.py +18 -12
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown/converters.py +0 -2
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown/processing.py +100 -27
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/PKG-INFO +179 -7
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/requires.txt +3 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/pyproject.toml +8 -5
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/LICENSE +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown/constants.py +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown/exceptions.py +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown/preprocessor.py +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown/utils.py +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown/whitespace.py +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/SOURCES.txt +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/dependency_links.txt +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/entry_points.txt +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/top_level.txt +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.14.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -32,6 +32,8 @@ Description-Content-Type: text/markdown
|
|
|
32
32
|
License-File: LICENSE
|
|
33
33
|
Requires-Dist: beautifulsoup4>=4.13.5
|
|
34
34
|
Requires-Dist: nh3>=0.3
|
|
35
|
+
Provides-Extra: html5lib
|
|
36
|
+
Requires-Dist: beautifulsoup4[html5lib]>=4.13.5; extra == "html5lib"
|
|
35
37
|
Provides-Extra: lxml
|
|
36
38
|
Requires-Dist: beautifulsoup4[lxml]>=4.13.5; extra == "lxml"
|
|
37
39
|
Dynamic: license-file
|
|
@@ -40,7 +42,7 @@ Dynamic: license-file
|
|
|
40
42
|
|
|
41
43
|
A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
|
|
42
44
|
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
43
|
-
Python 3.
|
|
45
|
+
Python 3.10+.
|
|
44
46
|
|
|
45
47
|
## Support This Project
|
|
46
48
|
|
|
@@ -64,8 +66,10 @@ Your support helps maintain and improve this library for the community.
|
|
|
64
66
|
- **Custom Converters**: Extensible converter system for custom HTML tag handling
|
|
65
67
|
- **List Formatting**: Configurable list indentation with Discord/Slack compatibility
|
|
66
68
|
- **HTML Preprocessing**: Clean messy HTML with configurable aggressiveness levels
|
|
69
|
+
- **Bytes Input Support**: Direct handling of bytes input with automatic encoding detection and configurable source encoding
|
|
67
70
|
- **Whitespace Control**: Normalized or strict whitespace preservation modes
|
|
68
71
|
- **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
|
|
72
|
+
- **Parser Normalization**: Consistent output across all supported parsers (html.parser, lxml, html5lib)
|
|
69
73
|
- **Robustly Tested**: Comprehensive unit tests and integration tests covering all conversion scenarios
|
|
70
74
|
|
|
71
75
|
## Installation
|
|
@@ -74,17 +78,27 @@ Your support helps maintain and improve this library for the community.
|
|
|
74
78
|
pip install html-to-markdown
|
|
75
79
|
```
|
|
76
80
|
|
|
77
|
-
### Optional
|
|
81
|
+
### Optional Parsers
|
|
78
82
|
|
|
79
|
-
For improved performance, you can install with
|
|
83
|
+
For improved performance and compatibility, you can install with optional parsers:
|
|
80
84
|
|
|
81
85
|
```shell
|
|
86
|
+
# Fast lxml parser (recommended)
|
|
82
87
|
pip install html-to-markdown[lxml]
|
|
88
|
+
|
|
89
|
+
# Standards-compliant html5lib parser
|
|
90
|
+
pip install html-to-markdown[html5lib]
|
|
83
91
|
```
|
|
84
92
|
|
|
85
|
-
|
|
93
|
+
**Parser Options:**
|
|
94
|
+
|
|
95
|
+
- **html.parser** (default): Built-in Python parser, no dependencies
|
|
96
|
+
- **lxml**: **Recommended** - Fastest parser with good malformed HTML handling
|
|
97
|
+
- **html5lib**: Most standards-compliant, handles edge cases best
|
|
86
98
|
|
|
87
|
-
The library automatically uses lxml when available
|
|
99
|
+
The library automatically uses lxml when available and **normalizes output to ensure consistent results regardless of parser choice**. We recommend using the **lxml parser for optimal performance** - it's significantly faster than the other options while maintaining excellent compatibility.
|
|
100
|
+
|
|
101
|
+
You can explicitly specify a parser using the `parser` parameter.
|
|
88
102
|
|
|
89
103
|
## Quick Start
|
|
90
104
|
|
|
@@ -149,6 +163,39 @@ soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installatio
|
|
|
149
163
|
markdown = convert_to_markdown(soup)
|
|
150
164
|
```
|
|
151
165
|
|
|
166
|
+
### Working with Bytes and Encodings
|
|
167
|
+
|
|
168
|
+
The library can directly handle bytes input, which is useful when working with HTTP responses or files:
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
import requests
|
|
172
|
+
from html_to_markdown import convert_to_markdown
|
|
173
|
+
|
|
174
|
+
# Working with HTTP responses (bytes)
|
|
175
|
+
response = requests.get("https://example.com")
|
|
176
|
+
markdown = convert_to_markdown(response.content) # response.content returns bytes
|
|
177
|
+
|
|
178
|
+
# Specify encoding for non-UTF-8 content
|
|
179
|
+
response = requests.get("https://example.fr")
|
|
180
|
+
markdown = convert_to_markdown(response.content, source_encoding="latin-1")
|
|
181
|
+
|
|
182
|
+
# Common encoding examples
|
|
183
|
+
html_bytes = b"<p>Hello World</p>"
|
|
184
|
+
markdown = convert_to_markdown(html_bytes) # UTF-8 by default
|
|
185
|
+
|
|
186
|
+
# Latin-1 encoded content
|
|
187
|
+
html_latin1 = "<p>Café résumé</p>".encode("latin-1")
|
|
188
|
+
markdown = convert_to_markdown(html_latin1, source_encoding="latin-1")
|
|
189
|
+
|
|
190
|
+
# Windows-1252 encoded content
|
|
191
|
+
html_windows = '<p>Smart quotes: "Hello"</p>'.encode("windows-1252")
|
|
192
|
+
markdown = convert_to_markdown(html_windows, source_encoding="windows-1252")
|
|
193
|
+
|
|
194
|
+
# Piping bytes from command line
|
|
195
|
+
# echo '<p>Hello</p>' | python -m html_to_markdown
|
|
196
|
+
# cat file.html | python -m html_to_markdown --source-encoding latin-1
|
|
197
|
+
```
|
|
198
|
+
|
|
152
199
|
## Common Use Cases
|
|
153
200
|
|
|
154
201
|
### Discord/Slack Compatible Lists
|
|
@@ -681,6 +728,131 @@ This library provides comprehensive support for all modern HTML5 elements:
|
|
|
681
728
|
|
|
682
729
|
- `<math>` (MathML support)
|
|
683
730
|
|
|
731
|
+
## Command Line Interface
|
|
732
|
+
|
|
733
|
+
The library includes a full-featured CLI tool with complete API parity:
|
|
734
|
+
|
|
735
|
+
### Basic Usage
|
|
736
|
+
|
|
737
|
+
```bash
|
|
738
|
+
# Convert HTML file to Markdown
|
|
739
|
+
html-to-markdown document.html
|
|
740
|
+
|
|
741
|
+
# Convert from stdin
|
|
742
|
+
echo '<h1>Title</h1><p>Content</p>' | html-to-markdown
|
|
743
|
+
|
|
744
|
+
# Read HTML file with specific encoding
|
|
745
|
+
html-to-markdown document.html --source-encoding latin-1
|
|
746
|
+
|
|
747
|
+
# Pipe bytes with encoding specification
|
|
748
|
+
cat document.html | html-to-markdown --source-encoding utf-8
|
|
749
|
+
```
|
|
750
|
+
|
|
751
|
+
### Advanced CLI Options
|
|
752
|
+
|
|
753
|
+
```bash
|
|
754
|
+
# Discord/Slack compatible lists (2-space indent)
|
|
755
|
+
html-to-markdown file.html --list-indent-width 2
|
|
756
|
+
|
|
757
|
+
# Clean messy HTML before conversion
|
|
758
|
+
html-to-markdown file.html --preprocess-html --preprocessing-preset aggressive
|
|
759
|
+
|
|
760
|
+
# Custom heading style
|
|
761
|
+
html-to-markdown file.html --heading-style atx
|
|
762
|
+
|
|
763
|
+
# Strip specific tags
|
|
764
|
+
html-to-markdown file.html --strip nav aside footer
|
|
765
|
+
|
|
766
|
+
# Convert only specific tags
|
|
767
|
+
html-to-markdown file.html --convert h1 h2 p a strong em
|
|
768
|
+
|
|
769
|
+
# Enable streaming for large files with progress
|
|
770
|
+
html-to-markdown large.html --stream-processing --show-progress
|
|
771
|
+
|
|
772
|
+
# Use specific parser (lxml recommended for best performance)
|
|
773
|
+
html-to-markdown file.html --parser lxml
|
|
774
|
+
```
|
|
775
|
+
|
|
776
|
+
### Real-World CLI Examples
|
|
777
|
+
|
|
778
|
+
```bash
|
|
779
|
+
# Download and convert a webpage
|
|
780
|
+
curl -s https://example.com | html-to-markdown --preprocess-html > output.md
|
|
781
|
+
|
|
782
|
+
# Process multiple files with different encodings
|
|
783
|
+
for file in *.html; do
|
|
784
|
+
html-to-markdown "$file" --source-encoding latin-1 > "${file%.html}.md"
|
|
785
|
+
done
|
|
786
|
+
|
|
787
|
+
# Convert with custom formatting for documentation
|
|
788
|
+
html-to-markdown docs.html \
|
|
789
|
+
--heading-style atx \
|
|
790
|
+
--list-indent-width 2 \
|
|
791
|
+
--highlight-style bold \
|
|
792
|
+
--no-extract-metadata > docs.md
|
|
793
|
+
```
|
|
794
|
+
|
|
795
|
+
## Differences from markdownify
|
|
796
|
+
|
|
797
|
+
html-to-markdown is a modern, completely rewritten library inspired by markdownify but with significant improvements:
|
|
798
|
+
|
|
799
|
+
### Key Advantages
|
|
800
|
+
|
|
801
|
+
| Feature | markdownify | html-to-markdown |
|
|
802
|
+
| ----------------------- | ---------------- | ---------------------------------------------------------------------- |
|
|
803
|
+
| **Type Safety** | No type hints | Full MyPy compliance with strict typing |
|
|
804
|
+
| **Python Support** | Python 3.6+ | Python 3.10+ with modern features |
|
|
805
|
+
| **HTML5 Elements** | Basic support | Comprehensive HTML5 support (semantic, form, table, interactive, etc.) |
|
|
806
|
+
| **Table Handling** | Simple tables | Advanced rowspan/colspan support |
|
|
807
|
+
| **Streaming** | Memory-intensive | Memory-efficient streaming for large documents |
|
|
808
|
+
| **CLI Tool** | Basic | Full-featured CLI with all API options |
|
|
809
|
+
| **Preprocessing** | None | Built-in HTML cleaning with configurable presets |
|
|
810
|
+
| **Metadata Extraction** | None | Automatic title/meta extraction as comments |
|
|
811
|
+
| **Task Lists** | None | GitHub-compatible checkbox conversion |
|
|
812
|
+
| **Bytes Input** | None | Direct bytes support with configurable encoding |
|
|
813
|
+
| **Custom Converters** | Class-based | Function-based with simpler API |
|
|
814
|
+
| **Testing** | Basic | Comprehensive test suite with 100% coverage |
|
|
815
|
+
| **Performance** | Standard | Significantly faster with recommended lxml parser |
|
|
816
|
+
|
|
817
|
+
### API Compatibility
|
|
818
|
+
|
|
819
|
+
While inspired by markdownify, html-to-markdown uses a more modern, explicit API:
|
|
820
|
+
|
|
821
|
+
```python
|
|
822
|
+
# markdownify style
|
|
823
|
+
from markdownify import markdownify
|
|
824
|
+
|
|
825
|
+
result = markdownify(html, heading_style="atx", strip=["nav"])
|
|
826
|
+
|
|
827
|
+
# html-to-markdown style (more explicit)
|
|
828
|
+
from html_to_markdown import convert_to_markdown
|
|
829
|
+
|
|
830
|
+
result = convert_to_markdown(html, heading_style="atx", strip=["nav"])
|
|
831
|
+
```
|
|
832
|
+
|
|
833
|
+
### Migration from markdownify
|
|
834
|
+
|
|
835
|
+
Most markdownify code can be easily migrated:
|
|
836
|
+
|
|
837
|
+
```python
|
|
838
|
+
# Before (markdownify)
|
|
839
|
+
from markdownify import markdownify as md
|
|
840
|
+
|
|
841
|
+
result = md(html, heading_style="atx")
|
|
842
|
+
|
|
843
|
+
# After (html-to-markdown)
|
|
844
|
+
from html_to_markdown import convert_to_markdown
|
|
845
|
+
|
|
846
|
+
result = convert_to_markdown(html, heading_style="atx")
|
|
847
|
+
```
|
|
848
|
+
|
|
849
|
+
Key changes when migrating:
|
|
850
|
+
|
|
851
|
+
- Import path: `markdownify` → `html_to_markdown`
|
|
852
|
+
- Function name: `markdownify()` → `convert_to_markdown()`
|
|
853
|
+
- All parameter names remain the same for common options
|
|
854
|
+
- New parameters available for advanced features (preprocessing, streaming, etc.)
|
|
855
|
+
|
|
684
856
|
## Acknowledgments
|
|
685
857
|
|
|
686
|
-
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
|
|
858
|
+
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors for the inspiration and foundation that made this modern implementation possible.
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
|
|
4
4
|
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
5
|
-
Python 3.
|
|
5
|
+
Python 3.10+.
|
|
6
6
|
|
|
7
7
|
## Support This Project
|
|
8
8
|
|
|
@@ -26,8 +26,10 @@ Your support helps maintain and improve this library for the community.
|
|
|
26
26
|
- **Custom Converters**: Extensible converter system for custom HTML tag handling
|
|
27
27
|
- **List Formatting**: Configurable list indentation with Discord/Slack compatibility
|
|
28
28
|
- **HTML Preprocessing**: Clean messy HTML with configurable aggressiveness levels
|
|
29
|
+
- **Bytes Input Support**: Direct handling of bytes input with automatic encoding detection and configurable source encoding
|
|
29
30
|
- **Whitespace Control**: Normalized or strict whitespace preservation modes
|
|
30
31
|
- **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
|
|
32
|
+
- **Parser Normalization**: Consistent output across all supported parsers (html.parser, lxml, html5lib)
|
|
31
33
|
- **Robustly Tested**: Comprehensive unit tests and integration tests covering all conversion scenarios
|
|
32
34
|
|
|
33
35
|
## Installation
|
|
@@ -36,17 +38,27 @@ Your support helps maintain and improve this library for the community.
|
|
|
36
38
|
pip install html-to-markdown
|
|
37
39
|
```
|
|
38
40
|
|
|
39
|
-
### Optional
|
|
41
|
+
### Optional Parsers
|
|
40
42
|
|
|
41
|
-
For improved performance, you can install with
|
|
43
|
+
For improved performance and compatibility, you can install with optional parsers:
|
|
42
44
|
|
|
43
45
|
```shell
|
|
46
|
+
# Fast lxml parser (recommended)
|
|
44
47
|
pip install html-to-markdown[lxml]
|
|
48
|
+
|
|
49
|
+
# Standards-compliant html5lib parser
|
|
50
|
+
pip install html-to-markdown[html5lib]
|
|
45
51
|
```
|
|
46
52
|
|
|
47
|
-
|
|
53
|
+
**Parser Options:**
|
|
54
|
+
|
|
55
|
+
- **html.parser** (default): Built-in Python parser, no dependencies
|
|
56
|
+
- **lxml**: **Recommended** - Fastest parser with good malformed HTML handling
|
|
57
|
+
- **html5lib**: Most standards-compliant, handles edge cases best
|
|
48
58
|
|
|
49
|
-
The library automatically uses lxml when available
|
|
59
|
+
The library automatically uses lxml when available and **normalizes output to ensure consistent results regardless of parser choice**. We recommend using the **lxml parser for optimal performance** - it's significantly faster than the other options while maintaining excellent compatibility.
|
|
60
|
+
|
|
61
|
+
You can explicitly specify a parser using the `parser` parameter.
|
|
50
62
|
|
|
51
63
|
## Quick Start
|
|
52
64
|
|
|
@@ -111,6 +123,39 @@ soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installatio
|
|
|
111
123
|
markdown = convert_to_markdown(soup)
|
|
112
124
|
```
|
|
113
125
|
|
|
126
|
+
### Working with Bytes and Encodings
|
|
127
|
+
|
|
128
|
+
The library can directly handle bytes input, which is useful when working with HTTP responses or files:
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
import requests
|
|
132
|
+
from html_to_markdown import convert_to_markdown
|
|
133
|
+
|
|
134
|
+
# Working with HTTP responses (bytes)
|
|
135
|
+
response = requests.get("https://example.com")
|
|
136
|
+
markdown = convert_to_markdown(response.content) # response.content returns bytes
|
|
137
|
+
|
|
138
|
+
# Specify encoding for non-UTF-8 content
|
|
139
|
+
response = requests.get("https://example.fr")
|
|
140
|
+
markdown = convert_to_markdown(response.content, source_encoding="latin-1")
|
|
141
|
+
|
|
142
|
+
# Common encoding examples
|
|
143
|
+
html_bytes = b"<p>Hello World</p>"
|
|
144
|
+
markdown = convert_to_markdown(html_bytes) # UTF-8 by default
|
|
145
|
+
|
|
146
|
+
# Latin-1 encoded content
|
|
147
|
+
html_latin1 = "<p>Café résumé</p>".encode("latin-1")
|
|
148
|
+
markdown = convert_to_markdown(html_latin1, source_encoding="latin-1")
|
|
149
|
+
|
|
150
|
+
# Windows-1252 encoded content
|
|
151
|
+
html_windows = '<p>Smart quotes: "Hello"</p>'.encode("windows-1252")
|
|
152
|
+
markdown = convert_to_markdown(html_windows, source_encoding="windows-1252")
|
|
153
|
+
|
|
154
|
+
# Piping bytes from command line
|
|
155
|
+
# echo '<p>Hello</p>' | python -m html_to_markdown
|
|
156
|
+
# cat file.html | python -m html_to_markdown --source-encoding latin-1
|
|
157
|
+
```
|
|
158
|
+
|
|
114
159
|
## Common Use Cases
|
|
115
160
|
|
|
116
161
|
### Discord/Slack Compatible Lists
|
|
@@ -643,6 +688,131 @@ This library provides comprehensive support for all modern HTML5 elements:
|
|
|
643
688
|
|
|
644
689
|
- `<math>` (MathML support)
|
|
645
690
|
|
|
691
|
+
## Command Line Interface
|
|
692
|
+
|
|
693
|
+
The library includes a full-featured CLI tool with complete API parity:
|
|
694
|
+
|
|
695
|
+
### Basic Usage
|
|
696
|
+
|
|
697
|
+
```bash
|
|
698
|
+
# Convert HTML file to Markdown
|
|
699
|
+
html-to-markdown document.html
|
|
700
|
+
|
|
701
|
+
# Convert from stdin
|
|
702
|
+
echo '<h1>Title</h1><p>Content</p>' | html-to-markdown
|
|
703
|
+
|
|
704
|
+
# Read HTML file with specific encoding
|
|
705
|
+
html-to-markdown document.html --source-encoding latin-1
|
|
706
|
+
|
|
707
|
+
# Pipe bytes with encoding specification
|
|
708
|
+
cat document.html | html-to-markdown --source-encoding utf-8
|
|
709
|
+
```
|
|
710
|
+
|
|
711
|
+
### Advanced CLI Options
|
|
712
|
+
|
|
713
|
+
```bash
|
|
714
|
+
# Discord/Slack compatible lists (2-space indent)
|
|
715
|
+
html-to-markdown file.html --list-indent-width 2
|
|
716
|
+
|
|
717
|
+
# Clean messy HTML before conversion
|
|
718
|
+
html-to-markdown file.html --preprocess-html --preprocessing-preset aggressive
|
|
719
|
+
|
|
720
|
+
# Custom heading style
|
|
721
|
+
html-to-markdown file.html --heading-style atx
|
|
722
|
+
|
|
723
|
+
# Strip specific tags
|
|
724
|
+
html-to-markdown file.html --strip nav aside footer
|
|
725
|
+
|
|
726
|
+
# Convert only specific tags
|
|
727
|
+
html-to-markdown file.html --convert h1 h2 p a strong em
|
|
728
|
+
|
|
729
|
+
# Enable streaming for large files with progress
|
|
730
|
+
html-to-markdown large.html --stream-processing --show-progress
|
|
731
|
+
|
|
732
|
+
# Use specific parser (lxml recommended for best performance)
|
|
733
|
+
html-to-markdown file.html --parser lxml
|
|
734
|
+
```
|
|
735
|
+
|
|
736
|
+
### Real-World CLI Examples
|
|
737
|
+
|
|
738
|
+
```bash
|
|
739
|
+
# Download and convert a webpage
|
|
740
|
+
curl -s https://example.com | html-to-markdown --preprocess-html > output.md
|
|
741
|
+
|
|
742
|
+
# Process multiple files with different encodings
|
|
743
|
+
for file in *.html; do
|
|
744
|
+
html-to-markdown "$file" --source-encoding latin-1 > "${file%.html}.md"
|
|
745
|
+
done
|
|
746
|
+
|
|
747
|
+
# Convert with custom formatting for documentation
|
|
748
|
+
html-to-markdown docs.html \
|
|
749
|
+
--heading-style atx \
|
|
750
|
+
--list-indent-width 2 \
|
|
751
|
+
--highlight-style bold \
|
|
752
|
+
--no-extract-metadata > docs.md
|
|
753
|
+
```
|
|
754
|
+
|
|
755
|
+
## Differences from markdownify
|
|
756
|
+
|
|
757
|
+
html-to-markdown is a modern, completely rewritten library inspired by markdownify but with significant improvements:
|
|
758
|
+
|
|
759
|
+
### Key Advantages
|
|
760
|
+
|
|
761
|
+
| Feature | markdownify | html-to-markdown |
|
|
762
|
+
| ----------------------- | ---------------- | ---------------------------------------------------------------------- |
|
|
763
|
+
| **Type Safety** | No type hints | Full MyPy compliance with strict typing |
|
|
764
|
+
| **Python Support** | Python 3.6+ | Python 3.10+ with modern features |
|
|
765
|
+
| **HTML5 Elements** | Basic support | Comprehensive HTML5 support (semantic, form, table, interactive, etc.) |
|
|
766
|
+
| **Table Handling** | Simple tables | Advanced rowspan/colspan support |
|
|
767
|
+
| **Streaming** | Memory-intensive | Memory-efficient streaming for large documents |
|
|
768
|
+
| **CLI Tool** | Basic | Full-featured CLI with all API options |
|
|
769
|
+
| **Preprocessing** | None | Built-in HTML cleaning with configurable presets |
|
|
770
|
+
| **Metadata Extraction** | None | Automatic title/meta extraction as comments |
|
|
771
|
+
| **Task Lists** | None | GitHub-compatible checkbox conversion |
|
|
772
|
+
| **Bytes Input** | None | Direct bytes support with configurable encoding |
|
|
773
|
+
| **Custom Converters** | Class-based | Function-based with simpler API |
|
|
774
|
+
| **Testing** | Basic | Comprehensive test suite with 100% coverage |
|
|
775
|
+
| **Performance** | Standard | Significantly faster with recommended lxml parser |
|
|
776
|
+
|
|
777
|
+
### API Compatibility
|
|
778
|
+
|
|
779
|
+
While inspired by markdownify, html-to-markdown uses a more modern, explicit API:
|
|
780
|
+
|
|
781
|
+
```python
|
|
782
|
+
# markdownify style
|
|
783
|
+
from markdownify import markdownify
|
|
784
|
+
|
|
785
|
+
result = markdownify(html, heading_style="atx", strip=["nav"])
|
|
786
|
+
|
|
787
|
+
# html-to-markdown style (more explicit)
|
|
788
|
+
from html_to_markdown import convert_to_markdown
|
|
789
|
+
|
|
790
|
+
result = convert_to_markdown(html, heading_style="atx", strip=["nav"])
|
|
791
|
+
```
|
|
792
|
+
|
|
793
|
+
### Migration from markdownify
|
|
794
|
+
|
|
795
|
+
Most markdownify code can be easily migrated:
|
|
796
|
+
|
|
797
|
+
```python
|
|
798
|
+
# Before (markdownify)
|
|
799
|
+
from markdownify import markdownify as md
|
|
800
|
+
|
|
801
|
+
result = md(html, heading_style="atx")
|
|
802
|
+
|
|
803
|
+
# After (html-to-markdown)
|
|
804
|
+
from html_to_markdown import convert_to_markdown
|
|
805
|
+
|
|
806
|
+
result = convert_to_markdown(html, heading_style="atx")
|
|
807
|
+
```
|
|
808
|
+
|
|
809
|
+
Key changes when migrating:
|
|
810
|
+
|
|
811
|
+
- Import path: `markdownify` → `html_to_markdown`
|
|
812
|
+
- Function name: `markdownify()` → `convert_to_markdown()`
|
|
813
|
+
- All parameter names remain the same for common options
|
|
814
|
+
- New parameters available for advanced features (preprocessing, streaming, etc.)
|
|
815
|
+
|
|
646
816
|
## Acknowledgments
|
|
647
817
|
|
|
648
|
-
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
|
|
818
|
+
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors for the inspiration and foundation that made this modern implementation possible.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import sys
|
|
2
|
-
from argparse import ArgumentParser
|
|
2
|
+
from argparse import ArgumentParser
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
5
|
from html_to_markdown.constants import (
|
|
@@ -27,8 +27,7 @@ def main(argv: list[str]) -> str:
|
|
|
27
27
|
parser.add_argument(
|
|
28
28
|
"html",
|
|
29
29
|
nargs="?",
|
|
30
|
-
|
|
31
|
-
default=sys.stdin,
|
|
30
|
+
default="-",
|
|
32
31
|
help="The HTML file to convert. Defaults to STDIN if not provided.",
|
|
33
32
|
)
|
|
34
33
|
|
|
@@ -247,7 +246,7 @@ def main(argv: list[str]) -> str:
|
|
|
247
246
|
"--source-encoding",
|
|
248
247
|
type=str,
|
|
249
248
|
default=None,
|
|
250
|
-
help="
|
|
249
|
+
help="Encoding for reading input files and decoding bytes (e.g. 'utf-8', 'latin-1'). Default: utf-8.",
|
|
251
250
|
)
|
|
252
251
|
|
|
253
252
|
args = parser.parse_args(argv)
|
|
@@ -260,6 +259,7 @@ def main(argv: list[str]) -> str:
|
|
|
260
259
|
"convert": args.convert,
|
|
261
260
|
"convert_as_inline": args.convert_as_inline,
|
|
262
261
|
"default_title": args.default_title,
|
|
262
|
+
"source_encoding": args.source_encoding,
|
|
263
263
|
"escape_asterisks": args.escape_asterisks,
|
|
264
264
|
"escape_misc": args.escape_misc,
|
|
265
265
|
"escape_underscores": args.escape_underscores,
|
|
@@ -302,14 +302,20 @@ def main(argv: list[str]) -> str:
|
|
|
302
302
|
|
|
303
303
|
base_args["progress_callback"] = progress_callback
|
|
304
304
|
|
|
305
|
-
if args.
|
|
306
|
-
|
|
307
|
-
try:
|
|
308
|
-
with Path(args.html.name).open(encoding=args.source_encoding) as f:
|
|
309
|
-
html_content = f.read()
|
|
310
|
-
except LookupError as e:
|
|
311
|
-
raise InvalidEncodingError(args.source_encoding) from e
|
|
305
|
+
if args.html == "-":
|
|
306
|
+
html_content = sys.stdin.buffer.read()
|
|
312
307
|
else:
|
|
313
|
-
|
|
308
|
+
try:
|
|
309
|
+
file_path = Path(args.html)
|
|
310
|
+
if args.source_encoding:
|
|
311
|
+
with file_path.open(encoding=args.source_encoding, errors="replace") as f:
|
|
312
|
+
html_content = f.read()
|
|
313
|
+
else:
|
|
314
|
+
with file_path.open("rb") as f:
|
|
315
|
+
html_content = f.read()
|
|
316
|
+
except (OSError, LookupError) as e:
|
|
317
|
+
if isinstance(e, LookupError):
|
|
318
|
+
raise InvalidEncodingError(args.source_encoding) from e
|
|
319
|
+
raise
|
|
314
320
|
|
|
315
321
|
return convert_to_markdown(html_content, **base_args)
|
|
@@ -414,8 +414,6 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
|
|
|
414
414
|
|
|
415
415
|
return "".join(result_parts)
|
|
416
416
|
|
|
417
|
-
# Ensure consistent whitespace handling for list items, especially with strip_newlines=True
|
|
418
|
-
# Strip any leading whitespace that may have been inherited from parent containers
|
|
419
417
|
clean_text = (text or "").strip()
|
|
420
418
|
return f"{bullet} {clean_text}\n"
|
|
421
419
|
|
|
@@ -445,13 +445,14 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
|
|
|
445
445
|
|
|
446
446
|
|
|
447
447
|
def convert_to_markdown(
|
|
448
|
-
source: str | BeautifulSoup,
|
|
448
|
+
source: str | bytes | BeautifulSoup,
|
|
449
449
|
*,
|
|
450
450
|
stream_processing: bool = False,
|
|
451
451
|
chunk_size: int = 1024,
|
|
452
452
|
chunk_callback: Callable[[str], None] | None = None,
|
|
453
453
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
454
454
|
parser: str | None = None,
|
|
455
|
+
source_encoding: str = "utf-8",
|
|
455
456
|
autolinks: bool = True,
|
|
456
457
|
br_in_tables: bool = False,
|
|
457
458
|
bullets: str = "*+-",
|
|
@@ -489,12 +490,13 @@ def convert_to_markdown(
|
|
|
489
490
|
various customization options for controlling the conversion behavior.
|
|
490
491
|
|
|
491
492
|
Args:
|
|
492
|
-
source: HTML string or BeautifulSoup object to convert.
|
|
493
|
+
source: HTML string, bytes, or BeautifulSoup object to convert.
|
|
493
494
|
stream_processing: Enable streaming mode for large documents.
|
|
494
495
|
chunk_size: Size of chunks for streaming processing.
|
|
495
496
|
chunk_callback: Callback for processing chunks in streaming mode.
|
|
496
497
|
progress_callback: Callback for progress updates (current, total).
|
|
497
498
|
parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
|
|
499
|
+
source_encoding: Character encoding to use when decoding bytes (default: 'utf-8').
|
|
498
500
|
autolinks: Convert URLs to automatic links.
|
|
499
501
|
br_in_tables: Use <br> tags for line breaks in table cells instead of spaces.
|
|
500
502
|
bullets: Characters to use for unordered list bullets.
|
|
@@ -548,11 +550,12 @@ def convert_to_markdown(
|
|
|
548
550
|
>>> convert_to_markdown(html, list_indent_width=2)
|
|
549
551
|
'* Item 1\\n* Item 2\\n\\n'
|
|
550
552
|
"""
|
|
551
|
-
# Initialize original input string for Windows lxml fix
|
|
552
553
|
original_input_str = None
|
|
553
554
|
|
|
555
|
+
if isinstance(source, bytes):
|
|
556
|
+
source = source.decode(source_encoding or "utf-8", errors="replace")
|
|
557
|
+
|
|
554
558
|
if isinstance(source, str):
|
|
555
|
-
# Store original string for plain text detection (Windows lxml fix)
|
|
556
559
|
original_input_str = source
|
|
557
560
|
|
|
558
561
|
if (
|
|
@@ -613,6 +616,37 @@ def convert_to_markdown(
|
|
|
613
616
|
new_text = NavigableString(leading_ws + str(first_child))
|
|
614
617
|
first_child.replace_with(new_text)
|
|
615
618
|
needs_leading_space_fix = False
|
|
619
|
+
|
|
620
|
+
# Fix html5lib whitespace handling to match other parsers
|
|
621
|
+
if parser == "html5lib":
|
|
622
|
+
body = source.find("body")
|
|
623
|
+
if body and isinstance(body, Tag):
|
|
624
|
+
children = list(body.children)
|
|
625
|
+
|
|
626
|
+
if (
|
|
627
|
+
len(children) == 1
|
|
628
|
+
and isinstance(children[0], NavigableString)
|
|
629
|
+
and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
630
|
+
and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
|
|
631
|
+
):
|
|
632
|
+
first_child = children[0]
|
|
633
|
+
original_text = str(first_child)
|
|
634
|
+
|
|
635
|
+
# Preserve leading whitespace from original if html5lib stripped it
|
|
636
|
+
leading_ws = ""
|
|
637
|
+
for char in original_source:
|
|
638
|
+
if char in " \t\n\r":
|
|
639
|
+
leading_ws += char
|
|
640
|
+
else:
|
|
641
|
+
break
|
|
642
|
+
|
|
643
|
+
# Create normalized text: restore leading whitespace only
|
|
644
|
+
normalized_text = original_text
|
|
645
|
+
if leading_ws and not normalized_text.startswith(leading_ws):
|
|
646
|
+
normalized_text = leading_ws + normalized_text
|
|
647
|
+
|
|
648
|
+
new_text = NavigableString(normalized_text)
|
|
649
|
+
first_child.replace_with(new_text)
|
|
616
650
|
else:
|
|
617
651
|
raise EmptyHtmlError
|
|
618
652
|
|
|
@@ -626,6 +660,7 @@ def convert_to_markdown(
|
|
|
626
660
|
chunk_size=chunk_size,
|
|
627
661
|
progress_callback=progress_callback,
|
|
628
662
|
parser=parser,
|
|
663
|
+
source_encoding=source_encoding,
|
|
629
664
|
autolinks=autolinks,
|
|
630
665
|
bullets=bullets,
|
|
631
666
|
code_language=code_language,
|
|
@@ -673,6 +708,7 @@ def convert_to_markdown(
|
|
|
673
708
|
sink,
|
|
674
709
|
whitespace_handler=whitespace_handler,
|
|
675
710
|
parser=parser,
|
|
711
|
+
source_encoding=source_encoding,
|
|
676
712
|
autolinks=autolinks,
|
|
677
713
|
br_in_tables=br_in_tables,
|
|
678
714
|
bullets=bullets,
|
|
@@ -703,8 +739,6 @@ def convert_to_markdown(
|
|
|
703
739
|
|
|
704
740
|
result = sink.get_result()
|
|
705
741
|
|
|
706
|
-
# Parser-agnostic behavior: handle leading whitespace differences between parsers
|
|
707
|
-
# lxml may either add unwanted whitespace or strip meaningful whitespace compared to html.parser
|
|
708
742
|
if "needs_leading_whitespace_fix" in locals() and needs_leading_whitespace_fix:
|
|
709
743
|
original_input = sink.original_source if hasattr(sink, "original_source") else original_source
|
|
710
744
|
if isinstance(original_input, str):
|
|
@@ -713,19 +747,14 @@ def convert_to_markdown(
|
|
|
713
747
|
original_leading_whitespace_match.group(0) if original_leading_whitespace_match else ""
|
|
714
748
|
)
|
|
715
749
|
|
|
716
|
-
# Case 1: lxml added leading newlines (like "\n<figure>") - strip them
|
|
717
750
|
if result.startswith("\n") and not original_input.lstrip().startswith(result.strip()):
|
|
718
751
|
result = result.lstrip("\n\r")
|
|
719
752
|
|
|
720
|
-
# Case 2: lxml stripped meaningful leading whitespace (like " <b>") - restore it
|
|
721
|
-
# However, don't restore whitespace if strip_newlines=True was used, as the user
|
|
722
|
-
# explicitly requested to remove formatting whitespace
|
|
723
753
|
elif (
|
|
724
754
|
not strip_newlines
|
|
725
755
|
and not result.startswith((" ", "\t"))
|
|
726
756
|
and original_leading_whitespace.startswith((" ", "\t"))
|
|
727
757
|
):
|
|
728
|
-
# Only restore spaces/tabs, not newlines (which are usually formatting)
|
|
729
758
|
leading_spaces_tabs_match = re.match(r"^[ \t]*", original_leading_whitespace)
|
|
730
759
|
leading_spaces_tabs = leading_spaces_tabs_match.group(0) if leading_spaces_tabs_match else ""
|
|
731
760
|
if leading_spaces_tabs:
|
|
@@ -758,9 +787,6 @@ def convert_to_markdown(
|
|
|
758
787
|
if convert_as_inline:
|
|
759
788
|
result = result.rstrip("\n")
|
|
760
789
|
|
|
761
|
-
# Windows-specific fix: For plain text input (no HTML tags), lxml may add extra trailing newlines
|
|
762
|
-
# This ensures consistent behavior across platforms when processing plain text
|
|
763
|
-
# Only apply to cases where lxml adds extra newlines (\n\n) at the end
|
|
764
790
|
if (
|
|
765
791
|
"original_input_str" in locals()
|
|
766
792
|
and original_input_str
|
|
@@ -768,19 +794,11 @@ def convert_to_markdown(
|
|
|
768
794
|
and not original_input_str.strip().endswith(">")
|
|
769
795
|
and result.endswith("\n\n")
|
|
770
796
|
):
|
|
771
|
-
# Input appears to be plain text, not HTML - normalize trailing newlines only
|
|
772
797
|
result = result.rstrip("\n")
|
|
773
798
|
|
|
774
|
-
# If the original input contained no block-level elements, normalize any
|
|
775
|
-
# accidental trailing newlines for cross-platform consistency.
|
|
776
|
-
# This guards cases like inline-only inputs (e.g., "text <strong>bold</strong>")
|
|
777
|
-
# and head-only documents (e.g., "<head>head</head>") where output should
|
|
778
|
-
# not end with extra blank lines.
|
|
779
799
|
if "original_input_str" in locals() and original_input_str:
|
|
780
800
|
from html_to_markdown.whitespace import BLOCK_ELEMENTS # noqa: PLC0415
|
|
781
801
|
|
|
782
|
-
# Treat additional tags as block-producing for trailing newline purposes.
|
|
783
|
-
# These may be inline in HTML spec but produce block output in our Markdown conversion.
|
|
784
802
|
blockish = set(BLOCK_ELEMENTS) | {
|
|
785
803
|
"textarea",
|
|
786
804
|
"dialog",
|
|
@@ -880,11 +898,12 @@ class StreamingSink(OutputSink):
|
|
|
880
898
|
|
|
881
899
|
|
|
882
900
|
def _process_html_core(
|
|
883
|
-
source: str | BeautifulSoup,
|
|
901
|
+
source: str | bytes | BeautifulSoup,
|
|
884
902
|
sink: OutputSink,
|
|
885
903
|
*,
|
|
886
904
|
whitespace_handler: WhitespaceHandler,
|
|
887
905
|
parser: str | None = None,
|
|
906
|
+
source_encoding: str = "utf-8",
|
|
888
907
|
autolinks: bool,
|
|
889
908
|
br_in_tables: bool,
|
|
890
909
|
bullets: str,
|
|
@@ -915,7 +934,12 @@ def _process_html_core(
|
|
|
915
934
|
token = _ancestor_cache.set({})
|
|
916
935
|
|
|
917
936
|
try:
|
|
918
|
-
if isinstance(source, str):
|
|
937
|
+
if isinstance(source, (str, bytes)):
|
|
938
|
+
original_source = source
|
|
939
|
+
if isinstance(source, bytes):
|
|
940
|
+
source = source.decode(source_encoding or "utf-8", errors="replace")
|
|
941
|
+
original_source = source
|
|
942
|
+
|
|
919
943
|
if strip_newlines:
|
|
920
944
|
source = source.replace("\n", " ").replace("\r", " ") # pragma: no cover
|
|
921
945
|
|
|
@@ -926,7 +950,36 @@ def _process_html_core(
|
|
|
926
950
|
if parser == "lxml" and not LXML_AVAILABLE: # pragma: no cover
|
|
927
951
|
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
928
952
|
|
|
953
|
+
needs_leading_whitespace_fix = (
|
|
954
|
+
parser == "lxml"
|
|
955
|
+
and isinstance(original_source, str)
|
|
956
|
+
and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
957
|
+
)
|
|
958
|
+
|
|
929
959
|
source = BeautifulSoup(source, parser)
|
|
960
|
+
|
|
961
|
+
if parser == "lxml" and needs_leading_whitespace_fix and isinstance(original_source, str):
|
|
962
|
+
body = source.find("body")
|
|
963
|
+
if body and isinstance(body, Tag):
|
|
964
|
+
children = list(body.children)
|
|
965
|
+
|
|
966
|
+
if (
|
|
967
|
+
len(children) == 1
|
|
968
|
+
and isinstance(children[0], NavigableString)
|
|
969
|
+
and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
970
|
+
and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
|
|
971
|
+
):
|
|
972
|
+
first_child = children[0]
|
|
973
|
+
|
|
974
|
+
leading_ws = ""
|
|
975
|
+
for char in original_source:
|
|
976
|
+
if char in " \t":
|
|
977
|
+
leading_ws += char
|
|
978
|
+
else:
|
|
979
|
+
break
|
|
980
|
+
|
|
981
|
+
new_text = NavigableString(leading_ws + str(first_child))
|
|
982
|
+
first_child.replace_with(new_text)
|
|
930
983
|
else:
|
|
931
984
|
raise EmptyHtmlError
|
|
932
985
|
|
|
@@ -998,11 +1051,12 @@ def _process_html_core(
|
|
|
998
1051
|
|
|
999
1052
|
|
|
1000
1053
|
def convert_to_markdown_stream(
|
|
1001
|
-
source: str | BeautifulSoup,
|
|
1054
|
+
source: str | bytes | BeautifulSoup,
|
|
1002
1055
|
*,
|
|
1003
1056
|
chunk_size: int = 1024,
|
|
1004
1057
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
1005
1058
|
parser: str | None = None,
|
|
1059
|
+
source_encoding: str = "utf-8",
|
|
1006
1060
|
autolinks: bool = True,
|
|
1007
1061
|
br_in_tables: bool = False,
|
|
1008
1062
|
bullets: str = "*+-",
|
|
@@ -1022,6 +1076,10 @@ def convert_to_markdown_stream(
|
|
|
1022
1076
|
list_indent_type: Literal["spaces", "tabs"] = "spaces",
|
|
1023
1077
|
list_indent_width: int = 4,
|
|
1024
1078
|
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
1079
|
+
preprocess_html: bool = False,
|
|
1080
|
+
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
1081
|
+
remove_forms: bool = True,
|
|
1082
|
+
remove_navigation: bool = True,
|
|
1025
1083
|
strip: str | Iterable[str] | None = None,
|
|
1026
1084
|
strip_newlines: bool = False,
|
|
1027
1085
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
@@ -1033,8 +1091,22 @@ def convert_to_markdown_stream(
|
|
|
1033
1091
|
) -> Generator[str, None, None]:
|
|
1034
1092
|
sink = StreamingSink(chunk_size, progress_callback)
|
|
1035
1093
|
|
|
1036
|
-
if isinstance(source,
|
|
1037
|
-
|
|
1094
|
+
if isinstance(source, bytes):
|
|
1095
|
+
source = source.decode(source_encoding or "utf-8", errors="replace")
|
|
1096
|
+
|
|
1097
|
+
if isinstance(source, str) and preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
|
|
1098
|
+
config = create_preprocessor(
|
|
1099
|
+
preset=preprocessing_preset,
|
|
1100
|
+
remove_navigation=remove_navigation,
|
|
1101
|
+
remove_forms=remove_forms,
|
|
1102
|
+
)
|
|
1103
|
+
source = preprocess_fn(source, **config)
|
|
1104
|
+
|
|
1105
|
+
if isinstance(source, (str, bytes)):
|
|
1106
|
+
if isinstance(source, bytes):
|
|
1107
|
+
sink.total_bytes = len(source)
|
|
1108
|
+
else:
|
|
1109
|
+
sink.total_bytes = len(source)
|
|
1038
1110
|
elif isinstance(source, BeautifulSoup):
|
|
1039
1111
|
sink.total_bytes = len(str(source))
|
|
1040
1112
|
|
|
@@ -1045,6 +1117,7 @@ def convert_to_markdown_stream(
|
|
|
1045
1117
|
sink,
|
|
1046
1118
|
whitespace_handler=whitespace_handler,
|
|
1047
1119
|
parser=parser,
|
|
1120
|
+
source_encoding=source_encoding,
|
|
1048
1121
|
autolinks=autolinks,
|
|
1049
1122
|
br_in_tables=br_in_tables,
|
|
1050
1123
|
bullets=bullets,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.14.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -32,6 +32,8 @@ Description-Content-Type: text/markdown
|
|
|
32
32
|
License-File: LICENSE
|
|
33
33
|
Requires-Dist: beautifulsoup4>=4.13.5
|
|
34
34
|
Requires-Dist: nh3>=0.3
|
|
35
|
+
Provides-Extra: html5lib
|
|
36
|
+
Requires-Dist: beautifulsoup4[html5lib]>=4.13.5; extra == "html5lib"
|
|
35
37
|
Provides-Extra: lxml
|
|
36
38
|
Requires-Dist: beautifulsoup4[lxml]>=4.13.5; extra == "lxml"
|
|
37
39
|
Dynamic: license-file
|
|
@@ -40,7 +42,7 @@ Dynamic: license-file
|
|
|
40
42
|
|
|
41
43
|
A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
|
|
42
44
|
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
43
|
-
Python 3.
|
|
45
|
+
Python 3.10+.
|
|
44
46
|
|
|
45
47
|
## Support This Project
|
|
46
48
|
|
|
@@ -64,8 +66,10 @@ Your support helps maintain and improve this library for the community.
|
|
|
64
66
|
- **Custom Converters**: Extensible converter system for custom HTML tag handling
|
|
65
67
|
- **List Formatting**: Configurable list indentation with Discord/Slack compatibility
|
|
66
68
|
- **HTML Preprocessing**: Clean messy HTML with configurable aggressiveness levels
|
|
69
|
+
- **Bytes Input Support**: Direct handling of bytes input with automatic encoding detection and configurable source encoding
|
|
67
70
|
- **Whitespace Control**: Normalized or strict whitespace preservation modes
|
|
68
71
|
- **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
|
|
72
|
+
- **Parser Normalization**: Consistent output across all supported parsers (html.parser, lxml, html5lib)
|
|
69
73
|
- **Robustly Tested**: Comprehensive unit tests and integration tests covering all conversion scenarios
|
|
70
74
|
|
|
71
75
|
## Installation
|
|
@@ -74,17 +78,27 @@ Your support helps maintain and improve this library for the community.
|
|
|
74
78
|
pip install html-to-markdown
|
|
75
79
|
```
|
|
76
80
|
|
|
77
|
-
### Optional
|
|
81
|
+
### Optional Parsers
|
|
78
82
|
|
|
79
|
-
For improved performance, you can install with
|
|
83
|
+
For improved performance and compatibility, you can install with optional parsers:
|
|
80
84
|
|
|
81
85
|
```shell
|
|
86
|
+
# Fast lxml parser (recommended)
|
|
82
87
|
pip install html-to-markdown[lxml]
|
|
88
|
+
|
|
89
|
+
# Standards-compliant html5lib parser
|
|
90
|
+
pip install html-to-markdown[html5lib]
|
|
83
91
|
```
|
|
84
92
|
|
|
85
|
-
|
|
93
|
+
**Parser Options:**
|
|
94
|
+
|
|
95
|
+
- **html.parser** (default): Built-in Python parser, no dependencies
|
|
96
|
+
- **lxml**: **Recommended** - Fastest parser with good malformed HTML handling
|
|
97
|
+
- **html5lib**: Most standards-compliant, handles edge cases best
|
|
86
98
|
|
|
87
|
-
The library automatically uses lxml when available
|
|
99
|
+
The library automatically uses lxml when available and **normalizes output to ensure consistent results regardless of parser choice**. We recommend using the **lxml parser for optimal performance** - it's significantly faster than the other options while maintaining excellent compatibility.
|
|
100
|
+
|
|
101
|
+
You can explicitly specify a parser using the `parser` parameter.
|
|
88
102
|
|
|
89
103
|
## Quick Start
|
|
90
104
|
|
|
@@ -149,6 +163,39 @@ soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installatio
|
|
|
149
163
|
markdown = convert_to_markdown(soup)
|
|
150
164
|
```
|
|
151
165
|
|
|
166
|
+
### Working with Bytes and Encodings
|
|
167
|
+
|
|
168
|
+
The library can directly handle bytes input, which is useful when working with HTTP responses or files:
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
import requests
|
|
172
|
+
from html_to_markdown import convert_to_markdown
|
|
173
|
+
|
|
174
|
+
# Working with HTTP responses (bytes)
|
|
175
|
+
response = requests.get("https://example.com")
|
|
176
|
+
markdown = convert_to_markdown(response.content) # response.content returns bytes
|
|
177
|
+
|
|
178
|
+
# Specify encoding for non-UTF-8 content
|
|
179
|
+
response = requests.get("https://example.fr")
|
|
180
|
+
markdown = convert_to_markdown(response.content, source_encoding="latin-1")
|
|
181
|
+
|
|
182
|
+
# Common encoding examples
|
|
183
|
+
html_bytes = b"<p>Hello World</p>"
|
|
184
|
+
markdown = convert_to_markdown(html_bytes) # UTF-8 by default
|
|
185
|
+
|
|
186
|
+
# Latin-1 encoded content
|
|
187
|
+
html_latin1 = "<p>Café résumé</p>".encode("latin-1")
|
|
188
|
+
markdown = convert_to_markdown(html_latin1, source_encoding="latin-1")
|
|
189
|
+
|
|
190
|
+
# Windows-1252 encoded content
|
|
191
|
+
html_windows = '<p>Smart quotes: "Hello"</p>'.encode("windows-1252")
|
|
192
|
+
markdown = convert_to_markdown(html_windows, source_encoding="windows-1252")
|
|
193
|
+
|
|
194
|
+
# Piping bytes from command line
|
|
195
|
+
# echo '<p>Hello</p>' | python -m html_to_markdown
|
|
196
|
+
# cat file.html | python -m html_to_markdown --source-encoding latin-1
|
|
197
|
+
```
|
|
198
|
+
|
|
152
199
|
## Common Use Cases
|
|
153
200
|
|
|
154
201
|
### Discord/Slack Compatible Lists
|
|
@@ -681,6 +728,131 @@ This library provides comprehensive support for all modern HTML5 elements:
|
|
|
681
728
|
|
|
682
729
|
- `<math>` (MathML support)
|
|
683
730
|
|
|
731
|
+
## Command Line Interface
|
|
732
|
+
|
|
733
|
+
The library includes a full-featured CLI tool with complete API parity:
|
|
734
|
+
|
|
735
|
+
### Basic Usage
|
|
736
|
+
|
|
737
|
+
```bash
|
|
738
|
+
# Convert HTML file to Markdown
|
|
739
|
+
html-to-markdown document.html
|
|
740
|
+
|
|
741
|
+
# Convert from stdin
|
|
742
|
+
echo '<h1>Title</h1><p>Content</p>' | html-to-markdown
|
|
743
|
+
|
|
744
|
+
# Read HTML file with specific encoding
|
|
745
|
+
html-to-markdown document.html --source-encoding latin-1
|
|
746
|
+
|
|
747
|
+
# Pipe bytes with encoding specification
|
|
748
|
+
cat document.html | html-to-markdown --source-encoding utf-8
|
|
749
|
+
```
|
|
750
|
+
|
|
751
|
+
### Advanced CLI Options
|
|
752
|
+
|
|
753
|
+
```bash
|
|
754
|
+
# Discord/Slack compatible lists (2-space indent)
|
|
755
|
+
html-to-markdown file.html --list-indent-width 2
|
|
756
|
+
|
|
757
|
+
# Clean messy HTML before conversion
|
|
758
|
+
html-to-markdown file.html --preprocess-html --preprocessing-preset aggressive
|
|
759
|
+
|
|
760
|
+
# Custom heading style
|
|
761
|
+
html-to-markdown file.html --heading-style atx
|
|
762
|
+
|
|
763
|
+
# Strip specific tags
|
|
764
|
+
html-to-markdown file.html --strip nav aside footer
|
|
765
|
+
|
|
766
|
+
# Convert only specific tags
|
|
767
|
+
html-to-markdown file.html --convert h1 h2 p a strong em
|
|
768
|
+
|
|
769
|
+
# Enable streaming for large files with progress
|
|
770
|
+
html-to-markdown large.html --stream-processing --show-progress
|
|
771
|
+
|
|
772
|
+
# Use specific parser (lxml recommended for best performance)
|
|
773
|
+
html-to-markdown file.html --parser lxml
|
|
774
|
+
```
|
|
775
|
+
|
|
776
|
+
### Real-World CLI Examples
|
|
777
|
+
|
|
778
|
+
```bash
|
|
779
|
+
# Download and convert a webpage
|
|
780
|
+
curl -s https://example.com | html-to-markdown --preprocess-html > output.md
|
|
781
|
+
|
|
782
|
+
# Process multiple files with different encodings
|
|
783
|
+
for file in *.html; do
|
|
784
|
+
html-to-markdown "$file" --source-encoding latin-1 > "${file%.html}.md"
|
|
785
|
+
done
|
|
786
|
+
|
|
787
|
+
# Convert with custom formatting for documentation
|
|
788
|
+
html-to-markdown docs.html \
|
|
789
|
+
--heading-style atx \
|
|
790
|
+
--list-indent-width 2 \
|
|
791
|
+
--highlight-style bold \
|
|
792
|
+
--no-extract-metadata > docs.md
|
|
793
|
+
```
|
|
794
|
+
|
|
795
|
+
## Differences from markdownify
|
|
796
|
+
|
|
797
|
+
html-to-markdown is a modern, completely rewritten library inspired by markdownify but with significant improvements:
|
|
798
|
+
|
|
799
|
+
### Key Advantages
|
|
800
|
+
|
|
801
|
+
| Feature | markdownify | html-to-markdown |
|
|
802
|
+
| ----------------------- | ---------------- | ---------------------------------------------------------------------- |
|
|
803
|
+
| **Type Safety** | No type hints | Full MyPy compliance with strict typing |
|
|
804
|
+
| **Python Support** | Python 3.6+ | Python 3.10+ with modern features |
|
|
805
|
+
| **HTML5 Elements** | Basic support | Comprehensive HTML5 support (semantic, form, table, interactive, etc.) |
|
|
806
|
+
| **Table Handling** | Simple tables | Advanced rowspan/colspan support |
|
|
807
|
+
| **Streaming** | Memory-intensive | Memory-efficient streaming for large documents |
|
|
808
|
+
| **CLI Tool** | Basic | Full-featured CLI with all API options |
|
|
809
|
+
| **Preprocessing** | None | Built-in HTML cleaning with configurable presets |
|
|
810
|
+
| **Metadata Extraction** | None | Automatic title/meta extraction as comments |
|
|
811
|
+
| **Task Lists** | None | GitHub-compatible checkbox conversion |
|
|
812
|
+
| **Bytes Input** | None | Direct bytes support with configurable encoding |
|
|
813
|
+
| **Custom Converters** | Class-based | Function-based with simpler API |
|
|
814
|
+
| **Testing** | Basic | Comprehensive test suite with 100% coverage |
|
|
815
|
+
| **Performance** | Standard | Significantly faster with recommended lxml parser |
|
|
816
|
+
|
|
817
|
+
### API Compatibility
|
|
818
|
+
|
|
819
|
+
While inspired by markdownify, html-to-markdown uses a more modern, explicit API:
|
|
820
|
+
|
|
821
|
+
```python
|
|
822
|
+
# markdownify style
|
|
823
|
+
from markdownify import markdownify
|
|
824
|
+
|
|
825
|
+
result = markdownify(html, heading_style="atx", strip=["nav"])
|
|
826
|
+
|
|
827
|
+
# html-to-markdown style (more explicit)
|
|
828
|
+
from html_to_markdown import convert_to_markdown
|
|
829
|
+
|
|
830
|
+
result = convert_to_markdown(html, heading_style="atx", strip=["nav"])
|
|
831
|
+
```
|
|
832
|
+
|
|
833
|
+
### Migration from markdownify
|
|
834
|
+
|
|
835
|
+
Most markdownify code can be easily migrated:
|
|
836
|
+
|
|
837
|
+
```python
|
|
838
|
+
# Before (markdownify)
|
|
839
|
+
from markdownify import markdownify as md
|
|
840
|
+
|
|
841
|
+
result = md(html, heading_style="atx")
|
|
842
|
+
|
|
843
|
+
# After (html-to-markdown)
|
|
844
|
+
from html_to_markdown import convert_to_markdown
|
|
845
|
+
|
|
846
|
+
result = convert_to_markdown(html, heading_style="atx")
|
|
847
|
+
```
|
|
848
|
+
|
|
849
|
+
Key changes when migrating:
|
|
850
|
+
|
|
851
|
+
- Import path: `markdownify` → `html_to_markdown`
|
|
852
|
+
- Function name: `markdownify()` → `convert_to_markdown()`
|
|
853
|
+
- All parameter names remain the same for common options
|
|
854
|
+
- New parameters available for advanced features (preprocessing, streaming, etc.)
|
|
855
|
+
|
|
684
856
|
## Acknowledgments
|
|
685
857
|
|
|
686
|
-
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
|
|
858
|
+
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors for the inspiration and foundation that made this modern implementation possible.
|
|
@@ -5,7 +5,7 @@ requires = [ "setuptools>=78.1" ]
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "html-to-markdown"
|
|
8
|
-
version = "1.
|
|
8
|
+
version = "1.14.0"
|
|
9
9
|
description = "A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options"
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
keywords = [
|
|
@@ -46,8 +46,9 @@ dependencies = [
|
|
|
46
46
|
"beautifulsoup4>=4.13.5",
|
|
47
47
|
"nh3>=0.3",
|
|
48
48
|
]
|
|
49
|
-
optional-dependencies.
|
|
49
|
+
optional-dependencies.html5lib = [ "beautifulsoup4[html5lib]>=4.13.5" ]
|
|
50
50
|
|
|
51
|
+
optional-dependencies.lxml = [ "beautifulsoup4[lxml]>=4.13.5" ]
|
|
51
52
|
urls.Changelog = "https://github.com/Goldziher/html-to-markdown/releases"
|
|
52
53
|
urls.Homepage = "https://github.com/Goldziher/html-to-markdown"
|
|
53
54
|
urls.Issues = "https://github.com/Goldziher/html-to-markdown/issues"
|
|
@@ -57,14 +58,16 @@ scripts.html_to_markdown = "html_to_markdown.__main__:cli"
|
|
|
57
58
|
|
|
58
59
|
[dependency-groups]
|
|
59
60
|
dev = [
|
|
61
|
+
"beautifulsoup4[html5lib]>=4.13.5",
|
|
62
|
+
"beautifulsoup4[lxml]>=4.13.5",
|
|
60
63
|
"covdefaults>=2.3",
|
|
61
|
-
"mypy>=1.18.
|
|
64
|
+
"mypy>=1.18.2",
|
|
62
65
|
"pre-commit>=4.3",
|
|
63
66
|
"pytest>=8.4.2",
|
|
64
67
|
"pytest-benchmark>=5.1",
|
|
65
68
|
"pytest-cov>=7",
|
|
66
|
-
"pytest-mock>=3.15",
|
|
67
|
-
"ruff>=0.13",
|
|
69
|
+
"pytest-mock>=3.15.1",
|
|
70
|
+
"ruff>=0.13.1",
|
|
68
71
|
"types-beautifulsoup4>=4.12.0.20250516",
|
|
69
72
|
"types-psutil>=7.0.0.20250822",
|
|
70
73
|
"uv-bump",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{html_to_markdown-1.13.0 → html_to_markdown-1.14.0}/html_to_markdown.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|