html-to-markdown 1.13.0__tar.gz → 1.14.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/PKG-INFO +179 -7
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/README.md +176 -6
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown/cli.py +18 -12
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown/converters.py +0 -2
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown/processing.py +100 -29
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown.egg-info/PKG-INFO +179 -7
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown.egg-info/requires.txt +3 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/pyproject.toml +10 -5
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/LICENSE +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown/constants.py +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown/exceptions.py +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown/preprocessor.py +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown/utils.py +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown/whitespace.py +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown.egg-info/SOURCES.txt +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown.egg-info/dependency_links.txt +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown.egg-info/entry_points.txt +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown.egg-info/top_level.txt +0 -0
- {html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.14.1
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -32,6 +32,8 @@ Description-Content-Type: text/markdown
|
|
|
32
32
|
License-File: LICENSE
|
|
33
33
|
Requires-Dist: beautifulsoup4>=4.13.5
|
|
34
34
|
Requires-Dist: nh3>=0.3
|
|
35
|
+
Provides-Extra: html5lib
|
|
36
|
+
Requires-Dist: beautifulsoup4[html5lib]>=4.13.5; extra == "html5lib"
|
|
35
37
|
Provides-Extra: lxml
|
|
36
38
|
Requires-Dist: beautifulsoup4[lxml]>=4.13.5; extra == "lxml"
|
|
37
39
|
Dynamic: license-file
|
|
@@ -40,7 +42,7 @@ Dynamic: license-file
|
|
|
40
42
|
|
|
41
43
|
A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
|
|
42
44
|
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
43
|
-
Python 3.
|
|
45
|
+
Python 3.10+.
|
|
44
46
|
|
|
45
47
|
## Support This Project
|
|
46
48
|
|
|
@@ -64,8 +66,10 @@ Your support helps maintain and improve this library for the community.
|
|
|
64
66
|
- **Custom Converters**: Extensible converter system for custom HTML tag handling
|
|
65
67
|
- **List Formatting**: Configurable list indentation with Discord/Slack compatibility
|
|
66
68
|
- **HTML Preprocessing**: Clean messy HTML with configurable aggressiveness levels
|
|
69
|
+
- **Bytes Input Support**: Direct handling of bytes input with automatic encoding detection and configurable source encoding
|
|
67
70
|
- **Whitespace Control**: Normalized or strict whitespace preservation modes
|
|
68
71
|
- **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
|
|
72
|
+
- **Parser Normalization**: Consistent output across all supported parsers (html.parser, lxml, html5lib)
|
|
69
73
|
- **Robustly Tested**: Comprehensive unit tests and integration tests covering all conversion scenarios
|
|
70
74
|
|
|
71
75
|
## Installation
|
|
@@ -74,17 +78,27 @@ Your support helps maintain and improve this library for the community.
|
|
|
74
78
|
pip install html-to-markdown
|
|
75
79
|
```
|
|
76
80
|
|
|
77
|
-
### Optional
|
|
81
|
+
### Optional Parsers
|
|
78
82
|
|
|
79
|
-
For improved performance, you can install with
|
|
83
|
+
For improved performance and compatibility, you can install with optional parsers:
|
|
80
84
|
|
|
81
85
|
```shell
|
|
86
|
+
# Fast lxml parser (recommended)
|
|
82
87
|
pip install html-to-markdown[lxml]
|
|
88
|
+
|
|
89
|
+
# Standards-compliant html5lib parser
|
|
90
|
+
pip install html-to-markdown[html5lib]
|
|
83
91
|
```
|
|
84
92
|
|
|
85
|
-
|
|
93
|
+
**Parser Options:**
|
|
94
|
+
|
|
95
|
+
- **html.parser** (default): Built-in Python parser, no dependencies
|
|
96
|
+
- **lxml**: **Recommended** - Fastest parser with good malformed HTML handling
|
|
97
|
+
- **html5lib**: Most standards-compliant, handles edge cases best
|
|
86
98
|
|
|
87
|
-
The library automatically uses lxml when available
|
|
99
|
+
The library automatically uses lxml when available and **normalizes output to ensure consistent results regardless of parser choice**. We recommend using the **lxml parser for optimal performance** - it's significantly faster than the other options while maintaining excellent compatibility.
|
|
100
|
+
|
|
101
|
+
You can explicitly specify a parser using the `parser` parameter.
|
|
88
102
|
|
|
89
103
|
## Quick Start
|
|
90
104
|
|
|
@@ -149,6 +163,39 @@ soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installatio
|
|
|
149
163
|
markdown = convert_to_markdown(soup)
|
|
150
164
|
```
|
|
151
165
|
|
|
166
|
+
### Working with Bytes and Encodings
|
|
167
|
+
|
|
168
|
+
The library can directly handle bytes input, which is useful when working with HTTP responses or files:
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
import requests
|
|
172
|
+
from html_to_markdown import convert_to_markdown
|
|
173
|
+
|
|
174
|
+
# Working with HTTP responses (bytes)
|
|
175
|
+
response = requests.get("https://example.com")
|
|
176
|
+
markdown = convert_to_markdown(response.content) # response.content returns bytes
|
|
177
|
+
|
|
178
|
+
# Specify encoding for non-UTF-8 content
|
|
179
|
+
response = requests.get("https://example.fr")
|
|
180
|
+
markdown = convert_to_markdown(response.content, source_encoding="latin-1")
|
|
181
|
+
|
|
182
|
+
# Common encoding examples
|
|
183
|
+
html_bytes = b"<p>Hello World</p>"
|
|
184
|
+
markdown = convert_to_markdown(html_bytes) # UTF-8 by default
|
|
185
|
+
|
|
186
|
+
# Latin-1 encoded content
|
|
187
|
+
html_latin1 = "<p>Café résumé</p>".encode("latin-1")
|
|
188
|
+
markdown = convert_to_markdown(html_latin1, source_encoding="latin-1")
|
|
189
|
+
|
|
190
|
+
# Windows-1252 encoded content
|
|
191
|
+
html_windows = '<p>Smart quotes: "Hello"</p>'.encode("windows-1252")
|
|
192
|
+
markdown = convert_to_markdown(html_windows, source_encoding="windows-1252")
|
|
193
|
+
|
|
194
|
+
# Piping bytes from command line
|
|
195
|
+
# echo '<p>Hello</p>' | python -m html_to_markdown
|
|
196
|
+
# cat file.html | python -m html_to_markdown --source-encoding latin-1
|
|
197
|
+
```
|
|
198
|
+
|
|
152
199
|
## Common Use Cases
|
|
153
200
|
|
|
154
201
|
### Discord/Slack Compatible Lists
|
|
@@ -681,6 +728,131 @@ This library provides comprehensive support for all modern HTML5 elements:
|
|
|
681
728
|
|
|
682
729
|
- `<math>` (MathML support)
|
|
683
730
|
|
|
731
|
+
## Command Line Interface
|
|
732
|
+
|
|
733
|
+
The library includes a full-featured CLI tool with complete API parity:
|
|
734
|
+
|
|
735
|
+
### Basic Usage
|
|
736
|
+
|
|
737
|
+
```bash
|
|
738
|
+
# Convert HTML file to Markdown
|
|
739
|
+
html-to-markdown document.html
|
|
740
|
+
|
|
741
|
+
# Convert from stdin
|
|
742
|
+
echo '<h1>Title</h1><p>Content</p>' | html-to-markdown
|
|
743
|
+
|
|
744
|
+
# Read HTML file with specific encoding
|
|
745
|
+
html-to-markdown document.html --source-encoding latin-1
|
|
746
|
+
|
|
747
|
+
# Pipe bytes with encoding specification
|
|
748
|
+
cat document.html | html-to-markdown --source-encoding utf-8
|
|
749
|
+
```
|
|
750
|
+
|
|
751
|
+
### Advanced CLI Options
|
|
752
|
+
|
|
753
|
+
```bash
|
|
754
|
+
# Discord/Slack compatible lists (2-space indent)
|
|
755
|
+
html-to-markdown file.html --list-indent-width 2
|
|
756
|
+
|
|
757
|
+
# Clean messy HTML before conversion
|
|
758
|
+
html-to-markdown file.html --preprocess-html --preprocessing-preset aggressive
|
|
759
|
+
|
|
760
|
+
# Custom heading style
|
|
761
|
+
html-to-markdown file.html --heading-style atx
|
|
762
|
+
|
|
763
|
+
# Strip specific tags
|
|
764
|
+
html-to-markdown file.html --strip nav aside footer
|
|
765
|
+
|
|
766
|
+
# Convert only specific tags
|
|
767
|
+
html-to-markdown file.html --convert h1 h2 p a strong em
|
|
768
|
+
|
|
769
|
+
# Enable streaming for large files with progress
|
|
770
|
+
html-to-markdown large.html --stream-processing --show-progress
|
|
771
|
+
|
|
772
|
+
# Use specific parser (lxml recommended for best performance)
|
|
773
|
+
html-to-markdown file.html --parser lxml
|
|
774
|
+
```
|
|
775
|
+
|
|
776
|
+
### Real-World CLI Examples
|
|
777
|
+
|
|
778
|
+
```bash
|
|
779
|
+
# Download and convert a webpage
|
|
780
|
+
curl -s https://example.com | html-to-markdown --preprocess-html > output.md
|
|
781
|
+
|
|
782
|
+
# Process multiple files with different encodings
|
|
783
|
+
for file in *.html; do
|
|
784
|
+
html-to-markdown "$file" --source-encoding latin-1 > "${file%.html}.md"
|
|
785
|
+
done
|
|
786
|
+
|
|
787
|
+
# Convert with custom formatting for documentation
|
|
788
|
+
html-to-markdown docs.html \
|
|
789
|
+
--heading-style atx \
|
|
790
|
+
--list-indent-width 2 \
|
|
791
|
+
--highlight-style bold \
|
|
792
|
+
--no-extract-metadata > docs.md
|
|
793
|
+
```
|
|
794
|
+
|
|
795
|
+
## Differences from markdownify
|
|
796
|
+
|
|
797
|
+
html-to-markdown is a modern, completely rewritten library inspired by markdownify but with significant improvements:
|
|
798
|
+
|
|
799
|
+
### Key Advantages
|
|
800
|
+
|
|
801
|
+
| Feature | markdownify | html-to-markdown |
|
|
802
|
+
| ----------------------- | ---------------- | ---------------------------------------------------------------------- |
|
|
803
|
+
| **Type Safety** | No type hints | Full MyPy compliance with strict typing |
|
|
804
|
+
| **Python Support** | Python 3.6+ | Python 3.10+ with modern features |
|
|
805
|
+
| **HTML5 Elements** | Basic support | Comprehensive HTML5 support (semantic, form, table, interactive, etc.) |
|
|
806
|
+
| **Table Handling** | Simple tables | Advanced rowspan/colspan support |
|
|
807
|
+
| **Streaming** | Memory-intensive | Memory-efficient streaming for large documents |
|
|
808
|
+
| **CLI Tool** | Basic | Full-featured CLI with all API options |
|
|
809
|
+
| **Preprocessing** | None | Built-in HTML cleaning with configurable presets |
|
|
810
|
+
| **Metadata Extraction** | None | Automatic title/meta extraction as comments |
|
|
811
|
+
| **Task Lists** | None | GitHub-compatible checkbox conversion |
|
|
812
|
+
| **Bytes Input** | None | Direct bytes support with configurable encoding |
|
|
813
|
+
| **Custom Converters** | Class-based | Function-based with simpler API |
|
|
814
|
+
| **Testing** | Basic | Comprehensive test suite with 100% coverage |
|
|
815
|
+
| **Performance** | Standard | Significantly faster with recommended lxml parser |
|
|
816
|
+
|
|
817
|
+
### API Compatibility
|
|
818
|
+
|
|
819
|
+
While inspired by markdownify, html-to-markdown uses a more modern, explicit API:
|
|
820
|
+
|
|
821
|
+
```python
|
|
822
|
+
# markdownify style
|
|
823
|
+
from markdownify import markdownify
|
|
824
|
+
|
|
825
|
+
result = markdownify(html, heading_style="atx", strip=["nav"])
|
|
826
|
+
|
|
827
|
+
# html-to-markdown style (more explicit)
|
|
828
|
+
from html_to_markdown import convert_to_markdown
|
|
829
|
+
|
|
830
|
+
result = convert_to_markdown(html, heading_style="atx", strip=["nav"])
|
|
831
|
+
```
|
|
832
|
+
|
|
833
|
+
### Migration from markdownify
|
|
834
|
+
|
|
835
|
+
Most markdownify code can be easily migrated:
|
|
836
|
+
|
|
837
|
+
```python
|
|
838
|
+
# Before (markdownify)
|
|
839
|
+
from markdownify import markdownify as md
|
|
840
|
+
|
|
841
|
+
result = md(html, heading_style="atx")
|
|
842
|
+
|
|
843
|
+
# After (html-to-markdown)
|
|
844
|
+
from html_to_markdown import convert_to_markdown
|
|
845
|
+
|
|
846
|
+
result = convert_to_markdown(html, heading_style="atx")
|
|
847
|
+
```
|
|
848
|
+
|
|
849
|
+
Key changes when migrating:
|
|
850
|
+
|
|
851
|
+
- Import path: `markdownify` → `html_to_markdown`
|
|
852
|
+
- Function name: `markdownify()` → `convert_to_markdown()`
|
|
853
|
+
- All parameter names remain the same for common options
|
|
854
|
+
- New parameters available for advanced features (preprocessing, streaming, etc.)
|
|
855
|
+
|
|
684
856
|
## Acknowledgments
|
|
685
857
|
|
|
686
|
-
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
|
|
858
|
+
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors for the inspiration and foundation that made this modern implementation possible.
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
|
|
4
4
|
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
5
|
-
Python 3.
|
|
5
|
+
Python 3.10+.
|
|
6
6
|
|
|
7
7
|
## Support This Project
|
|
8
8
|
|
|
@@ -26,8 +26,10 @@ Your support helps maintain and improve this library for the community.
|
|
|
26
26
|
- **Custom Converters**: Extensible converter system for custom HTML tag handling
|
|
27
27
|
- **List Formatting**: Configurable list indentation with Discord/Slack compatibility
|
|
28
28
|
- **HTML Preprocessing**: Clean messy HTML with configurable aggressiveness levels
|
|
29
|
+
- **Bytes Input Support**: Direct handling of bytes input with automatic encoding detection and configurable source encoding
|
|
29
30
|
- **Whitespace Control**: Normalized or strict whitespace preservation modes
|
|
30
31
|
- **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
|
|
32
|
+
- **Parser Normalization**: Consistent output across all supported parsers (html.parser, lxml, html5lib)
|
|
31
33
|
- **Robustly Tested**: Comprehensive unit tests and integration tests covering all conversion scenarios
|
|
32
34
|
|
|
33
35
|
## Installation
|
|
@@ -36,17 +38,27 @@ Your support helps maintain and improve this library for the community.
|
|
|
36
38
|
pip install html-to-markdown
|
|
37
39
|
```
|
|
38
40
|
|
|
39
|
-
### Optional
|
|
41
|
+
### Optional Parsers
|
|
40
42
|
|
|
41
|
-
For improved performance, you can install with
|
|
43
|
+
For improved performance and compatibility, you can install with optional parsers:
|
|
42
44
|
|
|
43
45
|
```shell
|
|
46
|
+
# Fast lxml parser (recommended)
|
|
44
47
|
pip install html-to-markdown[lxml]
|
|
48
|
+
|
|
49
|
+
# Standards-compliant html5lib parser
|
|
50
|
+
pip install html-to-markdown[html5lib]
|
|
45
51
|
```
|
|
46
52
|
|
|
47
|
-
|
|
53
|
+
**Parser Options:**
|
|
54
|
+
|
|
55
|
+
- **html.parser** (default): Built-in Python parser, no dependencies
|
|
56
|
+
- **lxml**: **Recommended** - Fastest parser with good malformed HTML handling
|
|
57
|
+
- **html5lib**: Most standards-compliant, handles edge cases best
|
|
48
58
|
|
|
49
|
-
The library automatically uses lxml when available
|
|
59
|
+
The library automatically uses lxml when available and **normalizes output to ensure consistent results regardless of parser choice**. We recommend using the **lxml parser for optimal performance** - it's significantly faster than the other options while maintaining excellent compatibility.
|
|
60
|
+
|
|
61
|
+
You can explicitly specify a parser using the `parser` parameter.
|
|
50
62
|
|
|
51
63
|
## Quick Start
|
|
52
64
|
|
|
@@ -111,6 +123,39 @@ soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installatio
|
|
|
111
123
|
markdown = convert_to_markdown(soup)
|
|
112
124
|
```
|
|
113
125
|
|
|
126
|
+
### Working with Bytes and Encodings
|
|
127
|
+
|
|
128
|
+
The library can directly handle bytes input, which is useful when working with HTTP responses or files:
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
import requests
|
|
132
|
+
from html_to_markdown import convert_to_markdown
|
|
133
|
+
|
|
134
|
+
# Working with HTTP responses (bytes)
|
|
135
|
+
response = requests.get("https://example.com")
|
|
136
|
+
markdown = convert_to_markdown(response.content) # response.content returns bytes
|
|
137
|
+
|
|
138
|
+
# Specify encoding for non-UTF-8 content
|
|
139
|
+
response = requests.get("https://example.fr")
|
|
140
|
+
markdown = convert_to_markdown(response.content, source_encoding="latin-1")
|
|
141
|
+
|
|
142
|
+
# Common encoding examples
|
|
143
|
+
html_bytes = b"<p>Hello World</p>"
|
|
144
|
+
markdown = convert_to_markdown(html_bytes) # UTF-8 by default
|
|
145
|
+
|
|
146
|
+
# Latin-1 encoded content
|
|
147
|
+
html_latin1 = "<p>Café résumé</p>".encode("latin-1")
|
|
148
|
+
markdown = convert_to_markdown(html_latin1, source_encoding="latin-1")
|
|
149
|
+
|
|
150
|
+
# Windows-1252 encoded content
|
|
151
|
+
html_windows = '<p>Smart quotes: "Hello"</p>'.encode("windows-1252")
|
|
152
|
+
markdown = convert_to_markdown(html_windows, source_encoding="windows-1252")
|
|
153
|
+
|
|
154
|
+
# Piping bytes from command line
|
|
155
|
+
# echo '<p>Hello</p>' | python -m html_to_markdown
|
|
156
|
+
# cat file.html | python -m html_to_markdown --source-encoding latin-1
|
|
157
|
+
```
|
|
158
|
+
|
|
114
159
|
## Common Use Cases
|
|
115
160
|
|
|
116
161
|
### Discord/Slack Compatible Lists
|
|
@@ -643,6 +688,131 @@ This library provides comprehensive support for all modern HTML5 elements:
|
|
|
643
688
|
|
|
644
689
|
- `<math>` (MathML support)
|
|
645
690
|
|
|
691
|
+
## Command Line Interface
|
|
692
|
+
|
|
693
|
+
The library includes a full-featured CLI tool with complete API parity:
|
|
694
|
+
|
|
695
|
+
### Basic Usage
|
|
696
|
+
|
|
697
|
+
```bash
|
|
698
|
+
# Convert HTML file to Markdown
|
|
699
|
+
html-to-markdown document.html
|
|
700
|
+
|
|
701
|
+
# Convert from stdin
|
|
702
|
+
echo '<h1>Title</h1><p>Content</p>' | html-to-markdown
|
|
703
|
+
|
|
704
|
+
# Read HTML file with specific encoding
|
|
705
|
+
html-to-markdown document.html --source-encoding latin-1
|
|
706
|
+
|
|
707
|
+
# Pipe bytes with encoding specification
|
|
708
|
+
cat document.html | html-to-markdown --source-encoding utf-8
|
|
709
|
+
```
|
|
710
|
+
|
|
711
|
+
### Advanced CLI Options
|
|
712
|
+
|
|
713
|
+
```bash
|
|
714
|
+
# Discord/Slack compatible lists (2-space indent)
|
|
715
|
+
html-to-markdown file.html --list-indent-width 2
|
|
716
|
+
|
|
717
|
+
# Clean messy HTML before conversion
|
|
718
|
+
html-to-markdown file.html --preprocess-html --preprocessing-preset aggressive
|
|
719
|
+
|
|
720
|
+
# Custom heading style
|
|
721
|
+
html-to-markdown file.html --heading-style atx
|
|
722
|
+
|
|
723
|
+
# Strip specific tags
|
|
724
|
+
html-to-markdown file.html --strip nav aside footer
|
|
725
|
+
|
|
726
|
+
# Convert only specific tags
|
|
727
|
+
html-to-markdown file.html --convert h1 h2 p a strong em
|
|
728
|
+
|
|
729
|
+
# Enable streaming for large files with progress
|
|
730
|
+
html-to-markdown large.html --stream-processing --show-progress
|
|
731
|
+
|
|
732
|
+
# Use specific parser (lxml recommended for best performance)
|
|
733
|
+
html-to-markdown file.html --parser lxml
|
|
734
|
+
```
|
|
735
|
+
|
|
736
|
+
### Real-World CLI Examples
|
|
737
|
+
|
|
738
|
+
```bash
|
|
739
|
+
# Download and convert a webpage
|
|
740
|
+
curl -s https://example.com | html-to-markdown --preprocess-html > output.md
|
|
741
|
+
|
|
742
|
+
# Process multiple files with different encodings
|
|
743
|
+
for file in *.html; do
|
|
744
|
+
html-to-markdown "$file" --source-encoding latin-1 > "${file%.html}.md"
|
|
745
|
+
done
|
|
746
|
+
|
|
747
|
+
# Convert with custom formatting for documentation
|
|
748
|
+
html-to-markdown docs.html \
|
|
749
|
+
--heading-style atx \
|
|
750
|
+
--list-indent-width 2 \
|
|
751
|
+
--highlight-style bold \
|
|
752
|
+
--no-extract-metadata > docs.md
|
|
753
|
+
```
|
|
754
|
+
|
|
755
|
+
## Differences from markdownify
|
|
756
|
+
|
|
757
|
+
html-to-markdown is a modern, completely rewritten library inspired by markdownify but with significant improvements:
|
|
758
|
+
|
|
759
|
+
### Key Advantages
|
|
760
|
+
|
|
761
|
+
| Feature | markdownify | html-to-markdown |
|
|
762
|
+
| ----------------------- | ---------------- | ---------------------------------------------------------------------- |
|
|
763
|
+
| **Type Safety** | No type hints | Full MyPy compliance with strict typing |
|
|
764
|
+
| **Python Support** | Python 3.6+ | Python 3.10+ with modern features |
|
|
765
|
+
| **HTML5 Elements** | Basic support | Comprehensive HTML5 support (semantic, form, table, interactive, etc.) |
|
|
766
|
+
| **Table Handling** | Simple tables | Advanced rowspan/colspan support |
|
|
767
|
+
| **Streaming** | Memory-intensive | Memory-efficient streaming for large documents |
|
|
768
|
+
| **CLI Tool** | Basic | Full-featured CLI with all API options |
|
|
769
|
+
| **Preprocessing** | None | Built-in HTML cleaning with configurable presets |
|
|
770
|
+
| **Metadata Extraction** | None | Automatic title/meta extraction as comments |
|
|
771
|
+
| **Task Lists** | None | GitHub-compatible checkbox conversion |
|
|
772
|
+
| **Bytes Input** | None | Direct bytes support with configurable encoding |
|
|
773
|
+
| **Custom Converters** | Class-based | Function-based with simpler API |
|
|
774
|
+
| **Testing** | Basic | Comprehensive test suite with 100% coverage |
|
|
775
|
+
| **Performance** | Standard | Significantly faster with recommended lxml parser |
|
|
776
|
+
|
|
777
|
+
### API Compatibility
|
|
778
|
+
|
|
779
|
+
While inspired by markdownify, html-to-markdown uses a more modern, explicit API:
|
|
780
|
+
|
|
781
|
+
```python
|
|
782
|
+
# markdownify style
|
|
783
|
+
from markdownify import markdownify
|
|
784
|
+
|
|
785
|
+
result = markdownify(html, heading_style="atx", strip=["nav"])
|
|
786
|
+
|
|
787
|
+
# html-to-markdown style (more explicit)
|
|
788
|
+
from html_to_markdown import convert_to_markdown
|
|
789
|
+
|
|
790
|
+
result = convert_to_markdown(html, heading_style="atx", strip=["nav"])
|
|
791
|
+
```
|
|
792
|
+
|
|
793
|
+
### Migration from markdownify
|
|
794
|
+
|
|
795
|
+
Most markdownify code can be easily migrated:
|
|
796
|
+
|
|
797
|
+
```python
|
|
798
|
+
# Before (markdownify)
|
|
799
|
+
from markdownify import markdownify as md
|
|
800
|
+
|
|
801
|
+
result = md(html, heading_style="atx")
|
|
802
|
+
|
|
803
|
+
# After (html-to-markdown)
|
|
804
|
+
from html_to_markdown import convert_to_markdown
|
|
805
|
+
|
|
806
|
+
result = convert_to_markdown(html, heading_style="atx")
|
|
807
|
+
```
|
|
808
|
+
|
|
809
|
+
Key changes when migrating:
|
|
810
|
+
|
|
811
|
+
- Import path: `markdownify` → `html_to_markdown`
|
|
812
|
+
- Function name: `markdownify()` → `convert_to_markdown()`
|
|
813
|
+
- All parameter names remain the same for common options
|
|
814
|
+
- New parameters available for advanced features (preprocessing, streaming, etc.)
|
|
815
|
+
|
|
646
816
|
## Acknowledgments
|
|
647
817
|
|
|
648
|
-
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
|
|
818
|
+
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors for the inspiration and foundation that made this modern implementation possible.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import sys
|
|
2
|
-
from argparse import ArgumentParser
|
|
2
|
+
from argparse import ArgumentParser
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
5
|
from html_to_markdown.constants import (
|
|
@@ -27,8 +27,7 @@ def main(argv: list[str]) -> str:
|
|
|
27
27
|
parser.add_argument(
|
|
28
28
|
"html",
|
|
29
29
|
nargs="?",
|
|
30
|
-
|
|
31
|
-
default=sys.stdin,
|
|
30
|
+
default="-",
|
|
32
31
|
help="The HTML file to convert. Defaults to STDIN if not provided.",
|
|
33
32
|
)
|
|
34
33
|
|
|
@@ -247,7 +246,7 @@ def main(argv: list[str]) -> str:
|
|
|
247
246
|
"--source-encoding",
|
|
248
247
|
type=str,
|
|
249
248
|
default=None,
|
|
250
|
-
help="
|
|
249
|
+
help="Encoding for reading input files and decoding bytes (e.g. 'utf-8', 'latin-1'). Default: utf-8.",
|
|
251
250
|
)
|
|
252
251
|
|
|
253
252
|
args = parser.parse_args(argv)
|
|
@@ -260,6 +259,7 @@ def main(argv: list[str]) -> str:
|
|
|
260
259
|
"convert": args.convert,
|
|
261
260
|
"convert_as_inline": args.convert_as_inline,
|
|
262
261
|
"default_title": args.default_title,
|
|
262
|
+
"source_encoding": args.source_encoding,
|
|
263
263
|
"escape_asterisks": args.escape_asterisks,
|
|
264
264
|
"escape_misc": args.escape_misc,
|
|
265
265
|
"escape_underscores": args.escape_underscores,
|
|
@@ -302,14 +302,20 @@ def main(argv: list[str]) -> str:
|
|
|
302
302
|
|
|
303
303
|
base_args["progress_callback"] = progress_callback
|
|
304
304
|
|
|
305
|
-
if args.
|
|
306
|
-
|
|
307
|
-
try:
|
|
308
|
-
with Path(args.html.name).open(encoding=args.source_encoding) as f:
|
|
309
|
-
html_content = f.read()
|
|
310
|
-
except LookupError as e:
|
|
311
|
-
raise InvalidEncodingError(args.source_encoding) from e
|
|
305
|
+
if args.html == "-":
|
|
306
|
+
html_content = sys.stdin.buffer.read()
|
|
312
307
|
else:
|
|
313
|
-
|
|
308
|
+
try:
|
|
309
|
+
file_path = Path(args.html)
|
|
310
|
+
if args.source_encoding:
|
|
311
|
+
with file_path.open(encoding=args.source_encoding, errors="replace") as f:
|
|
312
|
+
html_content = f.read()
|
|
313
|
+
else:
|
|
314
|
+
with file_path.open("rb") as f:
|
|
315
|
+
html_content = f.read()
|
|
316
|
+
except (OSError, LookupError) as e:
|
|
317
|
+
if isinstance(e, LookupError):
|
|
318
|
+
raise InvalidEncodingError(args.source_encoding) from e
|
|
319
|
+
raise
|
|
314
320
|
|
|
315
321
|
return convert_to_markdown(html_content, **base_args)
|
|
@@ -414,8 +414,6 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
|
|
|
414
414
|
|
|
415
415
|
return "".join(result_parts)
|
|
416
416
|
|
|
417
|
-
# Ensure consistent whitespace handling for list items, especially with strip_newlines=True
|
|
418
|
-
# Strip any leading whitespace that may have been inherited from parent containers
|
|
419
417
|
clean_text = (text or "").strip()
|
|
420
418
|
return f"{bullet} {clean_text}\n"
|
|
421
419
|
|
|
@@ -314,11 +314,12 @@ def _process_text(
|
|
|
314
314
|
if len(ancestor_names) > 10:
|
|
315
315
|
break
|
|
316
316
|
|
|
317
|
-
in_pre = bool(ancestor_names.intersection({"pre"}))
|
|
317
|
+
in_pre = bool(ancestor_names.intersection({"pre"})) or parent_name == "pre"
|
|
318
318
|
|
|
319
319
|
text = whitespace_handler.process_text_whitespace(text, el, in_pre=in_pre)
|
|
320
320
|
|
|
321
|
-
|
|
321
|
+
code_like_tags = {"pre", "code", "kbd", "samp"}
|
|
322
|
+
if not (ancestor_names.intersection(code_like_tags) or parent_name in code_like_tags):
|
|
322
323
|
text = escape(
|
|
323
324
|
text=text,
|
|
324
325
|
escape_misc=escape_misc,
|
|
@@ -445,13 +446,14 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
|
|
|
445
446
|
|
|
446
447
|
|
|
447
448
|
def convert_to_markdown(
|
|
448
|
-
source: str | BeautifulSoup,
|
|
449
|
+
source: str | bytes | BeautifulSoup,
|
|
449
450
|
*,
|
|
450
451
|
stream_processing: bool = False,
|
|
451
452
|
chunk_size: int = 1024,
|
|
452
453
|
chunk_callback: Callable[[str], None] | None = None,
|
|
453
454
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
454
455
|
parser: str | None = None,
|
|
456
|
+
source_encoding: str = "utf-8",
|
|
455
457
|
autolinks: bool = True,
|
|
456
458
|
br_in_tables: bool = False,
|
|
457
459
|
bullets: str = "*+-",
|
|
@@ -489,12 +491,13 @@ def convert_to_markdown(
|
|
|
489
491
|
various customization options for controlling the conversion behavior.
|
|
490
492
|
|
|
491
493
|
Args:
|
|
492
|
-
source: HTML string or BeautifulSoup object to convert.
|
|
494
|
+
source: HTML string, bytes, or BeautifulSoup object to convert.
|
|
493
495
|
stream_processing: Enable streaming mode for large documents.
|
|
494
496
|
chunk_size: Size of chunks for streaming processing.
|
|
495
497
|
chunk_callback: Callback for processing chunks in streaming mode.
|
|
496
498
|
progress_callback: Callback for progress updates (current, total).
|
|
497
499
|
parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
|
|
500
|
+
source_encoding: Character encoding to use when decoding bytes (default: 'utf-8').
|
|
498
501
|
autolinks: Convert URLs to automatic links.
|
|
499
502
|
br_in_tables: Use <br> tags for line breaks in table cells instead of spaces.
|
|
500
503
|
bullets: Characters to use for unordered list bullets.
|
|
@@ -548,11 +551,12 @@ def convert_to_markdown(
|
|
|
548
551
|
>>> convert_to_markdown(html, list_indent_width=2)
|
|
549
552
|
'* Item 1\\n* Item 2\\n\\n'
|
|
550
553
|
"""
|
|
551
|
-
# Initialize original input string for Windows lxml fix
|
|
552
554
|
original_input_str = None
|
|
553
555
|
|
|
556
|
+
if isinstance(source, bytes):
|
|
557
|
+
source = source.decode(source_encoding or "utf-8", errors="replace")
|
|
558
|
+
|
|
554
559
|
if isinstance(source, str):
|
|
555
|
-
# Store original string for plain text detection (Windows lxml fix)
|
|
556
560
|
original_input_str = source
|
|
557
561
|
|
|
558
562
|
if (
|
|
@@ -613,6 +617,34 @@ def convert_to_markdown(
|
|
|
613
617
|
new_text = NavigableString(leading_ws + str(first_child))
|
|
614
618
|
first_child.replace_with(new_text)
|
|
615
619
|
needs_leading_space_fix = False
|
|
620
|
+
|
|
621
|
+
if parser == "html5lib":
|
|
622
|
+
body = source.find("body")
|
|
623
|
+
if body and isinstance(body, Tag):
|
|
624
|
+
children = list(body.children)
|
|
625
|
+
|
|
626
|
+
if (
|
|
627
|
+
len(children) == 1
|
|
628
|
+
and isinstance(children[0], NavigableString)
|
|
629
|
+
and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
630
|
+
and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
|
|
631
|
+
):
|
|
632
|
+
first_child = children[0]
|
|
633
|
+
original_text = str(first_child)
|
|
634
|
+
|
|
635
|
+
leading_ws = ""
|
|
636
|
+
for char in original_source:
|
|
637
|
+
if char in " \t\n\r":
|
|
638
|
+
leading_ws += char
|
|
639
|
+
else:
|
|
640
|
+
break
|
|
641
|
+
|
|
642
|
+
normalized_text = original_text
|
|
643
|
+
if leading_ws and not normalized_text.startswith(leading_ws):
|
|
644
|
+
normalized_text = leading_ws + normalized_text
|
|
645
|
+
|
|
646
|
+
new_text = NavigableString(normalized_text)
|
|
647
|
+
first_child.replace_with(new_text)
|
|
616
648
|
else:
|
|
617
649
|
raise EmptyHtmlError
|
|
618
650
|
|
|
@@ -626,6 +658,7 @@ def convert_to_markdown(
|
|
|
626
658
|
chunk_size=chunk_size,
|
|
627
659
|
progress_callback=progress_callback,
|
|
628
660
|
parser=parser,
|
|
661
|
+
source_encoding=source_encoding,
|
|
629
662
|
autolinks=autolinks,
|
|
630
663
|
bullets=bullets,
|
|
631
664
|
code_language=code_language,
|
|
@@ -673,6 +706,7 @@ def convert_to_markdown(
|
|
|
673
706
|
sink,
|
|
674
707
|
whitespace_handler=whitespace_handler,
|
|
675
708
|
parser=parser,
|
|
709
|
+
source_encoding=source_encoding,
|
|
676
710
|
autolinks=autolinks,
|
|
677
711
|
br_in_tables=br_in_tables,
|
|
678
712
|
bullets=bullets,
|
|
@@ -703,8 +737,6 @@ def convert_to_markdown(
|
|
|
703
737
|
|
|
704
738
|
result = sink.get_result()
|
|
705
739
|
|
|
706
|
-
# Parser-agnostic behavior: handle leading whitespace differences between parsers
|
|
707
|
-
# lxml may either add unwanted whitespace or strip meaningful whitespace compared to html.parser
|
|
708
740
|
if "needs_leading_whitespace_fix" in locals() and needs_leading_whitespace_fix:
|
|
709
741
|
original_input = sink.original_source if hasattr(sink, "original_source") else original_source
|
|
710
742
|
if isinstance(original_input, str):
|
|
@@ -713,19 +745,14 @@ def convert_to_markdown(
|
|
|
713
745
|
original_leading_whitespace_match.group(0) if original_leading_whitespace_match else ""
|
|
714
746
|
)
|
|
715
747
|
|
|
716
|
-
# Case 1: lxml added leading newlines (like "\n<figure>") - strip them
|
|
717
748
|
if result.startswith("\n") and not original_input.lstrip().startswith(result.strip()):
|
|
718
749
|
result = result.lstrip("\n\r")
|
|
719
750
|
|
|
720
|
-
# Case 2: lxml stripped meaningful leading whitespace (like " <b>") - restore it
|
|
721
|
-
# However, don't restore whitespace if strip_newlines=True was used, as the user
|
|
722
|
-
# explicitly requested to remove formatting whitespace
|
|
723
751
|
elif (
|
|
724
752
|
not strip_newlines
|
|
725
753
|
and not result.startswith((" ", "\t"))
|
|
726
754
|
and original_leading_whitespace.startswith((" ", "\t"))
|
|
727
755
|
):
|
|
728
|
-
# Only restore spaces/tabs, not newlines (which are usually formatting)
|
|
729
756
|
leading_spaces_tabs_match = re.match(r"^[ \t]*", original_leading_whitespace)
|
|
730
757
|
leading_spaces_tabs = leading_spaces_tabs_match.group(0) if leading_spaces_tabs_match else ""
|
|
731
758
|
if leading_spaces_tabs:
|
|
@@ -758,9 +785,6 @@ def convert_to_markdown(
|
|
|
758
785
|
if convert_as_inline:
|
|
759
786
|
result = result.rstrip("\n")
|
|
760
787
|
|
|
761
|
-
# Windows-specific fix: For plain text input (no HTML tags), lxml may add extra trailing newlines
|
|
762
|
-
# This ensures consistent behavior across platforms when processing plain text
|
|
763
|
-
# Only apply to cases where lxml adds extra newlines (\n\n) at the end
|
|
764
788
|
if (
|
|
765
789
|
"original_input_str" in locals()
|
|
766
790
|
and original_input_str
|
|
@@ -768,19 +792,11 @@ def convert_to_markdown(
|
|
|
768
792
|
and not original_input_str.strip().endswith(">")
|
|
769
793
|
and result.endswith("\n\n")
|
|
770
794
|
):
|
|
771
|
-
# Input appears to be plain text, not HTML - normalize trailing newlines only
|
|
772
795
|
result = result.rstrip("\n")
|
|
773
796
|
|
|
774
|
-
# If the original input contained no block-level elements, normalize any
|
|
775
|
-
# accidental trailing newlines for cross-platform consistency.
|
|
776
|
-
# This guards cases like inline-only inputs (e.g., "text <strong>bold</strong>")
|
|
777
|
-
# and head-only documents (e.g., "<head>head</head>") where output should
|
|
778
|
-
# not end with extra blank lines.
|
|
779
797
|
if "original_input_str" in locals() and original_input_str:
|
|
780
798
|
from html_to_markdown.whitespace import BLOCK_ELEMENTS # noqa: PLC0415
|
|
781
799
|
|
|
782
|
-
# Treat additional tags as block-producing for trailing newline purposes.
|
|
783
|
-
# These may be inline in HTML spec but produce block output in our Markdown conversion.
|
|
784
800
|
blockish = set(BLOCK_ELEMENTS) | {
|
|
785
801
|
"textarea",
|
|
786
802
|
"dialog",
|
|
@@ -880,11 +896,12 @@ class StreamingSink(OutputSink):
|
|
|
880
896
|
|
|
881
897
|
|
|
882
898
|
def _process_html_core(
|
|
883
|
-
source: str | BeautifulSoup,
|
|
899
|
+
source: str | bytes | BeautifulSoup,
|
|
884
900
|
sink: OutputSink,
|
|
885
901
|
*,
|
|
886
902
|
whitespace_handler: WhitespaceHandler,
|
|
887
903
|
parser: str | None = None,
|
|
904
|
+
source_encoding: str = "utf-8",
|
|
888
905
|
autolinks: bool,
|
|
889
906
|
br_in_tables: bool,
|
|
890
907
|
bullets: str,
|
|
@@ -915,7 +932,12 @@ def _process_html_core(
|
|
|
915
932
|
token = _ancestor_cache.set({})
|
|
916
933
|
|
|
917
934
|
try:
|
|
918
|
-
if isinstance(source, str):
|
|
935
|
+
if isinstance(source, (str, bytes)):
|
|
936
|
+
original_source = source
|
|
937
|
+
if isinstance(source, bytes):
|
|
938
|
+
source = source.decode(source_encoding or "utf-8", errors="replace")
|
|
939
|
+
original_source = source
|
|
940
|
+
|
|
919
941
|
if strip_newlines:
|
|
920
942
|
source = source.replace("\n", " ").replace("\r", " ") # pragma: no cover
|
|
921
943
|
|
|
@@ -926,7 +948,36 @@ def _process_html_core(
|
|
|
926
948
|
if parser == "lxml" and not LXML_AVAILABLE: # pragma: no cover
|
|
927
949
|
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
928
950
|
|
|
951
|
+
needs_leading_whitespace_fix = (
|
|
952
|
+
parser == "lxml"
|
|
953
|
+
and isinstance(original_source, str)
|
|
954
|
+
and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
955
|
+
)
|
|
956
|
+
|
|
929
957
|
source = BeautifulSoup(source, parser)
|
|
958
|
+
|
|
959
|
+
if parser == "lxml" and needs_leading_whitespace_fix and isinstance(original_source, str):
|
|
960
|
+
body = source.find("body")
|
|
961
|
+
if body and isinstance(body, Tag):
|
|
962
|
+
children = list(body.children)
|
|
963
|
+
|
|
964
|
+
if (
|
|
965
|
+
len(children) == 1
|
|
966
|
+
and isinstance(children[0], NavigableString)
|
|
967
|
+
and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
968
|
+
and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
|
|
969
|
+
):
|
|
970
|
+
first_child = children[0]
|
|
971
|
+
|
|
972
|
+
leading_ws = ""
|
|
973
|
+
for char in original_source:
|
|
974
|
+
if char in " \t":
|
|
975
|
+
leading_ws += char
|
|
976
|
+
else:
|
|
977
|
+
break
|
|
978
|
+
|
|
979
|
+
new_text = NavigableString(leading_ws + str(first_child))
|
|
980
|
+
first_child.replace_with(new_text)
|
|
930
981
|
else:
|
|
931
982
|
raise EmptyHtmlError
|
|
932
983
|
|
|
@@ -998,11 +1049,12 @@ def _process_html_core(
|
|
|
998
1049
|
|
|
999
1050
|
|
|
1000
1051
|
def convert_to_markdown_stream(
|
|
1001
|
-
source: str | BeautifulSoup,
|
|
1052
|
+
source: str | bytes | BeautifulSoup,
|
|
1002
1053
|
*,
|
|
1003
1054
|
chunk_size: int = 1024,
|
|
1004
1055
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
1005
1056
|
parser: str | None = None,
|
|
1057
|
+
source_encoding: str = "utf-8",
|
|
1006
1058
|
autolinks: bool = True,
|
|
1007
1059
|
br_in_tables: bool = False,
|
|
1008
1060
|
bullets: str = "*+-",
|
|
@@ -1022,6 +1074,10 @@ def convert_to_markdown_stream(
|
|
|
1022
1074
|
list_indent_type: Literal["spaces", "tabs"] = "spaces",
|
|
1023
1075
|
list_indent_width: int = 4,
|
|
1024
1076
|
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
1077
|
+
preprocess_html: bool = False,
|
|
1078
|
+
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
1079
|
+
remove_forms: bool = True,
|
|
1080
|
+
remove_navigation: bool = True,
|
|
1025
1081
|
strip: str | Iterable[str] | None = None,
|
|
1026
1082
|
strip_newlines: bool = False,
|
|
1027
1083
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
@@ -1033,8 +1089,22 @@ def convert_to_markdown_stream(
|
|
|
1033
1089
|
) -> Generator[str, None, None]:
|
|
1034
1090
|
sink = StreamingSink(chunk_size, progress_callback)
|
|
1035
1091
|
|
|
1036
|
-
if isinstance(source,
|
|
1037
|
-
|
|
1092
|
+
if isinstance(source, bytes):
|
|
1093
|
+
source = source.decode(source_encoding or "utf-8", errors="replace")
|
|
1094
|
+
|
|
1095
|
+
if isinstance(source, str) and preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
|
|
1096
|
+
config = create_preprocessor(
|
|
1097
|
+
preset=preprocessing_preset,
|
|
1098
|
+
remove_navigation=remove_navigation,
|
|
1099
|
+
remove_forms=remove_forms,
|
|
1100
|
+
)
|
|
1101
|
+
source = preprocess_fn(source, **config)
|
|
1102
|
+
|
|
1103
|
+
if isinstance(source, (str, bytes)):
|
|
1104
|
+
if isinstance(source, bytes):
|
|
1105
|
+
sink.total_bytes = len(source)
|
|
1106
|
+
else:
|
|
1107
|
+
sink.total_bytes = len(source)
|
|
1038
1108
|
elif isinstance(source, BeautifulSoup):
|
|
1039
1109
|
sink.total_bytes = len(str(source))
|
|
1040
1110
|
|
|
@@ -1045,6 +1115,7 @@ def convert_to_markdown_stream(
|
|
|
1045
1115
|
sink,
|
|
1046
1116
|
whitespace_handler=whitespace_handler,
|
|
1047
1117
|
parser=parser,
|
|
1118
|
+
source_encoding=source_encoding,
|
|
1048
1119
|
autolinks=autolinks,
|
|
1049
1120
|
br_in_tables=br_in_tables,
|
|
1050
1121
|
bullets=bullets,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.14.1
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -32,6 +32,8 @@ Description-Content-Type: text/markdown
|
|
|
32
32
|
License-File: LICENSE
|
|
33
33
|
Requires-Dist: beautifulsoup4>=4.13.5
|
|
34
34
|
Requires-Dist: nh3>=0.3
|
|
35
|
+
Provides-Extra: html5lib
|
|
36
|
+
Requires-Dist: beautifulsoup4[html5lib]>=4.13.5; extra == "html5lib"
|
|
35
37
|
Provides-Extra: lxml
|
|
36
38
|
Requires-Dist: beautifulsoup4[lxml]>=4.13.5; extra == "lxml"
|
|
37
39
|
Dynamic: license-file
|
|
@@ -40,7 +42,7 @@ Dynamic: license-file
|
|
|
40
42
|
|
|
41
43
|
A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
|
|
42
44
|
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
43
|
-
Python 3.
|
|
45
|
+
Python 3.10+.
|
|
44
46
|
|
|
45
47
|
## Support This Project
|
|
46
48
|
|
|
@@ -64,8 +66,10 @@ Your support helps maintain and improve this library for the community.
|
|
|
64
66
|
- **Custom Converters**: Extensible converter system for custom HTML tag handling
|
|
65
67
|
- **List Formatting**: Configurable list indentation with Discord/Slack compatibility
|
|
66
68
|
- **HTML Preprocessing**: Clean messy HTML with configurable aggressiveness levels
|
|
69
|
+
- **Bytes Input Support**: Direct handling of bytes input with automatic encoding detection and configurable source encoding
|
|
67
70
|
- **Whitespace Control**: Normalized or strict whitespace preservation modes
|
|
68
71
|
- **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
|
|
72
|
+
- **Parser Normalization**: Consistent output across all supported parsers (html.parser, lxml, html5lib)
|
|
69
73
|
- **Robustly Tested**: Comprehensive unit tests and integration tests covering all conversion scenarios
|
|
70
74
|
|
|
71
75
|
## Installation
|
|
@@ -74,17 +78,27 @@ Your support helps maintain and improve this library for the community.
|
|
|
74
78
|
pip install html-to-markdown
|
|
75
79
|
```
|
|
76
80
|
|
|
77
|
-
### Optional
|
|
81
|
+
### Optional Parsers
|
|
78
82
|
|
|
79
|
-
For improved performance, you can install with
|
|
83
|
+
For improved performance and compatibility, you can install with optional parsers:
|
|
80
84
|
|
|
81
85
|
```shell
|
|
86
|
+
# Fast lxml parser (recommended)
|
|
82
87
|
pip install html-to-markdown[lxml]
|
|
88
|
+
|
|
89
|
+
# Standards-compliant html5lib parser
|
|
90
|
+
pip install html-to-markdown[html5lib]
|
|
83
91
|
```
|
|
84
92
|
|
|
85
|
-
|
|
93
|
+
**Parser Options:**
|
|
94
|
+
|
|
95
|
+
- **html.parser** (default): Built-in Python parser, no dependencies
|
|
96
|
+
- **lxml**: **Recommended** - Fastest parser with good malformed HTML handling
|
|
97
|
+
- **html5lib**: Most standards-compliant, handles edge cases best
|
|
86
98
|
|
|
87
|
-
The library automatically uses lxml when available
|
|
99
|
+
The library automatically uses lxml when available and **normalizes output to ensure consistent results regardless of parser choice**. We recommend using the **lxml parser for optimal performance** - it's significantly faster than the other options while maintaining excellent compatibility.
|
|
100
|
+
|
|
101
|
+
You can explicitly specify a parser using the `parser` parameter.
|
|
88
102
|
|
|
89
103
|
## Quick Start
|
|
90
104
|
|
|
@@ -149,6 +163,39 @@ soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installatio
|
|
|
149
163
|
markdown = convert_to_markdown(soup)
|
|
150
164
|
```
|
|
151
165
|
|
|
166
|
+
### Working with Bytes and Encodings
|
|
167
|
+
|
|
168
|
+
The library can directly handle bytes input, which is useful when working with HTTP responses or files:
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
import requests
|
|
172
|
+
from html_to_markdown import convert_to_markdown
|
|
173
|
+
|
|
174
|
+
# Working with HTTP responses (bytes)
|
|
175
|
+
response = requests.get("https://example.com")
|
|
176
|
+
markdown = convert_to_markdown(response.content) # response.content returns bytes
|
|
177
|
+
|
|
178
|
+
# Specify encoding for non-UTF-8 content
|
|
179
|
+
response = requests.get("https://example.fr")
|
|
180
|
+
markdown = convert_to_markdown(response.content, source_encoding="latin-1")
|
|
181
|
+
|
|
182
|
+
# Common encoding examples
|
|
183
|
+
html_bytes = b"<p>Hello World</p>"
|
|
184
|
+
markdown = convert_to_markdown(html_bytes) # UTF-8 by default
|
|
185
|
+
|
|
186
|
+
# Latin-1 encoded content
|
|
187
|
+
html_latin1 = "<p>Café résumé</p>".encode("latin-1")
|
|
188
|
+
markdown = convert_to_markdown(html_latin1, source_encoding="latin-1")
|
|
189
|
+
|
|
190
|
+
# Windows-1252 encoded content
|
|
191
|
+
html_windows = '<p>Smart quotes: "Hello"</p>'.encode("windows-1252")
|
|
192
|
+
markdown = convert_to_markdown(html_windows, source_encoding="windows-1252")
|
|
193
|
+
|
|
194
|
+
# Piping bytes from command line
|
|
195
|
+
# echo '<p>Hello</p>' | python -m html_to_markdown
|
|
196
|
+
# cat file.html | python -m html_to_markdown --source-encoding latin-1
|
|
197
|
+
```
|
|
198
|
+
|
|
152
199
|
## Common Use Cases
|
|
153
200
|
|
|
154
201
|
### Discord/Slack Compatible Lists
|
|
@@ -681,6 +728,131 @@ This library provides comprehensive support for all modern HTML5 elements:
|
|
|
681
728
|
|
|
682
729
|
- `<math>` (MathML support)
|
|
683
730
|
|
|
731
|
+
## Command Line Interface
|
|
732
|
+
|
|
733
|
+
The library includes a full-featured CLI tool with complete API parity:
|
|
734
|
+
|
|
735
|
+
### Basic Usage
|
|
736
|
+
|
|
737
|
+
```bash
|
|
738
|
+
# Convert HTML file to Markdown
|
|
739
|
+
html-to-markdown document.html
|
|
740
|
+
|
|
741
|
+
# Convert from stdin
|
|
742
|
+
echo '<h1>Title</h1><p>Content</p>' | html-to-markdown
|
|
743
|
+
|
|
744
|
+
# Read HTML file with specific encoding
|
|
745
|
+
html-to-markdown document.html --source-encoding latin-1
|
|
746
|
+
|
|
747
|
+
# Pipe bytes with encoding specification
|
|
748
|
+
cat document.html | html-to-markdown --source-encoding utf-8
|
|
749
|
+
```
|
|
750
|
+
|
|
751
|
+
### Advanced CLI Options
|
|
752
|
+
|
|
753
|
+
```bash
|
|
754
|
+
# Discord/Slack compatible lists (2-space indent)
|
|
755
|
+
html-to-markdown file.html --list-indent-width 2
|
|
756
|
+
|
|
757
|
+
# Clean messy HTML before conversion
|
|
758
|
+
html-to-markdown file.html --preprocess-html --preprocessing-preset aggressive
|
|
759
|
+
|
|
760
|
+
# Custom heading style
|
|
761
|
+
html-to-markdown file.html --heading-style atx
|
|
762
|
+
|
|
763
|
+
# Strip specific tags
|
|
764
|
+
html-to-markdown file.html --strip nav aside footer
|
|
765
|
+
|
|
766
|
+
# Convert only specific tags
|
|
767
|
+
html-to-markdown file.html --convert h1 h2 p a strong em
|
|
768
|
+
|
|
769
|
+
# Enable streaming for large files with progress
|
|
770
|
+
html-to-markdown large.html --stream-processing --show-progress
|
|
771
|
+
|
|
772
|
+
# Use specific parser (lxml recommended for best performance)
|
|
773
|
+
html-to-markdown file.html --parser lxml
|
|
774
|
+
```
|
|
775
|
+
|
|
776
|
+
### Real-World CLI Examples
|
|
777
|
+
|
|
778
|
+
```bash
|
|
779
|
+
# Download and convert a webpage
|
|
780
|
+
curl -s https://example.com | html-to-markdown --preprocess-html > output.md
|
|
781
|
+
|
|
782
|
+
# Process multiple files with different encodings
|
|
783
|
+
for file in *.html; do
|
|
784
|
+
html-to-markdown "$file" --source-encoding latin-1 > "${file%.html}.md"
|
|
785
|
+
done
|
|
786
|
+
|
|
787
|
+
# Convert with custom formatting for documentation
|
|
788
|
+
html-to-markdown docs.html \
|
|
789
|
+
--heading-style atx \
|
|
790
|
+
--list-indent-width 2 \
|
|
791
|
+
--highlight-style bold \
|
|
792
|
+
--no-extract-metadata > docs.md
|
|
793
|
+
```
|
|
794
|
+
|
|
795
|
+
## Differences from markdownify
|
|
796
|
+
|
|
797
|
+
html-to-markdown is a modern, completely rewritten library inspired by markdownify but with significant improvements:
|
|
798
|
+
|
|
799
|
+
### Key Advantages
|
|
800
|
+
|
|
801
|
+
| Feature | markdownify | html-to-markdown |
|
|
802
|
+
| ----------------------- | ---------------- | ---------------------------------------------------------------------- |
|
|
803
|
+
| **Type Safety** | No type hints | Full MyPy compliance with strict typing |
|
|
804
|
+
| **Python Support** | Python 3.6+ | Python 3.10+ with modern features |
|
|
805
|
+
| **HTML5 Elements** | Basic support | Comprehensive HTML5 support (semantic, form, table, interactive, etc.) |
|
|
806
|
+
| **Table Handling** | Simple tables | Advanced rowspan/colspan support |
|
|
807
|
+
| **Streaming** | Memory-intensive | Memory-efficient streaming for large documents |
|
|
808
|
+
| **CLI Tool** | Basic | Full-featured CLI with all API options |
|
|
809
|
+
| **Preprocessing** | None | Built-in HTML cleaning with configurable presets |
|
|
810
|
+
| **Metadata Extraction** | None | Automatic title/meta extraction as comments |
|
|
811
|
+
| **Task Lists** | None | GitHub-compatible checkbox conversion |
|
|
812
|
+
| **Bytes Input** | None | Direct bytes support with configurable encoding |
|
|
813
|
+
| **Custom Converters** | Class-based | Function-based with simpler API |
|
|
814
|
+
| **Testing** | Basic | Comprehensive test suite with 100% coverage |
|
|
815
|
+
| **Performance** | Standard | Significantly faster with recommended lxml parser |
|
|
816
|
+
|
|
817
|
+
### API Compatibility
|
|
818
|
+
|
|
819
|
+
While inspired by markdownify, html-to-markdown uses a more modern, explicit API:
|
|
820
|
+
|
|
821
|
+
```python
|
|
822
|
+
# markdownify style
|
|
823
|
+
from markdownify import markdownify
|
|
824
|
+
|
|
825
|
+
result = markdownify(html, heading_style="atx", strip=["nav"])
|
|
826
|
+
|
|
827
|
+
# html-to-markdown style (more explicit)
|
|
828
|
+
from html_to_markdown import convert_to_markdown
|
|
829
|
+
|
|
830
|
+
result = convert_to_markdown(html, heading_style="atx", strip=["nav"])
|
|
831
|
+
```
|
|
832
|
+
|
|
833
|
+
### Migration from markdownify
|
|
834
|
+
|
|
835
|
+
Most markdownify code can be easily migrated:
|
|
836
|
+
|
|
837
|
+
```python
|
|
838
|
+
# Before (markdownify)
|
|
839
|
+
from markdownify import markdownify as md
|
|
840
|
+
|
|
841
|
+
result = md(html, heading_style="atx")
|
|
842
|
+
|
|
843
|
+
# After (html-to-markdown)
|
|
844
|
+
from html_to_markdown import convert_to_markdown
|
|
845
|
+
|
|
846
|
+
result = convert_to_markdown(html, heading_style="atx")
|
|
847
|
+
```
|
|
848
|
+
|
|
849
|
+
Key changes when migrating:
|
|
850
|
+
|
|
851
|
+
- Import path: `markdownify` → `html_to_markdown`
|
|
852
|
+
- Function name: `markdownify()` → `convert_to_markdown()`
|
|
853
|
+
- All parameter names remain the same for common options
|
|
854
|
+
- New parameters available for advanced features (preprocessing, streaming, etc.)
|
|
855
|
+
|
|
684
856
|
## Acknowledgments
|
|
685
857
|
|
|
686
|
-
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
|
|
858
|
+
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors for the inspiration and foundation that made this modern implementation possible.
|
|
@@ -5,7 +5,7 @@ requires = [ "setuptools>=78.1" ]
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "html-to-markdown"
|
|
8
|
-
version = "1.
|
|
8
|
+
version = "1.14.1"
|
|
9
9
|
description = "A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options"
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
keywords = [
|
|
@@ -46,8 +46,9 @@ dependencies = [
|
|
|
46
46
|
"beautifulsoup4>=4.13.5",
|
|
47
47
|
"nh3>=0.3",
|
|
48
48
|
]
|
|
49
|
-
optional-dependencies.
|
|
49
|
+
optional-dependencies.html5lib = [ "beautifulsoup4[html5lib]>=4.13.5" ]
|
|
50
50
|
|
|
51
|
+
optional-dependencies.lxml = [ "beautifulsoup4[lxml]>=4.13.5" ]
|
|
51
52
|
urls.Changelog = "https://github.com/Goldziher/html-to-markdown/releases"
|
|
52
53
|
urls.Homepage = "https://github.com/Goldziher/html-to-markdown"
|
|
53
54
|
urls.Issues = "https://github.com/Goldziher/html-to-markdown/issues"
|
|
@@ -57,14 +58,18 @@ scripts.html_to_markdown = "html_to_markdown.__main__:cli"
|
|
|
57
58
|
|
|
58
59
|
[dependency-groups]
|
|
59
60
|
dev = [
|
|
61
|
+
"beautifulsoup4[html5lib]>=4.13.5",
|
|
62
|
+
"beautifulsoup4[lxml]>=4.13.5",
|
|
60
63
|
"covdefaults>=2.3",
|
|
61
|
-
"
|
|
64
|
+
"memray>=1.18; sys_platform!='win32'",
|
|
65
|
+
"mypy>=1.18.2",
|
|
62
66
|
"pre-commit>=4.3",
|
|
67
|
+
"psutil>=7.1; sys_platform!='win32'",
|
|
63
68
|
"pytest>=8.4.2",
|
|
64
69
|
"pytest-benchmark>=5.1",
|
|
65
70
|
"pytest-cov>=7",
|
|
66
|
-
"pytest-mock>=3.15",
|
|
67
|
-
"ruff>=0.13",
|
|
71
|
+
"pytest-mock>=3.15.1",
|
|
72
|
+
"ruff>=0.13.1",
|
|
68
73
|
"types-beautifulsoup4>=4.12.0.20250516",
|
|
69
74
|
"types-psutil>=7.0.0.20250822",
|
|
70
75
|
"uv-bump",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{html_to_markdown-1.13.0 → html_to_markdown-1.14.1}/html_to_markdown.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|