html-to-markdown 1.2.0__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-1.2.0 → html_to_markdown-1.3.0}/LICENSE +1 -1
- html_to_markdown-1.3.0/PKG-INFO +242 -0
- html_to_markdown-1.3.0/README.md +213 -0
- html_to_markdown-1.3.0/html_to_markdown/__init__.py +5 -0
- {html_to_markdown-1.2.0 → html_to_markdown-1.3.0}/html_to_markdown/converters.py +10 -11
- {html_to_markdown-1.2.0 → html_to_markdown-1.3.0}/html_to_markdown/processing.py +14 -9
- html_to_markdown-1.3.0/html_to_markdown.egg-info/PKG-INFO +242 -0
- html_to_markdown-1.3.0/html_to_markdown.egg-info/SOURCES.txt +16 -0
- html_to_markdown-1.3.0/html_to_markdown.egg-info/dependency_links.txt +1 -0
- html_to_markdown-1.3.0/html_to_markdown.egg-info/requires.txt +1 -0
- html_to_markdown-1.3.0/html_to_markdown.egg-info/top_level.txt +1 -0
- {html_to_markdown-1.2.0 → html_to_markdown-1.3.0}/pyproject.toml +18 -10
- html_to_markdown-1.3.0/setup.cfg +4 -0
- html_to_markdown-1.2.0/.gitignore +0 -21
- html_to_markdown-1.2.0/PKG-INFO +0 -102
- html_to_markdown-1.2.0/README.md +0 -75
- html_to_markdown-1.2.0/html_to_markdown/__init__.py +0 -5
- html_to_markdown-1.2.0/html_to_markdown/legacy.py +0 -89
- {html_to_markdown-1.2.0 → html_to_markdown-1.3.0}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-1.2.0 → html_to_markdown-1.3.0}/html_to_markdown/cli.py +0 -0
- {html_to_markdown-1.2.0 → html_to_markdown-1.3.0}/html_to_markdown/constants.py +0 -0
- {html_to_markdown-1.2.0 → html_to_markdown-1.3.0}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-1.2.0 → html_to_markdown-1.3.0}/html_to_markdown/utils.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
The MIT License (MIT)
|
|
2
2
|
|
|
3
3
|
Copyright 2012-2018 Matthew Tretter
|
|
4
|
-
Copyright 2024 Na'aman Hirschfeld
|
|
4
|
+
Copyright 2024-2025 Na'aman Hirschfeld
|
|
5
5
|
|
|
6
6
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
7
|
of this software and associated documentation files (the "Software"), to deal
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: html-to-markdown
|
|
3
|
+
Version: 1.3.0
|
|
4
|
+
Summary: Convert HTML to markdown
|
|
5
|
+
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: homepage, https://github.com/Goldziher/html-to-markdown
|
|
8
|
+
Keywords: converter,html,markdown,text-extraction,text-processing
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Text Processing
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
21
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
22
|
+
Classifier: Topic :: Utilities
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.9
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: beautifulsoup4>=4.12.3
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# html-to-markdown
|
|
31
|
+
|
|
32
|
+
A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
|
|
33
|
+
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
34
|
+
Python 3.9+.
|
|
35
|
+
|
|
36
|
+
## Features
|
|
37
|
+
|
|
38
|
+
- Full type safety with strict MyPy adherence
|
|
39
|
+
- Functional API design
|
|
40
|
+
- Extensive test coverage
|
|
41
|
+
- Configurable conversion options
|
|
42
|
+
- CLI tool for easy conversions
|
|
43
|
+
- Support for pre-configured BeautifulSoup instances
|
|
44
|
+
- Strict semver versioning
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```shell
|
|
49
|
+
pip install html-to-markdown
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
Convert HTML to Markdown with a single function call:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from html_to_markdown import convert_to_markdown
|
|
58
|
+
|
|
59
|
+
html = """
|
|
60
|
+
<article>
|
|
61
|
+
<h1>Welcome</h1>
|
|
62
|
+
<p>This is a <strong>sample</strong> with a <a href="https://example.com">link</a>.</p>
|
|
63
|
+
<ul>
|
|
64
|
+
<li>Item 1</li>
|
|
65
|
+
<li>Item 2</li>
|
|
66
|
+
</ul>
|
|
67
|
+
</article>
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
markdown = convert_to_markdown(html)
|
|
71
|
+
print(markdown)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Output:
|
|
75
|
+
|
|
76
|
+
```markdown
|
|
77
|
+
# Welcome
|
|
78
|
+
|
|
79
|
+
This is a **sample** with a [link](https://example.com).
|
|
80
|
+
|
|
81
|
+
* Item 1
|
|
82
|
+
* Item 2
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Working with BeautifulSoup
|
|
86
|
+
|
|
87
|
+
If you need more control over HTML parsing, you can pass a pre-configured BeautifulSoup instance:
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from bs4 import BeautifulSoup
|
|
91
|
+
from html_to_markdown import convert_to_markdown
|
|
92
|
+
|
|
93
|
+
# Configure BeautifulSoup with your preferred parser
|
|
94
|
+
soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installation
|
|
95
|
+
markdown = convert_to_markdown(soup)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Advanced Usage
|
|
99
|
+
|
|
100
|
+
### Customizing Conversion Options
|
|
101
|
+
|
|
102
|
+
The library offers extensive customization through various options:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from html_to_markdown import convert_to_markdown
|
|
106
|
+
|
|
107
|
+
html = "<div>Your content here...</div>"
|
|
108
|
+
markdown = convert_to_markdown(
|
|
109
|
+
html,
|
|
110
|
+
heading_style="atx", # Use # style headers
|
|
111
|
+
strong_em_symbol="*", # Use * for bold/italic
|
|
112
|
+
bullets="*+-", # Define bullet point characters
|
|
113
|
+
wrap=True, # Enable text wrapping
|
|
114
|
+
wrap_width=100, # Set wrap width
|
|
115
|
+
escape_asterisks=True, # Escape * characters
|
|
116
|
+
code_language="python", # Default code block language
|
|
117
|
+
)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Custom Converters
|
|
121
|
+
|
|
122
|
+
You can provide your own conversion functions for specific HTML tags:
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from bs4.element import Tag
|
|
126
|
+
from html_to_markdown import convert_to_markdown
|
|
127
|
+
|
|
128
|
+
# Define a custom converter for the <b> tag
|
|
129
|
+
def custom_bold_converter(*, tag: Tag, text: str, **kwargs) -> str:
|
|
130
|
+
return f"IMPORTANT: {text}"
|
|
131
|
+
|
|
132
|
+
html = "<p>This is a <b>bold statement</b>.</p>"
|
|
133
|
+
markdown = convert_to_markdown(html, custom_converters={"b": custom_bold_converter})
|
|
134
|
+
print(markdown)
|
|
135
|
+
# Output: This is a IMPORTANT: bold statement.
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Custom converters take precedence over the built-in converters and can be used alongside other configuration options.
|
|
139
|
+
|
|
140
|
+
### Configuration Options
|
|
141
|
+
|
|
142
|
+
| Option | Type | Default | Description |
|
|
143
|
+
| -------------------- | ---- | -------------- | ------------------------------------------------------ |
|
|
144
|
+
| `autolinks` | bool | `True` | Auto-convert URLs to Markdown links |
|
|
145
|
+
| `bullets` | str | `'*+-'` | Characters to use for bullet points |
|
|
146
|
+
| `code_language` | str | `''` | Default language for code blocks |
|
|
147
|
+
| `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`) |
|
|
148
|
+
| `escape_asterisks` | bool | `True` | Escape * characters |
|
|
149
|
+
| `escape_underscores` | bool | `True` | Escape _ characters |
|
|
150
|
+
| `wrap` | bool | `False` | Enable text wrapping |
|
|
151
|
+
| `wrap_width` | int | `80` | Text wrap width |
|
|
152
|
+
|
|
153
|
+
For a complete list of options, see the [Configuration](#configuration) section below.
|
|
154
|
+
|
|
155
|
+
## CLI Usage
|
|
156
|
+
|
|
157
|
+
Convert HTML files directly from the command line:
|
|
158
|
+
|
|
159
|
+
```shell
|
|
160
|
+
# Convert a file
|
|
161
|
+
html_to_markdown input.html > output.md
|
|
162
|
+
|
|
163
|
+
# Process stdin
|
|
164
|
+
cat input.html | html_to_markdown > output.md
|
|
165
|
+
|
|
166
|
+
# Use custom options
|
|
167
|
+
html_to_markdown --heading-style atx --wrap --wrap-width 100 input.html > output.md
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
View all available options:
|
|
171
|
+
|
|
172
|
+
```shell
|
|
173
|
+
html_to_markdown --help
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## Migration from Markdownify
|
|
177
|
+
|
|
178
|
+
For existing projects using Markdownify, a compatibility layer is provided:
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
# Old code
|
|
182
|
+
from markdownify import markdownify as md
|
|
183
|
+
|
|
184
|
+
# New code - works the same way
|
|
185
|
+
from html_to_markdown import markdownify as md
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
The `markdownify` function is an alias for `convert_to_markdown` and provides identical functionality.
|
|
189
|
+
|
|
190
|
+
## Configuration
|
|
191
|
+
|
|
192
|
+
Full list of configuration options:
|
|
193
|
+
|
|
194
|
+
- `autolinks`: Convert valid URLs to Markdown links automatically
|
|
195
|
+
- `bullets`: Characters to use for bullet points in lists
|
|
196
|
+
- `code_language`: Default language for fenced code blocks
|
|
197
|
+
- `code_language_callback`: Function to determine code block language
|
|
198
|
+
- `convert`: List of HTML tags to convert (None = all supported tags)
|
|
199
|
+
- `default_title`: Use default titles for elements like links
|
|
200
|
+
- `escape_asterisks`: Escape * characters
|
|
201
|
+
- `escape_misc`: Escape miscellaneous Markdown characters
|
|
202
|
+
- `escape_underscores`: Escape _ characters
|
|
203
|
+
- `heading_style`: Header style (underlined/atx/atx_closed)
|
|
204
|
+
- `keep_inline_images_in`: Tags where inline images should be kept
|
|
205
|
+
- `newline_style`: Style for handling newlines (spaces/backslash)
|
|
206
|
+
- `strip`: Tags to remove from output
|
|
207
|
+
- `strong_em_symbol`: Symbol for strong/emphasized text (\* or \_)
|
|
208
|
+
- `sub_symbol`: Symbol for subscript text
|
|
209
|
+
- `sup_symbol`: Symbol for superscript text
|
|
210
|
+
- `wrap`: Enable text wrapping
|
|
211
|
+
- `wrap_width`: Width for text wrapping
|
|
212
|
+
- `convert_as_inline`: Treat content as inline elements
|
|
213
|
+
- `custom_converters`: A mapping of HTML tag names to custom converter functions
|
|
214
|
+
|
|
215
|
+
## Contribution
|
|
216
|
+
|
|
217
|
+
This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
|
|
218
|
+
submitting PRs to avoid disappointment.
|
|
219
|
+
|
|
220
|
+
### Local Development
|
|
221
|
+
|
|
222
|
+
1. Clone the repo
|
|
223
|
+
|
|
224
|
+
1. Install the system dependencies
|
|
225
|
+
|
|
226
|
+
1. Install the full dependencies with `uv sync`
|
|
227
|
+
|
|
228
|
+
1. Install the pre-commit hooks with:
|
|
229
|
+
|
|
230
|
+
```shell
|
|
231
|
+
pre-commit install && pre-commit install --hook-type commit-msg
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
1. Make your changes and submit a PR
|
|
235
|
+
|
|
236
|
+
## License
|
|
237
|
+
|
|
238
|
+
This library uses the MIT license.
|
|
239
|
+
|
|
240
|
+
## Acknowledgments
|
|
241
|
+
|
|
242
|
+
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
# html-to-markdown
|
|
2
|
+
|
|
3
|
+
A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
|
|
4
|
+
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
5
|
+
Python 3.9+.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- Full type safety with strict MyPy adherence
|
|
10
|
+
- Functional API design
|
|
11
|
+
- Extensive test coverage
|
|
12
|
+
- Configurable conversion options
|
|
13
|
+
- CLI tool for easy conversions
|
|
14
|
+
- Support for pre-configured BeautifulSoup instances
|
|
15
|
+
- Strict semver versioning
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```shell
|
|
20
|
+
pip install html-to-markdown
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
Convert HTML to Markdown with a single function call:
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from html_to_markdown import convert_to_markdown
|
|
29
|
+
|
|
30
|
+
html = """
|
|
31
|
+
<article>
|
|
32
|
+
<h1>Welcome</h1>
|
|
33
|
+
<p>This is a <strong>sample</strong> with a <a href="https://example.com">link</a>.</p>
|
|
34
|
+
<ul>
|
|
35
|
+
<li>Item 1</li>
|
|
36
|
+
<li>Item 2</li>
|
|
37
|
+
</ul>
|
|
38
|
+
</article>
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
markdown = convert_to_markdown(html)
|
|
42
|
+
print(markdown)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Output:
|
|
46
|
+
|
|
47
|
+
```markdown
|
|
48
|
+
# Welcome
|
|
49
|
+
|
|
50
|
+
This is a **sample** with a [link](https://example.com).
|
|
51
|
+
|
|
52
|
+
* Item 1
|
|
53
|
+
* Item 2
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Working with BeautifulSoup
|
|
57
|
+
|
|
58
|
+
If you need more control over HTML parsing, you can pass a pre-configured BeautifulSoup instance:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from bs4 import BeautifulSoup
|
|
62
|
+
from html_to_markdown import convert_to_markdown
|
|
63
|
+
|
|
64
|
+
# Configure BeautifulSoup with your preferred parser
|
|
65
|
+
soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installation
|
|
66
|
+
markdown = convert_to_markdown(soup)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Advanced Usage
|
|
70
|
+
|
|
71
|
+
### Customizing Conversion Options
|
|
72
|
+
|
|
73
|
+
The library offers extensive customization through various options:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from html_to_markdown import convert_to_markdown
|
|
77
|
+
|
|
78
|
+
html = "<div>Your content here...</div>"
|
|
79
|
+
markdown = convert_to_markdown(
|
|
80
|
+
html,
|
|
81
|
+
heading_style="atx", # Use # style headers
|
|
82
|
+
strong_em_symbol="*", # Use * for bold/italic
|
|
83
|
+
bullets="*+-", # Define bullet point characters
|
|
84
|
+
wrap=True, # Enable text wrapping
|
|
85
|
+
wrap_width=100, # Set wrap width
|
|
86
|
+
escape_asterisks=True, # Escape * characters
|
|
87
|
+
code_language="python", # Default code block language
|
|
88
|
+
)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Custom Converters
|
|
92
|
+
|
|
93
|
+
You can provide your own conversion functions for specific HTML tags:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from bs4.element import Tag
|
|
97
|
+
from html_to_markdown import convert_to_markdown
|
|
98
|
+
|
|
99
|
+
# Define a custom converter for the <b> tag
|
|
100
|
+
def custom_bold_converter(*, tag: Tag, text: str, **kwargs) -> str:
|
|
101
|
+
return f"IMPORTANT: {text}"
|
|
102
|
+
|
|
103
|
+
html = "<p>This is a <b>bold statement</b>.</p>"
|
|
104
|
+
markdown = convert_to_markdown(html, custom_converters={"b": custom_bold_converter})
|
|
105
|
+
print(markdown)
|
|
106
|
+
# Output: This is a IMPORTANT: bold statement.
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Custom converters take precedence over the built-in converters and can be used alongside other configuration options.
|
|
110
|
+
|
|
111
|
+
### Configuration Options
|
|
112
|
+
|
|
113
|
+
| Option | Type | Default | Description |
|
|
114
|
+
| -------------------- | ---- | -------------- | ------------------------------------------------------ |
|
|
115
|
+
| `autolinks` | bool | `True` | Auto-convert URLs to Markdown links |
|
|
116
|
+
| `bullets` | str | `'*+-'` | Characters to use for bullet points |
|
|
117
|
+
| `code_language` | str | `''` | Default language for code blocks |
|
|
118
|
+
| `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`) |
|
|
119
|
+
| `escape_asterisks` | bool | `True` | Escape * characters |
|
|
120
|
+
| `escape_underscores` | bool | `True` | Escape _ characters |
|
|
121
|
+
| `wrap` | bool | `False` | Enable text wrapping |
|
|
122
|
+
| `wrap_width` | int | `80` | Text wrap width |
|
|
123
|
+
|
|
124
|
+
For a complete list of options, see the [Configuration](#configuration) section below.
|
|
125
|
+
|
|
126
|
+
## CLI Usage
|
|
127
|
+
|
|
128
|
+
Convert HTML files directly from the command line:
|
|
129
|
+
|
|
130
|
+
```shell
|
|
131
|
+
# Convert a file
|
|
132
|
+
html_to_markdown input.html > output.md
|
|
133
|
+
|
|
134
|
+
# Process stdin
|
|
135
|
+
cat input.html | html_to_markdown > output.md
|
|
136
|
+
|
|
137
|
+
# Use custom options
|
|
138
|
+
html_to_markdown --heading-style atx --wrap --wrap-width 100 input.html > output.md
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
View all available options:
|
|
142
|
+
|
|
143
|
+
```shell
|
|
144
|
+
html_to_markdown --help
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Migration from Markdownify
|
|
148
|
+
|
|
149
|
+
For existing projects using Markdownify, a compatibility layer is provided:
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
# Old code
|
|
153
|
+
from markdownify import markdownify as md
|
|
154
|
+
|
|
155
|
+
# New code - works the same way
|
|
156
|
+
from html_to_markdown import markdownify as md
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
The `markdownify` function is an alias for `convert_to_markdown` and provides identical functionality.
|
|
160
|
+
|
|
161
|
+
## Configuration
|
|
162
|
+
|
|
163
|
+
Full list of configuration options:
|
|
164
|
+
|
|
165
|
+
- `autolinks`: Convert valid URLs to Markdown links automatically
|
|
166
|
+
- `bullets`: Characters to use for bullet points in lists
|
|
167
|
+
- `code_language`: Default language for fenced code blocks
|
|
168
|
+
- `code_language_callback`: Function to determine code block language
|
|
169
|
+
- `convert`: List of HTML tags to convert (None = all supported tags)
|
|
170
|
+
- `default_title`: Use default titles for elements like links
|
|
171
|
+
- `escape_asterisks`: Escape * characters
|
|
172
|
+
- `escape_misc`: Escape miscellaneous Markdown characters
|
|
173
|
+
- `escape_underscores`: Escape _ characters
|
|
174
|
+
- `heading_style`: Header style (underlined/atx/atx_closed)
|
|
175
|
+
- `keep_inline_images_in`: Tags where inline images should be kept
|
|
176
|
+
- `newline_style`: Style for handling newlines (spaces/backslash)
|
|
177
|
+
- `strip`: Tags to remove from output
|
|
178
|
+
- `strong_em_symbol`: Symbol for strong/emphasized text (\* or \_)
|
|
179
|
+
- `sub_symbol`: Symbol for subscript text
|
|
180
|
+
- `sup_symbol`: Symbol for superscript text
|
|
181
|
+
- `wrap`: Enable text wrapping
|
|
182
|
+
- `wrap_width`: Width for text wrapping
|
|
183
|
+
- `convert_as_inline`: Treat content as inline elements
|
|
184
|
+
- `custom_converters`: A mapping of HTML tag names to custom converter functions
|
|
185
|
+
|
|
186
|
+
## Contribution
|
|
187
|
+
|
|
188
|
+
This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
|
|
189
|
+
submitting PRs to avoid disappointment.
|
|
190
|
+
|
|
191
|
+
### Local Development
|
|
192
|
+
|
|
193
|
+
1. Clone the repo
|
|
194
|
+
|
|
195
|
+
1. Install the system dependencies
|
|
196
|
+
|
|
197
|
+
1. Install the full dependencies with `uv sync`
|
|
198
|
+
|
|
199
|
+
1. Install the pre-commit hooks with:
|
|
200
|
+
|
|
201
|
+
```shell
|
|
202
|
+
pre-commit install && pre-commit install --hook-type commit-msg
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
1. Make your changes and submit a PR
|
|
206
|
+
|
|
207
|
+
## License
|
|
208
|
+
|
|
209
|
+
This library uses the MIT license.
|
|
210
|
+
|
|
211
|
+
## Acknowledgments
|
|
212
|
+
|
|
213
|
+
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from collections.abc import Iterable
|
|
4
7
|
from functools import partial
|
|
5
8
|
from inspect import getfullargspec
|
|
6
9
|
from textwrap import fill
|
|
@@ -55,7 +58,8 @@ SupportedElements = Literal[
|
|
|
55
58
|
"kbd",
|
|
56
59
|
]
|
|
57
60
|
|
|
58
|
-
|
|
61
|
+
Converter = Callable[[str, Tag], str]
|
|
62
|
+
ConvertersMap = dict[SupportedElements, Converter]
|
|
59
63
|
|
|
60
64
|
T = TypeVar("T")
|
|
61
65
|
|
|
@@ -85,7 +89,7 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
|
|
|
85
89
|
|
|
86
90
|
return f"{prefix}{markup_prefix}{text}{markup_suffix}{suffix}"
|
|
87
91
|
|
|
88
|
-
return cast(Callable[[Tag, str], str], implementation)
|
|
92
|
+
return cast("Callable[[Tag, str], str]", implementation)
|
|
89
93
|
|
|
90
94
|
|
|
91
95
|
def _get_colspan(tag: Tag) -> int:
|
|
@@ -187,7 +191,7 @@ def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
|
|
|
187
191
|
parent = tag.parent
|
|
188
192
|
if parent is not None and parent.name == "ol":
|
|
189
193
|
start = (
|
|
190
|
-
int(cast(str, parent["start"]))
|
|
194
|
+
int(cast("str", parent["start"]))
|
|
191
195
|
if isinstance(parent.get("start"), str) and str(parent.get("start")).isnumeric()
|
|
192
196
|
else 1
|
|
193
197
|
)
|
|
@@ -263,7 +267,6 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
263
267
|
overline = ""
|
|
264
268
|
underline = ""
|
|
265
269
|
if is_headrow and not tag.previous_sibling:
|
|
266
|
-
# first row and is headline: print headline underline
|
|
267
270
|
full_colspan = 0
|
|
268
271
|
for cell in cells:
|
|
269
272
|
if "colspan" in cell.attrs and cell["colspan"].isdigit():
|
|
@@ -272,12 +275,8 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
272
275
|
full_colspan += 1
|
|
273
276
|
underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n"
|
|
274
277
|
elif not tag.previous_sibling and (
|
|
275
|
-
parent_name == "table" or (parent_name == "tbody" and not cast(Tag, tag.parent).previous_sibling)
|
|
278
|
+
parent_name == "table" or (parent_name == "tbody" and not cast("Tag", tag.parent).previous_sibling)
|
|
276
279
|
):
|
|
277
|
-
# first row, not headline, and:
|
|
278
|
-
# - the parent is table or
|
|
279
|
-
# - the parent is tbody at the beginning of a table.
|
|
280
|
-
# print empty headline above this row
|
|
281
280
|
overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"
|
|
282
281
|
overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"
|
|
283
282
|
return overline + "|" + text + "\n" + underline
|
|
@@ -334,7 +333,7 @@ def create_converters_map(
|
|
|
334
333
|
return func(**kwargs)
|
|
335
334
|
return func(text)
|
|
336
335
|
|
|
337
|
-
return cast(Callable[[str, Tag], T], _inner)
|
|
336
|
+
return cast("Callable[[str, Tag], T]", _inner)
|
|
338
337
|
|
|
339
338
|
return {
|
|
340
339
|
"a": _wrapper(partial(_convert_a, autolinks=autolinks, default_title=default_title)),
|
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from collections.abc import Mapping
|
|
3
7
|
from itertools import chain
|
|
4
8
|
from typing import TYPE_CHECKING, Any, Callable, Literal, cast
|
|
5
9
|
|
|
@@ -12,7 +16,7 @@ from html_to_markdown.constants import (
|
|
|
12
16
|
html_heading_re,
|
|
13
17
|
whitespace_re,
|
|
14
18
|
)
|
|
15
|
-
from html_to_markdown.converters import ConvertersMap, create_converters_map
|
|
19
|
+
from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
|
|
16
20
|
from html_to_markdown.utils import escape
|
|
17
21
|
|
|
18
22
|
if TYPE_CHECKING:
|
|
@@ -87,7 +91,9 @@ def _process_tag(
|
|
|
87
91
|
strip: set[str] | None,
|
|
88
92
|
) -> str:
|
|
89
93
|
should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
|
|
90
|
-
tag_name: SupportedTag | None =
|
|
94
|
+
tag_name: SupportedTag | None = (
|
|
95
|
+
cast("SupportedTag", tag.name.lower()) if tag.name.lower() in converters_map else None
|
|
96
|
+
)
|
|
91
97
|
text = ""
|
|
92
98
|
|
|
93
99
|
is_heading = html_heading_re.match(tag.name) is not None
|
|
@@ -142,11 +148,9 @@ def _process_text(
|
|
|
142
148
|
) -> str:
|
|
143
149
|
text = str(el) or ""
|
|
144
150
|
|
|
145
|
-
# normalize whitespace if we're not inside a preformatted element
|
|
146
151
|
if not el.find_parent("pre"):
|
|
147
152
|
text = whitespace_re.sub(" ", text)
|
|
148
153
|
|
|
149
|
-
# escape special characters if we're not inside a preformatted or code element
|
|
150
154
|
if not el.find_parent(["pre", "code", "kbd", "samp"]):
|
|
151
155
|
text = escape(
|
|
152
156
|
text=text,
|
|
@@ -155,9 +159,6 @@ def _process_text(
|
|
|
155
159
|
escape_underscores=escape_underscores,
|
|
156
160
|
)
|
|
157
161
|
|
|
158
|
-
# remove trailing whitespaces if any of the following condition is true:
|
|
159
|
-
# - current text node is the last node in li
|
|
160
|
-
# - current text node is followed by an embedded list
|
|
161
162
|
if (
|
|
162
163
|
el.parent
|
|
163
164
|
and el.parent.name == "li"
|
|
@@ -192,6 +193,8 @@ def convert_to_markdown(
|
|
|
192
193
|
code_language: str = "",
|
|
193
194
|
code_language_callback: Callable[[Any], str] | None = None,
|
|
194
195
|
convert: str | Iterable[str] | None = None,
|
|
196
|
+
convert_as_inline: bool = False,
|
|
197
|
+
custom_converters: Mapping[SupportedElements, Converter] | None = None,
|
|
195
198
|
default_title: bool = False,
|
|
196
199
|
escape_asterisks: bool = True,
|
|
197
200
|
escape_misc: bool = True,
|
|
@@ -205,7 +208,6 @@ def convert_to_markdown(
|
|
|
205
208
|
sup_symbol: str = "",
|
|
206
209
|
wrap: bool = False,
|
|
207
210
|
wrap_width: int = 80,
|
|
208
|
-
convert_as_inline: bool = False,
|
|
209
211
|
) -> str:
|
|
210
212
|
"""Convert HTML to Markdown.
|
|
211
213
|
|
|
@@ -216,6 +218,8 @@ def convert_to_markdown(
|
|
|
216
218
|
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
217
219
|
code_language_callback: Function to dynamically determine the language for code blocks.
|
|
218
220
|
convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
221
|
+
convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
222
|
+
custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
|
|
219
223
|
default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
220
224
|
escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
|
|
221
225
|
escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
@@ -229,7 +233,6 @@ def convert_to_markdown(
|
|
|
229
233
|
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
230
234
|
wrap: Wrap text to the specified width. Defaults to False.
|
|
231
235
|
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
232
|
-
convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
233
236
|
|
|
234
237
|
Raises:
|
|
235
238
|
ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
|
|
@@ -263,6 +266,8 @@ def convert_to_markdown(
|
|
|
263
266
|
wrap=wrap,
|
|
264
267
|
wrap_width=wrap_width,
|
|
265
268
|
)
|
|
269
|
+
if custom_converters:
|
|
270
|
+
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
266
271
|
|
|
267
272
|
return _process_tag(
|
|
268
273
|
source,
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: html-to-markdown
|
|
3
|
+
Version: 1.3.0
|
|
4
|
+
Summary: Convert HTML to markdown
|
|
5
|
+
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: homepage, https://github.com/Goldziher/html-to-markdown
|
|
8
|
+
Keywords: converter,html,markdown,text-extraction,text-processing
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Text Processing
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
21
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
22
|
+
Classifier: Topic :: Utilities
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.9
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: beautifulsoup4>=4.12.3
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# html-to-markdown
|
|
31
|
+
|
|
32
|
+
A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
|
|
33
|
+
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
34
|
+
Python 3.9+.
|
|
35
|
+
|
|
36
|
+
## Features
|
|
37
|
+
|
|
38
|
+
- Full type safety with strict MyPy adherence
|
|
39
|
+
- Functional API design
|
|
40
|
+
- Extensive test coverage
|
|
41
|
+
- Configurable conversion options
|
|
42
|
+
- CLI tool for easy conversions
|
|
43
|
+
- Support for pre-configured BeautifulSoup instances
|
|
44
|
+
- Strict semver versioning
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```shell
|
|
49
|
+
pip install html-to-markdown
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
Convert HTML to Markdown with a single function call:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from html_to_markdown import convert_to_markdown
|
|
58
|
+
|
|
59
|
+
html = """
|
|
60
|
+
<article>
|
|
61
|
+
<h1>Welcome</h1>
|
|
62
|
+
<p>This is a <strong>sample</strong> with a <a href="https://example.com">link</a>.</p>
|
|
63
|
+
<ul>
|
|
64
|
+
<li>Item 1</li>
|
|
65
|
+
<li>Item 2</li>
|
|
66
|
+
</ul>
|
|
67
|
+
</article>
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
markdown = convert_to_markdown(html)
|
|
71
|
+
print(markdown)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Output:
|
|
75
|
+
|
|
76
|
+
```markdown
|
|
77
|
+
# Welcome
|
|
78
|
+
|
|
79
|
+
This is a **sample** with a [link](https://example.com).
|
|
80
|
+
|
|
81
|
+
* Item 1
|
|
82
|
+
* Item 2
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Working with BeautifulSoup
|
|
86
|
+
|
|
87
|
+
If you need more control over HTML parsing, you can pass a pre-configured BeautifulSoup instance:
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from bs4 import BeautifulSoup
|
|
91
|
+
from html_to_markdown import convert_to_markdown
|
|
92
|
+
|
|
93
|
+
# Configure BeautifulSoup with your preferred parser
|
|
94
|
+
soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installation
|
|
95
|
+
markdown = convert_to_markdown(soup)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Advanced Usage
|
|
99
|
+
|
|
100
|
+
### Customizing Conversion Options
|
|
101
|
+
|
|
102
|
+
The library offers extensive customization through various options:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from html_to_markdown import convert_to_markdown
|
|
106
|
+
|
|
107
|
+
html = "<div>Your content here...</div>"
|
|
108
|
+
markdown = convert_to_markdown(
|
|
109
|
+
html,
|
|
110
|
+
heading_style="atx", # Use # style headers
|
|
111
|
+
strong_em_symbol="*", # Use * for bold/italic
|
|
112
|
+
bullets="*+-", # Define bullet point characters
|
|
113
|
+
wrap=True, # Enable text wrapping
|
|
114
|
+
wrap_width=100, # Set wrap width
|
|
115
|
+
escape_asterisks=True, # Escape * characters
|
|
116
|
+
code_language="python", # Default code block language
|
|
117
|
+
)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Custom Converters
|
|
121
|
+
|
|
122
|
+
You can provide your own conversion functions for specific HTML tags:
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from bs4.element import Tag
|
|
126
|
+
from html_to_markdown import convert_to_markdown
|
|
127
|
+
|
|
128
|
+
# Define a custom converter for the <b> tag
|
|
129
|
+
def custom_bold_converter(*, tag: Tag, text: str, **kwargs) -> str:
|
|
130
|
+
return f"IMPORTANT: {text}"
|
|
131
|
+
|
|
132
|
+
html = "<p>This is a <b>bold statement</b>.</p>"
|
|
133
|
+
markdown = convert_to_markdown(html, custom_converters={"b": custom_bold_converter})
|
|
134
|
+
print(markdown)
|
|
135
|
+
# Output: This is a IMPORTANT: bold statement.
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Custom converters take precedence over the built-in converters and can be used alongside other configuration options.
|
|
139
|
+
|
|
140
|
+
### Configuration Options
|
|
141
|
+
|
|
142
|
+
| Option | Type | Default | Description |
|
|
143
|
+
| -------------------- | ---- | -------------- | ------------------------------------------------------ |
|
|
144
|
+
| `autolinks` | bool | `True` | Auto-convert URLs to Markdown links |
|
|
145
|
+
| `bullets` | str | `'*+-'` | Characters to use for bullet points |
|
|
146
|
+
| `code_language` | str | `''` | Default language for code blocks |
|
|
147
|
+
| `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`) |
|
|
148
|
+
| `escape_asterisks` | bool | `True` | Escape * characters |
|
|
149
|
+
| `escape_underscores` | bool | `True` | Escape _ characters |
|
|
150
|
+
| `wrap` | bool | `False` | Enable text wrapping |
|
|
151
|
+
| `wrap_width` | int | `80` | Text wrap width |
|
|
152
|
+
|
|
153
|
+
For a complete list of options, see the [Configuration](#configuration) section below.
|
|
154
|
+
|
|
155
|
+
## CLI Usage
|
|
156
|
+
|
|
157
|
+
Convert HTML files directly from the command line:
|
|
158
|
+
|
|
159
|
+
```shell
|
|
160
|
+
# Convert a file
|
|
161
|
+
html_to_markdown input.html > output.md
|
|
162
|
+
|
|
163
|
+
# Process stdin
|
|
164
|
+
cat input.html | html_to_markdown > output.md
|
|
165
|
+
|
|
166
|
+
# Use custom options
|
|
167
|
+
html_to_markdown --heading-style atx --wrap --wrap-width 100 input.html > output.md
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
View all available options:
|
|
171
|
+
|
|
172
|
+
```shell
|
|
173
|
+
html_to_markdown --help
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## Migration from Markdownify
|
|
177
|
+
|
|
178
|
+
For existing projects using Markdownify, a compatibility layer is provided:
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
# Old code
|
|
182
|
+
from markdownify import markdownify as md
|
|
183
|
+
|
|
184
|
+
# New code - works the same way
|
|
185
|
+
from html_to_markdown import markdownify as md
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
The `markdownify` function is an alias for `convert_to_markdown` and provides identical functionality.
|
|
189
|
+
|
|
190
|
+
## Configuration
|
|
191
|
+
|
|
192
|
+
Full list of configuration options:
|
|
193
|
+
|
|
194
|
+
- `autolinks`: Convert valid URLs to Markdown links automatically
|
|
195
|
+
- `bullets`: Characters to use for bullet points in lists
|
|
196
|
+
- `code_language`: Default language for fenced code blocks
|
|
197
|
+
- `code_language_callback`: Function to determine code block language
|
|
198
|
+
- `convert`: List of HTML tags to convert (None = all supported tags)
|
|
199
|
+
- `default_title`: Use default titles for elements like links
|
|
200
|
+
- `escape_asterisks`: Escape * characters
|
|
201
|
+
- `escape_misc`: Escape miscellaneous Markdown characters
|
|
202
|
+
- `escape_underscores`: Escape _ characters
|
|
203
|
+
- `heading_style`: Header style (underlined/atx/atx_closed)
|
|
204
|
+
- `keep_inline_images_in`: Tags where inline images should be kept
|
|
205
|
+
- `newline_style`: Style for handling newlines (spaces/backslash)
|
|
206
|
+
- `strip`: Tags to remove from output
|
|
207
|
+
- `strong_em_symbol`: Symbol for strong/emphasized text (\* or \_)
|
|
208
|
+
- `sub_symbol`: Symbol for subscript text
|
|
209
|
+
- `sup_symbol`: Symbol for superscript text
|
|
210
|
+
- `wrap`: Enable text wrapping
|
|
211
|
+
- `wrap_width`: Width for text wrapping
|
|
212
|
+
- `convert_as_inline`: Treat content as inline elements
|
|
213
|
+
- `custom_converters`: A mapping of HTML tag names to custom converter functions
|
|
214
|
+
|
|
215
|
+
## Contribution
|
|
216
|
+
|
|
217
|
+
This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
|
|
218
|
+
submitting PRs to avoid disappointment.
|
|
219
|
+
|
|
220
|
+
### Local Development
|
|
221
|
+
|
|
222
|
+
1. Clone the repo
|
|
223
|
+
|
|
224
|
+
1. Install the system dependencies
|
|
225
|
+
|
|
226
|
+
1. Install the full dependencies with `uv sync`
|
|
227
|
+
|
|
228
|
+
1. Install the pre-commit hooks with:
|
|
229
|
+
|
|
230
|
+
```shell
|
|
231
|
+
pre-commit install && pre-commit install --hook-type commit-msg
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
1. Make your changes and submit a PR
|
|
235
|
+
|
|
236
|
+
## License
|
|
237
|
+
|
|
238
|
+
This library uses the MIT license.
|
|
239
|
+
|
|
240
|
+
## Acknowledgments
|
|
241
|
+
|
|
242
|
+
Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
html_to_markdown/__init__.py
|
|
5
|
+
html_to_markdown/__main__.py
|
|
6
|
+
html_to_markdown/cli.py
|
|
7
|
+
html_to_markdown/constants.py
|
|
8
|
+
html_to_markdown/converters.py
|
|
9
|
+
html_to_markdown/processing.py
|
|
10
|
+
html_to_markdown/py.typed
|
|
11
|
+
html_to_markdown/utils.py
|
|
12
|
+
html_to_markdown.egg-info/PKG-INFO
|
|
13
|
+
html_to_markdown.egg-info/SOURCES.txt
|
|
14
|
+
html_to_markdown.egg-info/dependency_links.txt
|
|
15
|
+
html_to_markdown.egg-info/requires.txt
|
|
16
|
+
html_to_markdown.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
beautifulsoup4>=4.12.3
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
html_to_markdown
|
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
[build-system]
|
|
2
|
-
build-backend = "
|
|
2
|
+
build-backend = "setuptools.build_meta"
|
|
3
3
|
|
|
4
|
-
requires = [ "
|
|
4
|
+
requires = [ "setuptools>=78.1" ]
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "html-to-markdown"
|
|
8
|
-
version = "1.
|
|
8
|
+
version = "1.3.0"
|
|
9
9
|
description = "Convert HTML to markdown"
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
keywords = [ "converter", "html", "markdown", "text-extraction", "text-processing" ]
|
|
12
|
-
|
|
13
12
|
license = { text = "MIT" }
|
|
14
13
|
authors = [ { name = "Na'aman Hirschfeld", email = "nhirschfeld@gmail.com" } ]
|
|
15
14
|
requires-python = ">=3.9"
|
|
@@ -30,30 +29,36 @@ classifiers = [
|
|
|
30
29
|
"Topic :: Utilities",
|
|
31
30
|
"Typing :: Typed",
|
|
32
31
|
]
|
|
32
|
+
|
|
33
33
|
dependencies = [
|
|
34
34
|
"beautifulsoup4>=4.12.3",
|
|
35
35
|
]
|
|
36
36
|
|
|
37
|
+
urls.homepage = "https://github.com/Goldziher/html-to-markdown"
|
|
38
|
+
|
|
37
39
|
[dependency-groups]
|
|
38
40
|
dev = [
|
|
39
41
|
"covdefaults>=2.3",
|
|
40
42
|
"mypy>=1.14.1",
|
|
41
43
|
"pre-commit>=4.1",
|
|
42
44
|
"pytest>=8.3.4",
|
|
43
|
-
"pytest-cov>=6",
|
|
45
|
+
"pytest-cov>=6.1",
|
|
44
46
|
"pytest-mock>=3.14",
|
|
45
47
|
"ruff>=0.9.3",
|
|
46
48
|
"types-beautifulsoup4>=4.12.0.20241020",
|
|
49
|
+
"uv-bump",
|
|
47
50
|
]
|
|
48
51
|
|
|
52
|
+
[tool.setuptools.packages.find]
|
|
53
|
+
include = [ "html_to_markdown" ]
|
|
54
|
+
|
|
55
|
+
[tool.setuptools.package-data]
|
|
56
|
+
html_to_markdown = [ "py.typed" ]
|
|
57
|
+
|
|
49
58
|
[tool.hatch.build]
|
|
50
59
|
skip-excluded-dirs = true
|
|
51
60
|
|
|
52
|
-
|
|
53
|
-
only-include = [ "html_to_markdown" ]
|
|
54
|
-
|
|
55
|
-
[tool.hatch.build.targets.wheel]
|
|
56
|
-
only-include = [ "html_to_markdown" ]
|
|
61
|
+
scripts.html_to_markdown = "html_to_markdown.__main__:cli"
|
|
57
62
|
|
|
58
63
|
[tool.ruff]
|
|
59
64
|
target-version = "py39"
|
|
@@ -111,3 +116,6 @@ disallow_untyped_decorators = false
|
|
|
111
116
|
|
|
112
117
|
[tool.uv]
|
|
113
118
|
default-groups = [ "dev" ]
|
|
119
|
+
|
|
120
|
+
[tool.uv.sources]
|
|
121
|
+
uv-bump = { git = "https://github.com/Goldziher/uv-bump" }
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
*$py.class
|
|
2
|
-
*.iml
|
|
3
|
-
*.log
|
|
4
|
-
*.py[cod]
|
|
5
|
-
.coverage
|
|
6
|
-
.env
|
|
7
|
-
.idea/
|
|
8
|
-
.mypy_cache/
|
|
9
|
-
.pdm-build/
|
|
10
|
-
.pdm-python
|
|
11
|
-
.pdm.toml
|
|
12
|
-
.pytest_cache/
|
|
13
|
-
.python-version
|
|
14
|
-
.ruff_cache/
|
|
15
|
-
.tox/
|
|
16
|
-
.venv/
|
|
17
|
-
.vscode/
|
|
18
|
-
__pycache__/
|
|
19
|
-
__pypackages__/
|
|
20
|
-
coverage.xml
|
|
21
|
-
dist/
|
html_to_markdown-1.2.0/PKG-INFO
DELETED
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: html-to-markdown
|
|
3
|
-
Version: 1.2.0
|
|
4
|
-
Summary: Convert HTML to markdown
|
|
5
|
-
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
|
-
License: MIT
|
|
7
|
-
License-File: LICENSE
|
|
8
|
-
Keywords: converter,html,markdown,text-extraction,text-processing
|
|
9
|
-
Classifier: Intended Audience :: Developers
|
|
10
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
-
Classifier: Operating System :: OS Independent
|
|
12
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
-
Classifier: Topic :: Text Processing
|
|
19
|
-
Classifier: Topic :: Text Processing :: Markup
|
|
20
|
-
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
21
|
-
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
22
|
-
Classifier: Topic :: Utilities
|
|
23
|
-
Classifier: Typing :: Typed
|
|
24
|
-
Requires-Python: >=3.9
|
|
25
|
-
Requires-Dist: beautifulsoup4>=4.12.3
|
|
26
|
-
Description-Content-Type: text/markdown
|
|
27
|
-
|
|
28
|
-
# html_to_markdown
|
|
29
|
-
|
|
30
|
-
This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
|
|
31
|
-
Python 3.9 and above.
|
|
32
|
-
|
|
33
|
-
### Differences with the Markdownify
|
|
34
|
-
|
|
35
|
-
- The refactored codebase uses a strict functional approach - no classes are involved.
|
|
36
|
-
- There is full typing with strict MyPy strict adherence and a py.typed file included.
|
|
37
|
-
- The `convert_to_markdown` function allows passing a pre-configured instance of `BeautifulSoup` instead of html.
|
|
38
|
-
- This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
|
|
39
|
-
point versioning is no longer aligned.
|
|
40
|
-
|
|
41
|
-
## Installation
|
|
42
|
-
|
|
43
|
-
```shell
|
|
44
|
-
pip install html_to_markdown
|
|
45
|
-
```
|
|
46
|
-
|
|
47
|
-
## Usage
|
|
48
|
-
|
|
49
|
-
Convert an string HTML to Markdown:
|
|
50
|
-
|
|
51
|
-
```python
|
|
52
|
-
from html_to_markdown import convert_to_markdown
|
|
53
|
-
|
|
54
|
-
convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>') # > '**Yay** [GitHub](http://github.com)'
|
|
55
|
-
```
|
|
56
|
-
|
|
57
|
-
Or pass a pre-configured instance of `BeautifulSoup`:
|
|
58
|
-
|
|
59
|
-
```python
|
|
60
|
-
from bs4 import BeautifulSoup
|
|
61
|
-
from html_to_markdown import convert_to_markdown
|
|
62
|
-
|
|
63
|
-
soup = BeautifulSoup('<b>Yay</b> <a href="http://github.com">GitHub</a>', 'lxml') # lxml requires an extra dependency.
|
|
64
|
-
|
|
65
|
-
convert_to_markdown(soup) # > '**Yay** [GitHub](http://github.com)'
|
|
66
|
-
```
|
|
67
|
-
|
|
68
|
-
### Options
|
|
69
|
-
|
|
70
|
-
The `convert_to_markdown` function accepts the following kwargs:
|
|
71
|
-
|
|
72
|
-
- autolinks (bool): Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
73
|
-
- bullets (str): A string of characters to use for bullet points in lists. Defaults to '\*+-'.
|
|
74
|
-
- code_language (str): Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
75
|
-
- code_language_callback (Callable[[Any], str] | None): Function to dynamically determine the language for code blocks.
|
|
76
|
-
- convert (Iterable[str] | None): A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
77
|
-
- default_title (bool): Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
78
|
-
- escape_asterisks (bool): Escape asterisks (\*) to prevent unintended Markdown formatting. Defaults to True.
|
|
79
|
-
- escape_misc (bool): Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
80
|
-
- escape*underscores (bool): Escape underscores (*) to prevent unintended italic formatting. Defaults to True.
|
|
81
|
-
- heading_style (Literal["underlined", "atx", "atx_closed"]): The style to use for Markdown headings. Defaults to "
|
|
82
|
-
underlined".
|
|
83
|
-
- keep_inline_images_in (Iterable[str] | None): Tags in which inline images should be preserved. Defaults to None.
|
|
84
|
-
- newline_style (Literal["spaces", "backslash"]): Style for handling newlines in text content. Defaults to "spaces".
|
|
85
|
-
- strip (Iterable[str] | None): Tags to strip from the output. Defaults to None.
|
|
86
|
-
- strong*em_symbol (Literal["\*", "*"]): Symbol to use for strong/emphasized text. Defaults to "\*".
|
|
87
|
-
- sub_symbol (str): Custom symbol for subscript text. Defaults to an empty string.
|
|
88
|
-
- sup_symbol (str): Custom symbol for superscript text. Defaults to an empty string.
|
|
89
|
-
- wrap (bool): Wrap text to the specified width. Defaults to False.
|
|
90
|
-
- wrap_width (int): The number of characters at which to wrap text. Defaults to 80.
|
|
91
|
-
- convert_as_inline (bool): Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
92
|
-
|
|
93
|
-
## CLI
|
|
94
|
-
|
|
95
|
-
For compatibility with the original markdownify, a CLI is provided. Use `html_to_markdown example.html > example.md` or
|
|
96
|
-
pipe input from stdin:
|
|
97
|
-
|
|
98
|
-
```shell
|
|
99
|
-
cat example.html | html_to_markdown > example.md
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
Use `html_to_markdown -h` to see all available options. They are the same as listed above and take the same arguments.
|
html_to_markdown-1.2.0/README.md
DELETED
|
@@ -1,75 +0,0 @@
|
|
|
1
|
-
# html_to_markdown
|
|
2
|
-
|
|
3
|
-
This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
|
|
4
|
-
Python 3.9 and above.
|
|
5
|
-
|
|
6
|
-
### Differences with the Markdownify
|
|
7
|
-
|
|
8
|
-
- The refactored codebase uses a strict functional approach - no classes are involved.
|
|
9
|
-
- There is full typing with strict MyPy strict adherence and a py.typed file included.
|
|
10
|
-
- The `convert_to_markdown` function allows passing a pre-configured instance of `BeautifulSoup` instead of html.
|
|
11
|
-
- This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
|
|
12
|
-
point versioning is no longer aligned.
|
|
13
|
-
|
|
14
|
-
## Installation
|
|
15
|
-
|
|
16
|
-
```shell
|
|
17
|
-
pip install html_to_markdown
|
|
18
|
-
```
|
|
19
|
-
|
|
20
|
-
## Usage
|
|
21
|
-
|
|
22
|
-
Convert an string HTML to Markdown:
|
|
23
|
-
|
|
24
|
-
```python
|
|
25
|
-
from html_to_markdown import convert_to_markdown
|
|
26
|
-
|
|
27
|
-
convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>') # > '**Yay** [GitHub](http://github.com)'
|
|
28
|
-
```
|
|
29
|
-
|
|
30
|
-
Or pass a pre-configured instance of `BeautifulSoup`:
|
|
31
|
-
|
|
32
|
-
```python
|
|
33
|
-
from bs4 import BeautifulSoup
|
|
34
|
-
from html_to_markdown import convert_to_markdown
|
|
35
|
-
|
|
36
|
-
soup = BeautifulSoup('<b>Yay</b> <a href="http://github.com">GitHub</a>', 'lxml') # lxml requires an extra dependency.
|
|
37
|
-
|
|
38
|
-
convert_to_markdown(soup) # > '**Yay** [GitHub](http://github.com)'
|
|
39
|
-
```
|
|
40
|
-
|
|
41
|
-
### Options
|
|
42
|
-
|
|
43
|
-
The `convert_to_markdown` function accepts the following kwargs:
|
|
44
|
-
|
|
45
|
-
- autolinks (bool): Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
46
|
-
- bullets (str): A string of characters to use for bullet points in lists. Defaults to '\*+-'.
|
|
47
|
-
- code_language (str): Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
48
|
-
- code_language_callback (Callable[[Any], str] | None): Function to dynamically determine the language for code blocks.
|
|
49
|
-
- convert (Iterable[str] | None): A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
50
|
-
- default_title (bool): Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
51
|
-
- escape_asterisks (bool): Escape asterisks (\*) to prevent unintended Markdown formatting. Defaults to True.
|
|
52
|
-
- escape_misc (bool): Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
53
|
-
- escape*underscores (bool): Escape underscores (*) to prevent unintended italic formatting. Defaults to True.
|
|
54
|
-
- heading_style (Literal["underlined", "atx", "atx_closed"]): The style to use for Markdown headings. Defaults to "
|
|
55
|
-
underlined".
|
|
56
|
-
- keep_inline_images_in (Iterable[str] | None): Tags in which inline images should be preserved. Defaults to None.
|
|
57
|
-
- newline_style (Literal["spaces", "backslash"]): Style for handling newlines in text content. Defaults to "spaces".
|
|
58
|
-
- strip (Iterable[str] | None): Tags to strip from the output. Defaults to None.
|
|
59
|
-
- strong*em_symbol (Literal["\*", "*"]): Symbol to use for strong/emphasized text. Defaults to "\*".
|
|
60
|
-
- sub_symbol (str): Custom symbol for subscript text. Defaults to an empty string.
|
|
61
|
-
- sup_symbol (str): Custom symbol for superscript text. Defaults to an empty string.
|
|
62
|
-
- wrap (bool): Wrap text to the specified width. Defaults to False.
|
|
63
|
-
- wrap_width (int): The number of characters at which to wrap text. Defaults to 80.
|
|
64
|
-
- convert_as_inline (bool): Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
65
|
-
|
|
66
|
-
## CLI
|
|
67
|
-
|
|
68
|
-
For compatibility with the original markdownify, a CLI is provided. Use `html_to_markdown example.html > example.md` or
|
|
69
|
-
pipe input from stdin:
|
|
70
|
-
|
|
71
|
-
```shell
|
|
72
|
-
cat example.html | html_to_markdown > example.md
|
|
73
|
-
```
|
|
74
|
-
|
|
75
|
-
Use `html_to_markdown -h` to see all available options. They are the same as listed above and take the same arguments.
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import TYPE_CHECKING, Literal
|
|
4
|
-
|
|
5
|
-
from html_to_markdown.constants import ASTERISK, SPACES, UNDERLINED
|
|
6
|
-
from html_to_markdown.converters import create_converters_map
|
|
7
|
-
|
|
8
|
-
if TYPE_CHECKING:
|
|
9
|
-
from collections.abc import Callable, Iterable
|
|
10
|
-
|
|
11
|
-
from bs4 import Tag
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def _create_legacy_class(
|
|
15
|
-
autolinks: bool,
|
|
16
|
-
bullets: str,
|
|
17
|
-
code_language: str,
|
|
18
|
-
code_language_callback: Callable[[Tag], str] | None,
|
|
19
|
-
default_title: bool,
|
|
20
|
-
heading_style: Literal["atx", "atx_closed", "underlined"],
|
|
21
|
-
keep_inline_images_in: Iterable[str] | None,
|
|
22
|
-
newline_style: str,
|
|
23
|
-
strong_em_symbol: str,
|
|
24
|
-
sub_symbol: str,
|
|
25
|
-
sup_symbol: str,
|
|
26
|
-
wrap: bool,
|
|
27
|
-
wrap_width: int,
|
|
28
|
-
) -> type:
|
|
29
|
-
"""Create a legacy class for Markdownify.
|
|
30
|
-
|
|
31
|
-
Deprecated: Use the new hooks api instead.
|
|
32
|
-
|
|
33
|
-
Args:
|
|
34
|
-
autolinks: Whether to convert URLs into links.
|
|
35
|
-
bullets: The bullet characters to use for unordered lists.
|
|
36
|
-
code_language: The default code language to use.
|
|
37
|
-
code_language_callback: A callback to get the code language.
|
|
38
|
-
default_title: Whether to use the URL as the title for links.
|
|
39
|
-
heading_style: The style of headings.
|
|
40
|
-
keep_inline_images_in: The tags to keep inline images in.
|
|
41
|
-
newline_style: The style of newlines.
|
|
42
|
-
strong_em_symbol: The symbol to use for strong and emphasis text.
|
|
43
|
-
sub_symbol: The symbol to use for subscript text.
|
|
44
|
-
sup_symbol: The symbol to use for superscript text.
|
|
45
|
-
wrap: Whether to wrap text.
|
|
46
|
-
wrap_width: The width to wrap text at.
|
|
47
|
-
|
|
48
|
-
Returns:
|
|
49
|
-
A class that can be used to convert HTML to Markdown.
|
|
50
|
-
"""
|
|
51
|
-
return type(
|
|
52
|
-
"Markdownify",
|
|
53
|
-
(),
|
|
54
|
-
{
|
|
55
|
-
k.removeprefix("_"): v
|
|
56
|
-
for k, v in create_converters_map(
|
|
57
|
-
autolinks=autolinks,
|
|
58
|
-
bullets=bullets,
|
|
59
|
-
code_language=code_language,
|
|
60
|
-
code_language_callback=code_language_callback,
|
|
61
|
-
default_title=default_title,
|
|
62
|
-
heading_style=heading_style,
|
|
63
|
-
keep_inline_images_in=keep_inline_images_in,
|
|
64
|
-
newline_style=newline_style,
|
|
65
|
-
strong_em_symbol=strong_em_symbol,
|
|
66
|
-
sub_symbol=sub_symbol,
|
|
67
|
-
sup_symbol=sup_symbol,
|
|
68
|
-
wrap=wrap,
|
|
69
|
-
wrap_width=wrap_width,
|
|
70
|
-
).items()
|
|
71
|
-
},
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
Markdownify = _create_legacy_class(
|
|
76
|
-
autolinks=True,
|
|
77
|
-
bullets="*+-",
|
|
78
|
-
code_language="",
|
|
79
|
-
code_language_callback=None,
|
|
80
|
-
default_title=False,
|
|
81
|
-
heading_style=UNDERLINED,
|
|
82
|
-
keep_inline_images_in=None,
|
|
83
|
-
newline_style=SPACES,
|
|
84
|
-
strong_em_symbol=ASTERISK,
|
|
85
|
-
sub_symbol="",
|
|
86
|
-
sup_symbol="",
|
|
87
|
-
wrap=False,
|
|
88
|
-
wrap_width=80,
|
|
89
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|