html-to-markdown 1.6.0__tar.gz → 1.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/PKG-INFO +87 -14
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/README.md +83 -10
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown/__init__.py +3 -1
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown/cli.py +1 -4
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown/converters.py +375 -645
- html_to_markdown-1.9.0/html_to_markdown/preprocessor.py +407 -0
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown/processing.py +227 -87
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown/utils.py +12 -5
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown.egg-info/PKG-INFO +87 -14
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown.egg-info/SOURCES.txt +1 -0
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown.egg-info/requires.txt +2 -1
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/pyproject.toml +8 -9
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/LICENSE +0 -0
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown/constants.py +0 -0
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown/exceptions.py +0 -0
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown.egg-info/dependency_links.txt +0 -0
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown.egg-info/entry_points.txt +0 -0
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/html_to_markdown.egg-info/top_level.txt +0 -0
- {html_to_markdown-1.6.0 → html_to_markdown-1.9.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.9.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -15,7 +15,6 @@ Classifier: Intended Audience :: Developers
|
|
|
15
15
|
Classifier: License :: OSI Approved :: MIT License
|
|
16
16
|
Classifier: Operating System :: OS Independent
|
|
17
17
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.10
|
|
20
19
|
Classifier: Programming Language :: Python :: 3.11
|
|
21
20
|
Classifier: Programming Language :: Python :: 3.12
|
|
@@ -28,12 +27,13 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
|
28
27
|
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
29
28
|
Classifier: Topic :: Utilities
|
|
30
29
|
Classifier: Typing :: Typed
|
|
31
|
-
Requires-Python: >=3.
|
|
30
|
+
Requires-Python: >=3.10
|
|
32
31
|
Description-Content-Type: text/markdown
|
|
33
32
|
License-File: LICENSE
|
|
34
33
|
Requires-Dist: beautifulsoup4>=4.13.4
|
|
34
|
+
Requires-Dist: nh3>=0.3
|
|
35
35
|
Provides-Extra: lxml
|
|
36
|
-
Requires-Dist: lxml>=
|
|
36
|
+
Requires-Dist: lxml>=6; extra == "lxml"
|
|
37
37
|
Dynamic: license-file
|
|
38
38
|
|
|
39
39
|
# html-to-markdown
|
|
@@ -45,6 +45,7 @@ Python 3.9+.
|
|
|
45
45
|
## Features
|
|
46
46
|
|
|
47
47
|
- **Full HTML5 Support**: Comprehensive support for all modern HTML5 elements including semantic, form, table, ruby, interactive, structural, SVG, and math elements
|
|
48
|
+
- **Enhanced Table Support**: Advanced handling of merged cells with rowspan/colspan support for better table representation
|
|
48
49
|
- **Type Safety**: Strict MyPy adherence with comprehensive type hints
|
|
49
50
|
- **Metadata Extraction**: Automatic extraction of document metadata (title, meta tags) as comment headers
|
|
50
51
|
- **Streaming Support**: Memory-efficient processing for large documents with progress callbacks
|
|
@@ -54,7 +55,7 @@ Python 3.9+.
|
|
|
54
55
|
- **CLI Tool**: Full-featured command-line interface with all API options exposed
|
|
55
56
|
- **Custom Converters**: Extensible converter system for custom HTML tag handling
|
|
56
57
|
- **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
|
|
57
|
-
- **
|
|
58
|
+
- **Comprehensive Test Coverage**: 91%+ test coverage with 623+ comprehensive tests
|
|
58
59
|
|
|
59
60
|
## Installation
|
|
60
61
|
|
|
@@ -202,6 +203,51 @@ print(markdown)
|
|
|
202
203
|
|
|
203
204
|
Custom converters take precedence over the built-in converters and can be used alongside other configuration options.
|
|
204
205
|
|
|
206
|
+
### Enhanced Table Support
|
|
207
|
+
|
|
208
|
+
The library now provides better handling of complex tables with merged cells:
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
from html_to_markdown import convert_to_markdown
|
|
212
|
+
|
|
213
|
+
# HTML table with merged cells
|
|
214
|
+
html = """
|
|
215
|
+
<table>
|
|
216
|
+
<tr>
|
|
217
|
+
<th rowspan="2">Category</th>
|
|
218
|
+
<th colspan="2">Sales Data</th>
|
|
219
|
+
</tr>
|
|
220
|
+
<tr>
|
|
221
|
+
<th>Q1</th>
|
|
222
|
+
<th>Q2</th>
|
|
223
|
+
</tr>
|
|
224
|
+
<tr>
|
|
225
|
+
<td>Product A</td>
|
|
226
|
+
<td>$100K</td>
|
|
227
|
+
<td>$150K</td>
|
|
228
|
+
</tr>
|
|
229
|
+
</table>
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
markdown = convert_to_markdown(html)
|
|
233
|
+
print(markdown)
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
Output:
|
|
237
|
+
|
|
238
|
+
```markdown
|
|
239
|
+
| Category | Sales Data | |
|
|
240
|
+
| --- | --- | --- |
|
|
241
|
+
| | Q1 | Q2 |
|
|
242
|
+
| Product A | $100K | $150K |
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
The library handles:
|
|
246
|
+
|
|
247
|
+
- **Rowspan**: Inserts empty cells in subsequent rows
|
|
248
|
+
- **Colspan**: Properly manages column spanning
|
|
249
|
+
- **Clean output**: Removes `<colgroup>` and `<col>` elements that have no Markdown equivalent
|
|
250
|
+
|
|
205
251
|
### Key Configuration Options
|
|
206
252
|
|
|
207
253
|
| Option | Type | Default | Description |
|
|
@@ -437,7 +483,9 @@ This library provides comprehensive support for all modern HTML5 elements:
|
|
|
437
483
|
|
|
438
484
|
### Table Elements
|
|
439
485
|
|
|
440
|
-
- `<table>`, `<thead>`, `<tbody>`, `<tfoot>`, `<tr>`, `<th>`, `<td>`, `<caption
|
|
486
|
+
- `<table>`, `<thead>`, `<tbody>`, `<tfoot>`, `<tr>`, `<th>`, `<td>`, `<caption>`
|
|
487
|
+
- **Merged cell support**: Handles `rowspan` and `colspan` attributes for complex table layouts
|
|
488
|
+
- **Smart cleanup**: Automatically handles table styling elements for clean Markdown output
|
|
441
489
|
|
|
442
490
|
### Interactive Elements
|
|
443
491
|
|
|
@@ -456,16 +504,41 @@ This library provides comprehensive support for all modern HTML5 elements:
|
|
|
456
504
|
|
|
457
505
|
- `<math>` (MathML support)
|
|
458
506
|
|
|
459
|
-
##
|
|
507
|
+
## Advanced Table Support
|
|
508
|
+
|
|
509
|
+
The library provides sophisticated handling of complex HTML tables, including merged cells and proper structure conversion:
|
|
510
|
+
|
|
511
|
+
```python
|
|
512
|
+
from html_to_markdown import convert_to_markdown
|
|
513
|
+
|
|
514
|
+
# Complex table with merged cells
|
|
515
|
+
html = """
|
|
516
|
+
<table>
|
|
517
|
+
<caption>Sales Report</caption>
|
|
518
|
+
<tr>
|
|
519
|
+
<th rowspan="2">Product</th>
|
|
520
|
+
<th colspan="2">Quarterly Sales</th>
|
|
521
|
+
</tr>
|
|
522
|
+
<tr>
|
|
523
|
+
<th>Q1</th>
|
|
524
|
+
<th>Q2</th>
|
|
525
|
+
</tr>
|
|
526
|
+
<tr>
|
|
527
|
+
<td>Widget A</td>
|
|
528
|
+
<td>$50K</td>
|
|
529
|
+
<td>$75K</td>
|
|
530
|
+
</tr>
|
|
531
|
+
</table>
|
|
532
|
+
"""
|
|
533
|
+
|
|
534
|
+
result = convert_to_markdown(html)
|
|
535
|
+
```
|
|
460
536
|
|
|
461
|
-
|
|
537
|
+
**Features:**
|
|
462
538
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
1. **Streaming API**: New streaming parameters for large document processing
|
|
467
|
-
1. **Task List Support**: Automatic conversion of HTML checkboxes to GitHub-compatible task lists
|
|
468
|
-
1. **Highlight Styles**: New `highlight_style` parameter with multiple options for `<mark>` elements
|
|
539
|
+
- **Merged cell support**: Handles `rowspan` and `colspan` attributes intelligently
|
|
540
|
+
- **Clean output**: Automatically removes table styling elements that don't translate to Markdown
|
|
541
|
+
- **Structure preservation**: Maintains table hierarchy and relationships
|
|
469
542
|
|
|
470
543
|
## Acknowledgments
|
|
471
544
|
|
|
@@ -7,6 +7,7 @@ Python 3.9+.
|
|
|
7
7
|
## Features
|
|
8
8
|
|
|
9
9
|
- **Full HTML5 Support**: Comprehensive support for all modern HTML5 elements including semantic, form, table, ruby, interactive, structural, SVG, and math elements
|
|
10
|
+
- **Enhanced Table Support**: Advanced handling of merged cells with rowspan/colspan support for better table representation
|
|
10
11
|
- **Type Safety**: Strict MyPy adherence with comprehensive type hints
|
|
11
12
|
- **Metadata Extraction**: Automatic extraction of document metadata (title, meta tags) as comment headers
|
|
12
13
|
- **Streaming Support**: Memory-efficient processing for large documents with progress callbacks
|
|
@@ -16,7 +17,7 @@ Python 3.9+.
|
|
|
16
17
|
- **CLI Tool**: Full-featured command-line interface with all API options exposed
|
|
17
18
|
- **Custom Converters**: Extensible converter system for custom HTML tag handling
|
|
18
19
|
- **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
|
|
19
|
-
- **
|
|
20
|
+
- **Comprehensive Test Coverage**: 91%+ test coverage with 623+ comprehensive tests
|
|
20
21
|
|
|
21
22
|
## Installation
|
|
22
23
|
|
|
@@ -164,6 +165,51 @@ print(markdown)
|
|
|
164
165
|
|
|
165
166
|
Custom converters take precedence over the built-in converters and can be used alongside other configuration options.
|
|
166
167
|
|
|
168
|
+
### Enhanced Table Support
|
|
169
|
+
|
|
170
|
+
The library now provides better handling of complex tables with merged cells:
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
from html_to_markdown import convert_to_markdown
|
|
174
|
+
|
|
175
|
+
# HTML table with merged cells
|
|
176
|
+
html = """
|
|
177
|
+
<table>
|
|
178
|
+
<tr>
|
|
179
|
+
<th rowspan="2">Category</th>
|
|
180
|
+
<th colspan="2">Sales Data</th>
|
|
181
|
+
</tr>
|
|
182
|
+
<tr>
|
|
183
|
+
<th>Q1</th>
|
|
184
|
+
<th>Q2</th>
|
|
185
|
+
</tr>
|
|
186
|
+
<tr>
|
|
187
|
+
<td>Product A</td>
|
|
188
|
+
<td>$100K</td>
|
|
189
|
+
<td>$150K</td>
|
|
190
|
+
</tr>
|
|
191
|
+
</table>
|
|
192
|
+
"""
|
|
193
|
+
|
|
194
|
+
markdown = convert_to_markdown(html)
|
|
195
|
+
print(markdown)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
Output:
|
|
199
|
+
|
|
200
|
+
```markdown
|
|
201
|
+
| Category | Sales Data | |
|
|
202
|
+
| --- | --- | --- |
|
|
203
|
+
| | Q1 | Q2 |
|
|
204
|
+
| Product A | $100K | $150K |
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
The library handles:
|
|
208
|
+
|
|
209
|
+
- **Rowspan**: Inserts empty cells in subsequent rows
|
|
210
|
+
- **Colspan**: Properly manages column spanning
|
|
211
|
+
- **Clean output**: Removes `<colgroup>` and `<col>` elements that have no Markdown equivalent
|
|
212
|
+
|
|
167
213
|
### Key Configuration Options
|
|
168
214
|
|
|
169
215
|
| Option | Type | Default | Description |
|
|
@@ -399,7 +445,9 @@ This library provides comprehensive support for all modern HTML5 elements:
|
|
|
399
445
|
|
|
400
446
|
### Table Elements
|
|
401
447
|
|
|
402
|
-
- `<table>`, `<thead>`, `<tbody>`, `<tfoot>`, `<tr>`, `<th>`, `<td>`, `<caption
|
|
448
|
+
- `<table>`, `<thead>`, `<tbody>`, `<tfoot>`, `<tr>`, `<th>`, `<td>`, `<caption>`
|
|
449
|
+
- **Merged cell support**: Handles `rowspan` and `colspan` attributes for complex table layouts
|
|
450
|
+
- **Smart cleanup**: Automatically handles table styling elements for clean Markdown output
|
|
403
451
|
|
|
404
452
|
### Interactive Elements
|
|
405
453
|
|
|
@@ -418,16 +466,41 @@ This library provides comprehensive support for all modern HTML5 elements:
|
|
|
418
466
|
|
|
419
467
|
- `<math>` (MathML support)
|
|
420
468
|
|
|
421
|
-
##
|
|
469
|
+
## Advanced Table Support
|
|
470
|
+
|
|
471
|
+
The library provides sophisticated handling of complex HTML tables, including merged cells and proper structure conversion:
|
|
472
|
+
|
|
473
|
+
```python
|
|
474
|
+
from html_to_markdown import convert_to_markdown
|
|
475
|
+
|
|
476
|
+
# Complex table with merged cells
|
|
477
|
+
html = """
|
|
478
|
+
<table>
|
|
479
|
+
<caption>Sales Report</caption>
|
|
480
|
+
<tr>
|
|
481
|
+
<th rowspan="2">Product</th>
|
|
482
|
+
<th colspan="2">Quarterly Sales</th>
|
|
483
|
+
</tr>
|
|
484
|
+
<tr>
|
|
485
|
+
<th>Q1</th>
|
|
486
|
+
<th>Q2</th>
|
|
487
|
+
</tr>
|
|
488
|
+
<tr>
|
|
489
|
+
<td>Widget A</td>
|
|
490
|
+
<td>$50K</td>
|
|
491
|
+
<td>$75K</td>
|
|
492
|
+
</tr>
|
|
493
|
+
</table>
|
|
494
|
+
"""
|
|
495
|
+
|
|
496
|
+
result = convert_to_markdown(html)
|
|
497
|
+
```
|
|
422
498
|
|
|
423
|
-
|
|
499
|
+
**Features:**
|
|
424
500
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
1. **Streaming API**: New streaming parameters for large document processing
|
|
429
|
-
1. **Task List Support**: Automatic conversion of HTML checkboxes to GitHub-compatible task lists
|
|
430
|
-
1. **Highlight Styles**: New `highlight_style` parameter with multiple options for `<mark>` elements
|
|
501
|
+
- **Merged cell support**: Handles `rowspan` and `colspan` attributes intelligently
|
|
502
|
+
- **Clean output**: Automatically removes table styling elements that don't translate to Markdown
|
|
503
|
+
- **Structure preservation**: Maintains table hierarchy and relationships
|
|
431
504
|
|
|
432
505
|
## Acknowledgments
|
|
433
506
|
|
|
@@ -5,9 +5,9 @@ from html_to_markdown.exceptions import (
|
|
|
5
5
|
InvalidParserError,
|
|
6
6
|
MissingDependencyError,
|
|
7
7
|
)
|
|
8
|
+
from html_to_markdown.preprocessor import create_preprocessor, preprocess_html
|
|
8
9
|
from html_to_markdown.processing import convert_to_markdown, convert_to_markdown_stream
|
|
9
10
|
|
|
10
|
-
# For backward compatibility and to maintain the existing API
|
|
11
11
|
markdownify = convert_to_markdown
|
|
12
12
|
|
|
13
13
|
__all__ = [
|
|
@@ -18,5 +18,7 @@ __all__ = [
|
|
|
18
18
|
"MissingDependencyError",
|
|
19
19
|
"convert_to_markdown",
|
|
20
20
|
"convert_to_markdown_stream",
|
|
21
|
+
"create_preprocessor",
|
|
21
22
|
"markdownify",
|
|
23
|
+
"preprocess_html",
|
|
22
24
|
]
|
|
@@ -191,7 +191,6 @@ def main(argv: list[str]) -> str:
|
|
|
191
191
|
|
|
192
192
|
args = parser.parse_args(argv)
|
|
193
193
|
|
|
194
|
-
# Prepare base arguments
|
|
195
194
|
base_args = {
|
|
196
195
|
"strip": args.strip,
|
|
197
196
|
"convert": args.convert,
|
|
@@ -216,18 +215,16 @@ def main(argv: list[str]) -> str:
|
|
|
216
215
|
"highlight_style": args.highlight_style,
|
|
217
216
|
}
|
|
218
217
|
|
|
219
|
-
# Add streaming parameters only if streaming is enabled
|
|
220
218
|
if args.stream_processing:
|
|
221
219
|
base_args["stream_processing"] = True
|
|
222
220
|
base_args["chunk_size"] = args.chunk_size
|
|
223
221
|
|
|
224
|
-
# Progress callback for CLI
|
|
225
222
|
if args.show_progress:
|
|
226
223
|
|
|
227
224
|
def progress_callback(processed: int, total: int) -> None:
|
|
228
225
|
if total > 0:
|
|
229
226
|
percent = (processed / total) * 100
|
|
230
|
-
|
|
227
|
+
|
|
231
228
|
sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
|
|
232
229
|
sys.stderr.flush()
|
|
233
230
|
|