html-to-markdown 1.11.0__tar.gz → 1.12.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/PKG-INFO +143 -2
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/README.md +142 -1
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/cli.py +28 -2
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/converters.py +208 -130
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/exceptions.py +5 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/preprocessor.py +96 -86
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/processing.py +63 -48
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/utils.py +1 -3
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/whitespace.py +23 -33
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown.egg-info/PKG-INFO +143 -2
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/pyproject.toml +11 -1
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/LICENSE +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/constants.py +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown.egg-info/SOURCES.txt +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown.egg-info/dependency_links.txt +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown.egg-info/entry_points.txt +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown.egg-info/requires.txt +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown.egg-info/top_level.txt +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.12.1
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -320,6 +320,132 @@ def converter(*, tag: Tag, text: str, **kwargs) -> str:
|
|
|
320
320
|
|
|
321
321
|
Custom converters take precedence over built-in converters and can be used alongside other configuration options.
|
|
322
322
|
|
|
323
|
+
### Streaming API
|
|
324
|
+
|
|
325
|
+
For processing large documents with memory constraints, use the streaming API:
|
|
326
|
+
|
|
327
|
+
```python
|
|
328
|
+
from html_to_markdown import convert_to_markdown_stream
|
|
329
|
+
|
|
330
|
+
# Process large HTML in chunks
|
|
331
|
+
with open("large_document.html", "r") as f:
|
|
332
|
+
html_content = f.read()
|
|
333
|
+
|
|
334
|
+
# Returns a generator that yields markdown chunks
|
|
335
|
+
for chunk in convert_to_markdown_stream(html_content, chunk_size=2048):
|
|
336
|
+
print(chunk, end="")
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
With progress tracking:
|
|
340
|
+
|
|
341
|
+
```python
|
|
342
|
+
def show_progress(processed: int, total: int):
|
|
343
|
+
if total > 0:
|
|
344
|
+
percent = (processed / total) * 100
|
|
345
|
+
print(f"\rProgress: {percent:.1f}%", end="")
|
|
346
|
+
|
|
347
|
+
# Stream with progress callback
|
|
348
|
+
markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
#### When to Use Streaming vs Regular Processing
|
|
352
|
+
|
|
353
|
+
Based on comprehensive performance analysis, here are our recommendations:
|
|
354
|
+
|
|
355
|
+
**📄 Use Regular Processing When:**
|
|
356
|
+
|
|
357
|
+
- Files < 100KB (simplicity preferred)
|
|
358
|
+
- Simple scripts and one-off conversions
|
|
359
|
+
- Memory is not a concern
|
|
360
|
+
- You want the simplest API
|
|
361
|
+
|
|
362
|
+
**🌊 Use Streaming Processing When:**
|
|
363
|
+
|
|
364
|
+
- Files > 100KB (memory efficiency)
|
|
365
|
+
- Processing many files in batch
|
|
366
|
+
- Memory is constrained
|
|
367
|
+
- You need progress reporting
|
|
368
|
+
- You want to process results incrementally
|
|
369
|
+
- Running in production environments
|
|
370
|
+
|
|
371
|
+
**📋 Specific Recommendations by File Size:**
|
|
372
|
+
|
|
373
|
+
| File Size | Recommendation | Reason |
|
|
374
|
+
| ---------- | ----------------------------------------------- | -------------------------------------- |
|
|
375
|
+
| < 50KB | Regular (simplicity) or Streaming (3-5% faster) | Either works well |
|
|
376
|
+
| 50KB-100KB | Either (streaming slightly preferred) | Minimal difference |
|
|
377
|
+
| 100KB-1MB | Streaming preferred | Better performance + memory efficiency |
|
|
378
|
+
| > 1MB | Streaming strongly recommended | Significant memory advantages |
|
|
379
|
+
|
|
380
|
+
**🔧 Configuration Recommendations:**
|
|
381
|
+
|
|
382
|
+
- **Default chunk_size: 2048 bytes** (optimal performance balance)
|
|
383
|
+
- **For very large files (>10MB)**: Consider `chunk_size=4096`
|
|
384
|
+
- **For memory-constrained environments**: Use smaller chunks `chunk_size=1024`
|
|
385
|
+
|
|
386
|
+
**📈 Performance Benefits:**
|
|
387
|
+
|
|
388
|
+
Streaming provides consistent **3-5% performance improvement** across all file sizes:
|
|
389
|
+
|
|
390
|
+
- **Streaming throughput**: ~0.47-0.48 MB/s
|
|
391
|
+
- **Regular throughput**: ~0.44-0.47 MB/s
|
|
392
|
+
- **Memory usage**: Streaming uses less peak memory for large files
|
|
393
|
+
- **Latency**: Streaming allows processing results before completion
|
|
394
|
+
|
|
395
|
+
### Preprocessing API
|
|
396
|
+
|
|
397
|
+
The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
|
|
398
|
+
|
|
399
|
+
```python
|
|
400
|
+
from html_to_markdown import preprocess_html, create_preprocessor
|
|
401
|
+
|
|
402
|
+
# Direct preprocessing with custom options
|
|
403
|
+
cleaned_html = preprocess_html(
|
|
404
|
+
raw_html,
|
|
405
|
+
remove_navigation=True,
|
|
406
|
+
remove_forms=True,
|
|
407
|
+
remove_scripts=True,
|
|
408
|
+
remove_styles=True,
|
|
409
|
+
remove_comments=True,
|
|
410
|
+
preserve_semantic_structure=True,
|
|
411
|
+
preserve_tables=True,
|
|
412
|
+
preserve_media=True,
|
|
413
|
+
)
|
|
414
|
+
markdown = convert_to_markdown(cleaned_html)
|
|
415
|
+
|
|
416
|
+
# Create a preprocessor configuration from presets
|
|
417
|
+
config = create_preprocessor(preset="aggressive", preserve_tables=False) # or "minimal", "standard" # Override preset settings
|
|
418
|
+
markdown = convert_to_markdown(html, **config)
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
### Exception Handling
|
|
422
|
+
|
|
423
|
+
The library provides specific exception classes for better error handling:
|
|
424
|
+
|
|
425
|
+
````python
|
|
426
|
+
from html_to_markdown import (
|
|
427
|
+
convert_to_markdown,
|
|
428
|
+
HtmlToMarkdownError,
|
|
429
|
+
EmptyHtmlError,
|
|
430
|
+
InvalidParserError,
|
|
431
|
+
ConflictingOptionsError,
|
|
432
|
+
MissingDependencyError
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
try:
|
|
436
|
+
markdown = convert_to_markdown(html, parser='lxml')
|
|
437
|
+
except MissingDependencyError:
|
|
438
|
+
# lxml not installed
|
|
439
|
+
markdown = convert_to_markdown(html, parser='html.parser')
|
|
440
|
+
except EmptyHtmlError:
|
|
441
|
+
print("No HTML content to convert")
|
|
442
|
+
except InvalidParserError as e:
|
|
443
|
+
print(f"Parser error: {e}")
|
|
444
|
+
except ConflictingOptionsError as e:
|
|
445
|
+
print(f"Conflicting options: {e}")
|
|
446
|
+
except HtmlToMarkdownError as e:
|
|
447
|
+
print(f"Conversion error: {e}")
|
|
448
|
+
|
|
323
449
|
## CLI Usage
|
|
324
450
|
|
|
325
451
|
Convert HTML files directly from the command line with full access to all API options:
|
|
@@ -340,7 +466,7 @@ html_to_markdown \
|
|
|
340
466
|
--preprocess-html \
|
|
341
467
|
--preprocessing-preset aggressive \
|
|
342
468
|
input.html > output.md
|
|
343
|
-
|
|
469
|
+
````
|
|
344
470
|
|
|
345
471
|
### Key CLI Options
|
|
346
472
|
|
|
@@ -353,6 +479,20 @@ html_to_markdown \
|
|
|
353
479
|
--whitespace-mode {normalized,strict} # Whitespace handling (default: normalized)
|
|
354
480
|
--heading-style {atx,atx_closed,underlined} # Header style
|
|
355
481
|
--no-extract-metadata # Disable metadata extraction
|
|
482
|
+
--br-in-tables # Use <br> tags for line breaks in table cells
|
|
483
|
+
--source-encoding ENCODING # Override auto-detected encoding (rarely needed)
|
|
484
|
+
```
|
|
485
|
+
|
|
486
|
+
**File Encoding:**
|
|
487
|
+
|
|
488
|
+
The CLI automatically detects file encoding in most cases. Use `--source-encoding` only when automatic detection fails (typically on some Windows systems or with unusual encodings):
|
|
489
|
+
|
|
490
|
+
```shell
|
|
491
|
+
# Override auto-detection for Latin-1 encoded file
|
|
492
|
+
html_to_markdown --source-encoding latin-1 input.html > output.md
|
|
493
|
+
|
|
494
|
+
# Force UTF-16 encoding when auto-detection fails
|
|
495
|
+
html_to_markdown --source-encoding utf-16 input.html > output.md
|
|
356
496
|
```
|
|
357
497
|
|
|
358
498
|
**All Available Options:**
|
|
@@ -393,6 +533,7 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
|
|
|
393
533
|
- `newline_style` (str, default: `'spaces'`): Style for handling newlines (`'spaces'` or `'backslash'`)
|
|
394
534
|
- `sub_symbol` (str, default: `''`): Custom symbol for subscript text
|
|
395
535
|
- `sup_symbol` (str, default: `''`): Custom symbol for superscript text
|
|
536
|
+
- `br_in_tables` (bool, default: `False`): Use `<br>` tags for line breaks in table cells instead of spaces
|
|
396
537
|
|
|
397
538
|
### Parser Options
|
|
398
539
|
|
|
@@ -282,6 +282,132 @@ def converter(*, tag: Tag, text: str, **kwargs) -> str:
|
|
|
282
282
|
|
|
283
283
|
Custom converters take precedence over built-in converters and can be used alongside other configuration options.
|
|
284
284
|
|
|
285
|
+
### Streaming API
|
|
286
|
+
|
|
287
|
+
For processing large documents with memory constraints, use the streaming API:
|
|
288
|
+
|
|
289
|
+
```python
|
|
290
|
+
from html_to_markdown import convert_to_markdown_stream
|
|
291
|
+
|
|
292
|
+
# Process large HTML in chunks
|
|
293
|
+
with open("large_document.html", "r") as f:
|
|
294
|
+
html_content = f.read()
|
|
295
|
+
|
|
296
|
+
# Returns a generator that yields markdown chunks
|
|
297
|
+
for chunk in convert_to_markdown_stream(html_content, chunk_size=2048):
|
|
298
|
+
print(chunk, end="")
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
With progress tracking:
|
|
302
|
+
|
|
303
|
+
```python
|
|
304
|
+
def show_progress(processed: int, total: int):
|
|
305
|
+
if total > 0:
|
|
306
|
+
percent = (processed / total) * 100
|
|
307
|
+
print(f"\rProgress: {percent:.1f}%", end="")
|
|
308
|
+
|
|
309
|
+
# Stream with progress callback
|
|
310
|
+
markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
#### When to Use Streaming vs Regular Processing
|
|
314
|
+
|
|
315
|
+
Based on comprehensive performance analysis, here are our recommendations:
|
|
316
|
+
|
|
317
|
+
**📄 Use Regular Processing When:**
|
|
318
|
+
|
|
319
|
+
- Files < 100KB (simplicity preferred)
|
|
320
|
+
- Simple scripts and one-off conversions
|
|
321
|
+
- Memory is not a concern
|
|
322
|
+
- You want the simplest API
|
|
323
|
+
|
|
324
|
+
**🌊 Use Streaming Processing When:**
|
|
325
|
+
|
|
326
|
+
- Files > 100KB (memory efficiency)
|
|
327
|
+
- Processing many files in batch
|
|
328
|
+
- Memory is constrained
|
|
329
|
+
- You need progress reporting
|
|
330
|
+
- You want to process results incrementally
|
|
331
|
+
- Running in production environments
|
|
332
|
+
|
|
333
|
+
**📋 Specific Recommendations by File Size:**
|
|
334
|
+
|
|
335
|
+
| File Size | Recommendation | Reason |
|
|
336
|
+
| ---------- | ----------------------------------------------- | -------------------------------------- |
|
|
337
|
+
| < 50KB | Regular (simplicity) or Streaming (3-5% faster) | Either works well |
|
|
338
|
+
| 50KB-100KB | Either (streaming slightly preferred) | Minimal difference |
|
|
339
|
+
| 100KB-1MB | Streaming preferred | Better performance + memory efficiency |
|
|
340
|
+
| > 1MB | Streaming strongly recommended | Significant memory advantages |
|
|
341
|
+
|
|
342
|
+
**🔧 Configuration Recommendations:**
|
|
343
|
+
|
|
344
|
+
- **Default chunk_size: 2048 bytes** (optimal performance balance)
|
|
345
|
+
- **For very large files (>10MB)**: Consider `chunk_size=4096`
|
|
346
|
+
- **For memory-constrained environments**: Use smaller chunks `chunk_size=1024`
|
|
347
|
+
|
|
348
|
+
**📈 Performance Benefits:**
|
|
349
|
+
|
|
350
|
+
Streaming provides consistent **3-5% performance improvement** across all file sizes:
|
|
351
|
+
|
|
352
|
+
- **Streaming throughput**: ~0.47-0.48 MB/s
|
|
353
|
+
- **Regular throughput**: ~0.44-0.47 MB/s
|
|
354
|
+
- **Memory usage**: Streaming uses less peak memory for large files
|
|
355
|
+
- **Latency**: Streaming allows processing results before completion
|
|
356
|
+
|
|
357
|
+
### Preprocessing API
|
|
358
|
+
|
|
359
|
+
The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
|
|
360
|
+
|
|
361
|
+
```python
|
|
362
|
+
from html_to_markdown import preprocess_html, create_preprocessor
|
|
363
|
+
|
|
364
|
+
# Direct preprocessing with custom options
|
|
365
|
+
cleaned_html = preprocess_html(
|
|
366
|
+
raw_html,
|
|
367
|
+
remove_navigation=True,
|
|
368
|
+
remove_forms=True,
|
|
369
|
+
remove_scripts=True,
|
|
370
|
+
remove_styles=True,
|
|
371
|
+
remove_comments=True,
|
|
372
|
+
preserve_semantic_structure=True,
|
|
373
|
+
preserve_tables=True,
|
|
374
|
+
preserve_media=True,
|
|
375
|
+
)
|
|
376
|
+
markdown = convert_to_markdown(cleaned_html)
|
|
377
|
+
|
|
378
|
+
# Create a preprocessor configuration from presets
|
|
379
|
+
config = create_preprocessor(preset="aggressive", preserve_tables=False) # or "minimal", "standard" # Override preset settings
|
|
380
|
+
markdown = convert_to_markdown(html, **config)
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
### Exception Handling
|
|
384
|
+
|
|
385
|
+
The library provides specific exception classes for better error handling:
|
|
386
|
+
|
|
387
|
+
````python
|
|
388
|
+
from html_to_markdown import (
|
|
389
|
+
convert_to_markdown,
|
|
390
|
+
HtmlToMarkdownError,
|
|
391
|
+
EmptyHtmlError,
|
|
392
|
+
InvalidParserError,
|
|
393
|
+
ConflictingOptionsError,
|
|
394
|
+
MissingDependencyError
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
try:
|
|
398
|
+
markdown = convert_to_markdown(html, parser='lxml')
|
|
399
|
+
except MissingDependencyError:
|
|
400
|
+
# lxml not installed
|
|
401
|
+
markdown = convert_to_markdown(html, parser='html.parser')
|
|
402
|
+
except EmptyHtmlError:
|
|
403
|
+
print("No HTML content to convert")
|
|
404
|
+
except InvalidParserError as e:
|
|
405
|
+
print(f"Parser error: {e}")
|
|
406
|
+
except ConflictingOptionsError as e:
|
|
407
|
+
print(f"Conflicting options: {e}")
|
|
408
|
+
except HtmlToMarkdownError as e:
|
|
409
|
+
print(f"Conversion error: {e}")
|
|
410
|
+
|
|
285
411
|
## CLI Usage
|
|
286
412
|
|
|
287
413
|
Convert HTML files directly from the command line with full access to all API options:
|
|
@@ -302,7 +428,7 @@ html_to_markdown \
|
|
|
302
428
|
--preprocess-html \
|
|
303
429
|
--preprocessing-preset aggressive \
|
|
304
430
|
input.html > output.md
|
|
305
|
-
|
|
431
|
+
````
|
|
306
432
|
|
|
307
433
|
### Key CLI Options
|
|
308
434
|
|
|
@@ -315,6 +441,20 @@ html_to_markdown \
|
|
|
315
441
|
--whitespace-mode {normalized,strict} # Whitespace handling (default: normalized)
|
|
316
442
|
--heading-style {atx,atx_closed,underlined} # Header style
|
|
317
443
|
--no-extract-metadata # Disable metadata extraction
|
|
444
|
+
--br-in-tables # Use <br> tags for line breaks in table cells
|
|
445
|
+
--source-encoding ENCODING # Override auto-detected encoding (rarely needed)
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
**File Encoding:**
|
|
449
|
+
|
|
450
|
+
The CLI automatically detects file encoding in most cases. Use `--source-encoding` only when automatic detection fails (typically on some Windows systems or with unusual encodings):
|
|
451
|
+
|
|
452
|
+
```shell
|
|
453
|
+
# Override auto-detection for Latin-1 encoded file
|
|
454
|
+
html_to_markdown --source-encoding latin-1 input.html > output.md
|
|
455
|
+
|
|
456
|
+
# Force UTF-16 encoding when auto-detection fails
|
|
457
|
+
html_to_markdown --source-encoding utf-16 input.html > output.md
|
|
318
458
|
```
|
|
319
459
|
|
|
320
460
|
**All Available Options:**
|
|
@@ -355,6 +495,7 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
|
|
|
355
495
|
- `newline_style` (str, default: `'spaces'`): Style for handling newlines (`'spaces'` or `'backslash'`)
|
|
356
496
|
- `sub_symbol` (str, default: `''`): Custom symbol for subscript text
|
|
357
497
|
- `sup_symbol` (str, default: `''`): Custom symbol for superscript text
|
|
498
|
+
- `br_in_tables` (bool, default: `False`): Use `<br>` tags for line breaks in table cells instead of spaces
|
|
358
499
|
|
|
359
500
|
### Parser Options
|
|
360
501
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
from argparse import ArgumentParser, FileType
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
|
|
4
5
|
from html_to_markdown.constants import (
|
|
5
6
|
ASTERISK,
|
|
@@ -13,6 +14,7 @@ from html_to_markdown.constants import (
|
|
|
13
14
|
WHITESPACE_NORMALIZED,
|
|
14
15
|
WHITESPACE_STRICT,
|
|
15
16
|
)
|
|
17
|
+
from html_to_markdown.exceptions import InvalidEncodingError
|
|
16
18
|
from html_to_markdown.processing import convert_to_markdown
|
|
17
19
|
|
|
18
20
|
|
|
@@ -131,6 +133,12 @@ def main(argv: list[str]) -> str:
|
|
|
131
133
|
help="Parent tags where images remain inline (not converted to alt-text).",
|
|
132
134
|
)
|
|
133
135
|
|
|
136
|
+
parser.add_argument(
|
|
137
|
+
"--br-in-tables",
|
|
138
|
+
action="store_true",
|
|
139
|
+
help="Use <br> tags for line breaks in table cells instead of spaces.",
|
|
140
|
+
)
|
|
141
|
+
|
|
134
142
|
parser.add_argument("-w", "--wrap", action="store_true", help="Enable text wrapping at --wrap-width characters.")
|
|
135
143
|
|
|
136
144
|
parser.add_argument(
|
|
@@ -235,10 +243,18 @@ def main(argv: list[str]) -> str:
|
|
|
235
243
|
help="Keep navigation elements when preprocessing (normally removed).",
|
|
236
244
|
)
|
|
237
245
|
|
|
246
|
+
parser.add_argument(
|
|
247
|
+
"--source-encoding",
|
|
248
|
+
type=str,
|
|
249
|
+
default=None,
|
|
250
|
+
help="Source file encoding (e.g. 'utf-8', 'latin-1'). Defaults to system default.",
|
|
251
|
+
)
|
|
252
|
+
|
|
238
253
|
args = parser.parse_args(argv)
|
|
239
254
|
|
|
240
255
|
base_args = {
|
|
241
256
|
"autolinks": args.autolinks,
|
|
257
|
+
"br_in_tables": args.br_in_tables,
|
|
242
258
|
"bullets": args.bullets,
|
|
243
259
|
"code_language": args.code_language,
|
|
244
260
|
"convert": args.convert,
|
|
@@ -278,7 +294,7 @@ def main(argv: list[str]) -> str:
|
|
|
278
294
|
if args.show_progress:
|
|
279
295
|
|
|
280
296
|
def progress_callback(processed: int, total: int) -> None:
|
|
281
|
-
if total > 0:
|
|
297
|
+
if total > 0: # pragma: no cover
|
|
282
298
|
percent = (processed / total) * 100
|
|
283
299
|
|
|
284
300
|
sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
|
|
@@ -286,4 +302,14 @@ def main(argv: list[str]) -> str:
|
|
|
286
302
|
|
|
287
303
|
base_args["progress_callback"] = progress_callback
|
|
288
304
|
|
|
289
|
-
|
|
305
|
+
if args.source_encoding and args.html.name != "<stdin>":
|
|
306
|
+
args.html.close()
|
|
307
|
+
try:
|
|
308
|
+
with Path(args.html.name).open(encoding=args.source_encoding) as f:
|
|
309
|
+
html_content = f.read()
|
|
310
|
+
except LookupError as e:
|
|
311
|
+
raise InvalidEncodingError(args.source_encoding) from e
|
|
312
|
+
else:
|
|
313
|
+
html_content = args.html.read()
|
|
314
|
+
|
|
315
|
+
return convert_to_markdown(html_content, **base_args)
|