html-to-markdown 1.11.0__tar.gz → 1.12.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/PKG-INFO +99 -2
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/README.md +98 -1
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/cli.py +28 -2
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/converters.py +214 -127
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/exceptions.py +5 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/preprocessor.py +96 -86
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/processing.py +36 -34
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/utils.py +1 -3
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/whitespace.py +7 -31
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown.egg-info/PKG-INFO +99 -2
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/pyproject.toml +1 -1
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/LICENSE +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/constants.py +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown.egg-info/SOURCES.txt +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown.egg-info/dependency_links.txt +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown.egg-info/entry_points.txt +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown.egg-info/requires.txt +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown.egg-info/top_level.txt +0 -0
- {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.12.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -320,6 +320,88 @@ def converter(*, tag: Tag, text: str, **kwargs) -> str:
|
|
|
320
320
|
|
|
321
321
|
Custom converters take precedence over built-in converters and can be used alongside other configuration options.
|
|
322
322
|
|
|
323
|
+
### Streaming API
|
|
324
|
+
|
|
325
|
+
For processing large documents with memory constraints, use the streaming API:
|
|
326
|
+
|
|
327
|
+
```python
|
|
328
|
+
from html_to_markdown import convert_to_markdown_stream
|
|
329
|
+
|
|
330
|
+
# Process large HTML in chunks
|
|
331
|
+
with open("large_document.html", "r") as f:
|
|
332
|
+
html_content = f.read()
|
|
333
|
+
|
|
334
|
+
# Returns a generator that yields markdown chunks
|
|
335
|
+
for chunk in convert_to_markdown_stream(html_content, chunk_size=2048):
|
|
336
|
+
print(chunk, end="")
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
With progress tracking:
|
|
340
|
+
|
|
341
|
+
```python
|
|
342
|
+
def show_progress(processed: int, total: int):
|
|
343
|
+
if total > 0:
|
|
344
|
+
percent = (processed / total) * 100
|
|
345
|
+
print(f"\rProgress: {percent:.1f}%", end="")
|
|
346
|
+
|
|
347
|
+
# Stream with progress callback
|
|
348
|
+
markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
### Preprocessing API
|
|
352
|
+
|
|
353
|
+
The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
|
|
354
|
+
|
|
355
|
+
```python
|
|
356
|
+
from html_to_markdown import preprocess_html, create_preprocessor
|
|
357
|
+
|
|
358
|
+
# Direct preprocessing with custom options
|
|
359
|
+
cleaned_html = preprocess_html(
|
|
360
|
+
raw_html,
|
|
361
|
+
remove_navigation=True,
|
|
362
|
+
remove_forms=True,
|
|
363
|
+
remove_scripts=True,
|
|
364
|
+
remove_styles=True,
|
|
365
|
+
remove_comments=True,
|
|
366
|
+
preserve_semantic_structure=True,
|
|
367
|
+
preserve_tables=True,
|
|
368
|
+
preserve_media=True,
|
|
369
|
+
)
|
|
370
|
+
markdown = convert_to_markdown(cleaned_html)
|
|
371
|
+
|
|
372
|
+
# Create a preprocessor configuration from presets
|
|
373
|
+
config = create_preprocessor(preset="aggressive", preserve_tables=False) # or "minimal", "standard" # Override preset settings
|
|
374
|
+
markdown = convert_to_markdown(html, **config)
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
### Exception Handling
|
|
378
|
+
|
|
379
|
+
The library provides specific exception classes for better error handling:
|
|
380
|
+
|
|
381
|
+
````python
|
|
382
|
+
from html_to_markdown import (
|
|
383
|
+
convert_to_markdown,
|
|
384
|
+
HtmlToMarkdownError,
|
|
385
|
+
EmptyHtmlError,
|
|
386
|
+
InvalidParserError,
|
|
387
|
+
ConflictingOptionsError,
|
|
388
|
+
MissingDependencyError
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
try:
|
|
392
|
+
markdown = convert_to_markdown(html, parser='lxml')
|
|
393
|
+
except MissingDependencyError:
|
|
394
|
+
# lxml not installed
|
|
395
|
+
markdown = convert_to_markdown(html, parser='html.parser')
|
|
396
|
+
except EmptyHtmlError:
|
|
397
|
+
print("No HTML content to convert")
|
|
398
|
+
except InvalidParserError as e:
|
|
399
|
+
print(f"Parser error: {e}")
|
|
400
|
+
except ConflictingOptionsError as e:
|
|
401
|
+
print(f"Conflicting options: {e}")
|
|
402
|
+
except HtmlToMarkdownError as e:
|
|
403
|
+
print(f"Conversion error: {e}")
|
|
404
|
+
|
|
323
405
|
## CLI Usage
|
|
324
406
|
|
|
325
407
|
Convert HTML files directly from the command line with full access to all API options:
|
|
@@ -340,7 +422,7 @@ html_to_markdown \
|
|
|
340
422
|
--preprocess-html \
|
|
341
423
|
--preprocessing-preset aggressive \
|
|
342
424
|
input.html > output.md
|
|
343
|
-
|
|
425
|
+
````
|
|
344
426
|
|
|
345
427
|
### Key CLI Options
|
|
346
428
|
|
|
@@ -353,6 +435,20 @@ html_to_markdown \
|
|
|
353
435
|
--whitespace-mode {normalized,strict} # Whitespace handling (default: normalized)
|
|
354
436
|
--heading-style {atx,atx_closed,underlined} # Header style
|
|
355
437
|
--no-extract-metadata # Disable metadata extraction
|
|
438
|
+
--br-in-tables # Use <br> tags for line breaks in table cells
|
|
439
|
+
--source-encoding ENCODING # Override auto-detected encoding (rarely needed)
|
|
440
|
+
```
|
|
441
|
+
|
|
442
|
+
**File Encoding:**
|
|
443
|
+
|
|
444
|
+
The CLI automatically detects file encoding in most cases. Use `--source-encoding` only when automatic detection fails (typically on some Windows systems or with unusual encodings):
|
|
445
|
+
|
|
446
|
+
```shell
|
|
447
|
+
# Override auto-detection for Latin-1 encoded file
|
|
448
|
+
html_to_markdown --source-encoding latin-1 input.html > output.md
|
|
449
|
+
|
|
450
|
+
# Force UTF-16 encoding when auto-detection fails
|
|
451
|
+
html_to_markdown --source-encoding utf-16 input.html > output.md
|
|
356
452
|
```
|
|
357
453
|
|
|
358
454
|
**All Available Options:**
|
|
@@ -393,6 +489,7 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
|
|
|
393
489
|
- `newline_style` (str, default: `'spaces'`): Style for handling newlines (`'spaces'` or `'backslash'`)
|
|
394
490
|
- `sub_symbol` (str, default: `''`): Custom symbol for subscript text
|
|
395
491
|
- `sup_symbol` (str, default: `''`): Custom symbol for superscript text
|
|
492
|
+
- `br_in_tables` (bool, default: `False`): Use `<br>` tags for line breaks in table cells instead of spaces
|
|
396
493
|
|
|
397
494
|
### Parser Options
|
|
398
495
|
|
|
@@ -282,6 +282,88 @@ def converter(*, tag: Tag, text: str, **kwargs) -> str:
|
|
|
282
282
|
|
|
283
283
|
Custom converters take precedence over built-in converters and can be used alongside other configuration options.
|
|
284
284
|
|
|
285
|
+
### Streaming API
|
|
286
|
+
|
|
287
|
+
For processing large documents with memory constraints, use the streaming API:
|
|
288
|
+
|
|
289
|
+
```python
|
|
290
|
+
from html_to_markdown import convert_to_markdown_stream
|
|
291
|
+
|
|
292
|
+
# Process large HTML in chunks
|
|
293
|
+
with open("large_document.html", "r") as f:
|
|
294
|
+
html_content = f.read()
|
|
295
|
+
|
|
296
|
+
# Returns a generator that yields markdown chunks
|
|
297
|
+
for chunk in convert_to_markdown_stream(html_content, chunk_size=2048):
|
|
298
|
+
print(chunk, end="")
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
With progress tracking:
|
|
302
|
+
|
|
303
|
+
```python
|
|
304
|
+
def show_progress(processed: int, total: int):
|
|
305
|
+
if total > 0:
|
|
306
|
+
percent = (processed / total) * 100
|
|
307
|
+
print(f"\rProgress: {percent:.1f}%", end="")
|
|
308
|
+
|
|
309
|
+
# Stream with progress callback
|
|
310
|
+
markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
### Preprocessing API
|
|
314
|
+
|
|
315
|
+
The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
from html_to_markdown import preprocess_html, create_preprocessor
|
|
319
|
+
|
|
320
|
+
# Direct preprocessing with custom options
|
|
321
|
+
cleaned_html = preprocess_html(
|
|
322
|
+
raw_html,
|
|
323
|
+
remove_navigation=True,
|
|
324
|
+
remove_forms=True,
|
|
325
|
+
remove_scripts=True,
|
|
326
|
+
remove_styles=True,
|
|
327
|
+
remove_comments=True,
|
|
328
|
+
preserve_semantic_structure=True,
|
|
329
|
+
preserve_tables=True,
|
|
330
|
+
preserve_media=True,
|
|
331
|
+
)
|
|
332
|
+
markdown = convert_to_markdown(cleaned_html)
|
|
333
|
+
|
|
334
|
+
# Create a preprocessor configuration from presets
|
|
335
|
+
config = create_preprocessor(preset="aggressive", preserve_tables=False) # or "minimal", "standard" # Override preset settings
|
|
336
|
+
markdown = convert_to_markdown(html, **config)
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
### Exception Handling
|
|
340
|
+
|
|
341
|
+
The library provides specific exception classes for better error handling:
|
|
342
|
+
|
|
343
|
+
````python
|
|
344
|
+
from html_to_markdown import (
|
|
345
|
+
convert_to_markdown,
|
|
346
|
+
HtmlToMarkdownError,
|
|
347
|
+
EmptyHtmlError,
|
|
348
|
+
InvalidParserError,
|
|
349
|
+
ConflictingOptionsError,
|
|
350
|
+
MissingDependencyError
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
try:
|
|
354
|
+
markdown = convert_to_markdown(html, parser='lxml')
|
|
355
|
+
except MissingDependencyError:
|
|
356
|
+
# lxml not installed
|
|
357
|
+
markdown = convert_to_markdown(html, parser='html.parser')
|
|
358
|
+
except EmptyHtmlError:
|
|
359
|
+
print("No HTML content to convert")
|
|
360
|
+
except InvalidParserError as e:
|
|
361
|
+
print(f"Parser error: {e}")
|
|
362
|
+
except ConflictingOptionsError as e:
|
|
363
|
+
print(f"Conflicting options: {e}")
|
|
364
|
+
except HtmlToMarkdownError as e:
|
|
365
|
+
print(f"Conversion error: {e}")
|
|
366
|
+
|
|
285
367
|
## CLI Usage
|
|
286
368
|
|
|
287
369
|
Convert HTML files directly from the command line with full access to all API options:
|
|
@@ -302,7 +384,7 @@ html_to_markdown \
|
|
|
302
384
|
--preprocess-html \
|
|
303
385
|
--preprocessing-preset aggressive \
|
|
304
386
|
input.html > output.md
|
|
305
|
-
|
|
387
|
+
````
|
|
306
388
|
|
|
307
389
|
### Key CLI Options
|
|
308
390
|
|
|
@@ -315,6 +397,20 @@ html_to_markdown \
|
|
|
315
397
|
--whitespace-mode {normalized,strict} # Whitespace handling (default: normalized)
|
|
316
398
|
--heading-style {atx,atx_closed,underlined} # Header style
|
|
317
399
|
--no-extract-metadata # Disable metadata extraction
|
|
400
|
+
--br-in-tables # Use <br> tags for line breaks in table cells
|
|
401
|
+
--source-encoding ENCODING # Override auto-detected encoding (rarely needed)
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
**File Encoding:**
|
|
405
|
+
|
|
406
|
+
The CLI automatically detects file encoding in most cases. Use `--source-encoding` only when automatic detection fails (typically on some Windows systems or with unusual encodings):
|
|
407
|
+
|
|
408
|
+
```shell
|
|
409
|
+
# Override auto-detection for Latin-1 encoded file
|
|
410
|
+
html_to_markdown --source-encoding latin-1 input.html > output.md
|
|
411
|
+
|
|
412
|
+
# Force UTF-16 encoding when auto-detection fails
|
|
413
|
+
html_to_markdown --source-encoding utf-16 input.html > output.md
|
|
318
414
|
```
|
|
319
415
|
|
|
320
416
|
**All Available Options:**
|
|
@@ -355,6 +451,7 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
|
|
|
355
451
|
- `newline_style` (str, default: `'spaces'`): Style for handling newlines (`'spaces'` or `'backslash'`)
|
|
356
452
|
- `sub_symbol` (str, default: `''`): Custom symbol for subscript text
|
|
357
453
|
- `sup_symbol` (str, default: `''`): Custom symbol for superscript text
|
|
454
|
+
- `br_in_tables` (bool, default: `False`): Use `<br>` tags for line breaks in table cells instead of spaces
|
|
358
455
|
|
|
359
456
|
### Parser Options
|
|
360
457
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
from argparse import ArgumentParser, FileType
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
|
|
4
5
|
from html_to_markdown.constants import (
|
|
5
6
|
ASTERISK,
|
|
@@ -13,6 +14,7 @@ from html_to_markdown.constants import (
|
|
|
13
14
|
WHITESPACE_NORMALIZED,
|
|
14
15
|
WHITESPACE_STRICT,
|
|
15
16
|
)
|
|
17
|
+
from html_to_markdown.exceptions import InvalidEncodingError
|
|
16
18
|
from html_to_markdown.processing import convert_to_markdown
|
|
17
19
|
|
|
18
20
|
|
|
@@ -131,6 +133,12 @@ def main(argv: list[str]) -> str:
|
|
|
131
133
|
help="Parent tags where images remain inline (not converted to alt-text).",
|
|
132
134
|
)
|
|
133
135
|
|
|
136
|
+
parser.add_argument(
|
|
137
|
+
"--br-in-tables",
|
|
138
|
+
action="store_true",
|
|
139
|
+
help="Use <br> tags for line breaks in table cells instead of spaces.",
|
|
140
|
+
)
|
|
141
|
+
|
|
134
142
|
parser.add_argument("-w", "--wrap", action="store_true", help="Enable text wrapping at --wrap-width characters.")
|
|
135
143
|
|
|
136
144
|
parser.add_argument(
|
|
@@ -235,10 +243,18 @@ def main(argv: list[str]) -> str:
|
|
|
235
243
|
help="Keep navigation elements when preprocessing (normally removed).",
|
|
236
244
|
)
|
|
237
245
|
|
|
246
|
+
parser.add_argument(
|
|
247
|
+
"--source-encoding",
|
|
248
|
+
type=str,
|
|
249
|
+
default=None,
|
|
250
|
+
help="Source file encoding (e.g. 'utf-8', 'latin-1'). Defaults to system default.",
|
|
251
|
+
)
|
|
252
|
+
|
|
238
253
|
args = parser.parse_args(argv)
|
|
239
254
|
|
|
240
255
|
base_args = {
|
|
241
256
|
"autolinks": args.autolinks,
|
|
257
|
+
"br_in_tables": args.br_in_tables,
|
|
242
258
|
"bullets": args.bullets,
|
|
243
259
|
"code_language": args.code_language,
|
|
244
260
|
"convert": args.convert,
|
|
@@ -278,7 +294,7 @@ def main(argv: list[str]) -> str:
|
|
|
278
294
|
if args.show_progress:
|
|
279
295
|
|
|
280
296
|
def progress_callback(processed: int, total: int) -> None:
|
|
281
|
-
if total > 0:
|
|
297
|
+
if total > 0: # pragma: no cover
|
|
282
298
|
percent = (processed / total) * 100
|
|
283
299
|
|
|
284
300
|
sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
|
|
@@ -286,4 +302,14 @@ def main(argv: list[str]) -> str:
|
|
|
286
302
|
|
|
287
303
|
base_args["progress_callback"] = progress_callback
|
|
288
304
|
|
|
289
|
-
|
|
305
|
+
if args.source_encoding and args.html.name != "<stdin>":
|
|
306
|
+
args.html.close()
|
|
307
|
+
try:
|
|
308
|
+
with Path(args.html.name).open(encoding=args.source_encoding) as f:
|
|
309
|
+
html_content = f.read()
|
|
310
|
+
except LookupError as e:
|
|
311
|
+
raise InvalidEncodingError(args.source_encoding) from e
|
|
312
|
+
else:
|
|
313
|
+
html_content = args.html.read()
|
|
314
|
+
|
|
315
|
+
return convert_to_markdown(html_content, **base_args)
|