html-to-markdown 1.11.0__tar.gz → 1.12.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (22) hide show
  1. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/PKG-INFO +143 -2
  2. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/README.md +142 -1
  3. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/cli.py +28 -2
  4. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/converters.py +208 -130
  5. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/exceptions.py +5 -0
  6. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/preprocessor.py +96 -86
  7. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/processing.py +63 -48
  8. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/utils.py +1 -3
  9. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/whitespace.py +23 -33
  10. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown.egg-info/PKG-INFO +143 -2
  11. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/pyproject.toml +11 -1
  12. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/LICENSE +0 -0
  13. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/__init__.py +0 -0
  14. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/__main__.py +0 -0
  15. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/constants.py +0 -0
  16. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown/py.typed +0 -0
  17. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown.egg-info/SOURCES.txt +0 -0
  18. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown.egg-info/dependency_links.txt +0 -0
  19. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown.egg-info/entry_points.txt +0 -0
  20. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown.egg-info/requires.txt +0 -0
  21. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/html_to_markdown.egg-info/top_level.txt +0 -0
  22. {html_to_markdown-1.11.0 → html_to_markdown-1.12.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.11.0
3
+ Version: 1.12.1
4
4
  Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -320,6 +320,132 @@ def converter(*, tag: Tag, text: str, **kwargs) -> str:
320
320
 
321
321
  Custom converters take precedence over built-in converters and can be used alongside other configuration options.
322
322
 
323
+ ### Streaming API
324
+
325
+ For processing large documents with memory constraints, use the streaming API:
326
+
327
+ ```python
328
+ from html_to_markdown import convert_to_markdown_stream
329
+
330
+ # Process large HTML in chunks
331
+ with open("large_document.html", "r") as f:
332
+ html_content = f.read()
333
+
334
+ # Returns a generator that yields markdown chunks
335
+ for chunk in convert_to_markdown_stream(html_content, chunk_size=2048):
336
+ print(chunk, end="")
337
+ ```
338
+
339
+ With progress tracking:
340
+
341
+ ```python
342
+ def show_progress(processed: int, total: int):
343
+ if total > 0:
344
+ percent = (processed / total) * 100
345
+ print(f"\rProgress: {percent:.1f}%", end="")
346
+
347
+ # Stream with progress callback
348
+ markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
349
+ ```
350
+
351
+ #### When to Use Streaming vs Regular Processing
352
+
353
+ Based on comprehensive performance analysis, here are our recommendations:
354
+
355
+ **📄 Use Regular Processing When:**
356
+
357
+ - Files < 100KB (simplicity preferred)
358
+ - Simple scripts and one-off conversions
359
+ - Memory is not a concern
360
+ - You want the simplest API
361
+
362
+ **🌊 Use Streaming Processing When:**
363
+
364
+ - Files > 100KB (memory efficiency)
365
+ - Processing many files in batch
366
+ - Memory is constrained
367
+ - You need progress reporting
368
+ - You want to process results incrementally
369
+ - Running in production environments
370
+
371
+ **📋 Specific Recommendations by File Size:**
372
+
373
+ | File Size | Recommendation | Reason |
374
+ | ---------- | ----------------------------------------------- | -------------------------------------- |
375
+ | < 50KB | Regular (simplicity) or Streaming (3-5% faster) | Either works well |
376
+ | 50KB-100KB | Either (streaming slightly preferred) | Minimal difference |
377
+ | 100KB-1MB | Streaming preferred | Better performance + memory efficiency |
378
+ | > 1MB | Streaming strongly recommended | Significant memory advantages |
379
+
380
+ **🔧 Configuration Recommendations:**
381
+
382
+ - **Default chunk_size: 2048 bytes** (optimal performance balance)
383
+ - **For very large files (>10MB)**: Consider `chunk_size=4096`
384
+ - **For memory-constrained environments**: Use smaller chunks `chunk_size=1024`
385
+
386
+ **📈 Performance Benefits:**
387
+
388
+ Streaming provides consistent **3-5% performance improvement** across all file sizes:
389
+
390
+ - **Streaming throughput**: ~0.47-0.48 MB/s
391
+ - **Regular throughput**: ~0.44-0.47 MB/s
392
+ - **Memory usage**: Streaming uses less peak memory for large files
393
+ - **Latency**: Streaming allows processing results before completion
394
+
395
+ ### Preprocessing API
396
+
397
+ The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
398
+
399
+ ```python
400
+ from html_to_markdown import preprocess_html, create_preprocessor
401
+
402
+ # Direct preprocessing with custom options
403
+ cleaned_html = preprocess_html(
404
+ raw_html,
405
+ remove_navigation=True,
406
+ remove_forms=True,
407
+ remove_scripts=True,
408
+ remove_styles=True,
409
+ remove_comments=True,
410
+ preserve_semantic_structure=True,
411
+ preserve_tables=True,
412
+ preserve_media=True,
413
+ )
414
+ markdown = convert_to_markdown(cleaned_html)
415
+
416
+ # Create a preprocessor configuration from presets
417
+ config = create_preprocessor(preset="aggressive", preserve_tables=False) # or "minimal", "standard" # Override preset settings
418
+ markdown = convert_to_markdown(html, **config)
419
+ ```
420
+
421
+ ### Exception Handling
422
+
423
+ The library provides specific exception classes for better error handling:
424
+
425
+ ````python
426
+ from html_to_markdown import (
427
+ convert_to_markdown,
428
+ HtmlToMarkdownError,
429
+ EmptyHtmlError,
430
+ InvalidParserError,
431
+ ConflictingOptionsError,
432
+ MissingDependencyError
433
+ )
434
+
435
+ try:
436
+ markdown = convert_to_markdown(html, parser='lxml')
437
+ except MissingDependencyError:
438
+ # lxml not installed
439
+ markdown = convert_to_markdown(html, parser='html.parser')
440
+ except EmptyHtmlError:
441
+ print("No HTML content to convert")
442
+ except InvalidParserError as e:
443
+ print(f"Parser error: {e}")
444
+ except ConflictingOptionsError as e:
445
+ print(f"Conflicting options: {e}")
446
+ except HtmlToMarkdownError as e:
447
+ print(f"Conversion error: {e}")
448
+
323
449
  ## CLI Usage
324
450
 
325
451
  Convert HTML files directly from the command line with full access to all API options:
@@ -340,7 +466,7 @@ html_to_markdown \
340
466
  --preprocess-html \
341
467
  --preprocessing-preset aggressive \
342
468
  input.html > output.md
343
- ```
469
+ ````
344
470
 
345
471
  ### Key CLI Options
346
472
 
@@ -353,6 +479,20 @@ html_to_markdown \
353
479
  --whitespace-mode {normalized,strict} # Whitespace handling (default: normalized)
354
480
  --heading-style {atx,atx_closed,underlined} # Header style
355
481
  --no-extract-metadata # Disable metadata extraction
482
+ --br-in-tables # Use <br> tags for line breaks in table cells
483
+ --source-encoding ENCODING # Override auto-detected encoding (rarely needed)
484
+ ```
485
+
486
+ **File Encoding:**
487
+
488
+ The CLI automatically detects file encoding in most cases. Use `--source-encoding` only when automatic detection fails (typically on some Windows systems or with unusual encodings):
489
+
490
+ ```shell
491
+ # Override auto-detection for Latin-1 encoded file
492
+ html_to_markdown --source-encoding latin-1 input.html > output.md
493
+
494
+ # Force UTF-16 encoding when auto-detection fails
495
+ html_to_markdown --source-encoding utf-16 input.html > output.md
356
496
  ```
357
497
 
358
498
  **All Available Options:**
@@ -393,6 +533,7 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
393
533
  - `newline_style` (str, default: `'spaces'`): Style for handling newlines (`'spaces'` or `'backslash'`)
394
534
  - `sub_symbol` (str, default: `''`): Custom symbol for subscript text
395
535
  - `sup_symbol` (str, default: `''`): Custom symbol for superscript text
536
+ - `br_in_tables` (bool, default: `False`): Use `<br>` tags for line breaks in table cells instead of spaces
396
537
 
397
538
  ### Parser Options
398
539
 
@@ -282,6 +282,132 @@ def converter(*, tag: Tag, text: str, **kwargs) -> str:
282
282
 
283
283
  Custom converters take precedence over built-in converters and can be used alongside other configuration options.
284
284
 
285
+ ### Streaming API
286
+
287
+ For processing large documents with memory constraints, use the streaming API:
288
+
289
+ ```python
290
+ from html_to_markdown import convert_to_markdown_stream
291
+
292
+ # Process large HTML in chunks
293
+ with open("large_document.html", "r") as f:
294
+ html_content = f.read()
295
+
296
+ # Returns a generator that yields markdown chunks
297
+ for chunk in convert_to_markdown_stream(html_content, chunk_size=2048):
298
+ print(chunk, end="")
299
+ ```
300
+
301
+ With progress tracking:
302
+
303
+ ```python
304
+ def show_progress(processed: int, total: int):
305
+ if total > 0:
306
+ percent = (processed / total) * 100
307
+ print(f"\rProgress: {percent:.1f}%", end="")
308
+
309
+ # Stream with progress callback
310
+ markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
311
+ ```
312
+
313
+ #### When to Use Streaming vs Regular Processing
314
+
315
+ Based on comprehensive performance analysis, here are our recommendations:
316
+
317
+ **📄 Use Regular Processing When:**
318
+
319
+ - Files < 100KB (simplicity preferred)
320
+ - Simple scripts and one-off conversions
321
+ - Memory is not a concern
322
+ - You want the simplest API
323
+
324
+ **🌊 Use Streaming Processing When:**
325
+
326
+ - Files > 100KB (memory efficiency)
327
+ - Processing many files in batch
328
+ - Memory is constrained
329
+ - You need progress reporting
330
+ - You want to process results incrementally
331
+ - Running in production environments
332
+
333
+ **📋 Specific Recommendations by File Size:**
334
+
335
+ | File Size | Recommendation | Reason |
336
+ | ---------- | ----------------------------------------------- | -------------------------------------- |
337
+ | < 50KB | Regular (simplicity) or Streaming (3-5% faster) | Either works well |
338
+ | 50KB-100KB | Either (streaming slightly preferred) | Minimal difference |
339
+ | 100KB-1MB | Streaming preferred | Better performance + memory efficiency |
340
+ | > 1MB | Streaming strongly recommended | Significant memory advantages |
341
+
342
+ **🔧 Configuration Recommendations:**
343
+
344
+ - **Default chunk_size: 2048 bytes** (optimal performance balance)
345
+ - **For very large files (>10MB)**: Consider `chunk_size=4096`
346
+ - **For memory-constrained environments**: Use smaller chunks `chunk_size=1024`
347
+
348
+ **📈 Performance Benefits:**
349
+
350
+ Streaming provides consistent **3-5% performance improvement** across all file sizes:
351
+
352
+ - **Streaming throughput**: ~0.47-0.48 MB/s
353
+ - **Regular throughput**: ~0.44-0.47 MB/s
354
+ - **Memory usage**: Streaming uses less peak memory for large files
355
+ - **Latency**: Streaming allows processing results before completion
356
+
357
+ ### Preprocessing API
358
+
359
+ The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
360
+
361
+ ```python
362
+ from html_to_markdown import preprocess_html, create_preprocessor
363
+
364
+ # Direct preprocessing with custom options
365
+ cleaned_html = preprocess_html(
366
+ raw_html,
367
+ remove_navigation=True,
368
+ remove_forms=True,
369
+ remove_scripts=True,
370
+ remove_styles=True,
371
+ remove_comments=True,
372
+ preserve_semantic_structure=True,
373
+ preserve_tables=True,
374
+ preserve_media=True,
375
+ )
376
+ markdown = convert_to_markdown(cleaned_html)
377
+
378
+ # Create a preprocessor configuration from presets
379
+ config = create_preprocessor(preset="aggressive", preserve_tables=False) # or "minimal", "standard" # Override preset settings
380
+ markdown = convert_to_markdown(html, **config)
381
+ ```
382
+
383
+ ### Exception Handling
384
+
385
+ The library provides specific exception classes for better error handling:
386
+
387
+ ````python
388
+ from html_to_markdown import (
389
+ convert_to_markdown,
390
+ HtmlToMarkdownError,
391
+ EmptyHtmlError,
392
+ InvalidParserError,
393
+ ConflictingOptionsError,
394
+ MissingDependencyError
395
+ )
396
+
397
+ try:
398
+ markdown = convert_to_markdown(html, parser='lxml')
399
+ except MissingDependencyError:
400
+ # lxml not installed
401
+ markdown = convert_to_markdown(html, parser='html.parser')
402
+ except EmptyHtmlError:
403
+ print("No HTML content to convert")
404
+ except InvalidParserError as e:
405
+ print(f"Parser error: {e}")
406
+ except ConflictingOptionsError as e:
407
+ print(f"Conflicting options: {e}")
408
+ except HtmlToMarkdownError as e:
409
+ print(f"Conversion error: {e}")
410
+
285
411
  ## CLI Usage
286
412
 
287
413
  Convert HTML files directly from the command line with full access to all API options:
@@ -302,7 +428,7 @@ html_to_markdown \
302
428
  --preprocess-html \
303
429
  --preprocessing-preset aggressive \
304
430
  input.html > output.md
305
- ```
431
+ ````
306
432
 
307
433
  ### Key CLI Options
308
434
 
@@ -315,6 +441,20 @@ html_to_markdown \
315
441
  --whitespace-mode {normalized,strict} # Whitespace handling (default: normalized)
316
442
  --heading-style {atx,atx_closed,underlined} # Header style
317
443
  --no-extract-metadata # Disable metadata extraction
444
+ --br-in-tables # Use <br> tags for line breaks in table cells
445
+ --source-encoding ENCODING # Override auto-detected encoding (rarely needed)
446
+ ```
447
+
448
+ **File Encoding:**
449
+
450
+ The CLI automatically detects file encoding in most cases. Use `--source-encoding` only when automatic detection fails (typically on some Windows systems or with unusual encodings):
451
+
452
+ ```shell
453
+ # Override auto-detection for Latin-1 encoded file
454
+ html_to_markdown --source-encoding latin-1 input.html > output.md
455
+
456
+ # Force UTF-16 encoding when auto-detection fails
457
+ html_to_markdown --source-encoding utf-16 input.html > output.md
318
458
  ```
319
459
 
320
460
  **All Available Options:**
@@ -355,6 +495,7 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
355
495
  - `newline_style` (str, default: `'spaces'`): Style for handling newlines (`'spaces'` or `'backslash'`)
356
496
  - `sub_symbol` (str, default: `''`): Custom symbol for subscript text
357
497
  - `sup_symbol` (str, default: `''`): Custom symbol for superscript text
498
+ - `br_in_tables` (bool, default: `False`): Use `<br>` tags for line breaks in table cells instead of spaces
358
499
 
359
500
  ### Parser Options
360
501
 
@@ -1,5 +1,6 @@
1
1
  import sys
2
2
  from argparse import ArgumentParser, FileType
3
+ from pathlib import Path
3
4
 
4
5
  from html_to_markdown.constants import (
5
6
  ASTERISK,
@@ -13,6 +14,7 @@ from html_to_markdown.constants import (
13
14
  WHITESPACE_NORMALIZED,
14
15
  WHITESPACE_STRICT,
15
16
  )
17
+ from html_to_markdown.exceptions import InvalidEncodingError
16
18
  from html_to_markdown.processing import convert_to_markdown
17
19
 
18
20
 
@@ -131,6 +133,12 @@ def main(argv: list[str]) -> str:
131
133
  help="Parent tags where images remain inline (not converted to alt-text).",
132
134
  )
133
135
 
136
+ parser.add_argument(
137
+ "--br-in-tables",
138
+ action="store_true",
139
+ help="Use <br> tags for line breaks in table cells instead of spaces.",
140
+ )
141
+
134
142
  parser.add_argument("-w", "--wrap", action="store_true", help="Enable text wrapping at --wrap-width characters.")
135
143
 
136
144
  parser.add_argument(
@@ -235,10 +243,18 @@ def main(argv: list[str]) -> str:
235
243
  help="Keep navigation elements when preprocessing (normally removed).",
236
244
  )
237
245
 
246
+ parser.add_argument(
247
+ "--source-encoding",
248
+ type=str,
249
+ default=None,
250
+ help="Source file encoding (e.g. 'utf-8', 'latin-1'). Defaults to system default.",
251
+ )
252
+
238
253
  args = parser.parse_args(argv)
239
254
 
240
255
  base_args = {
241
256
  "autolinks": args.autolinks,
257
+ "br_in_tables": args.br_in_tables,
242
258
  "bullets": args.bullets,
243
259
  "code_language": args.code_language,
244
260
  "convert": args.convert,
@@ -278,7 +294,7 @@ def main(argv: list[str]) -> str:
278
294
  if args.show_progress:
279
295
 
280
296
  def progress_callback(processed: int, total: int) -> None:
281
- if total > 0:
297
+ if total > 0: # pragma: no cover
282
298
  percent = (processed / total) * 100
283
299
 
284
300
  sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
@@ -286,4 +302,14 @@ def main(argv: list[str]) -> str:
286
302
 
287
303
  base_args["progress_callback"] = progress_callback
288
304
 
289
- return convert_to_markdown(args.html.read(), **base_args)
305
+ if args.source_encoding and args.html.name != "<stdin>":
306
+ args.html.close()
307
+ try:
308
+ with Path(args.html.name).open(encoding=args.source_encoding) as f:
309
+ html_content = f.read()
310
+ except LookupError as e:
311
+ raise InvalidEncodingError(args.source_encoding) from e
312
+ else:
313
+ html_content = args.html.read()
314
+
315
+ return convert_to_markdown(html_content, **base_args)