html-to-markdown 1.11.0__tar.gz → 1.12.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (22) hide show
  1. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/PKG-INFO +99 -2
  2. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/README.md +98 -1
  3. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/cli.py +28 -2
  4. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/converters.py +214 -127
  5. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/exceptions.py +5 -0
  6. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/preprocessor.py +96 -86
  7. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/processing.py +36 -34
  8. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/utils.py +1 -3
  9. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/whitespace.py +7 -31
  10. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown.egg-info/PKG-INFO +99 -2
  11. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/pyproject.toml +1 -1
  12. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/LICENSE +0 -0
  13. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/__init__.py +0 -0
  14. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/__main__.py +0 -0
  15. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/constants.py +0 -0
  16. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown/py.typed +0 -0
  17. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown.egg-info/SOURCES.txt +0 -0
  18. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown.egg-info/dependency_links.txt +0 -0
  19. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown.egg-info/entry_points.txt +0 -0
  20. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown.egg-info/requires.txt +0 -0
  21. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/html_to_markdown.egg-info/top_level.txt +0 -0
  22. {html_to_markdown-1.11.0 → html_to_markdown-1.12.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.11.0
3
+ Version: 1.12.0
4
4
  Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -320,6 +320,88 @@ def converter(*, tag: Tag, text: str, **kwargs) -> str:
320
320
 
321
321
  Custom converters take precedence over built-in converters and can be used alongside other configuration options.
322
322
 
323
+ ### Streaming API
324
+
325
+ For processing large documents with memory constraints, use the streaming API:
326
+
327
+ ```python
328
+ from html_to_markdown import convert_to_markdown_stream
329
+
330
+ # Process large HTML in chunks
331
+ with open("large_document.html", "r") as f:
332
+ html_content = f.read()
333
+
334
+ # Returns a generator that yields markdown chunks
335
+ for chunk in convert_to_markdown_stream(html_content, chunk_size=2048):
336
+ print(chunk, end="")
337
+ ```
338
+
339
+ With progress tracking:
340
+
341
+ ```python
342
+ def show_progress(processed: int, total: int):
343
+ if total > 0:
344
+ percent = (processed / total) * 100
345
+ print(f"\rProgress: {percent:.1f}%", end="")
346
+
347
+ # Stream with progress callback
348
+ markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
349
+ ```
350
+
351
+ ### Preprocessing API
352
+
353
+ The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
354
+
355
+ ```python
356
+ from html_to_markdown import preprocess_html, create_preprocessor
357
+
358
+ # Direct preprocessing with custom options
359
+ cleaned_html = preprocess_html(
360
+ raw_html,
361
+ remove_navigation=True,
362
+ remove_forms=True,
363
+ remove_scripts=True,
364
+ remove_styles=True,
365
+ remove_comments=True,
366
+ preserve_semantic_structure=True,
367
+ preserve_tables=True,
368
+ preserve_media=True,
369
+ )
370
+ markdown = convert_to_markdown(cleaned_html)
371
+
372
+ # Create a preprocessor configuration from presets
373
+ config = create_preprocessor(preset="aggressive", preserve_tables=False) # or "minimal", "standard" # Override preset settings
374
+ markdown = convert_to_markdown(html, **config)
375
+ ```
376
+
377
+ ### Exception Handling
378
+
379
+ The library provides specific exception classes for better error handling:
380
+
381
+ ````python
382
+ from html_to_markdown import (
383
+ convert_to_markdown,
384
+ HtmlToMarkdownError,
385
+ EmptyHtmlError,
386
+ InvalidParserError,
387
+ ConflictingOptionsError,
388
+ MissingDependencyError
389
+ )
390
+
391
+ try:
392
+ markdown = convert_to_markdown(html, parser='lxml')
393
+ except MissingDependencyError:
394
+ # lxml not installed
395
+ markdown = convert_to_markdown(html, parser='html.parser')
396
+ except EmptyHtmlError:
397
+ print("No HTML content to convert")
398
+ except InvalidParserError as e:
399
+ print(f"Parser error: {e}")
400
+ except ConflictingOptionsError as e:
401
+ print(f"Conflicting options: {e}")
402
+ except HtmlToMarkdownError as e:
403
+ print(f"Conversion error: {e}")
404
+
323
405
  ## CLI Usage
324
406
 
325
407
  Convert HTML files directly from the command line with full access to all API options:
@@ -340,7 +422,7 @@ html_to_markdown \
340
422
  --preprocess-html \
341
423
  --preprocessing-preset aggressive \
342
424
  input.html > output.md
343
- ```
425
+ ````
344
426
 
345
427
  ### Key CLI Options
346
428
 
@@ -353,6 +435,20 @@ html_to_markdown \
353
435
  --whitespace-mode {normalized,strict} # Whitespace handling (default: normalized)
354
436
  --heading-style {atx,atx_closed,underlined} # Header style
355
437
  --no-extract-metadata # Disable metadata extraction
438
+ --br-in-tables # Use <br> tags for line breaks in table cells
439
+ --source-encoding ENCODING # Override auto-detected encoding (rarely needed)
440
+ ```
441
+
442
+ **File Encoding:**
443
+
444
+ The CLI automatically detects file encoding in most cases. Use `--source-encoding` only when automatic detection fails (typically on some Windows systems or with unusual encodings):
445
+
446
+ ```shell
447
+ # Override auto-detection for Latin-1 encoded file
448
+ html_to_markdown --source-encoding latin-1 input.html > output.md
449
+
450
+ # Force UTF-16 encoding when auto-detection fails
451
+ html_to_markdown --source-encoding utf-16 input.html > output.md
356
452
  ```
357
453
 
358
454
  **All Available Options:**
@@ -393,6 +489,7 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
393
489
  - `newline_style` (str, default: `'spaces'`): Style for handling newlines (`'spaces'` or `'backslash'`)
394
490
  - `sub_symbol` (str, default: `''`): Custom symbol for subscript text
395
491
  - `sup_symbol` (str, default: `''`): Custom symbol for superscript text
492
+ - `br_in_tables` (bool, default: `False`): Use `<br>` tags for line breaks in table cells instead of spaces
396
493
 
397
494
  ### Parser Options
398
495
 
@@ -282,6 +282,88 @@ def converter(*, tag: Tag, text: str, **kwargs) -> str:
282
282
 
283
283
  Custom converters take precedence over built-in converters and can be used alongside other configuration options.
284
284
 
285
+ ### Streaming API
286
+
287
+ For processing large documents with memory constraints, use the streaming API:
288
+
289
+ ```python
290
+ from html_to_markdown import convert_to_markdown_stream
291
+
292
+ # Process large HTML in chunks
293
+ with open("large_document.html", "r") as f:
294
+ html_content = f.read()
295
+
296
+ # Returns a generator that yields markdown chunks
297
+ for chunk in convert_to_markdown_stream(html_content, chunk_size=2048):
298
+ print(chunk, end="")
299
+ ```
300
+
301
+ With progress tracking:
302
+
303
+ ```python
304
+ def show_progress(processed: int, total: int):
305
+ if total > 0:
306
+ percent = (processed / total) * 100
307
+ print(f"\rProgress: {percent:.1f}%", end="")
308
+
309
+ # Stream with progress callback
310
+ markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
311
+ ```
312
+
313
+ ### Preprocessing API
314
+
315
+ The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
316
+
317
+ ```python
318
+ from html_to_markdown import preprocess_html, create_preprocessor
319
+
320
+ # Direct preprocessing with custom options
321
+ cleaned_html = preprocess_html(
322
+ raw_html,
323
+ remove_navigation=True,
324
+ remove_forms=True,
325
+ remove_scripts=True,
326
+ remove_styles=True,
327
+ remove_comments=True,
328
+ preserve_semantic_structure=True,
329
+ preserve_tables=True,
330
+ preserve_media=True,
331
+ )
332
+ markdown = convert_to_markdown(cleaned_html)
333
+
334
+ # Create a preprocessor configuration from presets
335
+ config = create_preprocessor(preset="aggressive", preserve_tables=False) # or "minimal", "standard" # Override preset settings
336
+ markdown = convert_to_markdown(html, **config)
337
+ ```
338
+
339
+ ### Exception Handling
340
+
341
+ The library provides specific exception classes for better error handling:
342
+
343
+ ````python
344
+ from html_to_markdown import (
345
+ convert_to_markdown,
346
+ HtmlToMarkdownError,
347
+ EmptyHtmlError,
348
+ InvalidParserError,
349
+ ConflictingOptionsError,
350
+ MissingDependencyError
351
+ )
352
+
353
+ try:
354
+ markdown = convert_to_markdown(html, parser='lxml')
355
+ except MissingDependencyError:
356
+ # lxml not installed
357
+ markdown = convert_to_markdown(html, parser='html.parser')
358
+ except EmptyHtmlError:
359
+ print("No HTML content to convert")
360
+ except InvalidParserError as e:
361
+ print(f"Parser error: {e}")
362
+ except ConflictingOptionsError as e:
363
+ print(f"Conflicting options: {e}")
364
+ except HtmlToMarkdownError as e:
365
+ print(f"Conversion error: {e}")
366
+
285
367
  ## CLI Usage
286
368
 
287
369
  Convert HTML files directly from the command line with full access to all API options:
@@ -302,7 +384,7 @@ html_to_markdown \
302
384
  --preprocess-html \
303
385
  --preprocessing-preset aggressive \
304
386
  input.html > output.md
305
- ```
387
+ ````
306
388
 
307
389
  ### Key CLI Options
308
390
 
@@ -315,6 +397,20 @@ html_to_markdown \
315
397
  --whitespace-mode {normalized,strict} # Whitespace handling (default: normalized)
316
398
  --heading-style {atx,atx_closed,underlined} # Header style
317
399
  --no-extract-metadata # Disable metadata extraction
400
+ --br-in-tables # Use <br> tags for line breaks in table cells
401
+ --source-encoding ENCODING # Override auto-detected encoding (rarely needed)
402
+ ```
403
+
404
+ **File Encoding:**
405
+
406
+ The CLI automatically detects file encoding in most cases. Use `--source-encoding` only when automatic detection fails (typically on some Windows systems or with unusual encodings):
407
+
408
+ ```shell
409
+ # Override auto-detection for Latin-1 encoded file
410
+ html_to_markdown --source-encoding latin-1 input.html > output.md
411
+
412
+ # Force UTF-16 encoding when auto-detection fails
413
+ html_to_markdown --source-encoding utf-16 input.html > output.md
318
414
  ```
319
415
 
320
416
  **All Available Options:**
@@ -355,6 +451,7 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
355
451
  - `newline_style` (str, default: `'spaces'`): Style for handling newlines (`'spaces'` or `'backslash'`)
356
452
  - `sub_symbol` (str, default: `''`): Custom symbol for subscript text
357
453
  - `sup_symbol` (str, default: `''`): Custom symbol for superscript text
454
+ - `br_in_tables` (bool, default: `False`): Use `<br>` tags for line breaks in table cells instead of spaces
358
455
 
359
456
  ### Parser Options
360
457
 
@@ -1,5 +1,6 @@
1
1
  import sys
2
2
  from argparse import ArgumentParser, FileType
3
+ from pathlib import Path
3
4
 
4
5
  from html_to_markdown.constants import (
5
6
  ASTERISK,
@@ -13,6 +14,7 @@ from html_to_markdown.constants import (
13
14
  WHITESPACE_NORMALIZED,
14
15
  WHITESPACE_STRICT,
15
16
  )
17
+ from html_to_markdown.exceptions import InvalidEncodingError
16
18
  from html_to_markdown.processing import convert_to_markdown
17
19
 
18
20
 
@@ -131,6 +133,12 @@ def main(argv: list[str]) -> str:
131
133
  help="Parent tags where images remain inline (not converted to alt-text).",
132
134
  )
133
135
 
136
+ parser.add_argument(
137
+ "--br-in-tables",
138
+ action="store_true",
139
+ help="Use <br> tags for line breaks in table cells instead of spaces.",
140
+ )
141
+
134
142
  parser.add_argument("-w", "--wrap", action="store_true", help="Enable text wrapping at --wrap-width characters.")
135
143
 
136
144
  parser.add_argument(
@@ -235,10 +243,18 @@ def main(argv: list[str]) -> str:
235
243
  help="Keep navigation elements when preprocessing (normally removed).",
236
244
  )
237
245
 
246
+ parser.add_argument(
247
+ "--source-encoding",
248
+ type=str,
249
+ default=None,
250
+ help="Source file encoding (e.g. 'utf-8', 'latin-1'). Defaults to system default.",
251
+ )
252
+
238
253
  args = parser.parse_args(argv)
239
254
 
240
255
  base_args = {
241
256
  "autolinks": args.autolinks,
257
+ "br_in_tables": args.br_in_tables,
242
258
  "bullets": args.bullets,
243
259
  "code_language": args.code_language,
244
260
  "convert": args.convert,
@@ -278,7 +294,7 @@ def main(argv: list[str]) -> str:
278
294
  if args.show_progress:
279
295
 
280
296
  def progress_callback(processed: int, total: int) -> None:
281
- if total > 0:
297
+ if total > 0: # pragma: no cover
282
298
  percent = (processed / total) * 100
283
299
 
284
300
  sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
@@ -286,4 +302,14 @@ def main(argv: list[str]) -> str:
286
302
 
287
303
  base_args["progress_callback"] = progress_callback
288
304
 
289
- return convert_to_markdown(args.html.read(), **base_args)
305
+ if args.source_encoding and args.html.name != "<stdin>":
306
+ args.html.close()
307
+ try:
308
+ with Path(args.html.name).open(encoding=args.source_encoding) as f:
309
+ html_content = f.read()
310
+ except LookupError as e:
311
+ raise InvalidEncodingError(args.source_encoding) from e
312
+ else:
313
+ html_content = args.html.read()
314
+
315
+ return convert_to_markdown(html_content, **base_args)