html-to-markdown 1.13.0__py3-none-any.whl → 1.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

html_to_markdown/cli.py CHANGED
@@ -1,5 +1,5 @@
1
1
  import sys
2
- from argparse import ArgumentParser, FileType
2
+ from argparse import ArgumentParser
3
3
  from pathlib import Path
4
4
 
5
5
  from html_to_markdown.constants import (
@@ -27,8 +27,7 @@ def main(argv: list[str]) -> str:
27
27
  parser.add_argument(
28
28
  "html",
29
29
  nargs="?",
30
- type=FileType("r"),
31
- default=sys.stdin,
30
+ default="-",
32
31
  help="The HTML file to convert. Defaults to STDIN if not provided.",
33
32
  )
34
33
 
@@ -247,7 +246,7 @@ def main(argv: list[str]) -> str:
247
246
  "--source-encoding",
248
247
  type=str,
249
248
  default=None,
250
- help="Source file encoding (e.g. 'utf-8', 'latin-1'). Defaults to system default.",
249
+ help="Encoding for reading input files and decoding bytes (e.g. 'utf-8', 'latin-1'). Default: utf-8.",
251
250
  )
252
251
 
253
252
  args = parser.parse_args(argv)
@@ -260,6 +259,7 @@ def main(argv: list[str]) -> str:
260
259
  "convert": args.convert,
261
260
  "convert_as_inline": args.convert_as_inline,
262
261
  "default_title": args.default_title,
262
+ "source_encoding": args.source_encoding,
263
263
  "escape_asterisks": args.escape_asterisks,
264
264
  "escape_misc": args.escape_misc,
265
265
  "escape_underscores": args.escape_underscores,
@@ -302,14 +302,20 @@ def main(argv: list[str]) -> str:
302
302
 
303
303
  base_args["progress_callback"] = progress_callback
304
304
 
305
- if args.source_encoding and args.html.name != "<stdin>":
306
- args.html.close()
307
- try:
308
- with Path(args.html.name).open(encoding=args.source_encoding) as f:
309
- html_content = f.read()
310
- except LookupError as e:
311
- raise InvalidEncodingError(args.source_encoding) from e
305
+ if args.html == "-":
306
+ html_content = sys.stdin.buffer.read()
312
307
  else:
313
- html_content = args.html.read()
308
+ try:
309
+ file_path = Path(args.html)
310
+ if args.source_encoding:
311
+ with file_path.open(encoding=args.source_encoding, errors="replace") as f:
312
+ html_content = f.read()
313
+ else:
314
+ with file_path.open("rb") as f:
315
+ html_content = f.read()
316
+ except (OSError, LookupError) as e:
317
+ if isinstance(e, LookupError):
318
+ raise InvalidEncodingError(args.source_encoding) from e
319
+ raise
314
320
 
315
321
  return convert_to_markdown(html_content, **base_args)
@@ -414,8 +414,6 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
414
414
 
415
415
  return "".join(result_parts)
416
416
 
417
- # Ensure consistent whitespace handling for list items, especially with strip_newlines=True
418
- # Strip any leading whitespace that may have been inherited from parent containers
419
417
  clean_text = (text or "").strip()
420
418
  return f"{bullet} {clean_text}\n"
421
419
 
@@ -445,13 +445,14 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
445
445
 
446
446
 
447
447
  def convert_to_markdown(
448
- source: str | BeautifulSoup,
448
+ source: str | bytes | BeautifulSoup,
449
449
  *,
450
450
  stream_processing: bool = False,
451
451
  chunk_size: int = 1024,
452
452
  chunk_callback: Callable[[str], None] | None = None,
453
453
  progress_callback: Callable[[int, int], None] | None = None,
454
454
  parser: str | None = None,
455
+ source_encoding: str = "utf-8",
455
456
  autolinks: bool = True,
456
457
  br_in_tables: bool = False,
457
458
  bullets: str = "*+-",
@@ -489,12 +490,13 @@ def convert_to_markdown(
489
490
  various customization options for controlling the conversion behavior.
490
491
 
491
492
  Args:
492
- source: HTML string or BeautifulSoup object to convert.
493
+ source: HTML string, bytes, or BeautifulSoup object to convert.
493
494
  stream_processing: Enable streaming mode for large documents.
494
495
  chunk_size: Size of chunks for streaming processing.
495
496
  chunk_callback: Callback for processing chunks in streaming mode.
496
497
  progress_callback: Callback for progress updates (current, total).
497
498
  parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
499
+ source_encoding: Character encoding to use when decoding bytes (default: 'utf-8').
498
500
  autolinks: Convert URLs to automatic links.
499
501
  br_in_tables: Use <br> tags for line breaks in table cells instead of spaces.
500
502
  bullets: Characters to use for unordered list bullets.
@@ -548,11 +550,12 @@ def convert_to_markdown(
548
550
  >>> convert_to_markdown(html, list_indent_width=2)
549
551
  '* Item 1\\n* Item 2\\n\\n'
550
552
  """
551
- # Initialize original input string for Windows lxml fix
552
553
  original_input_str = None
553
554
 
555
+ if isinstance(source, bytes):
556
+ source = source.decode(source_encoding or "utf-8", errors="replace")
557
+
554
558
  if isinstance(source, str):
555
- # Store original string for plain text detection (Windows lxml fix)
556
559
  original_input_str = source
557
560
 
558
561
  if (
@@ -613,6 +616,37 @@ def convert_to_markdown(
613
616
  new_text = NavigableString(leading_ws + str(first_child))
614
617
  first_child.replace_with(new_text)
615
618
  needs_leading_space_fix = False
619
+
620
+ # Fix html5lib whitespace handling to match other parsers
621
+ if parser == "html5lib":
622
+ body = source.find("body")
623
+ if body and isinstance(body, Tag):
624
+ children = list(body.children)
625
+
626
+ if (
627
+ len(children) == 1
628
+ and isinstance(children[0], NavigableString)
629
+ and original_source.startswith((" ", "\t", "\n", "\r"))
630
+ and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
631
+ ):
632
+ first_child = children[0]
633
+ original_text = str(first_child)
634
+
635
+ # Preserve leading whitespace from original if html5lib stripped it
636
+ leading_ws = ""
637
+ for char in original_source:
638
+ if char in " \t\n\r":
639
+ leading_ws += char
640
+ else:
641
+ break
642
+
643
+ # Create normalized text: restore leading whitespace only
644
+ normalized_text = original_text
645
+ if leading_ws and not normalized_text.startswith(leading_ws):
646
+ normalized_text = leading_ws + normalized_text
647
+
648
+ new_text = NavigableString(normalized_text)
649
+ first_child.replace_with(new_text)
616
650
  else:
617
651
  raise EmptyHtmlError
618
652
 
@@ -626,6 +660,7 @@ def convert_to_markdown(
626
660
  chunk_size=chunk_size,
627
661
  progress_callback=progress_callback,
628
662
  parser=parser,
663
+ source_encoding=source_encoding,
629
664
  autolinks=autolinks,
630
665
  bullets=bullets,
631
666
  code_language=code_language,
@@ -673,6 +708,7 @@ def convert_to_markdown(
673
708
  sink,
674
709
  whitespace_handler=whitespace_handler,
675
710
  parser=parser,
711
+ source_encoding=source_encoding,
676
712
  autolinks=autolinks,
677
713
  br_in_tables=br_in_tables,
678
714
  bullets=bullets,
@@ -703,8 +739,6 @@ def convert_to_markdown(
703
739
 
704
740
  result = sink.get_result()
705
741
 
706
- # Parser-agnostic behavior: handle leading whitespace differences between parsers
707
- # lxml may either add unwanted whitespace or strip meaningful whitespace compared to html.parser
708
742
  if "needs_leading_whitespace_fix" in locals() and needs_leading_whitespace_fix:
709
743
  original_input = sink.original_source if hasattr(sink, "original_source") else original_source
710
744
  if isinstance(original_input, str):
@@ -713,19 +747,14 @@ def convert_to_markdown(
713
747
  original_leading_whitespace_match.group(0) if original_leading_whitespace_match else ""
714
748
  )
715
749
 
716
- # Case 1: lxml added leading newlines (like "\n<figure>") - strip them
717
750
  if result.startswith("\n") and not original_input.lstrip().startswith(result.strip()):
718
751
  result = result.lstrip("\n\r")
719
752
 
720
- # Case 2: lxml stripped meaningful leading whitespace (like " <b>") - restore it
721
- # However, don't restore whitespace if strip_newlines=True was used, as the user
722
- # explicitly requested to remove formatting whitespace
723
753
  elif (
724
754
  not strip_newlines
725
755
  and not result.startswith((" ", "\t"))
726
756
  and original_leading_whitespace.startswith((" ", "\t"))
727
757
  ):
728
- # Only restore spaces/tabs, not newlines (which are usually formatting)
729
758
  leading_spaces_tabs_match = re.match(r"^[ \t]*", original_leading_whitespace)
730
759
  leading_spaces_tabs = leading_spaces_tabs_match.group(0) if leading_spaces_tabs_match else ""
731
760
  if leading_spaces_tabs:
@@ -758,9 +787,6 @@ def convert_to_markdown(
758
787
  if convert_as_inline:
759
788
  result = result.rstrip("\n")
760
789
 
761
- # Windows-specific fix: For plain text input (no HTML tags), lxml may add extra trailing newlines
762
- # This ensures consistent behavior across platforms when processing plain text
763
- # Only apply to cases where lxml adds extra newlines (\n\n) at the end
764
790
  if (
765
791
  "original_input_str" in locals()
766
792
  and original_input_str
@@ -768,19 +794,11 @@ def convert_to_markdown(
768
794
  and not original_input_str.strip().endswith(">")
769
795
  and result.endswith("\n\n")
770
796
  ):
771
- # Input appears to be plain text, not HTML - normalize trailing newlines only
772
797
  result = result.rstrip("\n")
773
798
 
774
- # If the original input contained no block-level elements, normalize any
775
- # accidental trailing newlines for cross-platform consistency.
776
- # This guards cases like inline-only inputs (e.g., "text <strong>bold</strong>")
777
- # and head-only documents (e.g., "<head>head</head>") where output should
778
- # not end with extra blank lines.
779
799
  if "original_input_str" in locals() and original_input_str:
780
800
  from html_to_markdown.whitespace import BLOCK_ELEMENTS # noqa: PLC0415
781
801
 
782
- # Treat additional tags as block-producing for trailing newline purposes.
783
- # These may be inline in HTML spec but produce block output in our Markdown conversion.
784
802
  blockish = set(BLOCK_ELEMENTS) | {
785
803
  "textarea",
786
804
  "dialog",
@@ -880,11 +898,12 @@ class StreamingSink(OutputSink):
880
898
 
881
899
 
882
900
  def _process_html_core(
883
- source: str | BeautifulSoup,
901
+ source: str | bytes | BeautifulSoup,
884
902
  sink: OutputSink,
885
903
  *,
886
904
  whitespace_handler: WhitespaceHandler,
887
905
  parser: str | None = None,
906
+ source_encoding: str = "utf-8",
888
907
  autolinks: bool,
889
908
  br_in_tables: bool,
890
909
  bullets: str,
@@ -915,7 +934,12 @@ def _process_html_core(
915
934
  token = _ancestor_cache.set({})
916
935
 
917
936
  try:
918
- if isinstance(source, str):
937
+ if isinstance(source, (str, bytes)):
938
+ original_source = source
939
+ if isinstance(source, bytes):
940
+ source = source.decode(source_encoding or "utf-8", errors="replace")
941
+ original_source = source
942
+
919
943
  if strip_newlines:
920
944
  source = source.replace("\n", " ").replace("\r", " ") # pragma: no cover
921
945
 
@@ -926,7 +950,36 @@ def _process_html_core(
926
950
  if parser == "lxml" and not LXML_AVAILABLE: # pragma: no cover
927
951
  raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
928
952
 
953
+ needs_leading_whitespace_fix = (
954
+ parser == "lxml"
955
+ and isinstance(original_source, str)
956
+ and original_source.startswith((" ", "\t", "\n", "\r"))
957
+ )
958
+
929
959
  source = BeautifulSoup(source, parser)
960
+
961
+ if parser == "lxml" and needs_leading_whitespace_fix and isinstance(original_source, str):
962
+ body = source.find("body")
963
+ if body and isinstance(body, Tag):
964
+ children = list(body.children)
965
+
966
+ if (
967
+ len(children) == 1
968
+ and isinstance(children[0], NavigableString)
969
+ and original_source.startswith((" ", "\t", "\n", "\r"))
970
+ and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
971
+ ):
972
+ first_child = children[0]
973
+
974
+ leading_ws = ""
975
+ for char in original_source:
976
+ if char in " \t":
977
+ leading_ws += char
978
+ else:
979
+ break
980
+
981
+ new_text = NavigableString(leading_ws + str(first_child))
982
+ first_child.replace_with(new_text)
930
983
  else:
931
984
  raise EmptyHtmlError
932
985
 
@@ -998,11 +1051,12 @@ def _process_html_core(
998
1051
 
999
1052
 
1000
1053
  def convert_to_markdown_stream(
1001
- source: str | BeautifulSoup,
1054
+ source: str | bytes | BeautifulSoup,
1002
1055
  *,
1003
1056
  chunk_size: int = 1024,
1004
1057
  progress_callback: Callable[[int, int], None] | None = None,
1005
1058
  parser: str | None = None,
1059
+ source_encoding: str = "utf-8",
1006
1060
  autolinks: bool = True,
1007
1061
  br_in_tables: bool = False,
1008
1062
  bullets: str = "*+-",
@@ -1022,6 +1076,10 @@ def convert_to_markdown_stream(
1022
1076
  list_indent_type: Literal["spaces", "tabs"] = "spaces",
1023
1077
  list_indent_width: int = 4,
1024
1078
  newline_style: Literal["spaces", "backslash"] = SPACES,
1079
+ preprocess_html: bool = False,
1080
+ preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
1081
+ remove_forms: bool = True,
1082
+ remove_navigation: bool = True,
1025
1083
  strip: str | Iterable[str] | None = None,
1026
1084
  strip_newlines: bool = False,
1027
1085
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
@@ -1033,8 +1091,22 @@ def convert_to_markdown_stream(
1033
1091
  ) -> Generator[str, None, None]:
1034
1092
  sink = StreamingSink(chunk_size, progress_callback)
1035
1093
 
1036
- if isinstance(source, str):
1037
- sink.total_bytes = len(source)
1094
+ if isinstance(source, bytes):
1095
+ source = source.decode(source_encoding or "utf-8", errors="replace")
1096
+
1097
+ if isinstance(source, str) and preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
1098
+ config = create_preprocessor(
1099
+ preset=preprocessing_preset,
1100
+ remove_navigation=remove_navigation,
1101
+ remove_forms=remove_forms,
1102
+ )
1103
+ source = preprocess_fn(source, **config)
1104
+
1105
+ if isinstance(source, (str, bytes)):
1106
+ if isinstance(source, bytes):
1107
+ sink.total_bytes = len(source)
1108
+ else:
1109
+ sink.total_bytes = len(source)
1038
1110
  elif isinstance(source, BeautifulSoup):
1039
1111
  sink.total_bytes = len(str(source))
1040
1112
 
@@ -1045,6 +1117,7 @@ def convert_to_markdown_stream(
1045
1117
  sink,
1046
1118
  whitespace_handler=whitespace_handler,
1047
1119
  parser=parser,
1120
+ source_encoding=source_encoding,
1048
1121
  autolinks=autolinks,
1049
1122
  br_in_tables=br_in_tables,
1050
1123
  bullets=bullets,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.13.0
3
+ Version: 1.14.0
4
4
  Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -32,6 +32,8 @@ Description-Content-Type: text/markdown
32
32
  License-File: LICENSE
33
33
  Requires-Dist: beautifulsoup4>=4.13.5
34
34
  Requires-Dist: nh3>=0.3
35
+ Provides-Extra: html5lib
36
+ Requires-Dist: beautifulsoup4[html5lib]>=4.13.5; extra == "html5lib"
35
37
  Provides-Extra: lxml
36
38
  Requires-Dist: beautifulsoup4[lxml]>=4.13.5; extra == "lxml"
37
39
  Dynamic: license-file
@@ -40,7 +42,7 @@ Dynamic: license-file
40
42
 
41
43
  A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
42
44
  of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
43
- Python 3.9+.
45
+ Python 3.10+.
44
46
 
45
47
  ## Support This Project
46
48
 
@@ -64,8 +66,10 @@ Your support helps maintain and improve this library for the community.
64
66
  - **Custom Converters**: Extensible converter system for custom HTML tag handling
65
67
  - **List Formatting**: Configurable list indentation with Discord/Slack compatibility
66
68
  - **HTML Preprocessing**: Clean messy HTML with configurable aggressiveness levels
69
+ - **Bytes Input Support**: Direct handling of bytes input with automatic encoding detection and configurable source encoding
67
70
  - **Whitespace Control**: Normalized or strict whitespace preservation modes
68
71
  - **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
72
+ - **Parser Normalization**: Consistent output across all supported parsers (html.parser, lxml, html5lib)
69
73
  - **Robustly Tested**: Comprehensive unit tests and integration tests covering all conversion scenarios
70
74
 
71
75
  ## Installation
@@ -74,17 +78,27 @@ Your support helps maintain and improve this library for the community.
74
78
  pip install html-to-markdown
75
79
  ```
76
80
 
77
- ### Optional lxml Parser
81
+ ### Optional Parsers
78
82
 
79
- For improved performance, you can install with the optional lxml parser:
83
+ For improved performance and compatibility, you can install with optional parsers:
80
84
 
81
85
  ```shell
86
+ # Fast lxml parser (recommended)
82
87
  pip install html-to-markdown[lxml]
88
+
89
+ # Standards-compliant html5lib parser
90
+ pip install html-to-markdown[html5lib]
83
91
  ```
84
92
 
85
- The lxml parser offers faster HTML parsing and better handling of malformed HTML compared to the default html.parser.
93
+ **Parser Options:**
94
+
95
+ - **html.parser** (default): Built-in Python parser, no dependencies
96
+ - **lxml**: **Recommended** - Fastest parser with good malformed HTML handling
97
+ - **html5lib**: Most standards-compliant, handles edge cases best
86
98
 
87
- The library automatically uses lxml when available. You can explicitly specify a parser using the `parser` parameter.
99
+ The library automatically uses lxml when available and **normalizes output to ensure consistent results regardless of parser choice**. We recommend using the **lxml parser for optimal performance** - it's significantly faster than the other options while maintaining excellent compatibility.
100
+
101
+ You can explicitly specify a parser using the `parser` parameter.
88
102
 
89
103
  ## Quick Start
90
104
 
@@ -149,6 +163,39 @@ soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installatio
149
163
  markdown = convert_to_markdown(soup)
150
164
  ```
151
165
 
166
+ ### Working with Bytes and Encodings
167
+
168
+ The library can directly handle bytes input, which is useful when working with HTTP responses or files:
169
+
170
+ ```python
171
+ import requests
172
+ from html_to_markdown import convert_to_markdown
173
+
174
+ # Working with HTTP responses (bytes)
175
+ response = requests.get("https://example.com")
176
+ markdown = convert_to_markdown(response.content) # response.content returns bytes
177
+
178
+ # Specify encoding for non-UTF-8 content
179
+ response = requests.get("https://example.fr")
180
+ markdown = convert_to_markdown(response.content, source_encoding="latin-1")
181
+
182
+ # Common encoding examples
183
+ html_bytes = b"<p>Hello World</p>"
184
+ markdown = convert_to_markdown(html_bytes) # UTF-8 by default
185
+
186
+ # Latin-1 encoded content
187
+ html_latin1 = "<p>Café résumé</p>".encode("latin-1")
188
+ markdown = convert_to_markdown(html_latin1, source_encoding="latin-1")
189
+
190
+ # Windows-1252 encoded content
191
+ html_windows = '<p>Smart quotes: "Hello"</p>'.encode("windows-1252")
192
+ markdown = convert_to_markdown(html_windows, source_encoding="windows-1252")
193
+
194
+ # Piping bytes from command line
195
+ # echo '<p>Hello</p>' | python -m html_to_markdown
196
+ # cat file.html | python -m html_to_markdown --source-encoding latin-1
197
+ ```
198
+
152
199
  ## Common Use Cases
153
200
 
154
201
  ### Discord/Slack Compatible Lists
@@ -681,6 +728,131 @@ This library provides comprehensive support for all modern HTML5 elements:
681
728
 
682
729
  - `<math>` (MathML support)
683
730
 
731
+ ## Command Line Interface
732
+
733
+ The library includes a full-featured CLI tool with complete API parity:
734
+
735
+ ### Basic Usage
736
+
737
+ ```bash
738
+ # Convert HTML file to Markdown
739
+ html-to-markdown document.html
740
+
741
+ # Convert from stdin
742
+ echo '<h1>Title</h1><p>Content</p>' | html-to-markdown
743
+
744
+ # Read HTML file with specific encoding
745
+ html-to-markdown document.html --source-encoding latin-1
746
+
747
+ # Pipe bytes with encoding specification
748
+ cat document.html | html-to-markdown --source-encoding utf-8
749
+ ```
750
+
751
+ ### Advanced CLI Options
752
+
753
+ ```bash
754
+ # Discord/Slack compatible lists (2-space indent)
755
+ html-to-markdown file.html --list-indent-width 2
756
+
757
+ # Clean messy HTML before conversion
758
+ html-to-markdown file.html --preprocess-html --preprocessing-preset aggressive
759
+
760
+ # Custom heading style
761
+ html-to-markdown file.html --heading-style atx
762
+
763
+ # Strip specific tags
764
+ html-to-markdown file.html --strip nav aside footer
765
+
766
+ # Convert only specific tags
767
+ html-to-markdown file.html --convert h1 h2 p a strong em
768
+
769
+ # Enable streaming for large files with progress
770
+ html-to-markdown large.html --stream-processing --show-progress
771
+
772
+ # Use specific parser (lxml recommended for best performance)
773
+ html-to-markdown file.html --parser lxml
774
+ ```
775
+
776
+ ### Real-World CLI Examples
777
+
778
+ ```bash
779
+ # Download and convert a webpage
780
+ curl -s https://example.com | html-to-markdown --preprocess-html > output.md
781
+
782
+ # Process multiple files with different encodings
783
+ for file in *.html; do
784
+ html-to-markdown "$file" --source-encoding latin-1 > "${file%.html}.md"
785
+ done
786
+
787
+ # Convert with custom formatting for documentation
788
+ html-to-markdown docs.html \
789
+ --heading-style atx \
790
+ --list-indent-width 2 \
791
+ --highlight-style bold \
792
+ --no-extract-metadata > docs.md
793
+ ```
794
+
795
+ ## Differences from markdownify
796
+
797
+ html-to-markdown is a modern, completely rewritten library inspired by markdownify but with significant improvements:
798
+
799
+ ### Key Advantages
800
+
801
+ | Feature | markdownify | html-to-markdown |
802
+ | ----------------------- | ---------------- | ---------------------------------------------------------------------- |
803
+ | **Type Safety** | No type hints | Full MyPy compliance with strict typing |
804
+ | **Python Support** | Python 3.6+ | Python 3.10+ with modern features |
805
+ | **HTML5 Elements** | Basic support | Comprehensive HTML5 support (semantic, form, table, interactive, etc.) |
806
+ | **Table Handling** | Simple tables | Advanced rowspan/colspan support |
807
+ | **Streaming** | Memory-intensive | Memory-efficient streaming for large documents |
808
+ | **CLI Tool** | Basic | Full-featured CLI with all API options |
809
+ | **Preprocessing** | None | Built-in HTML cleaning with configurable presets |
810
+ | **Metadata Extraction** | None | Automatic title/meta extraction as comments |
811
+ | **Task Lists** | None | GitHub-compatible checkbox conversion |
812
+ | **Bytes Input** | None | Direct bytes support with configurable encoding |
813
+ | **Custom Converters** | Class-based | Function-based with simpler API |
814
+ | **Testing** | Basic | Comprehensive test suite with 100% coverage |
815
+ | **Performance** | Standard | Significantly faster with recommended lxml parser |
816
+
817
+ ### API Compatibility
818
+
819
+ While inspired by markdownify, html-to-markdown uses a more modern, explicit API:
820
+
821
+ ```python
822
+ # markdownify style
823
+ from markdownify import markdownify
824
+
825
+ result = markdownify(html, heading_style="atx", strip=["nav"])
826
+
827
+ # html-to-markdown style (more explicit)
828
+ from html_to_markdown import convert_to_markdown
829
+
830
+ result = convert_to_markdown(html, heading_style="atx", strip=["nav"])
831
+ ```
832
+
833
+ ### Migration from markdownify
834
+
835
+ Most markdownify code can be easily migrated:
836
+
837
+ ```python
838
+ # Before (markdownify)
839
+ from markdownify import markdownify as md
840
+
841
+ result = md(html, heading_style="atx")
842
+
843
+ # After (html-to-markdown)
844
+ from html_to_markdown import convert_to_markdown
845
+
846
+ result = convert_to_markdown(html, heading_style="atx")
847
+ ```
848
+
849
+ Key changes when migrating:
850
+
851
+ - Import path: `markdownify` → `html_to_markdown`
852
+ - Function name: `markdownify()` → `convert_to_markdown()`
853
+ - All parameter names remain the same for common options
854
+ - New parameters available for advanced features (preprocessing, streaming, etc.)
855
+
684
856
  ## Acknowledgments
685
857
 
686
- Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
858
+ Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors for the inspiration and foundation that made this modern implementation possible.
@@ -1,17 +1,17 @@
1
1
  html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,653
2
2
  html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
3
- html_to_markdown/cli.py,sha256=qB8-1jqJPW-YrOmlyOdJnLM6DpKSUIA3iyn1SJaJgKg,9418
3
+ html_to_markdown/cli.py,sha256=-rq1L64Ze-zxSdn0cta8HvUCJDGmWHDcZe2RlVZJFjI,9665
4
4
  html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
5
- html_to_markdown/converters.py,sha256=l4ZtIhfOdemvaApRjH7qmzHrWNF3PDlBzsT1LRw3n0Y,36022
5
+ html_to_markdown/converters.py,sha256=REuvFnP-D97VlG2kuCVTbb3exoZ87NQn9hUuiP5ISOU,35839
6
6
  html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
7
7
  html_to_markdown/preprocessor.py,sha256=otnTOhoivJkxaip1Lb9xNMl8q-x9aGFXSYkSrxsTW8g,9591
8
- html_to_markdown/processing.py,sha256=SjVStbriaOb24ZwCcRp8eqOJ1p5bIVxpCXSMW3vQojs,38059
8
+ html_to_markdown/processing.py,sha256=Nw68rKNRMV7BbDxGW5sDhebeyhO7SC_Tv4lMJF4TJfc,40697
9
9
  html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
11
11
  html_to_markdown/whitespace.py,sha256=rl3eEwqfMpNWx4FBmbkZ1RxO_Od45p3EZ_7UgKcDAtg,7710
12
- html_to_markdown-1.13.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
13
- html_to_markdown-1.13.0.dist-info/METADATA,sha256=CIfFx5C69D3lFg3wgajZnMRmQV-7C78ga2zbXKcxcsc,22694
14
- html_to_markdown-1.13.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- html_to_markdown-1.13.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
16
- html_to_markdown-1.13.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
17
- html_to_markdown-1.13.0.dist-info/RECORD,,
12
+ html_to_markdown-1.14.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
13
+ html_to_markdown-1.14.0.dist-info/METADATA,sha256=vJeFvECsy8HFT8Ezd_ddc4__dHFxKgSH4wFHH8bDQtE,29421
14
+ html_to_markdown-1.14.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ html_to_markdown-1.14.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
16
+ html_to_markdown-1.14.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
17
+ html_to_markdown-1.14.0.dist-info/RECORD,,