html-to-markdown 1.13.0__py3-none-any.whl → 1.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

html_to_markdown/cli.py CHANGED
@@ -1,5 +1,5 @@
1
1
  import sys
2
- from argparse import ArgumentParser, FileType
2
+ from argparse import ArgumentParser
3
3
  from pathlib import Path
4
4
 
5
5
  from html_to_markdown.constants import (
@@ -27,8 +27,7 @@ def main(argv: list[str]) -> str:
27
27
  parser.add_argument(
28
28
  "html",
29
29
  nargs="?",
30
- type=FileType("r"),
31
- default=sys.stdin,
30
+ default="-",
32
31
  help="The HTML file to convert. Defaults to STDIN if not provided.",
33
32
  )
34
33
 
@@ -247,7 +246,7 @@ def main(argv: list[str]) -> str:
247
246
  "--source-encoding",
248
247
  type=str,
249
248
  default=None,
250
- help="Source file encoding (e.g. 'utf-8', 'latin-1'). Defaults to system default.",
249
+ help="Encoding for reading input files and decoding bytes (e.g. 'utf-8', 'latin-1'). Default: utf-8.",
251
250
  )
252
251
 
253
252
  args = parser.parse_args(argv)
@@ -260,6 +259,7 @@ def main(argv: list[str]) -> str:
260
259
  "convert": args.convert,
261
260
  "convert_as_inline": args.convert_as_inline,
262
261
  "default_title": args.default_title,
262
+ "source_encoding": args.source_encoding,
263
263
  "escape_asterisks": args.escape_asterisks,
264
264
  "escape_misc": args.escape_misc,
265
265
  "escape_underscores": args.escape_underscores,
@@ -302,14 +302,20 @@ def main(argv: list[str]) -> str:
302
302
 
303
303
  base_args["progress_callback"] = progress_callback
304
304
 
305
- if args.source_encoding and args.html.name != "<stdin>":
306
- args.html.close()
307
- try:
308
- with Path(args.html.name).open(encoding=args.source_encoding) as f:
309
- html_content = f.read()
310
- except LookupError as e:
311
- raise InvalidEncodingError(args.source_encoding) from e
305
+ if args.html == "-":
306
+ html_content = sys.stdin.buffer.read()
312
307
  else:
313
- html_content = args.html.read()
308
+ try:
309
+ file_path = Path(args.html)
310
+ if args.source_encoding:
311
+ with file_path.open(encoding=args.source_encoding, errors="replace") as f:
312
+ html_content = f.read()
313
+ else:
314
+ with file_path.open("rb") as f:
315
+ html_content = f.read()
316
+ except (OSError, LookupError) as e:
317
+ if isinstance(e, LookupError):
318
+ raise InvalidEncodingError(args.source_encoding) from e
319
+ raise
314
320
 
315
321
  return convert_to_markdown(html_content, **base_args)
@@ -414,8 +414,6 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
414
414
 
415
415
  return "".join(result_parts)
416
416
 
417
- # Ensure consistent whitespace handling for list items, especially with strip_newlines=True
418
- # Strip any leading whitespace that may have been inherited from parent containers
419
417
  clean_text = (text or "").strip()
420
418
  return f"{bullet} {clean_text}\n"
421
419
 
@@ -314,11 +314,12 @@ def _process_text(
314
314
  if len(ancestor_names) > 10:
315
315
  break
316
316
 
317
- in_pre = bool(ancestor_names.intersection({"pre"}))
317
+ in_pre = bool(ancestor_names.intersection({"pre"})) or parent_name == "pre"
318
318
 
319
319
  text = whitespace_handler.process_text_whitespace(text, el, in_pre=in_pre)
320
320
 
321
- if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
321
+ code_like_tags = {"pre", "code", "kbd", "samp"}
322
+ if not (ancestor_names.intersection(code_like_tags) or parent_name in code_like_tags):
322
323
  text = escape(
323
324
  text=text,
324
325
  escape_misc=escape_misc,
@@ -445,13 +446,14 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
445
446
 
446
447
 
447
448
  def convert_to_markdown(
448
- source: str | BeautifulSoup,
449
+ source: str | bytes | BeautifulSoup,
449
450
  *,
450
451
  stream_processing: bool = False,
451
452
  chunk_size: int = 1024,
452
453
  chunk_callback: Callable[[str], None] | None = None,
453
454
  progress_callback: Callable[[int, int], None] | None = None,
454
455
  parser: str | None = None,
456
+ source_encoding: str = "utf-8",
455
457
  autolinks: bool = True,
456
458
  br_in_tables: bool = False,
457
459
  bullets: str = "*+-",
@@ -489,12 +491,13 @@ def convert_to_markdown(
489
491
  various customization options for controlling the conversion behavior.
490
492
 
491
493
  Args:
492
- source: HTML string or BeautifulSoup object to convert.
494
+ source: HTML string, bytes, or BeautifulSoup object to convert.
493
495
  stream_processing: Enable streaming mode for large documents.
494
496
  chunk_size: Size of chunks for streaming processing.
495
497
  chunk_callback: Callback for processing chunks in streaming mode.
496
498
  progress_callback: Callback for progress updates (current, total).
497
499
  parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
500
+ source_encoding: Character encoding to use when decoding bytes (default: 'utf-8').
498
501
  autolinks: Convert URLs to automatic links.
499
502
  br_in_tables: Use <br> tags for line breaks in table cells instead of spaces.
500
503
  bullets: Characters to use for unordered list bullets.
@@ -548,11 +551,12 @@ def convert_to_markdown(
548
551
  >>> convert_to_markdown(html, list_indent_width=2)
549
552
  '* Item 1\\n* Item 2\\n\\n'
550
553
  """
551
- # Initialize original input string for Windows lxml fix
552
554
  original_input_str = None
553
555
 
556
+ if isinstance(source, bytes):
557
+ source = source.decode(source_encoding or "utf-8", errors="replace")
558
+
554
559
  if isinstance(source, str):
555
- # Store original string for plain text detection (Windows lxml fix)
556
560
  original_input_str = source
557
561
 
558
562
  if (
@@ -613,6 +617,34 @@ def convert_to_markdown(
613
617
  new_text = NavigableString(leading_ws + str(first_child))
614
618
  first_child.replace_with(new_text)
615
619
  needs_leading_space_fix = False
620
+
621
+ if parser == "html5lib":
622
+ body = source.find("body")
623
+ if body and isinstance(body, Tag):
624
+ children = list(body.children)
625
+
626
+ if (
627
+ len(children) == 1
628
+ and isinstance(children[0], NavigableString)
629
+ and original_source.startswith((" ", "\t", "\n", "\r"))
630
+ and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
631
+ ):
632
+ first_child = children[0]
633
+ original_text = str(first_child)
634
+
635
+ leading_ws = ""
636
+ for char in original_source:
637
+ if char in " \t\n\r":
638
+ leading_ws += char
639
+ else:
640
+ break
641
+
642
+ normalized_text = original_text
643
+ if leading_ws and not normalized_text.startswith(leading_ws):
644
+ normalized_text = leading_ws + normalized_text
645
+
646
+ new_text = NavigableString(normalized_text)
647
+ first_child.replace_with(new_text)
616
648
  else:
617
649
  raise EmptyHtmlError
618
650
 
@@ -626,6 +658,7 @@ def convert_to_markdown(
626
658
  chunk_size=chunk_size,
627
659
  progress_callback=progress_callback,
628
660
  parser=parser,
661
+ source_encoding=source_encoding,
629
662
  autolinks=autolinks,
630
663
  bullets=bullets,
631
664
  code_language=code_language,
@@ -673,6 +706,7 @@ def convert_to_markdown(
673
706
  sink,
674
707
  whitespace_handler=whitespace_handler,
675
708
  parser=parser,
709
+ source_encoding=source_encoding,
676
710
  autolinks=autolinks,
677
711
  br_in_tables=br_in_tables,
678
712
  bullets=bullets,
@@ -703,8 +737,6 @@ def convert_to_markdown(
703
737
 
704
738
  result = sink.get_result()
705
739
 
706
- # Parser-agnostic behavior: handle leading whitespace differences between parsers
707
- # lxml may either add unwanted whitespace or strip meaningful whitespace compared to html.parser
708
740
  if "needs_leading_whitespace_fix" in locals() and needs_leading_whitespace_fix:
709
741
  original_input = sink.original_source if hasattr(sink, "original_source") else original_source
710
742
  if isinstance(original_input, str):
@@ -713,19 +745,14 @@ def convert_to_markdown(
713
745
  original_leading_whitespace_match.group(0) if original_leading_whitespace_match else ""
714
746
  )
715
747
 
716
- # Case 1: lxml added leading newlines (like "\n<figure>") - strip them
717
748
  if result.startswith("\n") and not original_input.lstrip().startswith(result.strip()):
718
749
  result = result.lstrip("\n\r")
719
750
 
720
- # Case 2: lxml stripped meaningful leading whitespace (like " <b>") - restore it
721
- # However, don't restore whitespace if strip_newlines=True was used, as the user
722
- # explicitly requested to remove formatting whitespace
723
751
  elif (
724
752
  not strip_newlines
725
753
  and not result.startswith((" ", "\t"))
726
754
  and original_leading_whitespace.startswith((" ", "\t"))
727
755
  ):
728
- # Only restore spaces/tabs, not newlines (which are usually formatting)
729
756
  leading_spaces_tabs_match = re.match(r"^[ \t]*", original_leading_whitespace)
730
757
  leading_spaces_tabs = leading_spaces_tabs_match.group(0) if leading_spaces_tabs_match else ""
731
758
  if leading_spaces_tabs:
@@ -758,9 +785,6 @@ def convert_to_markdown(
758
785
  if convert_as_inline:
759
786
  result = result.rstrip("\n")
760
787
 
761
- # Windows-specific fix: For plain text input (no HTML tags), lxml may add extra trailing newlines
762
- # This ensures consistent behavior across platforms when processing plain text
763
- # Only apply to cases where lxml adds extra newlines (\n\n) at the end
764
788
  if (
765
789
  "original_input_str" in locals()
766
790
  and original_input_str
@@ -768,19 +792,11 @@ def convert_to_markdown(
768
792
  and not original_input_str.strip().endswith(">")
769
793
  and result.endswith("\n\n")
770
794
  ):
771
- # Input appears to be plain text, not HTML - normalize trailing newlines only
772
795
  result = result.rstrip("\n")
773
796
 
774
- # If the original input contained no block-level elements, normalize any
775
- # accidental trailing newlines for cross-platform consistency.
776
- # This guards cases like inline-only inputs (e.g., "text <strong>bold</strong>")
777
- # and head-only documents (e.g., "<head>head</head>") where output should
778
- # not end with extra blank lines.
779
797
  if "original_input_str" in locals() and original_input_str:
780
798
  from html_to_markdown.whitespace import BLOCK_ELEMENTS # noqa: PLC0415
781
799
 
782
- # Treat additional tags as block-producing for trailing newline purposes.
783
- # These may be inline in HTML spec but produce block output in our Markdown conversion.
784
800
  blockish = set(BLOCK_ELEMENTS) | {
785
801
  "textarea",
786
802
  "dialog",
@@ -880,11 +896,12 @@ class StreamingSink(OutputSink):
880
896
 
881
897
 
882
898
  def _process_html_core(
883
- source: str | BeautifulSoup,
899
+ source: str | bytes | BeautifulSoup,
884
900
  sink: OutputSink,
885
901
  *,
886
902
  whitespace_handler: WhitespaceHandler,
887
903
  parser: str | None = None,
904
+ source_encoding: str = "utf-8",
888
905
  autolinks: bool,
889
906
  br_in_tables: bool,
890
907
  bullets: str,
@@ -915,7 +932,12 @@ def _process_html_core(
915
932
  token = _ancestor_cache.set({})
916
933
 
917
934
  try:
918
- if isinstance(source, str):
935
+ if isinstance(source, (str, bytes)):
936
+ original_source = source
937
+ if isinstance(source, bytes):
938
+ source = source.decode(source_encoding or "utf-8", errors="replace")
939
+ original_source = source
940
+
919
941
  if strip_newlines:
920
942
  source = source.replace("\n", " ").replace("\r", " ") # pragma: no cover
921
943
 
@@ -926,7 +948,36 @@ def _process_html_core(
926
948
  if parser == "lxml" and not LXML_AVAILABLE: # pragma: no cover
927
949
  raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
928
950
 
951
+ needs_leading_whitespace_fix = (
952
+ parser == "lxml"
953
+ and isinstance(original_source, str)
954
+ and original_source.startswith((" ", "\t", "\n", "\r"))
955
+ )
956
+
929
957
  source = BeautifulSoup(source, parser)
958
+
959
+ if parser == "lxml" and needs_leading_whitespace_fix and isinstance(original_source, str):
960
+ body = source.find("body")
961
+ if body and isinstance(body, Tag):
962
+ children = list(body.children)
963
+
964
+ if (
965
+ len(children) == 1
966
+ and isinstance(children[0], NavigableString)
967
+ and original_source.startswith((" ", "\t", "\n", "\r"))
968
+ and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
969
+ ):
970
+ first_child = children[0]
971
+
972
+ leading_ws = ""
973
+ for char in original_source:
974
+ if char in " \t":
975
+ leading_ws += char
976
+ else:
977
+ break
978
+
979
+ new_text = NavigableString(leading_ws + str(first_child))
980
+ first_child.replace_with(new_text)
930
981
  else:
931
982
  raise EmptyHtmlError
932
983
 
@@ -998,11 +1049,12 @@ def _process_html_core(
998
1049
 
999
1050
 
1000
1051
  def convert_to_markdown_stream(
1001
- source: str | BeautifulSoup,
1052
+ source: str | bytes | BeautifulSoup,
1002
1053
  *,
1003
1054
  chunk_size: int = 1024,
1004
1055
  progress_callback: Callable[[int, int], None] | None = None,
1005
1056
  parser: str | None = None,
1057
+ source_encoding: str = "utf-8",
1006
1058
  autolinks: bool = True,
1007
1059
  br_in_tables: bool = False,
1008
1060
  bullets: str = "*+-",
@@ -1022,6 +1074,10 @@ def convert_to_markdown_stream(
1022
1074
  list_indent_type: Literal["spaces", "tabs"] = "spaces",
1023
1075
  list_indent_width: int = 4,
1024
1076
  newline_style: Literal["spaces", "backslash"] = SPACES,
1077
+ preprocess_html: bool = False,
1078
+ preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
1079
+ remove_forms: bool = True,
1080
+ remove_navigation: bool = True,
1025
1081
  strip: str | Iterable[str] | None = None,
1026
1082
  strip_newlines: bool = False,
1027
1083
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
@@ -1033,8 +1089,22 @@ def convert_to_markdown_stream(
1033
1089
  ) -> Generator[str, None, None]:
1034
1090
  sink = StreamingSink(chunk_size, progress_callback)
1035
1091
 
1036
- if isinstance(source, str):
1037
- sink.total_bytes = len(source)
1092
+ if isinstance(source, bytes):
1093
+ source = source.decode(source_encoding or "utf-8", errors="replace")
1094
+
1095
+ if isinstance(source, str) and preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
1096
+ config = create_preprocessor(
1097
+ preset=preprocessing_preset,
1098
+ remove_navigation=remove_navigation,
1099
+ remove_forms=remove_forms,
1100
+ )
1101
+ source = preprocess_fn(source, **config)
1102
+
1103
+ if isinstance(source, (str, bytes)):
1104
+ if isinstance(source, bytes):
1105
+ sink.total_bytes = len(source)
1106
+ else:
1107
+ sink.total_bytes = len(source)
1038
1108
  elif isinstance(source, BeautifulSoup):
1039
1109
  sink.total_bytes = len(str(source))
1040
1110
 
@@ -1045,6 +1115,7 @@ def convert_to_markdown_stream(
1045
1115
  sink,
1046
1116
  whitespace_handler=whitespace_handler,
1047
1117
  parser=parser,
1118
+ source_encoding=source_encoding,
1048
1119
  autolinks=autolinks,
1049
1120
  br_in_tables=br_in_tables,
1050
1121
  bullets=bullets,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.13.0
3
+ Version: 1.14.1
4
4
  Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -32,6 +32,8 @@ Description-Content-Type: text/markdown
32
32
  License-File: LICENSE
33
33
  Requires-Dist: beautifulsoup4>=4.13.5
34
34
  Requires-Dist: nh3>=0.3
35
+ Provides-Extra: html5lib
36
+ Requires-Dist: beautifulsoup4[html5lib]>=4.13.5; extra == "html5lib"
35
37
  Provides-Extra: lxml
36
38
  Requires-Dist: beautifulsoup4[lxml]>=4.13.5; extra == "lxml"
37
39
  Dynamic: license-file
@@ -40,7 +42,7 @@ Dynamic: license-file
40
42
 
41
43
  A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
42
44
  of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
43
- Python 3.9+.
45
+ Python 3.10+.
44
46
 
45
47
  ## Support This Project
46
48
 
@@ -64,8 +66,10 @@ Your support helps maintain and improve this library for the community.
64
66
  - **Custom Converters**: Extensible converter system for custom HTML tag handling
65
67
  - **List Formatting**: Configurable list indentation with Discord/Slack compatibility
66
68
  - **HTML Preprocessing**: Clean messy HTML with configurable aggressiveness levels
69
+ - **Bytes Input Support**: Direct handling of bytes input with automatic encoding detection and configurable source encoding
67
70
  - **Whitespace Control**: Normalized or strict whitespace preservation modes
68
71
  - **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
72
+ - **Parser Normalization**: Consistent output across all supported parsers (html.parser, lxml, html5lib)
69
73
  - **Robustly Tested**: Comprehensive unit tests and integration tests covering all conversion scenarios
70
74
 
71
75
  ## Installation
@@ -74,17 +78,27 @@ Your support helps maintain and improve this library for the community.
74
78
  pip install html-to-markdown
75
79
  ```
76
80
 
77
- ### Optional lxml Parser
81
+ ### Optional Parsers
78
82
 
79
- For improved performance, you can install with the optional lxml parser:
83
+ For improved performance and compatibility, you can install with optional parsers:
80
84
 
81
85
  ```shell
86
+ # Fast lxml parser (recommended)
82
87
  pip install html-to-markdown[lxml]
88
+
89
+ # Standards-compliant html5lib parser
90
+ pip install html-to-markdown[html5lib]
83
91
  ```
84
92
 
85
- The lxml parser offers faster HTML parsing and better handling of malformed HTML compared to the default html.parser.
93
+ **Parser Options:**
94
+
95
+ - **html.parser** (default): Built-in Python parser, no dependencies
96
+ - **lxml**: **Recommended** - Fastest parser with good malformed HTML handling
97
+ - **html5lib**: Most standards-compliant, handles edge cases best
86
98
 
87
- The library automatically uses lxml when available. You can explicitly specify a parser using the `parser` parameter.
99
+ The library automatically uses lxml when available and **normalizes output to ensure consistent results regardless of parser choice**. We recommend using the **lxml parser for optimal performance** - it's significantly faster than the other options while maintaining excellent compatibility.
100
+
101
+ You can explicitly specify a parser using the `parser` parameter.
88
102
 
89
103
  ## Quick Start
90
104
 
@@ -149,6 +163,39 @@ soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installatio
149
163
  markdown = convert_to_markdown(soup)
150
164
  ```
151
165
 
166
+ ### Working with Bytes and Encodings
167
+
168
+ The library can directly handle bytes input, which is useful when working with HTTP responses or files:
169
+
170
+ ```python
171
+ import requests
172
+ from html_to_markdown import convert_to_markdown
173
+
174
+ # Working with HTTP responses (bytes)
175
+ response = requests.get("https://example.com")
176
+ markdown = convert_to_markdown(response.content) # response.content returns bytes
177
+
178
+ # Specify encoding for non-UTF-8 content
179
+ response = requests.get("https://example.fr")
180
+ markdown = convert_to_markdown(response.content, source_encoding="latin-1")
181
+
182
+ # Common encoding examples
183
+ html_bytes = b"<p>Hello World</p>"
184
+ markdown = convert_to_markdown(html_bytes) # UTF-8 by default
185
+
186
+ # Latin-1 encoded content
187
+ html_latin1 = "<p>Café résumé</p>".encode("latin-1")
188
+ markdown = convert_to_markdown(html_latin1, source_encoding="latin-1")
189
+
190
+ # Windows-1252 encoded content
191
+ html_windows = '<p>Smart quotes: "Hello"</p>'.encode("windows-1252")
192
+ markdown = convert_to_markdown(html_windows, source_encoding="windows-1252")
193
+
194
+ # Piping bytes from command line
195
+ # echo '<p>Hello</p>' | python -m html_to_markdown
196
+ # cat file.html | python -m html_to_markdown --source-encoding latin-1
197
+ ```
198
+
152
199
  ## Common Use Cases
153
200
 
154
201
  ### Discord/Slack Compatible Lists
@@ -681,6 +728,131 @@ This library provides comprehensive support for all modern HTML5 elements:
681
728
 
682
729
  - `<math>` (MathML support)
683
730
 
731
+ ## Command Line Interface
732
+
733
+ The library includes a full-featured CLI tool with complete API parity:
734
+
735
+ ### Basic Usage
736
+
737
+ ```bash
738
+ # Convert HTML file to Markdown
739
+ html-to-markdown document.html
740
+
741
+ # Convert from stdin
742
+ echo '<h1>Title</h1><p>Content</p>' | html-to-markdown
743
+
744
+ # Read HTML file with specific encoding
745
+ html-to-markdown document.html --source-encoding latin-1
746
+
747
+ # Pipe bytes with encoding specification
748
+ cat document.html | html-to-markdown --source-encoding utf-8
749
+ ```
750
+
751
+ ### Advanced CLI Options
752
+
753
+ ```bash
754
+ # Discord/Slack compatible lists (2-space indent)
755
+ html-to-markdown file.html --list-indent-width 2
756
+
757
+ # Clean messy HTML before conversion
758
+ html-to-markdown file.html --preprocess-html --preprocessing-preset aggressive
759
+
760
+ # Custom heading style
761
+ html-to-markdown file.html --heading-style atx
762
+
763
+ # Strip specific tags
764
+ html-to-markdown file.html --strip nav aside footer
765
+
766
+ # Convert only specific tags
767
+ html-to-markdown file.html --convert h1 h2 p a strong em
768
+
769
+ # Enable streaming for large files with progress
770
+ html-to-markdown large.html --stream-processing --show-progress
771
+
772
+ # Use specific parser (lxml recommended for best performance)
773
+ html-to-markdown file.html --parser lxml
774
+ ```
775
+
776
+ ### Real-World CLI Examples
777
+
778
+ ```bash
779
+ # Download and convert a webpage
780
+ curl -s https://example.com | html-to-markdown --preprocess-html > output.md
781
+
782
+ # Process multiple files with different encodings
783
+ for file in *.html; do
784
+ html-to-markdown "$file" --source-encoding latin-1 > "${file%.html}.md"
785
+ done
786
+
787
+ # Convert with custom formatting for documentation
788
+ html-to-markdown docs.html \
789
+ --heading-style atx \
790
+ --list-indent-width 2 \
791
+ --highlight-style bold \
792
+ --no-extract-metadata > docs.md
793
+ ```
794
+
795
+ ## Differences from markdownify
796
+
797
+ html-to-markdown is a modern, completely rewritten library inspired by markdownify but with significant improvements:
798
+
799
+ ### Key Advantages
800
+
801
+ | Feature | markdownify | html-to-markdown |
802
+ | ----------------------- | ---------------- | ---------------------------------------------------------------------- |
803
+ | **Type Safety** | No type hints | Full MyPy compliance with strict typing |
804
+ | **Python Support** | Python 3.6+ | Python 3.10+ with modern features |
805
+ | **HTML5 Elements** | Basic support | Comprehensive HTML5 support (semantic, form, table, interactive, etc.) |
806
+ | **Table Handling** | Simple tables | Advanced rowspan/colspan support |
807
+ | **Streaming** | Memory-intensive | Memory-efficient streaming for large documents |
808
+ | **CLI Tool** | Basic | Full-featured CLI with all API options |
809
+ | **Preprocessing** | None | Built-in HTML cleaning with configurable presets |
810
+ | **Metadata Extraction** | None | Automatic title/meta extraction as comments |
811
+ | **Task Lists** | None | GitHub-compatible checkbox conversion |
812
+ | **Bytes Input** | None | Direct bytes support with configurable encoding |
813
+ | **Custom Converters** | Class-based | Function-based with simpler API |
814
+ | **Testing** | Basic | Comprehensive test suite with 100% coverage |
815
+ | **Performance** | Standard | Significantly faster with recommended lxml parser |
816
+
817
+ ### API Compatibility
818
+
819
+ While inspired by markdownify, html-to-markdown uses a more modern, explicit API:
820
+
821
+ ```python
822
+ # markdownify style
823
+ from markdownify import markdownify
824
+
825
+ result = markdownify(html, heading_style="atx", strip=["nav"])
826
+
827
+ # html-to-markdown style (more explicit)
828
+ from html_to_markdown import convert_to_markdown
829
+
830
+ result = convert_to_markdown(html, heading_style="atx", strip=["nav"])
831
+ ```
832
+
833
+ ### Migration from markdownify
834
+
835
+ Most markdownify code can be easily migrated:
836
+
837
+ ```python
838
+ # Before (markdownify)
839
+ from markdownify import markdownify as md
840
+
841
+ result = md(html, heading_style="atx")
842
+
843
+ # After (html-to-markdown)
844
+ from html_to_markdown import convert_to_markdown
845
+
846
+ result = convert_to_markdown(html, heading_style="atx")
847
+ ```
848
+
849
+ Key changes when migrating:
850
+
851
+ - Import path: `markdownify` → `html_to_markdown`
852
+ - Function name: `markdownify()` → `convert_to_markdown()`
853
+ - All parameter names remain the same for common options
854
+ - New parameters available for advanced features (preprocessing, streaming, etc.)
855
+
684
856
  ## Acknowledgments
685
857
 
686
- Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
858
+ Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors for the inspiration and foundation that made this modern implementation possible.
@@ -1,17 +1,17 @@
1
1
  html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,653
2
2
  html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
3
- html_to_markdown/cli.py,sha256=qB8-1jqJPW-YrOmlyOdJnLM6DpKSUIA3iyn1SJaJgKg,9418
3
+ html_to_markdown/cli.py,sha256=-rq1L64Ze-zxSdn0cta8HvUCJDGmWHDcZe2RlVZJFjI,9665
4
4
  html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
5
- html_to_markdown/converters.py,sha256=l4ZtIhfOdemvaApRjH7qmzHrWNF3PDlBzsT1LRw3n0Y,36022
5
+ html_to_markdown/converters.py,sha256=REuvFnP-D97VlG2kuCVTbb3exoZ87NQn9hUuiP5ISOU,35839
6
6
  html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
7
7
  html_to_markdown/preprocessor.py,sha256=otnTOhoivJkxaip1Lb9xNMl8q-x9aGFXSYkSrxsTW8g,9591
8
- html_to_markdown/processing.py,sha256=SjVStbriaOb24ZwCcRp8eqOJ1p5bIVxpCXSMW3vQojs,38059
8
+ html_to_markdown/processing.py,sha256=WFXwHNOK_wdNtiRjubt_MC19Q3FScR0j5eohWmBRSmU,40548
9
9
  html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
11
11
  html_to_markdown/whitespace.py,sha256=rl3eEwqfMpNWx4FBmbkZ1RxO_Od45p3EZ_7UgKcDAtg,7710
12
- html_to_markdown-1.13.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
13
- html_to_markdown-1.13.0.dist-info/METADATA,sha256=CIfFx5C69D3lFg3wgajZnMRmQV-7C78ga2zbXKcxcsc,22694
14
- html_to_markdown-1.13.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- html_to_markdown-1.13.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
16
- html_to_markdown-1.13.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
17
- html_to_markdown-1.13.0.dist-info/RECORD,,
12
+ html_to_markdown-1.14.1.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
13
+ html_to_markdown-1.14.1.dist-info/METADATA,sha256=m9fQ28oyBQXrKoXB6Sd-tH7-NhB0RsaikBH2wBvn1LA,29421
14
+ html_to_markdown-1.14.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ html_to_markdown-1.14.1.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
16
+ html_to_markdown-1.14.1.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
17
+ html_to_markdown-1.14.1.dist-info/RECORD,,