html-to-markdown 2.0.1__cp310-abi3-win_amd64.whl → 2.1.2__cp310-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -23,12 +23,8 @@ from html_to_markdown.exceptions import (
23
23
  InvalidParserError,
24
24
  MissingDependencyError,
25
25
  )
26
- from html_to_markdown.options import (
27
- ConversionOptions,
28
- ParsingOptions,
29
- PreprocessingOptions,
30
- )
31
- from html_to_markdown.v1_compat import convert_to_markdown, convert_to_markdown_stream, markdownify
26
+ from html_to_markdown.options import ConversionOptions, PreprocessingOptions
27
+ from html_to_markdown.v1_compat import convert_to_markdown, markdownify
32
28
 
33
29
  __all__ = [
34
30
  "ConflictingOptionsError",
@@ -37,12 +33,10 @@ __all__ = [
37
33
  "HtmlToMarkdownError",
38
34
  "InvalidParserError",
39
35
  "MissingDependencyError",
40
- "ParsingOptions",
41
36
  "PreprocessingOptions",
42
37
  "convert",
43
38
  "convert_to_markdown",
44
- "convert_to_markdown_stream",
45
39
  "markdownify",
46
40
  ]
47
41
 
48
- __version__ = "2.0.0"
42
+ __version__ = "2.1.1"
Binary file
@@ -21,8 +21,9 @@ class ConversionOptions:
21
21
  sub_symbol: str
22
22
  sup_symbol: str
23
23
  newline_style: str
24
+ keep_inline_images_in: list[str]
24
25
  preprocessing: PreprocessingOptions
25
- parsing: ParsingOptions
26
+ encoding: str
26
27
 
27
28
  def __init__(
28
29
  self,
@@ -48,8 +49,9 @@ class ConversionOptions:
48
49
  sub_symbol: str = "",
49
50
  sup_symbol: str = "",
50
51
  newline_style: str = "spaces",
52
+ keep_inline_images_in: list[str] | None = None,
51
53
  preprocessing: PreprocessingOptions | None = None,
52
- parsing: ParsingOptions | None = None,
54
+ encoding: str = "utf-8",
53
55
  ) -> None: ...
54
56
 
55
57
  class PreprocessingOptions:
@@ -66,14 +68,4 @@ class PreprocessingOptions:
66
68
  remove_forms: bool = True,
67
69
  ) -> None: ...
68
70
 
69
- class ParsingOptions:
70
- encoding: str
71
- parser: str | None
72
-
73
- def __init__(
74
- self,
75
- encoding: str = "utf-8",
76
- parser: str | None = None,
77
- ) -> None: ...
78
-
79
71
  def convert(html: str, options: ConversionOptions | None = None) -> str: ...
html_to_markdown/api.py CHANGED
@@ -7,47 +7,28 @@ using the Rust backend for conversion.
7
7
  from __future__ import annotations
8
8
 
9
9
  import html_to_markdown._html_to_markdown as _rust # type: ignore[import-not-found]
10
- from html_to_markdown.options import (
11
- ConversionOptions,
12
- ParsingOptions,
13
- PreprocessingOptions,
14
- )
10
+ from html_to_markdown.options import ConversionOptions, PreprocessingOptions
15
11
 
16
12
 
17
13
  def convert(
18
14
  html: str,
19
15
  options: ConversionOptions | None = None,
20
16
  preprocessing: PreprocessingOptions | None = None,
21
- parsing: ParsingOptions | None = None,
22
17
  ) -> str:
23
- """Convert HTML to Markdown using Rust backend.
24
-
25
- This is the main entry point for the v2 API, using dataclass-based configuration
26
- and Rust implementation for high-performance conversion.
18
+ """Convert HTML to Markdown using the Rust backend.
27
19
 
28
20
  Args:
29
- html: HTML string to convert
30
- options: Conversion options (uses defaults if None)
31
- preprocessing: HTML preprocessing options (uses defaults if None)
32
- parsing: HTML parsing options (uses defaults if None)
21
+ html: HTML string to convert.
22
+ options: Conversion configuration options (defaults to ConversionOptions()).
23
+ preprocessing: HTML preprocessing options (defaults to PreprocessingOptions()).
33
24
 
34
25
  Returns:
35
- Markdown string
36
-
37
- Example:
38
- >>> from html_to_markdown import convert, ConversionOptions
39
- >>> options = ConversionOptions(heading_style="atx", list_indent_width=2)
40
- >>> markdown = convert("<h1>Title</h1>", options)
41
- >>> print(markdown)
42
- # Title
43
- <BLANKLINE>
26
+ Converted Markdown string.
44
27
  """
45
28
  if options is None:
46
29
  options = ConversionOptions()
47
30
  if preprocessing is None:
48
31
  preprocessing = PreprocessingOptions()
49
- if parsing is None:
50
- parsing = ParsingOptions()
51
32
 
52
33
  rust_preprocessing = _rust.PreprocessingOptions(
53
34
  enabled=preprocessing.enabled,
@@ -56,11 +37,6 @@ def convert(
56
37
  remove_forms=preprocessing.remove_forms,
57
38
  )
58
39
 
59
- rust_parsing = _rust.ParsingOptions(
60
- encoding=parsing.encoding,
61
- parser=parsing.parser,
62
- )
63
-
64
40
  rust_options = _rust.ConversionOptions(
65
41
  heading_style=options.heading_style,
66
42
  list_indent_type=options.list_indent_type,
@@ -75,9 +51,6 @@ def convert(
75
51
  autolinks=options.autolinks,
76
52
  default_title=options.default_title,
77
53
  br_in_tables=options.br_in_tables,
78
- hocr_extract_tables=options.hocr_extract_tables,
79
- hocr_table_column_threshold=options.hocr_table_column_threshold,
80
- hocr_table_row_threshold_ratio=options.hocr_table_row_threshold_ratio,
81
54
  highlight_style=options.highlight_style,
82
55
  extract_metadata=options.extract_metadata,
83
56
  whitespace_mode=options.whitespace_mode,
@@ -91,7 +64,7 @@ def convert(
91
64
  code_block_style=options.code_block_style,
92
65
  keep_inline_images_in=list(options.keep_inline_images_in) if options.keep_inline_images_in else [],
93
66
  preprocessing=rust_preprocessing,
94
- parsing=rust_parsing,
67
+ encoding=options.encoding,
95
68
  debug=options.debug,
96
69
  strip_tags=list(options.strip_tags) if options.strip_tags else [],
97
70
  )
Binary file
html_to_markdown/cli.py CHANGED
@@ -1,9 +1,3 @@
1
- """CLI wrapper that proxies to Rust CLI binary.
2
-
3
- This module provides backwards compatibility for code that imports
4
- from html_to_markdown.cli. The actual CLI implementation is in Rust.
5
- """
6
-
7
1
  from html_to_markdown.cli_proxy import main
8
2
 
9
3
  __all__ = ["main"]
@@ -1,25 +1,19 @@
1
- """CLI proxy that calls the Rust CLI binary.
2
-
3
- This module provides a Python wrapper around the Rust CLI binary,
4
- allowing the Python package to use the high-performance Rust implementation
5
- for command-line operations. It also provides v1 -> v2 CLI argument translation.
6
- """
7
-
8
1
  import subprocess
9
2
  import sys
3
+ import warnings
10
4
  from pathlib import Path
11
5
 
12
6
  from html_to_markdown.exceptions import RedundantV1FlagError, RemovedV1FlagError
13
7
 
14
8
 
15
9
  def find_cli_binary() -> Path:
16
- """Find the html-to-markdown CLI binary.
10
+ """Find the html-to-markdown CLI binary in expected locations.
17
11
 
18
12
  Returns:
19
- Path to the CLI binary
13
+ Path to the CLI binary.
20
14
 
21
15
  Raises:
22
- FileNotFoundError: If the binary cannot be found
16
+ FileNotFoundError: If the binary cannot be found.
23
17
  """
24
18
  binary_name = "html-to-markdown.exe" if sys.platform == "win32" else "html-to-markdown"
25
19
 
@@ -38,28 +32,22 @@ def find_cli_binary() -> Path:
38
32
 
39
33
 
40
34
  def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
41
- """Translate v1 CLI arguments to v2 Rust CLI arguments.
42
-
43
- This handles differences between the v1 Python CLI and v2 Rust CLI:
44
- - Boolean flags: v1 used --flag/--no-flag, v2 uses presence/absence
45
- - Flag name changes: --preprocess-html -> --preprocess
46
- - Unsupported flags: --strip, --convert (raise errors)
35
+ """Translate v1 CLI arguments to v2 format.
47
36
 
48
37
  Args:
49
- argv: v1 CLI arguments
38
+ argv: List of command-line arguments.
50
39
 
51
40
  Returns:
52
- Translated v2 CLI arguments
41
+ Translated list of arguments compatible with v2.
53
42
 
54
43
  Raises:
55
- RemovedV1FlagError: If a v1 flag has been removed in v2
44
+ RemovedV1FlagError: If a v1 flag has been removed in v2.
56
45
  """
57
46
  translated = []
58
47
  i = 0
59
48
  while i < len(argv):
60
49
  arg = argv[i]
61
50
 
62
- # Error on removed/unsupported v1 features
63
51
  if arg in ("--strip", "--convert"):
64
52
  raise RemovedV1FlagError(
65
53
  flag=arg,
@@ -67,8 +55,6 @@ def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
67
55
  migration="Remove this flag from your command. The feature is no longer available.",
68
56
  )
69
57
 
70
- # These flags are redundant (match v2 defaults) but we accept them for v1 compatibility
71
- # Silently skip - Rust CLI defaults match these flags
72
58
  if arg in (
73
59
  "--no-escape-asterisks",
74
60
  "--no-escape-underscores",
@@ -77,14 +63,21 @@ def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
77
63
  "--no-autolinks",
78
64
  "--no-extract-metadata",
79
65
  ):
80
- # Skip this flag - matches Rust CLI defaults
81
- pass
66
+ warnings.warn(
67
+ f"'{arg}' is deprecated and redundant in v2. "
68
+ f"These options are now disabled by default. Remove this flag.",
69
+ DeprecationWarning,
70
+ stacklevel=2,
71
+ )
82
72
 
83
- # Flag name translations
84
73
  elif arg == "--preprocess-html":
74
+ warnings.warn(
75
+ "'--preprocess-html' is deprecated. Use '--preprocess' instead.",
76
+ DeprecationWarning,
77
+ stacklevel=2,
78
+ )
85
79
  translated.append("--preprocess")
86
80
 
87
- # Positive flags that should be passed through
88
81
  elif arg in (
89
82
  "--escape-asterisks",
90
83
  "--escape-underscores",
@@ -95,7 +88,6 @@ def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
95
88
  ):
96
89
  translated.append(arg)
97
90
 
98
- # All other args pass through unchanged
99
91
  else:
100
92
  translated.append(arg)
101
93
 
@@ -105,23 +97,21 @@ def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
105
97
 
106
98
 
107
99
  def main(argv: list[str]) -> str:
108
- """Run the Rust CLI with the given arguments.
100
+ """Main entry point for the CLI proxy.
109
101
 
110
- Translates v1 CLI arguments to v2 format if needed.
111
- Exits with non-zero status on errors (FileNotFoundError, UnsupportedV1FeatureError, CLI errors).
102
+ Translates v1 arguments to v2 and invokes the native Rust CLI binary.
112
103
 
113
104
  Args:
114
- argv: Command line arguments (without program name)
105
+ argv: Command-line arguments.
115
106
 
116
107
  Returns:
117
- Output from the CLI
108
+ Stdout from the CLI binary.
118
109
  """
119
110
  cli_binary = find_cli_binary()
120
111
 
121
112
  try:
122
113
  translated_args = translate_v1_args_to_v2(argv)
123
114
  except (RemovedV1FlagError, RedundantV1FlagError) as e:
124
- # Format the error nicely for CLI users
125
115
  sys.stderr.write(f"\n❌ Error: {e.flag}\n\n")
126
116
  sys.stderr.write(f" {e.reason}\n\n")
127
117
  sys.stderr.write(f" 💡 {e.migration}\n\n")
@@ -1,10 +1,8 @@
1
- """Exception classes for html-to-markdown."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
 
6
4
  class HtmlToMarkdownError(Exception):
7
- """Base exception for html-to-markdown errors."""
5
+ """Base exception for all html-to-markdown errors."""
8
6
 
9
7
 
10
8
  class MissingDependencyError(HtmlToMarkdownError):
@@ -22,7 +20,7 @@ class MissingDependencyError(HtmlToMarkdownError):
22
20
 
23
21
 
24
22
  class InvalidParserError(HtmlToMarkdownError):
25
- """Raised when an invalid HTML parser is specified."""
23
+ """Raised when an invalid parser is specified."""
26
24
 
27
25
  def __init__(self, parser: str, available_parsers: list[str]) -> None:
28
26
  self.parser = parser
@@ -33,14 +31,14 @@ class InvalidParserError(HtmlToMarkdownError):
33
31
 
34
32
 
35
33
  class EmptyHtmlError(HtmlToMarkdownError):
36
- """Raised when the input HTML is empty."""
34
+ """Raised when input HTML is empty."""
37
35
 
38
36
  def __init__(self) -> None:
39
37
  super().__init__("The input HTML is empty.")
40
38
 
41
39
 
42
40
  class ConflictingOptionsError(HtmlToMarkdownError):
43
- """Raised when conflicting options are specified."""
41
+ """Raised when conflicting configuration options are specified."""
44
42
 
45
43
  def __init__(self, option1: str, option2: str) -> None:
46
44
  self.option1 = option1
@@ -50,20 +48,14 @@ class ConflictingOptionsError(HtmlToMarkdownError):
50
48
 
51
49
 
52
50
  class InvalidEncodingError(HtmlToMarkdownError):
53
- """Raised when an invalid encoding is specified."""
51
+ """Raised when an invalid character encoding is specified."""
54
52
 
55
53
  def __init__(self, encoding: str) -> None:
56
54
  super().__init__(f"The specified encoding ({encoding}) is not valid.")
57
55
 
58
56
 
59
57
  class UnsupportedV1FeatureError(HtmlToMarkdownError):
60
- """Raised when a v1 feature is not supported in v2.
61
-
62
- Args:
63
- flag: The CLI flag or feature that is not supported
64
- reason: Why the feature is not supported
65
- migration: How to migrate away from this feature
66
- """
58
+ """Raised when a v1 feature is not supported in v2."""
67
59
 
68
60
  def __init__(self, flag: str, reason: str, migration: str) -> None:
69
61
  self.flag = flag
@@ -74,8 +66,8 @@ class UnsupportedV1FeatureError(HtmlToMarkdownError):
74
66
 
75
67
 
76
68
  class RemovedV1FlagError(UnsupportedV1FeatureError):
77
- """Raised when a CLI flag has been completely removed in v2."""
69
+ """Raised when a v1 flag has been removed in v2."""
78
70
 
79
71
 
80
72
  class RedundantV1FlagError(UnsupportedV1FeatureError):
81
- """Raised when a v1 flag is redundant in v2 because it's the default behavior."""
73
+ """Raised when a v1 flag is redundant in v2."""
@@ -6,38 +6,7 @@ This module provides dataclass-based configuration for the v2 API.
6
6
  from __future__ import annotations
7
7
 
8
8
  from dataclasses import dataclass
9
- from typing import TYPE_CHECKING, Any, Literal, Protocol
10
-
11
- if TYPE_CHECKING:
12
- from collections.abc import Callable
13
-
14
- from bs4 import Tag
15
-
16
-
17
- class ConverterFunction(Protocol):
18
- """Protocol for custom converter functions.
19
-
20
- Converter functions receive keyword-only arguments including the HTML tag,
21
- processed text content, and any conversion options needed.
22
-
23
- Example:
24
- >>> def custom_link_converter(*, tag: Tag, text: str, autolinks: bool, **kwargs: Any) -> str:
25
- ... href = tag.get("href", "")
26
- ... return f"[{text}]({href})"
27
- """
28
-
29
- def __call__(self, *, tag: Tag, text: str, **kwargs: Any) -> str:
30
- """Convert an HTML element to Markdown.
31
-
32
- Args:
33
- tag: BeautifulSoup Tag object representing the HTML element
34
- text: Processed text content of the element's children
35
- **kwargs: Additional conversion options (varies by converter)
36
-
37
- Returns:
38
- Markdown string representation of the element
39
- """
40
- ...
9
+ from typing import Literal
41
10
 
42
11
 
43
12
  @dataclass
@@ -87,8 +56,8 @@ class ConversionOptions:
87
56
  code_language: str = ""
88
57
  """Default language for code blocks."""
89
58
 
90
- code_language_callback: Callable[[Tag], str] | None = None
91
- """Callback to determine code language from element."""
59
+ encoding: str = "utf-8"
60
+ """Character encoding expected for the HTML input."""
92
61
 
93
62
  autolinks: bool = True
94
63
  """Convert bare URLs to automatic links."""
@@ -102,15 +71,6 @@ class ConversionOptions:
102
71
  br_in_tables: bool = False
103
72
  """Use <br> tags for line breaks in table cells instead of spaces."""
104
73
 
105
- hocr_extract_tables: bool = True
106
- """Enable table extraction from hOCR (HTML-based OCR) documents."""
107
-
108
- hocr_table_column_threshold: int = 50
109
- """Pixel threshold for detecting column boundaries in hOCR tables."""
110
-
111
- hocr_table_row_threshold_ratio: float = 0.5
112
- """Row height ratio threshold for detecting row boundaries in hOCR tables."""
113
-
114
74
  highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
115
75
  """Style for highlighting <mark> elements."""
116
76
 
@@ -129,9 +89,6 @@ class ConversionOptions:
129
89
  wrap_width: int = 80
130
90
  """Column width for text wrapping."""
131
91
 
132
- convert: set[str] | None = None
133
- """HTML tags to convert to Markdown (None = all supported tags). v1 compatibility only."""
134
-
135
92
  strip_tags: set[str] | None = None
136
93
  """HTML tags to strip from output (output only text content, no markdown conversion)."""
137
94
 
@@ -150,9 +107,6 @@ class ConversionOptions:
150
107
  code_block_style: Literal["indented", "backticks", "tildes"] = "backticks"
151
108
  """Style for code blocks: 'backticks' (```, better whitespace preservation), 'indented' (4 spaces), or 'tildes' (~~~). All are CommonMark compliant."""
152
109
 
153
- custom_converters: dict[str, Callable[..., str]] | None = None
154
- """Custom converter functions for specific HTML elements."""
155
-
156
110
  debug: bool = False
157
111
  """Enable debug mode with diagnostic warnings about unhandled elements and hOCR processing."""
158
112
 
@@ -182,30 +136,3 @@ class PreprocessingOptions:
182
136
 
183
137
  remove_forms: bool = True
184
138
  """Remove form elements during preprocessing."""
185
-
186
- excluded_navigation_classes: set[str] | None = None
187
- """Navigation class fragments to keep even when removing navigation."""
188
-
189
- extra_navigation_classes: set[str] | None = None
190
- """Additional navigation class fragments to strip beyond defaults."""
191
-
192
-
193
- @dataclass
194
- class ParsingOptions:
195
- """HTML parsing configuration.
196
-
197
- Example:
198
- >>> options = ParsingOptions(
199
- ... encoding="utf-8",
200
- ... detect_encoding=True,
201
- ... )
202
- """
203
-
204
- encoding: str = "utf-8"
205
- """Character encoding for decoding bytes input."""
206
-
207
- detect_encoding: bool = False
208
- """Attempt to detect encoding from HTML (not yet implemented)."""
209
-
210
- parser: str | None = None
211
- """HTML parser to use: 'html.parser', 'lxml', or 'html5lib' (None = auto)."""
@@ -1,21 +1,18 @@
1
1
  """V1 API compatibility layer.
2
2
 
3
3
  Provides backward compatibility for the v1 convert_to_markdown API
4
- by translating v1 kwargs to v2 ConversionOptions/PreprocessingOptions/ParsingOptions.
4
+ by translating v1 kwargs to v2 ConversionOptions and PreprocessingOptions.
5
5
  """
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
- from typing import TYPE_CHECKING
9
+ import warnings
10
10
 
11
- if TYPE_CHECKING:
12
- from collections.abc import Iterator
13
-
14
- from html_to_markdown import ConversionOptions, ParsingOptions, PreprocessingOptions
11
+ from html_to_markdown import ConversionOptions, PreprocessingOptions
15
12
  from html_to_markdown import convert as convert_v2
16
13
 
17
14
 
18
- def convert_to_markdown( # noqa: D417
15
+ def convert_to_markdown(
19
16
  html: str,
20
17
  *,
21
18
  heading_style: str = "underlined",
@@ -48,32 +45,72 @@ def convert_to_markdown( # noqa: D417
48
45
  preprocessing_preset: str = "standard",
49
46
  remove_navigation: bool = True,
50
47
  remove_forms: bool = True,
51
- parser: str = "html.parser",
52
48
  source_encoding: str = "utf-8",
53
49
  code_language_callback: object | None = None,
54
50
  strip: list[str] | None = None,
55
51
  convert: list[str] | None = None,
56
52
  custom_converters: dict[str, object] | None = None,
57
53
  ) -> str:
58
- """Convert HTML to Markdown (v1 API compatibility).
59
-
60
- This function provides backward compatibility with the v1 API by accepting
61
- the same kwargs and translating them to v2 ConversionOptions.
54
+ """Convert HTML to Markdown (v1 compatibility API).
62
55
 
63
- Note: Some v1 options are not supported in v2:
64
- - code_language_callback: Removed in v2
65
- - convert: Removed in v2
66
- - custom_converters: Not yet implemented in v2
56
+ This function provides backward compatibility with the v1 API by translating
57
+ v1-style keyword arguments to v2 ConversionOptions and PreprocessingOptions.
67
58
 
68
59
  Args:
69
- html: HTML string to convert
60
+ html: HTML string to convert.
61
+ heading_style: Style for headings (default: "underlined" for v1 compatibility).
62
+ list_indent_type: Type of indentation for lists.
63
+ list_indent_width: Number of spaces for list indentation (v1 default: 4).
64
+ bullets: Characters to use for unordered list bullets.
65
+ strong_em_symbol: Symbol for strong/emphasis formatting.
66
+ escape_asterisks: Escape asterisk characters (v1 default: True).
67
+ escape_underscores: Escape underscore characters (v1 default: True).
68
+ escape_misc: Escape miscellaneous Markdown characters (v1 default: True).
69
+ code_language: Default language for code blocks.
70
+ autolinks: Convert bare URLs to automatic links.
71
+ default_title: Add a default title if none exists.
72
+ br_in_tables: Use <br> tags for line breaks in table cells.
73
+ hocr_extract_tables: Deprecated - always True in v2.
74
+ hocr_table_column_threshold: Deprecated - uses built-in heuristics in v2.
75
+ hocr_table_row_threshold_ratio: Deprecated - uses built-in heuristics in v2.
76
+ highlight_style: Style for highlighting <mark> elements.
77
+ extract_metadata: Extract metadata from HTML head.
78
+ whitespace_mode: How to handle whitespace.
79
+ strip_newlines: Remove newlines from HTML before processing.
80
+ wrap: Enable text wrapping.
81
+ wrap_width: Column width for text wrapping.
82
+ convert_as_inline: Treat block elements as inline.
83
+ sub_symbol: Symbol for subscript text.
84
+ sup_symbol: Symbol for superscript text.
85
+ newline_style: Style for newlines.
86
+ keep_inline_images_in: Parent tag names where images should remain inline.
87
+ preprocess: Enable HTML preprocessing.
88
+ preprocessing_preset: Preprocessing aggressiveness level.
89
+ remove_navigation: Remove navigation elements during preprocessing.
90
+ remove_forms: Remove form elements during preprocessing.
91
+ source_encoding: Character encoding expected for the HTML input.
92
+ code_language_callback: Deprecated - not supported in v2.
93
+ strip: HTML tags to strip from output.
94
+ convert: Deprecated - not supported in v2.
95
+ custom_converters: Deprecated - not yet implemented in v2.
70
96
 
71
97
  Returns:
72
- Markdown string
98
+ Converted Markdown string.
73
99
 
74
100
  Raises:
75
- NotImplementedError: If unsupported v1 options are provided
101
+ NotImplementedError: If deprecated v1 features are used.
102
+
103
+ .. deprecated:: 2.0
104
+ Use :func:`html_to_markdown.convert` with :class:`ConversionOptions` instead.
105
+ The v1 API is provided for backward compatibility only.
76
106
  """
107
+ warnings.warn(
108
+ "convert_to_markdown() is deprecated and will be removed in v3.0. "
109
+ "Use html_to_markdown.convert() with ConversionOptions instead.",
110
+ DeprecationWarning,
111
+ stacklevel=2,
112
+ )
113
+
77
114
  if code_language_callback is not None:
78
115
  raise NotImplementedError(
79
116
  "code_language_callback was removed in v2. Use the code_language option to set a default language."
@@ -82,9 +119,17 @@ def convert_to_markdown( # noqa: D417
82
119
  raise NotImplementedError("convert option was removed in v2. All supported tags are converted by default.")
83
120
  if custom_converters is not None:
84
121
  raise NotImplementedError("custom_converters is not yet implemented in v2")
122
+ if not hocr_extract_tables:
123
+ raise NotImplementedError(
124
+ "hocr_extract_tables toggle was removed in v2. hOCR tables are always reconstructed when detected."
125
+ )
126
+ if hocr_table_column_threshold != 50 or hocr_table_row_threshold_ratio != 0.5:
127
+ raise NotImplementedError(
128
+ "hOCR table threshold overrides were removed in v2. Table reconstruction now uses built-in heuristics."
129
+ )
85
130
 
86
- # V1 behavior: if code_language is set, use fenced code blocks (backticks)
87
- # V2 default is indented code blocks, so we need to override
131
+ # ~keep: v1 used indented code blocks by default, but switched to backticks when a language was set
132
+ # This maintains v1 behavior for backward compatibility
88
133
  code_block_style = "backticks" if code_language else "indented"
89
134
 
90
135
  options = ConversionOptions(
@@ -101,9 +146,6 @@ def convert_to_markdown( # noqa: D417
101
146
  autolinks=autolinks,
102
147
  default_title=default_title,
103
148
  br_in_tables=br_in_tables,
104
- hocr_extract_tables=hocr_extract_tables,
105
- hocr_table_column_threshold=hocr_table_column_threshold,
106
- hocr_table_row_threshold_ratio=hocr_table_row_threshold_ratio,
107
149
  highlight_style=highlight_style, # type: ignore[arg-type]
108
150
  extract_metadata=extract_metadata,
109
151
  whitespace_mode=whitespace_mode, # type: ignore[arg-type]
@@ -125,37 +167,23 @@ def convert_to_markdown( # noqa: D417
125
167
  remove_forms=remove_forms,
126
168
  )
127
169
 
128
- parsing = ParsingOptions(
129
- encoding=source_encoding,
130
- parser=parser,
131
- )
132
-
133
- return convert_v2(html, options, preprocessing, parsing)
134
-
170
+ options.encoding = source_encoding
171
+ return convert_v2(html, options, preprocessing)
135
172
 
136
- def convert_to_markdown_stream( # noqa: D417
137
- html: str,
138
- *,
139
- chunk_size: int = 4096,
140
- **kwargs: object,
141
- ) -> Iterator[str]:
142
- """Stream HTML to Markdown conversion (v1 API).
143
173
 
144
- Note: Streaming was removed in v2.
174
+ def markdownify(*args: object, **kwargs: object) -> str:
175
+ """Alias for convert_to_markdown (deprecated).
145
176
 
146
- Args:
147
- html: HTML string to convert
148
- chunk_size: Size of chunks to yield (not used in v2)
149
-
150
- Raises:
151
- NotImplementedError: Streaming was removed in v2
177
+ .. deprecated:: 2.0
178
+ Use html_to_markdown.convert() instead.
152
179
  """
153
- raise NotImplementedError(
154
- "Streaming API (convert_to_markdown_stream) was removed in v2 (html5ever does not support streaming). "
155
- "Use convert_to_markdown() instead."
180
+ warnings.warn(
181
+ "markdownify() is deprecated and will be removed in v3.0. "
182
+ "Use html_to_markdown.convert() with ConversionOptions instead.",
183
+ DeprecationWarning,
184
+ stacklevel=2,
156
185
  )
186
+ return convert_to_markdown(*args, **kwargs) # type: ignore[arg-type]
157
187
 
158
188
 
159
- markdownify = convert_to_markdown
160
-
161
- __all__ = ["convert_to_markdown", "convert_to_markdown_stream", "markdownify"]
189
+ __all__ = ["convert_to_markdown", "markdownify"]
@@ -0,0 +1,196 @@
1
+ Metadata-Version: 2.4
2
+ Name: html-to-markdown
3
+ Version: 2.1.2
4
+ Classifier: Development Status :: 5 - Production/Stable
5
+ Classifier: Environment :: Console
6
+ Classifier: Intended Audience :: Developers
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Classifier: Programming Language :: Python :: 3 :: Only
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Programming Language :: Python :: 3.14
15
+ Classifier: Programming Language :: Rust
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Classifier: Topic :: Text Processing
18
+ Classifier: Topic :: Text Processing :: Markup
19
+ Classifier: Topic :: Text Processing :: Markup :: HTML
20
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
21
+ Classifier: Typing :: Typed
22
+ License-File: LICENSE
23
+ Summary: High-performance HTML to Markdown converter powered by Rust with a clean Python API
24
+ Keywords: cli-tool,converter,html,html2markdown,html5,markdown,markup,parser,rust,text-processing
25
+ Home-Page: https://github.com/Goldziher/html-to-markdown
26
+ Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
27
+ Requires-Python: >=3.10
28
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
29
+ Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
30
+ Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
31
+ Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
32
+ Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
33
+
34
+ # html-to-markdown
35
+
36
+ High-performance HTML to Markdown converter with a clean Python API (powered by a Rust core). Wheels are published for Linux, macOS, and Windows.
37
+
38
+ [![PyPI version](https://badge.fury.io/py/html-to-markdown.svg)](https://github.com/Goldziher/html-to-markdown)
39
+ [![Rust crate](https://img.shields.io/crates/v/html-to-markdown-rs.svg)](https://github.com/Goldziher/html-to-markdown)
40
+ [![Python Versions](https://img.shields.io/pypi/pyversions/html-to-markdown.svg)](https://github.com/Goldziher/html-to-markdown)
41
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
42
+
43
+ ## Installation
44
+
45
+ ```bash
46
+ pip install html-to-markdown
47
+ ```
48
+
49
+ ## Performance Snapshot
50
+
51
+ Apple M4 • Real Wikipedia documents • `convert()` (Python)
52
+
53
+ | Document | Size | Latency | Throughput | Docs/sec |
54
+ | ------------------- | ----- | ------- | ---------- | -------- |
55
+ | Lists (Timeline) | 129KB | 0.62ms | 208 MB/s | 1,613 |
56
+ | Tables (Countries) | 360KB | 2.02ms | 178 MB/s | 495 |
57
+ | Mixed (Python wiki) | 656KB | 4.56ms | 144 MB/s | 219 |
58
+
59
+ > V1 averaged ~2.5 MB/s (Python/BeautifulSoup). V2’s Rust engine delivers 60–80× higher throughput.
60
+
61
+ ## Quick Start
62
+
63
+ ```python
64
+ from html_to_markdown import convert
65
+
66
+ html = """
67
+ <h1>Welcome</h1>
68
+ <p>This is <strong>fast</strong> Rust-powered conversion!</p>
69
+ <ul>
70
+ <li>Blazing fast</li>
71
+ <li>Type safe</li>
72
+ <li>Easy to use</li>
73
+ </ul>
74
+ """
75
+
76
+ markdown = convert(html)
77
+ print(markdown)
78
+ ```
79
+
80
+ ## Configuration (v2 API)
81
+
82
+ ```python
83
+ from html_to_markdown import ConversionOptions, convert
84
+
85
+ options = ConversionOptions(
86
+ heading_style="atx",
87
+ list_indent_width=2,
88
+ bullets="*+-",
89
+ )
90
+ options.escape_asterisks = True
91
+ options.code_language = "python"
92
+ options.extract_metadata = True
93
+
94
+ markdown = convert(html, options)
95
+ ```
96
+
97
+ ### HTML Preprocessing
98
+
99
+ ```python
100
+ from html_to_markdown import ConversionOptions, PreprocessingOptions, convert
101
+
102
+ options = ConversionOptions(
103
+ preprocessing=PreprocessingOptions(enabled=True, preset="aggressive"),
104
+ )
105
+
106
+ markdown = convert(scraped_html, options)
107
+ ```
108
+
109
+ ### Inline Image Extraction
110
+
111
+ ```python
112
+ from html_to_markdown import InlineImageConfig, convert_with_inline_images
113
+
114
+ markdown, inline_images, warnings = convert_with_inline_images(
115
+ '<p><img src="data:image/png;base64,...==" alt="Pixel" width="1" height="1"></p>',
116
+ image_config=InlineImageConfig(max_decoded_size_bytes=1024, infer_dimensions=True),
117
+ )
118
+
119
+ if inline_images:
120
+ first = inline_images[0]
121
+ print(first["format"], first["dimensions"], first["attributes"]) # e.g. "png", (1, 1), {"width": "1"}
122
+ ```
123
+
124
+ Each inline image is returned as a typed dictionary (`bytes` payload, metadata, and relevant HTML attributes). Warnings are human-readable skip reasons.
125
+
126
+ ### hOCR (HTML OCR) Support
127
+
128
+ ```python
129
+ from html_to_markdown import ConversionOptions, convert
130
+
131
+ # Default: emit structured Markdown directly
132
+ markdown = convert(hocr_html)
133
+
134
+ # hOCR documents are detected automatically; tables are reconstructed without extra configuration.
135
+ markdown = convert(hocr_html)
136
+ ```
137
+
138
+ ## CLI (same engine)
139
+
140
+ ```bash
141
+ pipx install html-to-markdown # or: pip install html-to-markdown
142
+
143
+ html-to-markdown page.html > page.md
144
+ cat page.html | html-to-markdown --heading-style atx > page.md
145
+ ```
146
+
147
+ ## API Surface
148
+
149
+ ### `ConversionOptions`
150
+
151
+ Key fields (see docstring for full matrix):
152
+
153
+ - `heading_style`: `"underlined" | "atx" | "atx_closed"`
154
+ - `list_indent_width`: spaces per indent level (default 2)
155
+ - `bullets`: cycle of bullet characters (`"*+-"`)
156
+ - `strong_em_symbol`: `"*"` or `"_"`
157
+ - `code_language`: default fenced code block language
158
+ - `wrap`, `wrap_width`: wrap Markdown output
159
+ - `strip_tags`: remove specific HTML tags
160
+ - `preprocessing`: `PreprocessingOptions`
161
+ - `encoding`: input character encoding (informational)
162
+
163
+ ### `PreprocessingOptions`
164
+
165
+ - `enabled`: enable HTML sanitisation
166
+ - `preset`: `"minimal" | "standard" | "aggressive"`
167
+ - `remove_navigation`, `remove_forms`
168
+
169
+ ### `InlineImageConfig`
170
+
171
+ - `max_decoded_size_bytes`: reject larger payloads
172
+ - `filename_prefix`: generated name prefix (`embedded_image` default)
173
+ - `capture_svg`: collect inline `<svg>` (default `True`)
174
+ - `infer_dimensions`: decode raster images to obtain dimensions (default `False`)
175
+
176
+ ## v1 Compatibility
177
+
178
+ - **Performance**: V1 averaged ~2.5 MB/s; V2 sustains 150–210 MB/s with identical Markdown output.
179
+ - **Compat shim**: `html_to_markdown.v1_compat` exposes `convert_to_markdown`, `convert_to_markdown_stream`, and `markdownify` to ease migration. Keyword mappings are listed in the [changelog](CHANGELOG.md#v200).
180
+ - **CLI**: The Rust CLI replaces the Python script. New flags are documented via `html-to-markdown --help`.
181
+ - **Removed options**: `code_language_callback`, `strip`, and streaming APIs were removed; use `ConversionOptions`, `PreprocessingOptions`, and the inline-image helpers instead.
182
+
183
+ ## Links
184
+
185
+ - GitHub: [https://github.com/Goldziher/html-to-markdown](https://github.com/Goldziher/html-to-markdown)
186
+ - Discord: [https://discord.gg/pXxagNK2zN](https://discord.gg/pXxagNK2zN)
187
+ - Kreuzberg ecosystem: [https://kreuzberg.dev](https://kreuzberg.dev)
188
+
189
+ ## License
190
+
191
+ MIT License – see [LICENSE](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE).
192
+
193
+ ## Support
194
+
195
+ If you find this library useful, consider [sponsoring the project](https://github.com/sponsors/Goldziher).
196
+
@@ -0,0 +1,17 @@
1
+ html_to_markdown-2.1.2.data/scripts/html-to-markdown.exe,sha256=SJCBlZp0uTo6_wzS7KEgXariZtABUVc64o5TVxOwVZo,4414976
2
+ html_to_markdown-2.1.2.dist-info/METADATA,sha256=TgFH9djK4HzJF_vDFVZCm7EDXYscA4v9t31DuXCujIE,7233
3
+ html_to_markdown-2.1.2.dist-info/WHEEL,sha256=4EDp_7DiFfWl1yYv5M4wSosAn5L_xgD1dyrQxQxfCx8,95
4
+ html_to_markdown-2.1.2.dist-info/licenses/LICENSE,sha256=QhKFMkQLa4mSUlOsyG9VElzC7GYbAKtiS_EwOCyH-b4,1107
5
+ html_to_markdown/__init__.py,sha256=3_Egcf46oNcEam7rc7zAHx8lfOj1eVNO1p0kErVf_fs,1191
6
+ html_to_markdown/__main__.py,sha256=5objj9lB7hhpSpZsDok5tv9o9yztVR63Ccww-pXsAyY,343
7
+ html_to_markdown/_html_to_markdown.pyd,sha256=ES7QEe9lTb2ZK3yvC2-vNHng__U7HB3CY5p2wJ0IuNQ,4159488
8
+ html_to_markdown/_rust.pyi,sha256=SHrrT8opJd5kcRYycooR4AS9is5tr1beSGtpoUWqzNc,2097
9
+ html_to_markdown/api.py,sha256=YQQuJoO1OQnXpuOLk8TbdQDTARcKYFbf_zSA44BeHCM,2800
10
+ html_to_markdown/bin/html-to-markdown.exe,sha256=SJCBlZp0uTo6_wzS7KEgXariZtABUVc64o5TVxOwVZo,4414976
11
+ html_to_markdown/cli.py,sha256=z59l8sF8wIRRzJtUd-tXgqiC0WTqkTjzl-df8Ey_oQ0,67
12
+ html_to_markdown/cli_proxy.py,sha256=J2Qk9MnnkFKIroxc0wn79nzI0dXqXDDNEAF9o9hth9Y,3829
13
+ html_to_markdown/exceptions.py,sha256=31VqpPi4JLGv7lI2481Z4f2s5ejYmq97c3s-WFFkXVU,2443
14
+ html_to_markdown/options.py,sha256=ijjRBTwrbESbwmYTOXV_ZO1A1GAmOzzILiFoPeC-jZk,4940
15
+ html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ html_to_markdown/v1_compat.py,sha256=Lb3pppLfVH9EyAYGbOfpcO3vYkof4SIYDMI-CBEbh-A,8045
17
+ html_to_markdown-2.1.2.dist-info/RECORD,,
@@ -1,243 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: html-to-markdown
3
- Version: 2.0.1
4
- Classifier: Development Status :: 5 - Production/Stable
5
- Classifier: Environment :: Console
6
- Classifier: Intended Audience :: Developers
7
- Classifier: License :: OSI Approved :: MIT License
8
- Classifier: Operating System :: OS Independent
9
- Classifier: Programming Language :: Python :: 3 :: Only
10
- Classifier: Programming Language :: Python :: 3.10
11
- Classifier: Programming Language :: Python :: 3.11
12
- Classifier: Programming Language :: Python :: 3.12
13
- Classifier: Programming Language :: Python :: 3.13
14
- Classifier: Programming Language :: Rust
15
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
16
- Classifier: Topic :: Text Processing
17
- Classifier: Topic :: Text Processing :: Markup
18
- Classifier: Topic :: Text Processing :: Markup :: HTML
19
- Classifier: Topic :: Text Processing :: Markup :: Markdown
20
- Classifier: Typing :: Typed
21
- License-File: LICENSE
22
- Summary: High-performance HTML to Markdown converter powered by Rust with a clean Python API
23
- Keywords: cli-tool,converter,html,html2markdown,html5,markdown,markup,parser,rust,text-processing
24
- Home-Page: https://github.com/Goldziher/html-to-markdown
25
- Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
26
- Requires-Python: >=3.10
27
- Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
28
- Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
29
- Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
30
- Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
31
- Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
32
-
33
- # html-to-markdown
34
-
35
- High-performance HTML to Markdown converter powered by Rust with a clean Python API. Available via PyPI with pre-built wheels for all major platforms.
36
-
37
- [![PyPI version](https://badge.fury.io/py/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
38
- [![Crates.io](https://img.shields.io/crates/v/html-to-markdown-rs.svg)](https://crates.io/crates/html-to-markdown-rs)
39
- [![Python Versions](https://img.shields.io/pypi/pyversions/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
40
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
41
- [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
42
-
43
- Part of the [Kreuzberg](https://kreuzberg.dev) ecosystem for document intelligence.
44
-
45
- ## Installation
46
-
47
- ```bash
48
- pip install html-to-markdown
49
- ```
50
-
51
- Pre-built wheels available for:
52
-
53
- - **Linux**: x86_64, aarch64
54
- - **macOS**: x86_64 (Intel), arm64 (Apple Silicon)
55
- - **Windows**: x86_64
56
-
57
- ## ⚡ Performance
58
-
59
- Real Wikipedia documents on Apple M4:
60
-
61
- | Document | Size | Latency | Throughput | Docs/sec |
62
- | ------------------- | ----- | ------- | ---------- | -------- |
63
- | Lists (Timeline) | 129KB | 0.62ms | 208 MB/s | 1,613 |
64
- | Tables (Countries) | 360KB | 2.02ms | 178 MB/s | 495 |
65
- | Mixed (Python wiki) | 656KB | 4.56ms | 144 MB/s | 219 |
66
-
67
- **19-30x faster** than pure Python implementations.
68
-
69
- ## Quick Start
70
-
71
- ```python
72
- from html_to_markdown import convert_to_markdown
73
-
74
- html = """
75
- <h1>Welcome</h1>
76
- <p>This is <strong>fast</strong> Rust-powered conversion!</p>
77
- <ul>
78
- <li>Blazing fast</li>
79
- <li>Type safe</li>
80
- <li>Easy to use</li>
81
- </ul>
82
- """
83
-
84
- markdown = convert_to_markdown(html)
85
- print(markdown)
86
- ```
87
-
88
- Output:
89
-
90
- ```markdown
91
- # Welcome
92
-
93
- This is **fast** Rust-powered conversion!
94
-
95
- - Blazing fast
96
- - Type safe
97
- - Easy to use
98
- ```
99
-
100
- ## Configuration
101
-
102
- ```python
103
- from html_to_markdown import convert_to_markdown
104
-
105
- markdown = convert_to_markdown(
106
- html,
107
- heading_style="atx", # "atx", "atx_closed", "underlined"
108
- list_indent_width=2, # Discord/Slack: use 2
109
- bullets="*+-", # Bullet characters
110
- strong_em_symbol="*", # "*" or "_"
111
- escape_asterisks=True, # Escape * in text
112
- code_language="python", # Default code block language
113
- extract_metadata=True, # Extract HTML metadata
114
- )
115
- ```
116
-
117
- ### HTML Preprocessing
118
-
119
- Clean web-scraped HTML before conversion:
120
-
121
- ```python
122
- from html_to_markdown import convert_to_markdown
123
-
124
- markdown = convert_to_markdown(
125
- scraped_html,
126
- preprocess=True,
127
- preprocessing_preset="aggressive", # "minimal", "standard", "aggressive"
128
- )
129
- ```
130
-
131
- ## Features
132
-
133
- - **🚀 Blazing Fast**: Pure Rust core with ultra-fast `tl` HTML parser
134
- - **🐍 Type Safe**: Full type hints and `.pyi` stubs for excellent IDE support
135
- - **📊 hOCR 1.2 Compliant**: Full support for all 40+ elements and 20+ properties
136
- - **📝 CommonMark Compliant**: Follows CommonMark specification for list formatting
137
- - **🌍 Cross-Platform**: Pre-built wheels for Linux, macOS, and Windows
138
- - **✅ Well-Tested**: 900+ tests with dual Python + Rust coverage
139
- - **🔧 Zero Dependencies**: No BeautifulSoup or lxml required
140
-
141
- ## hOCR 1.2 Support
142
-
143
- Complete hOCR 1.2 specification compliance with support for all elements, properties, and metadata:
144
-
145
- ```python
146
- from html_to_markdown import convert_to_markdown
147
-
148
- # Option 1: Document structure extraction (NEW in v2)
149
- # Extracts all hOCR elements and converts to structured markdown
150
- markdown = convert_to_markdown(hocr_html)
151
-
152
- # Option 2: Legacy table extraction (spatial reconstruction)
153
- # Reconstructs tables from word bounding boxes
154
- markdown = convert_to_markdown(
155
- hocr_html,
156
- hocr_extract_tables=True,
157
- hocr_table_column_threshold=50,
158
- hocr_table_row_threshold_ratio=0.5,
159
- )
160
- ```
161
-
162
- **Full hOCR 1.2 Spec Coverage:**
163
-
164
- - ✅ **All 40 Element Types** - Logical structure, typesetting, floats, inline, engine-specific
165
- - ✅ **All 20+ Properties** - bbox, baseline, textangle, poly, x_wconf, x_font, x_fsize, and more
166
- - ✅ **All 5 Metadata Fields** - ocr-system, ocr-capabilities, ocr-number-of-pages, ocr-langs, ocr-scripts
167
-
168
- ## Configuration Reference
169
-
170
- ### ConversionOptions
171
-
172
- | Option | Type | Default | Description |
173
- | -------------------------------- | ----- | ------------- | ----------------------------------------------------------------------- |
174
- | `heading_style` | str | `"atx"` | Heading format: `"atx"` (#), `"atx_closed"` (# #), `"underlined"` (===) |
175
- | `list_indent_width` | int | `2` | Spaces per list indent level (CommonMark: 2) |
176
- | `list_indent_type` | str | `"spaces"` | `"spaces"` or `"tabs"` |
177
- | `bullets` | str | `"*+-"` | Bullet chars for unordered lists (cycles through levels) |
178
- | `strong_em_symbol` | str | `"*"` | Symbol for bold/italic: `"*"` or `"_"` |
179
- | `escape_asterisks` | bool | `True` | Escape `*` in text |
180
- | `escape_underscores` | bool | `True` | Escape `_` in text |
181
- | `code_language` | str | `""` | Default language for code blocks |
182
- | `code_block_style` | str | `"backticks"` | `"indented"` (4 spaces), `"backticks"` (\`\`\`), `"tildes"` (\~~~) |
183
- | `extract_metadata` | bool | `True` | Extract HTML metadata as comment |
184
- | `hocr_extract_tables` | bool | `True` | Enable hOCR table extraction |
185
- | `hocr_table_column_threshold` | int | `50` | Column detection threshold (pixels) |
186
- | `hocr_table_row_threshold_ratio` | float | `0.5` | Row grouping threshold ratio |
187
-
188
- ### Preprocessing Options
189
-
190
- | Option | Type | Default | Description |
191
- | ---------------------- | ---- | ------------ | ----------------------------------------- |
192
- | `preprocess` | bool | `False` | Enable HTML preprocessing |
193
- | `preprocessing_preset` | str | `"standard"` | `"minimal"`, `"standard"`, `"aggressive"` |
194
-
195
- ## CLI Tool
196
-
197
- A native Rust CLI binary is also available:
198
-
199
- ```bash
200
- # Install via pipx (recommended for CLI tools)
201
- pipx install html-to-markdown
202
-
203
- # Or install with pip
204
- pip install html-to-markdown
205
-
206
- # Use the CLI
207
- html-to-markdown input.html > output.md
208
- echo "<h1>Test</h1>" | html-to-markdown
209
- ```
210
-
211
- **For Rust library usage and comprehensive documentation**, see the [GitHub repository](https://github.com/Goldziher/html-to-markdown).
212
-
213
- ## Upgrading from v1.x
214
-
215
- All v1 code works without changes. v2 is a complete Rust rewrite with **19-30x performance improvements**:
216
-
217
- **What Changed:**
218
-
219
- - Complete Rust rewrite using `tl` HTML parser
220
- - CommonMark-compliant defaults (2-space indents, minimal escaping, ATX headings)
221
- - No BeautifulSoup or lxml dependencies
222
-
223
- **Removed Features:**
224
-
225
- - `code_language_callback` - use `code_language` for default language
226
- - `strip` / `convert` options - use preprocessing instead
227
- - `convert_to_markdown_stream()` - not supported in v2
228
-
229
- ## Links
230
-
231
- - **GitHub Repository**: [https://github.com/Goldziher/html-to-markdown](https://github.com/Goldziher/html-to-markdown)
232
- - **Rust Crate**: [https://crates.io/crates/html-to-markdown-rs](https://crates.io/crates/html-to-markdown-rs)
233
- - **Discord Community**: [https://discord.gg/pXxagNK2zN](https://discord.gg/pXxagNK2zN)
234
- - **Kreuzberg Ecosystem**: [https://kreuzberg.dev](https://kreuzberg.dev)
235
-
236
- ## License
237
-
238
- MIT License - see [LICENSE](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE) for details.
239
-
240
- ## Support
241
-
242
- If you find this library useful, consider [sponsoring the project](https://github.com/sponsors/Goldziher).
243
-
@@ -1,17 +0,0 @@
1
- html_to_markdown-2.0.1.data/scripts/html-to-markdown.exe,sha256=4IXz5ITYrahD_3Qze4ldQ1R9AdrxiEO9xWF56VnaMJE,4380672
2
- html_to_markdown-2.0.1.dist-info/METADATA,sha256=altO23YUZgJVPuW2sbzObGYEQAWS9MDDkAPMIZjlHWg,10057
3
- html_to_markdown-2.0.1.dist-info/WHEEL,sha256=4EDp_7DiFfWl1yYv5M4wSosAn5L_xgD1dyrQxQxfCx8,95
4
- html_to_markdown-2.0.1.dist-info/licenses/LICENSE,sha256=QhKFMkQLa4mSUlOsyG9VElzC7GYbAKtiS_EwOCyH-b4,1107
5
- html_to_markdown/__init__.py,sha256=c8bbkGkjR0bQs9ZZzhNevTULM4vzQIDXv3aaHSfVtmQ,1314
6
- html_to_markdown/__main__.py,sha256=5objj9lB7hhpSpZsDok5tv9o9yztVR63Ccww-pXsAyY,343
7
- html_to_markdown/_html_to_markdown.pyd,sha256=djqk4pMy4X6ioYreksFwKsmAtGuZzhBNhg2Mtw4AcPM,3401216
8
- html_to_markdown/_rust.pyi,sha256=bA1lfF_pRWpMFXApXVl7VXUcV1q8-T4-jVrEqLDwJ1Y,2220
9
- html_to_markdown/api.py,sha256=HfxYYebne2bZU62a-VALhurWfqDy5PEfUYNVnU6suac,3720
10
- html_to_markdown/bin/html-to-markdown.exe,sha256=4IXz5ITYrahD_3Qze4ldQ1R9AdrxiEO9xWF56VnaMJE,4380672
11
- html_to_markdown/cli.py,sha256=4bQ44HhOKtQJY9t-TjSul-J7pqSHEpg9NKtZ0AeAqZA,263
12
- html_to_markdown/cli_proxy.py,sha256=mq-1BTjnr380N-mCU4cx8DyTIbY6ycq-_2DA10sLlfk,4442
13
- html_to_markdown/exceptions.py,sha256=L2YmpDWsvxTu2b-M_oT3mQdyyNhDM1jsvdWr18ZvD2Q,2707
14
- html_to_markdown/options.py,sha256=cenv2-vwduteQdIt6IXqDD0JyRFfa05oqhWgxKVdlB4,7452
15
- html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- html_to_markdown/v1_compat.py,sha256=Qts9BqmWMcZt5CZ8P20_vf8oc43tMgZxkj3-W0YI2VE,5682
17
- html_to_markdown-2.0.1.dist-info/RECORD,,