html-to-markdown 2.0.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -0,0 +1,48 @@
1
+ """html-to-markdown: Convert HTML to Markdown using Rust backend.
2
+
3
+ This package provides high-performance HTML to Markdown conversion
4
+ powered by Rust with a clean Python API.
5
+
6
+ V2 API (current):
7
+ from html_to_markdown import convert, ConversionOptions
8
+
9
+ options = ConversionOptions(heading_style="atx")
10
+ markdown = convert(html, options)
11
+
12
+ V1 API (backward compatibility):
13
+ from html_to_markdown import convert_to_markdown
14
+
15
+ markdown = convert_to_markdown(html, heading_style="atx")
16
+ """
17
+
18
+ from html_to_markdown.api import convert
19
+ from html_to_markdown.exceptions import (
20
+ ConflictingOptionsError,
21
+ EmptyHtmlError,
22
+ HtmlToMarkdownError,
23
+ InvalidParserError,
24
+ MissingDependencyError,
25
+ )
26
+ from html_to_markdown.options import (
27
+ ConversionOptions,
28
+ ParsingOptions,
29
+ PreprocessingOptions,
30
+ )
31
+ from html_to_markdown.v1_compat import convert_to_markdown, convert_to_markdown_stream, markdownify
32
+
33
+ __all__ = [
34
+ "ConflictingOptionsError",
35
+ "ConversionOptions",
36
+ "EmptyHtmlError",
37
+ "HtmlToMarkdownError",
38
+ "InvalidParserError",
39
+ "MissingDependencyError",
40
+ "ParsingOptions",
41
+ "PreprocessingOptions",
42
+ "convert",
43
+ "convert_to_markdown",
44
+ "convert_to_markdown_stream",
45
+ "markdownify",
46
+ ]
47
+
48
+ __version__ = "2.0.0"
@@ -0,0 +1,16 @@
1
+ import sys
2
+
3
+ from html_to_markdown.cli_proxy import main
4
+
5
+
6
+ def cli() -> None:
7
+ try:
8
+ result = main(sys.argv[1:])
9
+ print(result, end="") # noqa: T201
10
+ except (ValueError, FileNotFoundError) as e:
11
+ print(str(e), file=sys.stderr) # noqa: T201
12
+ sys.exit(1)
13
+
14
+
15
+ if __name__ == "__main__":
16
+ cli()
@@ -0,0 +1,79 @@
1
+ class ConversionOptions:
2
+ heading_style: str
3
+ list_indent_type: str
4
+ list_indent_width: int
5
+ bullets: str
6
+ strong_em_symbol: str
7
+ escape_asterisks: bool
8
+ escape_underscores: bool
9
+ escape_misc: bool
10
+ code_language: str
11
+ autolinks: bool
12
+ default_title: bool
13
+ br_in_tables: bool
14
+ highlight_style: str
15
+ extract_metadata: bool
16
+ whitespace_mode: str
17
+ strip_newlines: bool
18
+ wrap: bool
19
+ wrap_width: int
20
+ convert_as_inline: bool
21
+ sub_symbol: str
22
+ sup_symbol: str
23
+ newline_style: str
24
+ preprocessing: PreprocessingOptions
25
+ parsing: ParsingOptions
26
+
27
+ def __init__(
28
+ self,
29
+ heading_style: str = "underlined",
30
+ list_indent_type: str = "spaces",
31
+ list_indent_width: int = 4,
32
+ bullets: str = "*+-",
33
+ strong_em_symbol: str = "*",
34
+ escape_asterisks: bool = True,
35
+ escape_underscores: bool = True,
36
+ escape_misc: bool = True,
37
+ code_language: str = "",
38
+ autolinks: bool = True,
39
+ default_title: bool = False,
40
+ br_in_tables: bool = False,
41
+ highlight_style: str = "double-equal",
42
+ extract_metadata: bool = True,
43
+ whitespace_mode: str = "normalized",
44
+ strip_newlines: bool = False,
45
+ wrap: bool = False,
46
+ wrap_width: int = 80,
47
+ convert_as_inline: bool = False,
48
+ sub_symbol: str = "",
49
+ sup_symbol: str = "",
50
+ newline_style: str = "spaces",
51
+ preprocessing: PreprocessingOptions | None = None,
52
+ parsing: ParsingOptions | None = None,
53
+ ) -> None: ...
54
+
55
+ class PreprocessingOptions:
56
+ enabled: bool
57
+ preset: str
58
+ remove_navigation: bool
59
+ remove_forms: bool
60
+
61
+ def __init__(
62
+ self,
63
+ enabled: bool = False,
64
+ preset: str = "standard",
65
+ remove_navigation: bool = True,
66
+ remove_forms: bool = True,
67
+ ) -> None: ...
68
+
69
+ class ParsingOptions:
70
+ encoding: str
71
+ parser: str | None
72
+
73
+ def __init__(
74
+ self,
75
+ encoding: str = "utf-8",
76
+ parser: str | None = None,
77
+ ) -> None: ...
78
+
79
+ def convert(html: str, options: ConversionOptions | None = None) -> str: ...
@@ -0,0 +1,100 @@
1
+ """New v2 functional API for HTML to Markdown conversion.
2
+
3
+ This module provides the new functional API with dataclass-based options,
4
+ using the Rust backend for conversion.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import html_to_markdown._html_to_markdown as _rust # type: ignore[import-not-found]
10
+ from html_to_markdown.options import (
11
+ ConversionOptions,
12
+ ParsingOptions,
13
+ PreprocessingOptions,
14
+ )
15
+
16
+
17
+ def convert(
18
+ html: str,
19
+ options: ConversionOptions | None = None,
20
+ preprocessing: PreprocessingOptions | None = None,
21
+ parsing: ParsingOptions | None = None,
22
+ ) -> str:
23
+ """Convert HTML to Markdown using Rust backend.
24
+
25
+ This is the main entry point for the v2 API, using dataclass-based configuration
26
+ and Rust implementation for high-performance conversion.
27
+
28
+ Args:
29
+ html: HTML string to convert
30
+ options: Conversion options (uses defaults if None)
31
+ preprocessing: HTML preprocessing options (uses defaults if None)
32
+ parsing: HTML parsing options (uses defaults if None)
33
+
34
+ Returns:
35
+ Markdown string
36
+
37
+ Example:
38
+ >>> from html_to_markdown import convert, ConversionOptions
39
+ >>> options = ConversionOptions(heading_style="atx", list_indent_width=2)
40
+ >>> markdown = convert("<h1>Title</h1>", options)
41
+ >>> print(markdown)
42
+ # Title
43
+ <BLANKLINE>
44
+ """
45
+ if options is None:
46
+ options = ConversionOptions()
47
+ if preprocessing is None:
48
+ preprocessing = PreprocessingOptions()
49
+ if parsing is None:
50
+ parsing = ParsingOptions()
51
+
52
+ rust_preprocessing = _rust.PreprocessingOptions(
53
+ enabled=preprocessing.enabled,
54
+ preset=preprocessing.preset,
55
+ remove_navigation=preprocessing.remove_navigation,
56
+ remove_forms=preprocessing.remove_forms,
57
+ )
58
+
59
+ rust_parsing = _rust.ParsingOptions(
60
+ encoding=parsing.encoding,
61
+ parser=parsing.parser,
62
+ )
63
+
64
+ rust_options = _rust.ConversionOptions(
65
+ heading_style=options.heading_style,
66
+ list_indent_type=options.list_indent_type,
67
+ list_indent_width=options.list_indent_width,
68
+ bullets=options.bullets,
69
+ strong_em_symbol=options.strong_em_symbol,
70
+ escape_asterisks=options.escape_asterisks,
71
+ escape_underscores=options.escape_underscores,
72
+ escape_misc=options.escape_misc,
73
+ escape_ascii=options.escape_ascii,
74
+ code_language=options.code_language,
75
+ autolinks=options.autolinks,
76
+ default_title=options.default_title,
77
+ br_in_tables=options.br_in_tables,
78
+ hocr_extract_tables=options.hocr_extract_tables,
79
+ hocr_table_column_threshold=options.hocr_table_column_threshold,
80
+ hocr_table_row_threshold_ratio=options.hocr_table_row_threshold_ratio,
81
+ highlight_style=options.highlight_style,
82
+ extract_metadata=options.extract_metadata,
83
+ whitespace_mode=options.whitespace_mode,
84
+ strip_newlines=options.strip_newlines,
85
+ wrap=options.wrap,
86
+ wrap_width=options.wrap_width,
87
+ convert_as_inline=options.convert_as_inline,
88
+ sub_symbol=options.sub_symbol,
89
+ sup_symbol=options.sup_symbol,
90
+ newline_style=options.newline_style,
91
+ code_block_style=options.code_block_style,
92
+ keep_inline_images_in=list(options.keep_inline_images_in) if options.keep_inline_images_in else [],
93
+ preprocessing=rust_preprocessing,
94
+ parsing=rust_parsing,
95
+ debug=options.debug,
96
+ strip_tags=list(options.strip_tags) if options.strip_tags else [],
97
+ )
98
+
99
+ result: str = _rust.convert(html, rust_options)
100
+ return result
Binary file
@@ -0,0 +1,9 @@
1
+ """CLI wrapper that proxies to Rust CLI binary.
2
+
3
+ This module provides backwards compatibility for code that imports
4
+ from html_to_markdown.cli. The actual CLI implementation is in Rust.
5
+ """
6
+
7
+ from html_to_markdown.cli_proxy import main
8
+
9
+ __all__ = ["main"]
@@ -0,0 +1,144 @@
1
+ """CLI proxy that calls the Rust CLI binary.
2
+
3
+ This module provides a Python wrapper around the Rust CLI binary,
4
+ allowing the Python package to use the high-performance Rust implementation
5
+ for command-line operations. It also provides v1 -> v2 CLI argument translation.
6
+ """
7
+
8
+ import subprocess
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ from html_to_markdown.exceptions import RedundantV1FlagError, RemovedV1FlagError
13
+
14
+
15
+ def find_cli_binary() -> Path:
16
+ """Find the html-to-markdown CLI binary.
17
+
18
+ Returns:
19
+ Path to the CLI binary
20
+
21
+ Raises:
22
+ FileNotFoundError: If the binary cannot be found
23
+ """
24
+ binary_name = "html-to-markdown.exe" if sys.platform == "win32" else "html-to-markdown"
25
+
26
+ possible_locations = [
27
+ Path(__file__).parent.parent / "target" / "release" / binary_name,
28
+ Path(__file__).parent / "bin" / binary_name,
29
+ Path(__file__).parent / binary_name,
30
+ ]
31
+
32
+ for location in possible_locations:
33
+ if location.exists() and location.is_file():
34
+ return location
35
+
36
+ msg = "html-to-markdown CLI binary not found. Please install or build the package."
37
+ raise FileNotFoundError(msg)
38
+
39
+
40
+ def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
41
+ """Translate v1 CLI arguments to v2 Rust CLI arguments.
42
+
43
+ This handles differences between the v1 Python CLI and v2 Rust CLI:
44
+ - Boolean flags: v1 used --flag/--no-flag, v2 uses presence/absence
45
+ - Flag name changes: --preprocess-html -> --preprocess
46
+ - Unsupported flags: --strip, --convert (raise errors)
47
+
48
+ Args:
49
+ argv: v1 CLI arguments
50
+
51
+ Returns:
52
+ Translated v2 CLI arguments
53
+
54
+ Raises:
55
+ RemovedV1FlagError: If a v1 flag has been removed in v2
56
+ """
57
+ translated = []
58
+ i = 0
59
+ while i < len(argv):
60
+ arg = argv[i]
61
+
62
+ # Error on removed/unsupported v1 features
63
+ if arg in ("--strip", "--convert"):
64
+ raise RemovedV1FlagError(
65
+ flag=arg,
66
+ reason=f"{arg} option has been removed in v2.",
67
+ migration="Remove this flag from your command. The feature is no longer available.",
68
+ )
69
+
70
+ # These flags are redundant (match v2 defaults) but we accept them for v1 compatibility
71
+ # Silently skip - Rust CLI defaults match these flags
72
+ if arg in (
73
+ "--no-escape-asterisks",
74
+ "--no-escape-underscores",
75
+ "--no-escape-misc",
76
+ "--no-wrap",
77
+ "--no-autolinks",
78
+ "--no-extract-metadata",
79
+ ):
80
+ # Skip this flag - matches Rust CLI defaults
81
+ pass
82
+
83
+ # Flag name translations
84
+ elif arg == "--preprocess-html":
85
+ translated.append("--preprocess")
86
+
87
+ # Positive flags that should be passed through
88
+ elif arg in (
89
+ "--escape-asterisks",
90
+ "--escape-underscores",
91
+ "--escape-misc",
92
+ "--autolinks",
93
+ "--extract-metadata",
94
+ "--wrap",
95
+ ):
96
+ translated.append(arg)
97
+
98
+ # All other args pass through unchanged
99
+ else:
100
+ translated.append(arg)
101
+
102
+ i += 1
103
+
104
+ return translated
105
+
106
+
107
+ def main(argv: list[str]) -> str:
108
+ """Run the Rust CLI with the given arguments.
109
+
110
+ Translates v1 CLI arguments to v2 format if needed.
111
+ Exits with non-zero status on errors (FileNotFoundError, UnsupportedV1FeatureError, CLI errors).
112
+
113
+ Args:
114
+ argv: Command line arguments (without program name)
115
+
116
+ Returns:
117
+ Output from the CLI
118
+ """
119
+ cli_binary = find_cli_binary()
120
+
121
+ try:
122
+ translated_args = translate_v1_args_to_v2(argv)
123
+ except (RemovedV1FlagError, RedundantV1FlagError) as e:
124
+ # Format the error nicely for CLI users
125
+ sys.stderr.write(f"\n❌ Error: {e.flag}\n\n")
126
+ sys.stderr.write(f" {e.reason}\n\n")
127
+ sys.stderr.write(f" 💡 {e.migration}\n\n")
128
+ sys.exit(1)
129
+ except ValueError as e:
130
+ sys.stderr.write(f"Error: {e}\n")
131
+ sys.exit(1)
132
+
133
+ result = subprocess.run( # noqa: S603
134
+ [str(cli_binary), *translated_args],
135
+ capture_output=True,
136
+ text=True,
137
+ check=False,
138
+ )
139
+
140
+ if result.returncode != 0:
141
+ sys.stderr.write(result.stderr)
142
+ sys.exit(result.returncode)
143
+
144
+ return result.stdout
@@ -0,0 +1,81 @@
1
+ """Exception classes for html-to-markdown."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class HtmlToMarkdownError(Exception):
7
+ """Base exception for html-to-markdown errors."""
8
+
9
+
10
+ class MissingDependencyError(HtmlToMarkdownError):
11
+ """Raised when a required dependency is not installed."""
12
+
13
+ def __init__(self, dependency: str, install_command: str | None = None) -> None:
14
+ self.dependency = dependency
15
+ self.install_command = install_command
16
+
17
+ message = f"{dependency} is not installed."
18
+ if install_command:
19
+ message += f" Install with: {install_command}"
20
+
21
+ super().__init__(message)
22
+
23
+
24
+ class InvalidParserError(HtmlToMarkdownError):
25
+ """Raised when an invalid HTML parser is specified."""
26
+
27
+ def __init__(self, parser: str, available_parsers: list[str]) -> None:
28
+ self.parser = parser
29
+ self.available_parsers = available_parsers
30
+
31
+ message = f"Invalid parser '{parser}'. Available parsers: {', '.join(available_parsers)}"
32
+ super().__init__(message)
33
+
34
+
35
+ class EmptyHtmlError(HtmlToMarkdownError):
36
+ """Raised when the input HTML is empty."""
37
+
38
+ def __init__(self) -> None:
39
+ super().__init__("The input HTML is empty.")
40
+
41
+
42
+ class ConflictingOptionsError(HtmlToMarkdownError):
43
+ """Raised when conflicting options are specified."""
44
+
45
+ def __init__(self, option1: str, option2: str) -> None:
46
+ self.option1 = option1
47
+ self.option2 = option2
48
+
49
+ super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")
50
+
51
+
52
+ class InvalidEncodingError(HtmlToMarkdownError):
53
+ """Raised when an invalid encoding is specified."""
54
+
55
+ def __init__(self, encoding: str) -> None:
56
+ super().__init__(f"The specified encoding ({encoding}) is not valid.")
57
+
58
+
59
+ class UnsupportedV1FeatureError(HtmlToMarkdownError):
60
+ """Raised when a v1 feature is not supported in v2.
61
+
62
+ Args:
63
+ flag: The CLI flag or feature that is not supported
64
+ reason: Why the feature is not supported
65
+ migration: How to migrate away from this feature
66
+ """
67
+
68
+ def __init__(self, flag: str, reason: str, migration: str) -> None:
69
+ self.flag = flag
70
+ self.reason = reason
71
+ self.migration = migration
72
+ message = f"'{flag}' is not supported in v2.\n\nReason: {reason}\n\nMigration: {migration}"
73
+ super().__init__(message)
74
+
75
+
76
+ class RemovedV1FlagError(UnsupportedV1FeatureError):
77
+ """Raised when a CLI flag has been completely removed in v2."""
78
+
79
+
80
+ class RedundantV1FlagError(UnsupportedV1FeatureError):
81
+ """Raised when a v1 flag is redundant in v2 because it's the default behavior."""
@@ -0,0 +1,211 @@
1
+ """Configuration options for HTML to Markdown conversion.
2
+
3
+ This module provides dataclass-based configuration for the v2 API.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass
9
+ from typing import TYPE_CHECKING, Any, Literal, Protocol
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Callable
13
+
14
+ from bs4 import Tag
15
+
16
+
17
+ class ConverterFunction(Protocol):
18
+ """Protocol for custom converter functions.
19
+
20
+ Converter functions receive keyword-only arguments including the HTML tag,
21
+ processed text content, and any conversion options needed.
22
+
23
+ Example:
24
+ >>> def custom_link_converter(*, tag: Tag, text: str, autolinks: bool, **kwargs: Any) -> str:
25
+ ... href = tag.get("href", "")
26
+ ... return f"[{text}]({href})"
27
+ """
28
+
29
+ def __call__(self, *, tag: Tag, text: str, **kwargs: Any) -> str:
30
+ """Convert an HTML element to Markdown.
31
+
32
+ Args:
33
+ tag: BeautifulSoup Tag object representing the HTML element
34
+ text: Processed text content of the element's children
35
+ **kwargs: Additional conversion options (varies by converter)
36
+
37
+ Returns:
38
+ Markdown string representation of the element
39
+ """
40
+ ...
41
+
42
+
43
+ @dataclass
44
+ class ConversionOptions:
45
+ """Main conversion configuration.
46
+
47
+ This class groups all conversion-related options together, replacing
48
+ the large number of keyword arguments in the v1 API.
49
+
50
+ Example:
51
+ >>> options = ConversionOptions(
52
+ ... heading_style="atx",
53
+ ... list_indent_width=2,
54
+ ... escape_asterisks=True,
55
+ ... )
56
+ >>> from html_to_markdown import convert
57
+ >>> markdown = convert("<h1>Title</h1>", options)
58
+ """
59
+
60
+ heading_style: Literal["underlined", "atx", "atx_closed"] = "atx"
61
+ """Style for headings: 'atx' (#) is CommonMark default, 'underlined' (===), or 'atx_closed' (# #)."""
62
+
63
+ list_indent_type: Literal["spaces", "tabs"] = "spaces"
64
+ """Type of indentation for lists."""
65
+
66
+ list_indent_width: int = 2
67
+ """Number of spaces for list indentation (CommonMark uses 2 spaces, ignored if list_indent_type='tabs')."""
68
+
69
+ bullets: str = "-*+"
70
+ """Characters to use for unordered list bullets (cycles through -, *, + for nested levels). CommonMark compliant."""
71
+
72
+ strong_em_symbol: Literal["*", "_"] = "*"
73
+ """Symbol for strong/emphasis formatting."""
74
+
75
+ escape_asterisks: bool = False
76
+ """Escape asterisk characters in text to prevent accidental formatting. Default False for minimal escaping (CommonMark)."""
77
+
78
+ escape_underscores: bool = False
79
+ """Escape underscore characters in text to prevent accidental formatting. Default False for minimal escaping (CommonMark)."""
80
+
81
+ escape_misc: bool = False
82
+ """Escape miscellaneous Markdown characters. Default False for minimal escaping (CommonMark)."""
83
+
84
+ escape_ascii: bool = False
85
+ """Escape all ASCII punctuation (for CommonMark spec compliance tests). Disabled by default for minimal escaping."""
86
+
87
+ code_language: str = ""
88
+ """Default language for code blocks."""
89
+
90
+ code_language_callback: Callable[[Tag], str] | None = None
91
+ """Callback to determine code language from element."""
92
+
93
+ autolinks: bool = True
94
+ """Convert bare URLs to automatic links."""
95
+
96
+ default_title: bool = False
97
+ """Add a default title if none exists."""
98
+
99
+ keep_inline_images_in: set[str] | None = None
100
+ """Parent tag names where images should remain inline."""
101
+
102
+ br_in_tables: bool = False
103
+ """Use <br> tags for line breaks in table cells instead of spaces."""
104
+
105
+ hocr_extract_tables: bool = True
106
+ """Enable table extraction from hOCR (HTML-based OCR) documents."""
107
+
108
+ hocr_table_column_threshold: int = 50
109
+ """Pixel threshold for detecting column boundaries in hOCR tables."""
110
+
111
+ hocr_table_row_threshold_ratio: float = 0.5
112
+ """Row height ratio threshold for detecting row boundaries in hOCR tables."""
113
+
114
+ highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
115
+ """Style for highlighting <mark> elements."""
116
+
117
+ extract_metadata: bool = True
118
+ """Extract metadata from HTML head and include as comment."""
119
+
120
+ whitespace_mode: Literal["normalized", "strict"] = "normalized"
121
+ """How to handle whitespace: 'normalized' or 'strict'."""
122
+
123
+ strip_newlines: bool = False
124
+ """Remove newlines from HTML before processing."""
125
+
126
+ wrap: bool = False
127
+ """Enable text wrapping."""
128
+
129
+ wrap_width: int = 80
130
+ """Column width for text wrapping."""
131
+
132
+ convert: set[str] | None = None
133
+ """HTML tags to convert to Markdown (None = all supported tags). v1 compatibility only."""
134
+
135
+ strip_tags: set[str] | None = None
136
+ """HTML tags to strip from output (output only text content, no markdown conversion)."""
137
+
138
+ convert_as_inline: bool = False
139
+ """Treat block elements as inline during conversion."""
140
+
141
+ sub_symbol: str = ""
142
+ """Symbol for subscript text."""
143
+
144
+ sup_symbol: str = ""
145
+ """Symbol for superscript text."""
146
+
147
+ newline_style: Literal["spaces", "backslash"] = "spaces"
148
+ """Style for newlines: 'spaces' (two trailing spaces, CommonMark default) or 'backslash' (\\). Both are equally CommonMark compliant."""
149
+
150
+ code_block_style: Literal["indented", "backticks", "tildes"] = "backticks"
151
+ """Style for code blocks: 'backticks' (```, better whitespace preservation), 'indented' (4 spaces), or 'tildes' (~~~). All are CommonMark compliant."""
152
+
153
+ custom_converters: dict[str, Callable[..., str]] | None = None
154
+ """Custom converter functions for specific HTML elements."""
155
+
156
+ debug: bool = False
157
+ """Enable debug mode with diagnostic warnings about unhandled elements and hOCR processing."""
158
+
159
+
160
+ @dataclass
161
+ class PreprocessingOptions:
162
+ """HTML preprocessing configuration.
163
+
164
+ Controls how HTML is cleaned and preprocessed before conversion.
165
+
166
+ Example:
167
+ >>> options = PreprocessingOptions(
168
+ ... enabled=True,
169
+ ... preset="aggressive",
170
+ ... remove_navigation=True,
171
+ ... )
172
+ """
173
+
174
+ enabled: bool = False
175
+ """Whether to enable HTML preprocessing (disabled by default for minimal transformation)."""
176
+
177
+ preset: Literal["minimal", "standard", "aggressive"] = "standard"
178
+ """Preprocessing aggressiveness level."""
179
+
180
+ remove_navigation: bool = True
181
+ """Remove navigation elements during preprocessing."""
182
+
183
+ remove_forms: bool = True
184
+ """Remove form elements during preprocessing."""
185
+
186
+ excluded_navigation_classes: set[str] | None = None
187
+ """Navigation class fragments to keep even when removing navigation."""
188
+
189
+ extra_navigation_classes: set[str] | None = None
190
+ """Additional navigation class fragments to strip beyond defaults."""
191
+
192
+
193
+ @dataclass
194
+ class ParsingOptions:
195
+ """HTML parsing configuration.
196
+
197
+ Example:
198
+ >>> options = ParsingOptions(
199
+ ... encoding="utf-8",
200
+ ... detect_encoding=True,
201
+ ... )
202
+ """
203
+
204
+ encoding: str = "utf-8"
205
+ """Character encoding for decoding bytes input."""
206
+
207
+ detect_encoding: bool = False
208
+ """Attempt to detect encoding from HTML (not yet implemented)."""
209
+
210
+ parser: str | None = None
211
+ """HTML parser to use: 'html.parser', 'lxml', or 'html5lib' (None = auto)."""
File without changes
@@ -0,0 +1,161 @@
1
+ """V1 API compatibility layer.
2
+
3
+ Provides backward compatibility for the v1 convert_to_markdown API
4
+ by translating v1 kwargs to v2 ConversionOptions/PreprocessingOptions/ParsingOptions.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import TYPE_CHECKING
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Iterator
13
+
14
+ from html_to_markdown import ConversionOptions, ParsingOptions, PreprocessingOptions
15
+ from html_to_markdown import convert as convert_v2
16
+
17
+
18
+ def convert_to_markdown( # noqa: D417
19
+ html: str,
20
+ *,
21
+ heading_style: str = "underlined",
22
+ list_indent_type: str = "spaces",
23
+ list_indent_width: int = 4,
24
+ bullets: str = "*+-",
25
+ strong_em_symbol: str = "*",
26
+ escape_asterisks: bool = True,
27
+ escape_underscores: bool = True,
28
+ escape_misc: bool = True,
29
+ code_language: str = "",
30
+ autolinks: bool = True,
31
+ default_title: bool = False,
32
+ br_in_tables: bool = False,
33
+ hocr_extract_tables: bool = True,
34
+ hocr_table_column_threshold: int = 50,
35
+ hocr_table_row_threshold_ratio: float = 0.5,
36
+ highlight_style: str = "double-equal",
37
+ extract_metadata: bool = True,
38
+ whitespace_mode: str = "normalized",
39
+ strip_newlines: bool = False,
40
+ wrap: bool = False,
41
+ wrap_width: int = 80,
42
+ convert_as_inline: bool = False,
43
+ sub_symbol: str = "",
44
+ sup_symbol: str = "",
45
+ newline_style: str = "spaces",
46
+ keep_inline_images_in: set[str] | None = None,
47
+ preprocess: bool = False,
48
+ preprocessing_preset: str = "standard",
49
+ remove_navigation: bool = True,
50
+ remove_forms: bool = True,
51
+ parser: str = "html.parser",
52
+ source_encoding: str = "utf-8",
53
+ code_language_callback: object | None = None,
54
+ strip: list[str] | None = None,
55
+ convert: list[str] | None = None,
56
+ custom_converters: dict[str, object] | None = None,
57
+ ) -> str:
58
+ """Convert HTML to Markdown (v1 API compatibility).
59
+
60
+ This function provides backward compatibility with the v1 API by accepting
61
+ the same kwargs and translating them to v2 ConversionOptions.
62
+
63
+ Note: Some v1 options are not supported in v2:
64
+ - code_language_callback: Removed in v2
65
+ - convert: Removed in v2
66
+ - custom_converters: Not yet implemented in v2
67
+
68
+ Args:
69
+ html: HTML string to convert
70
+
71
+ Returns:
72
+ Markdown string
73
+
74
+ Raises:
75
+ NotImplementedError: If unsupported v1 options are provided
76
+ """
77
+ if code_language_callback is not None:
78
+ raise NotImplementedError(
79
+ "code_language_callback was removed in v2. Use the code_language option to set a default language."
80
+ )
81
+ if convert is not None:
82
+ raise NotImplementedError("convert option was removed in v2. All supported tags are converted by default.")
83
+ if custom_converters is not None:
84
+ raise NotImplementedError("custom_converters is not yet implemented in v2")
85
+
86
+ # V1 behavior: if code_language is set, use fenced code blocks (backticks)
87
+ # V2 default is indented code blocks, so we need to override
88
+ code_block_style = "backticks" if code_language else "indented"
89
+
90
+ options = ConversionOptions(
91
+ heading_style=heading_style, # type: ignore[arg-type]
92
+ list_indent_type=list_indent_type, # type: ignore[arg-type]
93
+ list_indent_width=list_indent_width,
94
+ bullets=bullets,
95
+ strong_em_symbol=strong_em_symbol, # type: ignore[arg-type]
96
+ escape_asterisks=escape_asterisks,
97
+ escape_underscores=escape_underscores,
98
+ escape_misc=escape_misc,
99
+ code_block_style=code_block_style, # type: ignore[arg-type]
100
+ code_language=code_language,
101
+ autolinks=autolinks,
102
+ default_title=default_title,
103
+ br_in_tables=br_in_tables,
104
+ hocr_extract_tables=hocr_extract_tables,
105
+ hocr_table_column_threshold=hocr_table_column_threshold,
106
+ hocr_table_row_threshold_ratio=hocr_table_row_threshold_ratio,
107
+ highlight_style=highlight_style, # type: ignore[arg-type]
108
+ extract_metadata=extract_metadata,
109
+ whitespace_mode=whitespace_mode, # type: ignore[arg-type]
110
+ strip_newlines=strip_newlines,
111
+ wrap=wrap,
112
+ wrap_width=wrap_width,
113
+ convert_as_inline=convert_as_inline,
114
+ sub_symbol=sub_symbol,
115
+ sup_symbol=sup_symbol,
116
+ newline_style=newline_style, # type: ignore[arg-type]
117
+ keep_inline_images_in=keep_inline_images_in,
118
+ strip_tags=set(strip) if strip else None,
119
+ )
120
+
121
+ preprocessing = PreprocessingOptions(
122
+ enabled=preprocess,
123
+ preset=preprocessing_preset, # type: ignore[arg-type]
124
+ remove_navigation=remove_navigation,
125
+ remove_forms=remove_forms,
126
+ )
127
+
128
+ parsing = ParsingOptions(
129
+ encoding=source_encoding,
130
+ parser=parser,
131
+ )
132
+
133
+ return convert_v2(html, options, preprocessing, parsing)
134
+
135
+
136
+ def convert_to_markdown_stream( # noqa: D417
137
+ html: str,
138
+ *,
139
+ chunk_size: int = 4096,
140
+ **kwargs: object,
141
+ ) -> Iterator[str]:
142
+ """Stream HTML to Markdown conversion (v1 API).
143
+
144
+ Note: Streaming was removed in v2.
145
+
146
+ Args:
147
+ html: HTML string to convert
148
+ chunk_size: Size of chunks to yield (not used in v2)
149
+
150
+ Raises:
151
+ NotImplementedError: Streaming was removed in v2
152
+ """
153
+ raise NotImplementedError(
154
+ "Streaming API (convert_to_markdown_stream) was removed in v2 (html5ever does not support streaming). "
155
+ "Use convert_to_markdown() instead."
156
+ )
157
+
158
+
159
+ markdownify = convert_to_markdown
160
+
161
+ __all__ = ["convert_to_markdown", "convert_to_markdown_stream", "markdownify"]
@@ -0,0 +1,422 @@
1
+ Metadata-Version: 2.4
2
+ Name: html-to-markdown
3
+ Version: 2.0.0
4
+ Classifier: Development Status :: 5 - Production/Stable
5
+ Classifier: Environment :: Console
6
+ Classifier: Intended Audience :: Developers
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Classifier: Programming Language :: Python :: 3 :: Only
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Programming Language :: Rust
15
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
16
+ Classifier: Topic :: Text Processing
17
+ Classifier: Topic :: Text Processing :: Markup
18
+ Classifier: Topic :: Text Processing :: Markup :: HTML
19
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
20
+ Classifier: Typing :: Typed
21
+ License-File: LICENSE
22
+ Summary: High-performance HTML to Markdown converter powered by Rust with a clean Python API
23
+ Keywords: cli-tool,converter,html,html2markdown,html5,markdown,markup,parser,rust,text-processing
24
+ Home-Page: https://github.com/Goldziher/html-to-markdown
25
+ Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
26
+ License: MIT
27
+ Requires-Python: >=3.10
28
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
29
+ Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
30
+ Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
31
+ Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
32
+ Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
33
+
34
+ # html-to-markdown
35
+
36
+ High-performance HTML to Markdown converter Rust crate and CLI with Python bindings and CLI. Available via PyPI, Homebrew, and Cargo. Cross-platform support for Linux, macOS, and Windows.
37
+
38
+ [![PyPI version](https://badge.fury.io/py/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
39
+ [![Crates.io](https://img.shields.io/crates/v/html-to-markdown-rs.svg)](https://crates.io/crates/html-to-markdown-rs)
40
+ [![Python Versions](https://img.shields.io/pypi/pyversions/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
41
+ [![Documentation](https://img.shields.io/badge/docs-github-blue)](https://github.com/Goldziher/html-to-markdown)
42
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
43
+ [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
44
+
45
+ Part of the [Kreuzberg](https://kreuzberg.dev) ecosystem for document intelligence.
46
+
47
+ **📚 [Full V2 Documentation](crates/html-to-markdown/README.md)** - Comprehensive guide for Rust, Python, and CLI usage.
48
+
49
+ ## ⚡ Benchmarks
50
+
51
+ ### Throughput (Python API)
52
+
53
+ Real Wikipedia documents on Apple M1 Pro:
54
+
55
+ | Document | Size | Latency | Throughput | Docs/sec |
56
+ | ------------------- | ----- | ------- | ---------- | -------- |
57
+ | Lists (Timeline) | 129KB | 0.62ms | 208 MB/s | 1,613 |
58
+ | Tables (Countries) | 360KB | 2.02ms | 178 MB/s | 495 |
59
+ | Mixed (Python wiki) | 656KB | 4.56ms | 144 MB/s | 219 |
60
+
61
+ **Throughput scales linearly** from 144-208 MB/s across all document sizes.
62
+
63
+ ### Memory Usage
64
+
65
+ | Document Size | Memory Delta | Peak RSS | Leak Detection |
66
+ | ------------- | ------------ | -------- | -------------- |
67
+ | 10KB | < 2 MB | < 20 MB | ✅ None |
68
+ | 50KB | < 8 MB | < 35 MB | ✅ None |
69
+ | 500KB | < 40 MB | < 80 MB | ✅ None |
70
+
71
+ Memory usage is linear and stable across 50+ repeated conversions.
72
+
73
+ **V2 is 19-30x faster** than v1 Python/BeautifulSoup implementation.
74
+
75
+ 📊 **[Benchmark Results](BENCHMARK_RESULTS.md)** - Detailed Python API comparison
76
+ 📈 **[Performance Analysis](PERFORMANCE.md)** - Rust core benchmarks and profiling
77
+ 🔧 **[Benchmarking Guide](BENCHMARKS.md)** - How to run benchmarks
78
+ ✅ **[CommonMark Compliance](COMMONMARK_COMPLIANCE.md)** - CommonMark specification compliance
79
+
80
+ ## Features
81
+
82
+ - **🚀 Blazing Fast**: Pure Rust core with ultra-fast `tl` HTML parser
83
+ - **🐍 Python Bindings**: Clean Python API via PyO3 with full type hints
84
+ - **🦀 Native CLI**: Rust CLI binary with comprehensive options
85
+ - **📊 hOCR 1.2 Compliant**: Full support for all 40+ elements and 20+ properties
86
+ - **📝 CommonMark Compliant**: Follows CommonMark specification for list formatting
87
+ - **🎯 Type Safe**: Full type hints and `.pyi` stubs for excellent IDE support
88
+ - **🌍 Cross-Platform**: Wheels for Linux, macOS, Windows (x86_64 + ARM64)
89
+ - **✅ Well-Tested**: 900+ tests with dual Python + Rust coverage
90
+
91
+ ## Installation
92
+
93
+ > **📦 Package Names**: Due to a naming conflict on crates.io, the Rust crate is published as `html-to-markdown-rs`, while the Python package remains `html-to-markdown` on PyPI. The CLI binary name is `html-to-markdown` for both.
94
+
95
+ ### Python Package
96
+
97
+ ```bash
98
+ pip install html-to-markdown
99
+ ```
100
+
101
+ ### Rust Library
102
+
103
+ ```bash
104
+ cargo add html-to-markdown-rs
105
+ ```
106
+
107
+ ### CLI Binary
108
+
109
+ #### via Homebrew (macOS/Linux)
110
+
111
+ ```bash
112
+ brew tap goldziher/tap
113
+ brew install html-to-markdown
114
+ ```
115
+
116
+ #### via Cargo
117
+
118
+ ```bash
119
+ cargo install html-to-markdown-cli
120
+ ```
121
+
122
+ #### Direct Download
123
+
124
+ Download pre-built binaries from [GitHub Releases](https://github.com/Goldziher/html-to-markdown/releases).
125
+
126
+ ## Quick Start
127
+
128
+ ### Python API
129
+
130
+ Clean, type-safe configuration with dataclasses:
131
+
132
+ ```python
133
+ from html_to_markdown import convert, ConversionOptions
134
+
135
+ html = """
136
+ <h1>Welcome</h1>
137
+ <p>This is <strong>fast</strong> Rust-powered conversion!</p>
138
+ <ul>
139
+ <li>Blazing fast</li>
140
+ <li>Type safe</li>
141
+ <li>Easy to use</li>
142
+ </ul>
143
+ """
144
+
145
+ options = ConversionOptions(
146
+ heading_style="atx",
147
+ strong_em_symbol="*",
148
+ bullets="*+-",
149
+ )
150
+
151
+ markdown = convert(html, options)
152
+ print(markdown)
153
+ ```
154
+
155
+ Output:
156
+
157
+ ```markdown
158
+ # Welcome
159
+
160
+ This is **fast** Rust-powered conversion!
161
+
162
+ * Blazing fast
163
+ + Type safe
164
+ - Easy to use
165
+ ```
166
+
167
+ ### Rust API
168
+
169
+ ```rust
170
+ use html_to_markdown_rs::{convert, ConversionOptions, HeadingStyle};
171
+
172
+ fn main() {
173
+ let html = r#"
174
+ <h1>Welcome</h1>
175
+ <p>This is <strong>fast</strong> conversion!</p>
176
+ "#;
177
+
178
+ let options = ConversionOptions {
179
+ heading_style: HeadingStyle::Atx,
180
+ ..Default::default()
181
+ };
182
+
183
+ let markdown = convert(html, Some(options)).unwrap();
184
+ println!("{}", markdown);
185
+ }
186
+ ```
187
+
188
+ ### CLI Usage
189
+
190
+ ```bash
191
+ # Convert file
192
+ html-to-markdown input.html > output.md
193
+
194
+ # From stdin
195
+ cat input.html | html-to-markdown > output.md
196
+
197
+ # With options
198
+ html-to-markdown --heading-style atx --list-indent-width 2 input.html
199
+
200
+ # Clean web-scraped content
201
+ html-to-markdown \
202
+ --preprocess \
203
+ --preset aggressive \
204
+ --no-extract-metadata \
205
+ scraped.html > clean.md
206
+ ```
207
+
208
+ ## Configuration
209
+
210
+ ### Python: Dataclass Configuration
211
+
212
+ ```python
213
+ from html_to_markdown import (
214
+ convert,
215
+ ConversionOptions,
216
+ PreprocessingOptions,
217
+ )
218
+
219
+ # Conversion settings
220
+ options = ConversionOptions(
221
+ heading_style="atx", # "atx", "atx_closed", "underlined"
222
+ list_indent_width=2, # Discord/Slack: use 2
223
+ bullets="*+-", # Bullet characters
224
+ strong_em_symbol="*", # "*" or "_"
225
+ escape_asterisks=True, # Escape * in text
226
+ code_language="python", # Default code block language
227
+ extract_metadata=True, # Extract HTML metadata
228
+ highlight_style="double-equal", # "double-equal", "html", "bold"
229
+ )
230
+
231
+ # HTML preprocessing
232
+ preprocessing = PreprocessingOptions(
233
+ enabled=True,
234
+ preset="standard", # "minimal", "standard", "aggressive"
235
+ remove_navigation=True,
236
+ remove_forms=True,
237
+ )
238
+
239
+ markdown = convert(html, options, preprocessing)
240
+ ```
241
+
242
+ ### Python: Legacy API (v1 compatibility)
243
+
244
+ For backward compatibility with existing v1 code:
245
+
246
+ ```python
247
+ from html_to_markdown import convert_to_markdown
248
+
249
+ markdown = convert_to_markdown(
250
+ html,
251
+ heading_style="atx",
252
+ list_indent_width=2,
253
+ preprocess=True,
254
+ preprocessing_preset="standard",
255
+ )
256
+ ```
257
+
258
+ ## Common Use Cases
259
+
260
+ ### Discord/Slack Compatible Lists
261
+
262
+ ```python
263
+ from html_to_markdown import convert, ConversionOptions
264
+
265
+ options = ConversionOptions(list_indent_width=2)
266
+ markdown = convert(html, options)
267
+ ```
268
+
269
+ ### Clean Web-Scraped HTML
270
+
271
+ ```python
272
+ from html_to_markdown import convert, PreprocessingOptions
273
+
274
+ preprocessing = PreprocessingOptions(
275
+ enabled=True,
276
+ preset="aggressive", # Heavy cleaning
277
+ remove_navigation=True,
278
+ remove_forms=True,
279
+ )
280
+
281
+ markdown = convert(html, preprocessing=preprocessing)
282
+ ```
283
+
284
+ ### hOCR 1.2 Support
285
+
286
+ **Complete hOCR 1.2 specification compliance** with support for all elements, properties, and metadata:
287
+
288
+ ```python
289
+ from html_to_markdown import convert, ConversionOptions
290
+
291
+ # Option 1: Document structure extraction (NEW in v2)
292
+ # Extracts all hOCR elements and converts to structured markdown
293
+ # Supports: paragraphs, sections, chapters, headers/footers, images, math, etc.
294
+ markdown = convert(hocr_html)
295
+
296
+ # Option 2: Legacy table extraction (spatial reconstruction)
297
+ # Reconstructs tables from word bounding boxes
298
+ options = ConversionOptions(
299
+ hocr_extract_tables=True,
300
+ hocr_table_column_threshold=50,
301
+ hocr_table_row_threshold_ratio=0.5,
302
+ )
303
+ markdown = convert(hocr_html, options)
304
+ ```
305
+
306
+ **Full hOCR 1.2 Spec Coverage:**
307
+
308
+ - ✅ **All 40 Element Types** - Logical structure (12), typesetting (6), float (13), inline (6), engine-specific (3)
309
+ - ✅ **All 20+ Properties** - bbox, baseline, textangle, poly, x_wconf, x_confs, x_font, x_fsize, order, cflow, cuts, x_bboxes, image, ppageno, lpageno, scan_res, and more
310
+ - ✅ **All 5 Metadata Fields** - ocr-system, ocr-capabilities, ocr-number-of-pages, ocr-langs, ocr-scripts
311
+ - ✅ **37 Tests** - Complete coverage of all elements and properties
312
+
313
+ **Semantic Markdown Conversion:**
314
+
315
+ | Element Category | Examples | Markdown Output |
316
+ | ---------------- | ------------------------------- | ----------------------------------------- |
317
+ | Headings | `ocr_title`, `ocr_chapter` | `# Heading` |
318
+ | Sections | `ocr_section`, `ocr_subsection` | `##`, `###` |
319
+ | Structure | `ocr_par`, `ocr_blockquote` | Paragraphs, `> quotes` |
320
+ | Metadata | `ocr_abstract`, `ocr_author` | `**Abstract**`, `*Author*` |
321
+ | Floats | `ocr_header`, `ocr_footer` | `*Header*`, `*Footer*` |
322
+ | Images | `ocr_image`, `ocr_photo` | `![alt](path)` with image property |
323
+ | Math | `ocr_math`, `ocr_display` | `` `formula` ``, ```` ```equation``` ```` |
324
+ | Layout | `ocr_separator` | `---` horizontal rule |
325
+ | Inline | `ocrx_word`, `ocr_dropcap` | Text, `**Letter**` |
326
+
327
+ **HTML Entity Handling:** Automatically decodes `&quot;`, `&apos;`, `&lt;`, `&gt;`, `&amp;` in title attributes for proper property parsing.
328
+
329
+ ## Configuration Reference
330
+
331
+ **V2 Defaults (CommonMark-compliant):**
332
+
333
+ - `list_indent_width`: 2 (CommonMark standard)
334
+ - `bullets`: "\*+-" (cycles through `*`, `+`, `-` for nested levels)
335
+ - `escape_asterisks`: false (minimal escaping)
336
+ - `escape_underscores`: false (minimal escaping)
337
+ - `escape_misc`: false (minimal escaping)
338
+ - `newline_style`: "spaces" (CommonMark: two trailing spaces)
339
+ - `code_block_style`: "backticks" (fenced code blocks with \`\`\`, better whitespace preservation)
340
+ - `heading_style`: "atx" (CommonMark: `#`)
341
+ - `preprocessing.enabled`: false (no preprocessing by default)
342
+
343
+ For complete configuration reference, see **[Full Documentation](crates/html-to-markdown/README.md#configuration-reference)**.
344
+
345
+ ## Upgrading from v1.x
346
+
347
+ ### Backward Compatibility
348
+
349
+ Existing v1 code works without changes:
350
+
351
+ ```python
352
+ from html_to_markdown import convert_to_markdown
353
+
354
+ markdown = convert_to_markdown(html, heading_style="atx") # Still works!
355
+ ```
356
+
357
+ ### Modern API (Recommended)
358
+
359
+ For new projects, use the dataclass-based API:
360
+
361
+ ```python
362
+ from html_to_markdown import convert, ConversionOptions
363
+
364
+ options = ConversionOptions(heading_style="atx", list_indent_width=2)
365
+ markdown = convert(html, options)
366
+ ```
367
+
368
+ ### What Changed in v2
369
+
370
+ **Core Rewrite:**
371
+
372
+ - Complete Rust rewrite using `tl` HTML parser
373
+ - 19-30x performance improvement over v1
374
+ - CommonMark-compliant defaults (2-space indents, minimal escaping, ATX headings)
375
+ - No BeautifulSoup or lxml dependencies
376
+
377
+ **Removed Features:**
378
+
379
+ - `code_language_callback` - use `code_language` for default language
380
+ - `strip` / `convert` options - use `strip_tags` or preprocessing
381
+ - `convert_to_markdown_stream()` - not supported in v2
382
+
383
+ **Planned:**
384
+
385
+ - `custom_converters` - planned for future release
386
+
387
+ See **[CHANGELOG.md](CHANGELOG.md)** for complete v1 vs v2 comparison and migration guide.
388
+
389
+ ## Kreuzberg Ecosystem
390
+
391
+ html-to-markdown is part of the [Kreuzberg](https://kreuzberg.dev) ecosystem, a comprehensive framework for document intelligence and processing. While html-to-markdown focuses on converting HTML to Markdown with maximum performance, Kreuzberg provides a complete solution for:
392
+
393
+ - **Document Extraction**: Extract text, images, and metadata from 50+ document formats
394
+ - **OCR Processing**: Multiple OCR backends (Tesseract, EasyOCR, PaddleOCR)
395
+ - **Table Extraction**: Vision-based and OCR-based table detection
396
+ - **Document Classification**: Automatic detection of contracts, forms, invoices, etc.
397
+ - **RAG Pipelines**: Integration with retrieval-augmented generation workflows
398
+
399
+ Learn more at [kreuzberg.dev](https://kreuzberg.dev) or join our [Discord community](https://discord.gg/pXxagNK2zN).
400
+
401
+ ## Contributing
402
+
403
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, testing, and contribution guidelines.
404
+
405
+ ## License
406
+
407
+ MIT License - see [LICENSE](LICENSE) for details.
408
+
409
+ ## Acknowledgments
410
+
411
+ Version 1 started as a fork of [markdownify](https://pypi.org/project/markdownify/), rewritten, extended, and enhanced with better typing and features. Version 2 is a complete Rust rewrite for high performance.
412
+
413
+ ## Support
414
+
415
+ If you find this library useful, consider:
416
+
417
+ <a href="https://github.com/sponsors/Goldziher">
418
+ <img src="https://img.shields.io/badge/Sponsor-%E2%9D%A4-pink?logo=github-sponsors" alt="Sponsor" height="32">
419
+ </a>
420
+
421
+ Your support helps maintain and improve this library!
422
+
@@ -0,0 +1,17 @@
1
+ html_to_markdown-2.0.0.data/scripts/html-to-markdown,sha256=brVDlJvTJykaMtqOu_ls037UVWdX2UgNIXKcQhmnPTE,3734448
2
+ html_to_markdown-2.0.0.dist-info/RECORD,,
3
+ html_to_markdown-2.0.0.dist-info/WHEEL,sha256=HtAbUhtjhH1WdiDuIy2CapdoAiKCwe6bij_Tlxr1lEg,131
4
+ html_to_markdown-2.0.0.dist-info/METADATA,sha256=UbI9PqMoGe0EC3j_1Y6NeTgCU7b3fJ9jW_MOkDAM7yM,14285
5
+ html_to_markdown-2.0.0.dist-info/licenses/LICENSE,sha256=oQvPC-0UWvfg0WaeUBe11OJMtX60An-TW1ev_oaAA0k,1086
6
+ html_to_markdown/options.py,sha256=LXOUDqWwuvC-ryE118LttnATDO6-rlogYbbEGVfynhM,7241
7
+ html_to_markdown/_html_to_markdown.abi3.so,sha256=Ff4tPbv2sLfflQxy1P07Yl9put-HpiHKKbNGRujaBug,2989792
8
+ html_to_markdown/__init__.py,sha256=0r7a2ruI_9xqj0Ko-5O4yCGrQ4Nga89qSUY4lTSyiDE,1266
9
+ html_to_markdown/api.py,sha256=0KgVWCDX-pWxrADxxxnqzk5_IhYc4fDxRytgeHttCKQ,3620
10
+ html_to_markdown/_rust.pyi,sha256=6GZ5fXfQ7VqglKB-kSZ395cysOdLIdQidDq6yoAHICA,2141
11
+ html_to_markdown/v1_compat.py,sha256=ThGk8g5rsZ_2gO1pA4_VThiLKuNhu4injClyv2pQmg4,5521
12
+ html_to_markdown/cli.py,sha256=OW6GZAR7adSOfqSaRGx5YqNU3xChAkwG98WHcRhL5ss,254
13
+ html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ html_to_markdown/exceptions.py,sha256=0Yrzndw1kSqN-HMnE34TjZzo21iihiD1TZG1k2dmpdI,2626
15
+ html_to_markdown/cli_proxy.py,sha256=nuBMky_q_ArDUKGgWW6Vrxf2JwOa_RgmUPH8qYBIcRQ,4298
16
+ html_to_markdown/__main__.py,sha256=3Ic_EbOt2h6W88q084pkz5IKU6iY5z_woBygH6u9aw0,327
17
+ html_to_markdown/bin/html-to-markdown,sha256=brVDlJvTJykaMtqOu_ls037UVWdX2UgNIXKcQhmnPTE,3734448
@@ -0,0 +1,6 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.9.6)
3
+ Root-Is-Purelib: false
4
+ Tag: cp310-abi3-macosx_11_0_arm64
5
+ Generator: delocate 0.13.0
6
+
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright 2024-2025 Na'aman Hirschfeld
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.