html-to-markdown 2.7.0__cp310-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- html_to_markdown/__init__.py +58 -0
- html_to_markdown/__main__.py +16 -0
- html_to_markdown/_html_to_markdown.pyd +0 -0
- html_to_markdown/_html_to_markdown.pyi +22 -0
- html_to_markdown/api.py +151 -0
- html_to_markdown/bin/html-to-markdown.exe +0 -0
- html_to_markdown/cli.py +3 -0
- html_to_markdown/cli_proxy.py +142 -0
- html_to_markdown/exceptions.py +73 -0
- html_to_markdown/options.py +144 -0
- html_to_markdown/py.typed +0 -0
- html_to_markdown/v1_compat.py +192 -0
- html_to_markdown-2.7.0.data/scripts/html-to-markdown.exe +0 -0
- html_to_markdown-2.7.0.dist-info/METADATA +254 -0
- html_to_markdown-2.7.0.dist-info/RECORD +17 -0
- html_to_markdown-2.7.0.dist-info/WHEEL +4 -0
- html_to_markdown-2.7.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""html-to-markdown: Convert HTML to Markdown using Rust backend.
|
|
2
|
+
|
|
3
|
+
This package provides high-performance HTML to Markdown conversion
|
|
4
|
+
powered by Rust with a clean Python API.
|
|
5
|
+
|
|
6
|
+
V2 API (current):
|
|
7
|
+
from html_to_markdown import convert, ConversionOptions
|
|
8
|
+
|
|
9
|
+
options = ConversionOptions(heading_style="atx")
|
|
10
|
+
markdown = convert(html, options)
|
|
11
|
+
|
|
12
|
+
V1 API (backward compatibility):
|
|
13
|
+
from html_to_markdown import convert_to_markdown
|
|
14
|
+
|
|
15
|
+
markdown = convert_to_markdown(html, heading_style="atx")
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from html_to_markdown.api import (
|
|
19
|
+
InlineImage,
|
|
20
|
+
InlineImageConfig,
|
|
21
|
+
InlineImageWarning,
|
|
22
|
+
OptionsHandle,
|
|
23
|
+
convert,
|
|
24
|
+
convert_with_handle,
|
|
25
|
+
convert_with_inline_images,
|
|
26
|
+
create_options_handle,
|
|
27
|
+
)
|
|
28
|
+
from html_to_markdown.exceptions import (
|
|
29
|
+
ConflictingOptionsError,
|
|
30
|
+
EmptyHtmlError,
|
|
31
|
+
HtmlToMarkdownError,
|
|
32
|
+
InvalidParserError,
|
|
33
|
+
MissingDependencyError,
|
|
34
|
+
)
|
|
35
|
+
from html_to_markdown.options import ConversionOptions, PreprocessingOptions
|
|
36
|
+
from html_to_markdown.v1_compat import convert_to_markdown, markdownify
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"ConflictingOptionsError",
|
|
40
|
+
"ConversionOptions",
|
|
41
|
+
"EmptyHtmlError",
|
|
42
|
+
"HtmlToMarkdownError",
|
|
43
|
+
"InlineImage",
|
|
44
|
+
"InlineImageConfig",
|
|
45
|
+
"InlineImageWarning",
|
|
46
|
+
"InvalidParserError",
|
|
47
|
+
"MissingDependencyError",
|
|
48
|
+
"OptionsHandle",
|
|
49
|
+
"PreprocessingOptions",
|
|
50
|
+
"convert",
|
|
51
|
+
"convert_to_markdown",
|
|
52
|
+
"convert_with_handle",
|
|
53
|
+
"convert_with_inline_images",
|
|
54
|
+
"create_options_handle",
|
|
55
|
+
"markdownify",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
__version__ = "2.7.0"
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
|
|
3
|
+
from html_to_markdown.cli_proxy import main
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def cli() -> None:
|
|
7
|
+
try:
|
|
8
|
+
result = main(sys.argv[1:])
|
|
9
|
+
print(result, end="") # noqa: T201
|
|
10
|
+
except (ValueError, FileNotFoundError) as e:
|
|
11
|
+
print(str(e), file=sys.stderr) # noqa: T201
|
|
12
|
+
sys.exit(1)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
if __name__ == "__main__":
|
|
16
|
+
cli()
|
|
Binary file
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
class PreprocessingOptions:
|
|
4
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None: ...
|
|
5
|
+
|
|
6
|
+
class ConversionOptions:
|
|
7
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None: ...
|
|
8
|
+
|
|
9
|
+
class InlineImageConfig:
|
|
10
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None: ...
|
|
11
|
+
|
|
12
|
+
class ConversionOptionsHandle:
|
|
13
|
+
def __init__(self, options: ConversionOptions | None = None) -> None: ...
|
|
14
|
+
|
|
15
|
+
def convert(html: str, options: ConversionOptions | None = None) -> str: ...
|
|
16
|
+
def convert_with_inline_images(
|
|
17
|
+
html: str,
|
|
18
|
+
options: ConversionOptions | None = None,
|
|
19
|
+
image_config: InlineImageConfig | None = None,
|
|
20
|
+
) -> tuple[str, list[Any], list[Any]]: ...
|
|
21
|
+
def create_options_handle(options: ConversionOptions | None = None) -> ConversionOptionsHandle: ...
|
|
22
|
+
def convert_with_options_handle(html: str, handle: ConversionOptionsHandle) -> str: ...
|
html_to_markdown/api.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""High-level Python API backed by the Rust core."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Literal, TypedDict, cast
|
|
6
|
+
|
|
7
|
+
import html_to_markdown._html_to_markdown as _rust
|
|
8
|
+
from html_to_markdown._html_to_markdown import (
|
|
9
|
+
ConversionOptionsHandle as OptionsHandle,
|
|
10
|
+
)
|
|
11
|
+
from html_to_markdown._html_to_markdown import (
|
|
12
|
+
InlineImageConfig,
|
|
13
|
+
)
|
|
14
|
+
from html_to_markdown.options import ConversionOptions, PreprocessingOptions
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class InlineImage(TypedDict):
|
|
18
|
+
"""Inline image extracted during conversion."""
|
|
19
|
+
|
|
20
|
+
data: bytes
|
|
21
|
+
format: str
|
|
22
|
+
filename: str | None
|
|
23
|
+
description: str | None
|
|
24
|
+
dimensions: tuple[int, int] | None
|
|
25
|
+
source: Literal["img_data_uri", "svg_element"]
|
|
26
|
+
attributes: dict[str, str]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class InlineImageWarning(TypedDict):
|
|
30
|
+
"""Warning produced during inline image extraction."""
|
|
31
|
+
|
|
32
|
+
index: int
|
|
33
|
+
message: str
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _to_rust_preprocessing(options: PreprocessingOptions) -> _rust.PreprocessingOptions:
|
|
37
|
+
return _rust.PreprocessingOptions(
|
|
38
|
+
enabled=options.enabled,
|
|
39
|
+
preset=options.preset,
|
|
40
|
+
remove_navigation=options.remove_navigation,
|
|
41
|
+
remove_forms=options.remove_forms,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _to_rust_options(
|
|
46
|
+
options: ConversionOptions,
|
|
47
|
+
preprocessing: PreprocessingOptions,
|
|
48
|
+
) -> _rust.ConversionOptions:
|
|
49
|
+
return _rust.ConversionOptions(
|
|
50
|
+
heading_style=options.heading_style,
|
|
51
|
+
list_indent_type=options.list_indent_type,
|
|
52
|
+
list_indent_width=options.list_indent_width,
|
|
53
|
+
bullets=options.bullets,
|
|
54
|
+
strong_em_symbol=options.strong_em_symbol,
|
|
55
|
+
escape_asterisks=options.escape_asterisks,
|
|
56
|
+
escape_underscores=options.escape_underscores,
|
|
57
|
+
escape_misc=options.escape_misc,
|
|
58
|
+
escape_ascii=options.escape_ascii,
|
|
59
|
+
code_language=options.code_language,
|
|
60
|
+
autolinks=options.autolinks,
|
|
61
|
+
default_title=options.default_title,
|
|
62
|
+
br_in_tables=options.br_in_tables,
|
|
63
|
+
hocr_spatial_tables=options.hocr_spatial_tables,
|
|
64
|
+
highlight_style=options.highlight_style,
|
|
65
|
+
extract_metadata=options.extract_metadata,
|
|
66
|
+
whitespace_mode=options.whitespace_mode,
|
|
67
|
+
strip_newlines=options.strip_newlines,
|
|
68
|
+
wrap=options.wrap,
|
|
69
|
+
wrap_width=options.wrap_width,
|
|
70
|
+
convert_as_inline=options.convert_as_inline,
|
|
71
|
+
sub_symbol=options.sub_symbol,
|
|
72
|
+
sup_symbol=options.sup_symbol,
|
|
73
|
+
newline_style=options.newline_style,
|
|
74
|
+
code_block_style=options.code_block_style,
|
|
75
|
+
keep_inline_images_in=list(options.keep_inline_images_in) if options.keep_inline_images_in else [],
|
|
76
|
+
preprocessing=_to_rust_preprocessing(preprocessing),
|
|
77
|
+
encoding=options.encoding,
|
|
78
|
+
debug=options.debug,
|
|
79
|
+
strip_tags=list(options.strip_tags) if options.strip_tags else [],
|
|
80
|
+
preserve_tags=list(options.preserve_tags) if options.preserve_tags else [],
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def convert(
|
|
85
|
+
html: str,
|
|
86
|
+
options: ConversionOptions | None = None,
|
|
87
|
+
preprocessing: PreprocessingOptions | None = None,
|
|
88
|
+
) -> str:
|
|
89
|
+
"""Convert HTML to Markdown using the Rust backend."""
|
|
90
|
+
if options is None and preprocessing is None:
|
|
91
|
+
return _rust.convert(html, None)
|
|
92
|
+
|
|
93
|
+
if options is None:
|
|
94
|
+
options = ConversionOptions()
|
|
95
|
+
if preprocessing is None:
|
|
96
|
+
preprocessing = PreprocessingOptions()
|
|
97
|
+
|
|
98
|
+
rust_options = _to_rust_options(options, preprocessing)
|
|
99
|
+
return _rust.convert(html, rust_options)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def convert_with_inline_images(
|
|
103
|
+
html: str,
|
|
104
|
+
options: ConversionOptions | None = None,
|
|
105
|
+
preprocessing: PreprocessingOptions | None = None,
|
|
106
|
+
image_config: InlineImageConfig | None = None,
|
|
107
|
+
) -> tuple[str, list[InlineImage], list[InlineImageWarning]]:
|
|
108
|
+
"""Convert HTML and extract inline images."""
|
|
109
|
+
if options is None:
|
|
110
|
+
options = ConversionOptions()
|
|
111
|
+
if preprocessing is None:
|
|
112
|
+
preprocessing = PreprocessingOptions()
|
|
113
|
+
if image_config is None:
|
|
114
|
+
image_config = InlineImageConfig()
|
|
115
|
+
|
|
116
|
+
rust_options = _to_rust_options(options, preprocessing)
|
|
117
|
+
markdown, images, warnings = cast(
|
|
118
|
+
"tuple[str, list[InlineImage], list[InlineImageWarning]]",
|
|
119
|
+
_rust.convert_with_inline_images(html, rust_options, image_config),
|
|
120
|
+
)
|
|
121
|
+
return markdown, list(images), list(warnings)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def create_options_handle(
|
|
125
|
+
options: ConversionOptions | None = None,
|
|
126
|
+
preprocessing: PreprocessingOptions | None = None,
|
|
127
|
+
) -> OptionsHandle:
|
|
128
|
+
"""Create a reusable ConversionOptions handle backed by Rust."""
|
|
129
|
+
if options is None:
|
|
130
|
+
options = ConversionOptions()
|
|
131
|
+
if preprocessing is None:
|
|
132
|
+
preprocessing = PreprocessingOptions()
|
|
133
|
+
rust_options = _to_rust_options(options, preprocessing)
|
|
134
|
+
return _rust.create_options_handle(rust_options)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def convert_with_handle(html: str, handle: OptionsHandle) -> str:
|
|
138
|
+
"""Convert HTML using a pre-parsed ConversionOptions handle."""
|
|
139
|
+
return _rust.convert_with_options_handle(html, handle)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
__all__ = [
|
|
143
|
+
"InlineImage",
|
|
144
|
+
"InlineImageConfig",
|
|
145
|
+
"InlineImageWarning",
|
|
146
|
+
"OptionsHandle",
|
|
147
|
+
"convert",
|
|
148
|
+
"convert_with_handle",
|
|
149
|
+
"convert_with_inline_images",
|
|
150
|
+
"create_options_handle",
|
|
151
|
+
]
|
|
Binary file
|
html_to_markdown/cli.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import sys
|
|
3
|
+
import warnings
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from html_to_markdown.exceptions import RedundantV1FlagError, RemovedV1FlagError
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def find_cli_binary() -> Path:
|
|
10
|
+
"""Find the html-to-markdown CLI binary in expected locations.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
Path to the CLI binary.
|
|
14
|
+
|
|
15
|
+
Raises:
|
|
16
|
+
FileNotFoundError: If the binary cannot be found.
|
|
17
|
+
"""
|
|
18
|
+
binary_name = "html-to-markdown.exe" if sys.platform == "win32" else "html-to-markdown"
|
|
19
|
+
|
|
20
|
+
module_dir = Path(__file__).resolve().parent
|
|
21
|
+
parent_dirs = list(module_dir.parents)
|
|
22
|
+
|
|
23
|
+
search_roots = []
|
|
24
|
+
for parent in parent_dirs:
|
|
25
|
+
candidate = parent / "target" / "release" / binary_name
|
|
26
|
+
search_roots.append(candidate)
|
|
27
|
+
|
|
28
|
+
possible_locations = [
|
|
29
|
+
*search_roots,
|
|
30
|
+
module_dir / "bin" / binary_name,
|
|
31
|
+
module_dir / binary_name,
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
for location in possible_locations:
|
|
35
|
+
if location.exists() and location.is_file():
|
|
36
|
+
return location
|
|
37
|
+
|
|
38
|
+
msg = "html-to-markdown CLI binary not found. Please install or build the package."
|
|
39
|
+
raise FileNotFoundError(msg)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
|
|
43
|
+
"""Translate v1 CLI arguments to v2 format.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
argv: List of command-line arguments.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Translated list of arguments compatible with v2.
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
RemovedV1FlagError: If a v1 flag has been removed in v2.
|
|
53
|
+
"""
|
|
54
|
+
translated = []
|
|
55
|
+
i = 0
|
|
56
|
+
while i < len(argv):
|
|
57
|
+
arg = argv[i]
|
|
58
|
+
|
|
59
|
+
if arg in ("--strip", "--convert"):
|
|
60
|
+
raise RemovedV1FlagError(
|
|
61
|
+
flag=arg,
|
|
62
|
+
reason=f"{arg} option has been removed in v2.",
|
|
63
|
+
migration="Remove this flag from your command. The feature is no longer available.",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
if arg in (
|
|
67
|
+
"--no-escape-asterisks",
|
|
68
|
+
"--no-escape-underscores",
|
|
69
|
+
"--no-escape-misc",
|
|
70
|
+
"--no-wrap",
|
|
71
|
+
"--no-autolinks",
|
|
72
|
+
"--no-extract-metadata",
|
|
73
|
+
):
|
|
74
|
+
warnings.warn(
|
|
75
|
+
f"'{arg}' is deprecated and redundant in v2. "
|
|
76
|
+
f"These options are now disabled by default. Remove this flag.",
|
|
77
|
+
DeprecationWarning,
|
|
78
|
+
stacklevel=2,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
elif arg == "--preprocess-html":
|
|
82
|
+
warnings.warn(
|
|
83
|
+
"'--preprocess-html' is deprecated. Use '--preprocess' instead.",
|
|
84
|
+
DeprecationWarning,
|
|
85
|
+
stacklevel=2,
|
|
86
|
+
)
|
|
87
|
+
translated.append("--preprocess")
|
|
88
|
+
|
|
89
|
+
elif arg in (
|
|
90
|
+
"--escape-asterisks",
|
|
91
|
+
"--escape-underscores",
|
|
92
|
+
"--escape-misc",
|
|
93
|
+
"--autolinks",
|
|
94
|
+
"--extract-metadata",
|
|
95
|
+
"--wrap",
|
|
96
|
+
):
|
|
97
|
+
translated.append(arg)
|
|
98
|
+
|
|
99
|
+
else:
|
|
100
|
+
translated.append(arg)
|
|
101
|
+
|
|
102
|
+
i += 1
|
|
103
|
+
|
|
104
|
+
return translated
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def main(argv: list[str]) -> str:
|
|
108
|
+
"""Execute the CLI proxy.
|
|
109
|
+
|
|
110
|
+
Translates v1 arguments to v2 and invokes the native Rust CLI binary.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
argv: Command-line arguments.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Stdout from the CLI binary.
|
|
117
|
+
"""
|
|
118
|
+
cli_binary = find_cli_binary()
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
translated_args = translate_v1_args_to_v2(argv)
|
|
122
|
+
except (RemovedV1FlagError, RedundantV1FlagError) as e:
|
|
123
|
+
sys.stderr.write(f"\n❌ Error: {e.flag}\n\n")
|
|
124
|
+
sys.stderr.write(f" {e.reason}\n\n")
|
|
125
|
+
sys.stderr.write(f" 💡 {e.migration}\n\n")
|
|
126
|
+
sys.exit(1)
|
|
127
|
+
except ValueError as e:
|
|
128
|
+
sys.stderr.write(f"Error: {e}\n")
|
|
129
|
+
sys.exit(1)
|
|
130
|
+
|
|
131
|
+
result = subprocess.run( # noqa: S603
|
|
132
|
+
[str(cli_binary), *translated_args],
|
|
133
|
+
capture_output=True,
|
|
134
|
+
text=True,
|
|
135
|
+
check=False,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
if result.returncode != 0:
|
|
139
|
+
sys.stderr.write(result.stderr)
|
|
140
|
+
sys.exit(result.returncode)
|
|
141
|
+
|
|
142
|
+
return result.stdout
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class HtmlToMarkdownError(Exception):
|
|
5
|
+
"""Base exception for all html-to-markdown errors."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class MissingDependencyError(HtmlToMarkdownError):
|
|
9
|
+
"""Raised when a required dependency is not installed."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, dependency: str, install_command: str | None = None) -> None:
|
|
12
|
+
self.dependency = dependency
|
|
13
|
+
self.install_command = install_command
|
|
14
|
+
|
|
15
|
+
message = f"{dependency} is not installed."
|
|
16
|
+
if install_command:
|
|
17
|
+
message += f" Install with: {install_command}"
|
|
18
|
+
|
|
19
|
+
super().__init__(message)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class InvalidParserError(HtmlToMarkdownError):
|
|
23
|
+
"""Raised when an invalid parser is specified."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, parser: str, available_parsers: list[str]) -> None:
|
|
26
|
+
self.parser = parser
|
|
27
|
+
self.available_parsers = available_parsers
|
|
28
|
+
|
|
29
|
+
message = f"Invalid parser '{parser}'. Available parsers: {', '.join(available_parsers)}"
|
|
30
|
+
super().__init__(message)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class EmptyHtmlError(HtmlToMarkdownError):
|
|
34
|
+
"""Raised when input HTML is empty."""
|
|
35
|
+
|
|
36
|
+
def __init__(self) -> None:
|
|
37
|
+
super().__init__("The input HTML is empty.")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ConflictingOptionsError(HtmlToMarkdownError):
|
|
41
|
+
"""Raised when conflicting configuration options are specified."""
|
|
42
|
+
|
|
43
|
+
def __init__(self, option1: str, option2: str) -> None:
|
|
44
|
+
self.option1 = option1
|
|
45
|
+
self.option2 = option2
|
|
46
|
+
|
|
47
|
+
super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class InvalidEncodingError(HtmlToMarkdownError):
|
|
51
|
+
"""Raised when an invalid character encoding is specified."""
|
|
52
|
+
|
|
53
|
+
def __init__(self, encoding: str) -> None:
|
|
54
|
+
super().__init__(f"The specified encoding ({encoding}) is not valid.")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class UnsupportedV1FeatureError(HtmlToMarkdownError):
|
|
58
|
+
"""Raised when a v1 feature is not supported in v2."""
|
|
59
|
+
|
|
60
|
+
def __init__(self, flag: str, reason: str, migration: str) -> None:
|
|
61
|
+
self.flag = flag
|
|
62
|
+
self.reason = reason
|
|
63
|
+
self.migration = migration
|
|
64
|
+
message = f"'{flag}' is not supported in v2.\n\nReason: {reason}\n\nMigration: {migration}"
|
|
65
|
+
super().__init__(message)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class RemovedV1FlagError(UnsupportedV1FeatureError):
|
|
69
|
+
"""Raised when a v1 flag has been removed in v2."""
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class RedundantV1FlagError(UnsupportedV1FeatureError):
|
|
73
|
+
"""Raised when a v1 flag is redundant in v2."""
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Configuration options for HTML to Markdown conversion.
|
|
2
|
+
|
|
3
|
+
This module provides dataclass-based configuration for the v2 API.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Literal
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ConversionOptions:
|
|
14
|
+
"""Main conversion configuration.
|
|
15
|
+
|
|
16
|
+
This class groups all conversion-related options together, replacing
|
|
17
|
+
the large number of keyword arguments in the v1 API.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
>>> options = ConversionOptions(
|
|
21
|
+
... heading_style="atx",
|
|
22
|
+
... list_indent_width=2,
|
|
23
|
+
... escape_asterisks=True,
|
|
24
|
+
... )
|
|
25
|
+
>>> from html_to_markdown import convert
|
|
26
|
+
>>> markdown = convert("<h1>Title</h1>", options)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
heading_style: Literal["underlined", "atx", "atx_closed"] = "atx"
|
|
30
|
+
"""Style for headings: 'atx' (#) is CommonMark default, 'underlined' (===), or 'atx_closed' (# #)."""
|
|
31
|
+
|
|
32
|
+
list_indent_type: Literal["spaces", "tabs"] = "spaces"
|
|
33
|
+
"""Type of indentation for lists."""
|
|
34
|
+
|
|
35
|
+
list_indent_width: int = 2
|
|
36
|
+
"""Number of spaces for list indentation (CommonMark uses 2 spaces, ignored if list_indent_type='tabs')."""
|
|
37
|
+
|
|
38
|
+
bullets: str = "-*+"
|
|
39
|
+
"""Characters to use for unordered list bullets (cycles through -, *, + for nested levels). CommonMark compliant."""
|
|
40
|
+
|
|
41
|
+
strong_em_symbol: Literal["*", "_"] = "*"
|
|
42
|
+
"""Symbol for strong/emphasis formatting."""
|
|
43
|
+
|
|
44
|
+
escape_asterisks: bool = False
|
|
45
|
+
"""Escape asterisk characters in text to prevent accidental formatting. Default False for minimal escaping (CommonMark)."""
|
|
46
|
+
|
|
47
|
+
escape_underscores: bool = False
|
|
48
|
+
"""Escape underscore characters in text to prevent accidental formatting. Default False for minimal escaping (CommonMark)."""
|
|
49
|
+
|
|
50
|
+
escape_misc: bool = False
|
|
51
|
+
"""Escape miscellaneous Markdown characters. Default False for minimal escaping (CommonMark)."""
|
|
52
|
+
|
|
53
|
+
escape_ascii: bool = False
|
|
54
|
+
"""Escape all ASCII punctuation (for CommonMark spec compliance tests). Disabled by default for minimal escaping."""
|
|
55
|
+
|
|
56
|
+
code_language: str = ""
|
|
57
|
+
"""Default language for code blocks."""
|
|
58
|
+
|
|
59
|
+
encoding: str = "utf-8"
|
|
60
|
+
"""Character encoding expected for the HTML input."""
|
|
61
|
+
|
|
62
|
+
autolinks: bool = True
|
|
63
|
+
"""Convert bare URLs to automatic links."""
|
|
64
|
+
|
|
65
|
+
default_title: bool = False
|
|
66
|
+
"""Add a default title if none exists."""
|
|
67
|
+
|
|
68
|
+
keep_inline_images_in: set[str] | None = None
|
|
69
|
+
"""Parent tag names where images should remain inline."""
|
|
70
|
+
|
|
71
|
+
br_in_tables: bool = False
|
|
72
|
+
"""Use <br> tags for line breaks in table cells instead of spaces."""
|
|
73
|
+
|
|
74
|
+
hocr_spatial_tables: bool = True
|
|
75
|
+
"""Reconstruct tables in hOCR documents using spatial heuristics."""
|
|
76
|
+
|
|
77
|
+
highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
|
|
78
|
+
"""Style for highlighting <mark> elements."""
|
|
79
|
+
|
|
80
|
+
extract_metadata: bool = True
|
|
81
|
+
"""Extract metadata from HTML head and include as comment."""
|
|
82
|
+
|
|
83
|
+
whitespace_mode: Literal["normalized", "strict"] = "normalized"
|
|
84
|
+
"""How to handle whitespace: 'normalized' or 'strict'."""
|
|
85
|
+
|
|
86
|
+
strip_newlines: bool = False
|
|
87
|
+
"""Remove newlines from HTML before processing."""
|
|
88
|
+
|
|
89
|
+
wrap: bool = False
|
|
90
|
+
"""Enable text wrapping."""
|
|
91
|
+
|
|
92
|
+
wrap_width: int = 80
|
|
93
|
+
"""Column width for text wrapping."""
|
|
94
|
+
|
|
95
|
+
strip_tags: set[str] | None = None
|
|
96
|
+
"""HTML tags to strip from output (output only text content, no markdown conversion)."""
|
|
97
|
+
|
|
98
|
+
preserve_tags: set[str] | None = None
|
|
99
|
+
"""HTML tags to preserve as-is in the output (keep original HTML). Useful for complex elements like tables."""
|
|
100
|
+
|
|
101
|
+
convert_as_inline: bool = False
|
|
102
|
+
"""Treat block elements as inline during conversion."""
|
|
103
|
+
|
|
104
|
+
sub_symbol: str = ""
|
|
105
|
+
"""Symbol for subscript text."""
|
|
106
|
+
|
|
107
|
+
sup_symbol: str = ""
|
|
108
|
+
"""Symbol for superscript text."""
|
|
109
|
+
|
|
110
|
+
newline_style: Literal["spaces", "backslash"] = "spaces"
|
|
111
|
+
"""Style for newlines: 'spaces' (two trailing spaces, CommonMark default) or 'backslash' (\\). Both are equally CommonMark compliant."""
|
|
112
|
+
|
|
113
|
+
code_block_style: Literal["indented", "backticks", "tildes"] = "backticks"
|
|
114
|
+
"""Style for code blocks: 'backticks' (```, better whitespace preservation), 'indented' (4 spaces), or 'tildes' (~~~). All are CommonMark compliant."""
|
|
115
|
+
|
|
116
|
+
debug: bool = False
|
|
117
|
+
"""Enable debug mode with diagnostic warnings about unhandled elements and hOCR processing."""
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@dataclass
|
|
121
|
+
class PreprocessingOptions:
|
|
122
|
+
"""HTML preprocessing configuration.
|
|
123
|
+
|
|
124
|
+
Controls how HTML is cleaned and preprocessed before conversion.
|
|
125
|
+
|
|
126
|
+
Example:
|
|
127
|
+
>>> options = PreprocessingOptions(
|
|
128
|
+
... enabled=True,
|
|
129
|
+
... preset="aggressive",
|
|
130
|
+
... remove_navigation=True,
|
|
131
|
+
... )
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
enabled: bool = True
|
|
135
|
+
"""Whether to enable HTML preprocessing (enabled by default for robust handling of malformed HTML)."""
|
|
136
|
+
|
|
137
|
+
preset: Literal["minimal", "standard", "aggressive"] = "standard"
|
|
138
|
+
"""Preprocessing aggressiveness level."""
|
|
139
|
+
|
|
140
|
+
remove_navigation: bool = True
|
|
141
|
+
"""Remove navigation elements during preprocessing."""
|
|
142
|
+
|
|
143
|
+
remove_forms: bool = True
|
|
144
|
+
"""Remove form elements during preprocessing."""
|
|
File without changes
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""V1 API compatibility layer.
|
|
2
|
+
|
|
3
|
+
Provides backward compatibility for the v1 convert_to_markdown API
|
|
4
|
+
by translating v1 kwargs to v2 ConversionOptions and PreprocessingOptions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import warnings
|
|
10
|
+
|
|
11
|
+
from html_to_markdown import ConversionOptions, PreprocessingOptions
|
|
12
|
+
from html_to_markdown import convert as convert_v2
|
|
13
|
+
|
|
14
|
+
DEPRECATION_MESSAGE = (
|
|
15
|
+
"The v1 compatibility layer is deprecated and will be removed in v3.0. "
|
|
16
|
+
"Use html_to_markdown.convert() with ConversionOptions instead."
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _warn_deprecated(api_name: str, *, stacklevel: int = 2) -> None:
|
|
21
|
+
warnings.warn(f"{api_name} is deprecated. {DEPRECATION_MESSAGE}", DeprecationWarning, stacklevel=stacklevel)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def convert_to_markdown(
|
|
25
|
+
html: str,
|
|
26
|
+
*,
|
|
27
|
+
heading_style: str = "underlined",
|
|
28
|
+
list_indent_type: str = "spaces",
|
|
29
|
+
list_indent_width: int = 4,
|
|
30
|
+
bullets: str = "*+-",
|
|
31
|
+
strong_em_symbol: str = "*",
|
|
32
|
+
escape_asterisks: bool = True,
|
|
33
|
+
escape_underscores: bool = True,
|
|
34
|
+
escape_misc: bool = True,
|
|
35
|
+
code_language: str = "",
|
|
36
|
+
autolinks: bool = True,
|
|
37
|
+
default_title: bool = False,
|
|
38
|
+
br_in_tables: bool = False,
|
|
39
|
+
hocr_extract_tables: bool = True,
|
|
40
|
+
hocr_table_column_threshold: int = 50,
|
|
41
|
+
hocr_table_row_threshold_ratio: float = 0.5,
|
|
42
|
+
highlight_style: str = "double-equal",
|
|
43
|
+
extract_metadata: bool = True,
|
|
44
|
+
whitespace_mode: str = "normalized",
|
|
45
|
+
strip_newlines: bool = False,
|
|
46
|
+
wrap: bool = False,
|
|
47
|
+
wrap_width: int = 80,
|
|
48
|
+
convert_as_inline: bool = False,
|
|
49
|
+
sub_symbol: str = "",
|
|
50
|
+
sup_symbol: str = "",
|
|
51
|
+
newline_style: str = "spaces",
|
|
52
|
+
keep_inline_images_in: set[str] | None = None,
|
|
53
|
+
preprocess: bool = False,
|
|
54
|
+
preprocessing_preset: str = "standard",
|
|
55
|
+
remove_navigation: bool = True,
|
|
56
|
+
remove_forms: bool = True,
|
|
57
|
+
source_encoding: str = "utf-8",
|
|
58
|
+
code_language_callback: object | None = None,
|
|
59
|
+
strip: list[str] | None = None,
|
|
60
|
+
convert: list[str] | None = None,
|
|
61
|
+
custom_converters: dict[str, object] | None = None,
|
|
62
|
+
) -> str:
|
|
63
|
+
"""Convert HTML to Markdown (v1 compatibility API).
|
|
64
|
+
|
|
65
|
+
This function provides backward compatibility with the v1 API by translating
|
|
66
|
+
v1-style keyword arguments to v2 ConversionOptions and PreprocessingOptions.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
html: HTML string to convert.
|
|
70
|
+
heading_style: Style for headings (default: "underlined" for v1 compatibility).
|
|
71
|
+
list_indent_type: Type of indentation for lists.
|
|
72
|
+
list_indent_width: Number of spaces for list indentation (v1 default: 4).
|
|
73
|
+
bullets: Characters to use for unordered list bullets.
|
|
74
|
+
strong_em_symbol: Symbol for strong/emphasis formatting.
|
|
75
|
+
escape_asterisks: Escape asterisk characters (v1 default: True).
|
|
76
|
+
escape_underscores: Escape underscore characters (v1 default: True).
|
|
77
|
+
escape_misc: Escape miscellaneous Markdown characters (v1 default: True).
|
|
78
|
+
code_language: Default language for code blocks.
|
|
79
|
+
autolinks: Convert bare URLs to automatic links.
|
|
80
|
+
default_title: Add a default title if none exists.
|
|
81
|
+
br_in_tables: Use <br> tags for line breaks in table cells.
|
|
82
|
+
hocr_extract_tables: Deprecated - always True in v2.
|
|
83
|
+
hocr_table_column_threshold: Deprecated - uses built-in heuristics in v2.
|
|
84
|
+
hocr_table_row_threshold_ratio: Deprecated - uses built-in heuristics in v2.
|
|
85
|
+
highlight_style: Style for highlighting <mark> elements.
|
|
86
|
+
extract_metadata: Extract metadata from HTML head.
|
|
87
|
+
whitespace_mode: How to handle whitespace.
|
|
88
|
+
strip_newlines: Remove newlines from HTML before processing.
|
|
89
|
+
wrap: Enable text wrapping.
|
|
90
|
+
wrap_width: Column width for text wrapping.
|
|
91
|
+
convert_as_inline: Treat block elements as inline.
|
|
92
|
+
sub_symbol: Symbol for subscript text.
|
|
93
|
+
sup_symbol: Symbol for superscript text.
|
|
94
|
+
newline_style: Style for newlines.
|
|
95
|
+
keep_inline_images_in: Parent tag names where images should remain inline.
|
|
96
|
+
preprocess: Enable HTML preprocessing.
|
|
97
|
+
preprocessing_preset: Preprocessing aggressiveness level.
|
|
98
|
+
remove_navigation: Remove navigation elements during preprocessing.
|
|
99
|
+
remove_forms: Remove form elements during preprocessing.
|
|
100
|
+
source_encoding: Character encoding expected for the HTML input.
|
|
101
|
+
code_language_callback: Deprecated - not supported in v2.
|
|
102
|
+
strip: HTML tags to strip from output.
|
|
103
|
+
convert: Deprecated - not supported in v2.
|
|
104
|
+
custom_converters: Deprecated - not yet implemented in v2.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Converted Markdown string.
|
|
108
|
+
|
|
109
|
+
Raises:
|
|
110
|
+
NotImplementedError: If deprecated v1 features are used.
|
|
111
|
+
|
|
112
|
+
.. deprecated:: 2.0
|
|
113
|
+
Use :func:`html_to_markdown.convert` with :class:`ConversionOptions` instead.
|
|
114
|
+
The v1 API is provided for backward compatibility only.
|
|
115
|
+
"""
|
|
116
|
+
_warn_deprecated("convert_to_markdown()", stacklevel=2)
|
|
117
|
+
|
|
118
|
+
if code_language_callback is not None:
|
|
119
|
+
raise NotImplementedError(
|
|
120
|
+
"code_language_callback was removed in v2. Use the code_language option to set a default language."
|
|
121
|
+
)
|
|
122
|
+
if convert is not None:
|
|
123
|
+
raise NotImplementedError("convert option was removed in v2. All supported tags are converted by default.")
|
|
124
|
+
if custom_converters is not None:
|
|
125
|
+
raise NotImplementedError("custom_converters is not yet implemented in v2")
|
|
126
|
+
if not hocr_extract_tables:
|
|
127
|
+
warnings.warn(
|
|
128
|
+
"hocr_extract_tables is deprecated and will be removed in a future release. "
|
|
129
|
+
"Use ConversionOptions(hocr_spatial_tables=False) to disable spatial table reconstruction.",
|
|
130
|
+
DeprecationWarning,
|
|
131
|
+
stacklevel=2,
|
|
132
|
+
)
|
|
133
|
+
if hocr_table_column_threshold != 50 or hocr_table_row_threshold_ratio != 0.5:
|
|
134
|
+
raise NotImplementedError(
|
|
135
|
+
"hOCR table threshold overrides were removed in v2. Table reconstruction now uses built-in heuristics."
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# ~keep: v1 used indented code blocks by default, but switched to backticks when a language was set
|
|
139
|
+
# This maintains v1 behavior for backward compatibility
|
|
140
|
+
code_block_style = "backticks" if code_language else "indented"
|
|
141
|
+
|
|
142
|
+
options = ConversionOptions(
|
|
143
|
+
heading_style=heading_style, # type: ignore[arg-type]
|
|
144
|
+
list_indent_type=list_indent_type, # type: ignore[arg-type]
|
|
145
|
+
list_indent_width=list_indent_width,
|
|
146
|
+
bullets=bullets,
|
|
147
|
+
strong_em_symbol=strong_em_symbol, # type: ignore[arg-type]
|
|
148
|
+
escape_asterisks=escape_asterisks,
|
|
149
|
+
escape_underscores=escape_underscores,
|
|
150
|
+
escape_misc=escape_misc,
|
|
151
|
+
code_block_style=code_block_style, # type: ignore[arg-type]
|
|
152
|
+
code_language=code_language,
|
|
153
|
+
autolinks=autolinks,
|
|
154
|
+
default_title=default_title,
|
|
155
|
+
br_in_tables=br_in_tables,
|
|
156
|
+
hocr_spatial_tables=hocr_extract_tables,
|
|
157
|
+
highlight_style=highlight_style, # type: ignore[arg-type]
|
|
158
|
+
extract_metadata=extract_metadata,
|
|
159
|
+
whitespace_mode=whitespace_mode, # type: ignore[arg-type]
|
|
160
|
+
strip_newlines=strip_newlines,
|
|
161
|
+
wrap=wrap,
|
|
162
|
+
wrap_width=wrap_width,
|
|
163
|
+
convert_as_inline=convert_as_inline,
|
|
164
|
+
sub_symbol=sub_symbol,
|
|
165
|
+
sup_symbol=sup_symbol,
|
|
166
|
+
newline_style=newline_style, # type: ignore[arg-type]
|
|
167
|
+
keep_inline_images_in=keep_inline_images_in,
|
|
168
|
+
strip_tags=set(strip) if strip else None,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
preprocessing = PreprocessingOptions(
|
|
172
|
+
enabled=preprocess,
|
|
173
|
+
preset=preprocessing_preset, # type: ignore[arg-type]
|
|
174
|
+
remove_navigation=remove_navigation,
|
|
175
|
+
remove_forms=remove_forms,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
options.encoding = source_encoding
|
|
179
|
+
return convert_v2(html, options, preprocessing)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def markdownify(*args: object, **kwargs: object) -> str:
|
|
183
|
+
"""Alias for convert_to_markdown (deprecated).
|
|
184
|
+
|
|
185
|
+
.. deprecated:: 2.0
|
|
186
|
+
Use html_to_markdown.convert() instead.
|
|
187
|
+
"""
|
|
188
|
+
_warn_deprecated("markdownify()", stacklevel=2)
|
|
189
|
+
return convert_to_markdown(*args, **kwargs) # type: ignore[arg-type]
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
__all__ = ["convert_to_markdown", "markdownify"]
|
|
Binary file
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: html-to-markdown
|
|
3
|
+
Version: 2.7.0
|
|
4
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
5
|
+
Classifier: Environment :: Console
|
|
6
|
+
Classifier: Intended Audience :: Developers
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
|
+
Classifier: Programming Language :: Rust
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Classifier: Topic :: Text Processing
|
|
18
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Summary: High-performance HTML to Markdown converter powered by Rust with a clean Python API
|
|
24
|
+
Keywords: cli-tool,converter,html,html2markdown,html5,markdown,markup,parser,rust,text-processing
|
|
25
|
+
Home-Page: https://github.com/Goldziher/html-to-markdown
|
|
26
|
+
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
27
|
+
Requires-Python: >=3.10
|
|
28
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
29
|
+
Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
|
|
30
|
+
Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
|
|
31
|
+
Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
|
|
32
|
+
Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
|
|
33
|
+
|
|
34
|
+
# html-to-markdown
|
|
35
|
+
|
|
36
|
+
High-performance HTML to Markdown converter with a clean Python API (powered by a Rust core). The same engine also drives the Node.js, Ruby, PHP, and WebAssembly bindings, so rendered Markdown stays identical across runtimes. Wheels are published for Linux, macOS, and Windows.
|
|
37
|
+
|
|
38
|
+
[](https://crates.io/crates/html-to-markdown-rs)
|
|
39
|
+
[](https://www.npmjs.com/package/html-to-markdown-node)
|
|
40
|
+
[](https://www.npmjs.com/package/html-to-markdown-wasm)
|
|
41
|
+
[](https://pypi.org/project/html-to-markdown/)
|
|
42
|
+
[](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
43
|
+
[](https://rubygems.org/gems/html-to-markdown)
|
|
44
|
+
[](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install html-to-markdown
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Performance Snapshot
|
|
53
|
+
|
|
54
|
+
Apple M4 • Real Wikipedia documents • `convert()` (Python)
|
|
55
|
+
|
|
56
|
+
| Document | Size | Latency | Throughput | Docs/sec |
|
|
57
|
+
| ------------------- | ----- | ------- | ---------- | -------- |
|
|
58
|
+
| Lists (Timeline) | 129KB | 0.62ms | 208 MB/s | 1,613 |
|
|
59
|
+
| Tables (Countries) | 360KB | 2.02ms | 178 MB/s | 495 |
|
|
60
|
+
| Mixed (Python wiki) | 656KB | 4.56ms | 144 MB/s | 219 |
|
|
61
|
+
|
|
62
|
+
> V1 averaged ~2.5 MB/s (Python/BeautifulSoup). V2's Rust engine delivers 60–80× higher throughput.
|
|
63
|
+
|
|
64
|
+
## Quick Start
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from html_to_markdown import convert
|
|
68
|
+
|
|
69
|
+
html = """
|
|
70
|
+
<h1>Welcome</h1>
|
|
71
|
+
<p>This is <strong>fast</strong> Rust-powered conversion!</p>
|
|
72
|
+
<ul>
|
|
73
|
+
<li>Blazing fast</li>
|
|
74
|
+
<li>Type safe</li>
|
|
75
|
+
<li>Easy to use</li>
|
|
76
|
+
</ul>
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
markdown = convert(html)
|
|
80
|
+
print(markdown)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Configuration (v2 API)
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from html_to_markdown import ConversionOptions, convert
|
|
87
|
+
|
|
88
|
+
options = ConversionOptions(
|
|
89
|
+
heading_style="atx",
|
|
90
|
+
list_indent_width=2,
|
|
91
|
+
bullets="*+-",
|
|
92
|
+
)
|
|
93
|
+
options.escape_asterisks = True
|
|
94
|
+
options.code_language = "python"
|
|
95
|
+
options.extract_metadata = True
|
|
96
|
+
|
|
97
|
+
markdown = convert(html, options)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Reusing Parsed Options
|
|
101
|
+
|
|
102
|
+
Avoid re-parsing the same option dictionaries inside hot loops by building a reusable handle:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from html_to_markdown import ConversionOptions, convert_with_handle, create_options_handle
|
|
106
|
+
|
|
107
|
+
handle = create_options_handle(ConversionOptions(hocr_spatial_tables=False))
|
|
108
|
+
|
|
109
|
+
for html in documents:
|
|
110
|
+
markdown = convert_with_handle(html, handle)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### HTML Preprocessing
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from html_to_markdown import ConversionOptions, PreprocessingOptions, convert
|
|
117
|
+
|
|
118
|
+
options = ConversionOptions(
|
|
119
|
+
preprocessing=PreprocessingOptions(enabled=True, preset="aggressive"),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
markdown = convert(scraped_html, options)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Inline Image Extraction
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from html_to_markdown import InlineImageConfig, convert_with_inline_images
|
|
129
|
+
|
|
130
|
+
markdown, inline_images, warnings = convert_with_inline_images(
|
|
131
|
+
'<p><img src="data:image/png;base64,...==" alt="Pixel" width="1" height="1"></p>',
|
|
132
|
+
image_config=InlineImageConfig(max_decoded_size_bytes=1024, infer_dimensions=True),
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if inline_images:
|
|
136
|
+
first = inline_images[0]
|
|
137
|
+
print(first["format"], first["dimensions"], first["attributes"]) # e.g. "png", (1, 1), {"width": "1"}
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Each inline image is returned as a typed dictionary (`bytes` payload, metadata, and relevant HTML attributes). Warnings are human-readable skip reasons.
|
|
141
|
+
|
|
142
|
+
### hOCR (HTML OCR) Support
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from html_to_markdown import ConversionOptions, convert
|
|
146
|
+
|
|
147
|
+
# Default: emit structured Markdown directly
|
|
148
|
+
markdown = convert(hocr_html)
|
|
149
|
+
|
|
150
|
+
# hOCR documents are detected automatically; tables are reconstructed without extra configuration.
|
|
151
|
+
markdown = convert(hocr_html)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## CLI (same engine)
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
pipx install html-to-markdown # or: pip install html-to-markdown
|
|
158
|
+
|
|
159
|
+
html-to-markdown page.html > page.md
|
|
160
|
+
cat page.html | html-to-markdown --heading-style atx > page.md
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## API Surface
|
|
164
|
+
|
|
165
|
+
### `ConversionOptions`
|
|
166
|
+
|
|
167
|
+
Key fields (see docstring for full matrix):
|
|
168
|
+
|
|
169
|
+
- `heading_style`: `"underlined" | "atx" | "atx_closed"`
|
|
170
|
+
- `list_indent_width`: spaces per indent level (default 2)
|
|
171
|
+
- `bullets`: cycle of bullet characters (`"*+-"`)
|
|
172
|
+
- `strong_em_symbol`: `"*"` or `"_"`
|
|
173
|
+
- `code_language`: default fenced code block language
|
|
174
|
+
- `wrap`, `wrap_width`: wrap Markdown output
|
|
175
|
+
- `strip_tags`: remove specific HTML tags
|
|
176
|
+
- `preprocessing`: `PreprocessingOptions`
|
|
177
|
+
- `encoding`: input character encoding (informational)
|
|
178
|
+
|
|
179
|
+
### `PreprocessingOptions`
|
|
180
|
+
|
|
181
|
+
- `enabled`: enable HTML sanitisation (default: `True` since v2.4.2 for robust malformed HTML handling)
|
|
182
|
+
- `preset`: `"minimal" | "standard" | "aggressive"` (default: `"standard"`)
|
|
183
|
+
- `remove_navigation`: remove navigation elements (default: `True`)
|
|
184
|
+
- `remove_forms`: remove form elements (default: `True`)
|
|
185
|
+
|
|
186
|
+
**Note:** As of v2.4.2, preprocessing is enabled by default to ensure robust handling of malformed HTML (e.g., bare angle brackets like `1<2` in content). Set `enabled=False` if you need minimal preprocessing.
|
|
187
|
+
|
|
188
|
+
### `InlineImageConfig`
|
|
189
|
+
|
|
190
|
+
- `max_decoded_size_bytes`: reject larger payloads
|
|
191
|
+
- `filename_prefix`: generated name prefix (`embedded_image` default)
|
|
192
|
+
- `capture_svg`: collect inline `<svg>` (default `True`)
|
|
193
|
+
- `infer_dimensions`: decode raster images to obtain dimensions (default `False`)
|
|
194
|
+
|
|
195
|
+
## Performance: V2 vs V1 Compatibility Layer
|
|
196
|
+
|
|
197
|
+
### ⚠️ Important: Always Use V2 API
|
|
198
|
+
|
|
199
|
+
The v2 API (`convert()`) is **strongly recommended** for all code. The v1 compatibility layer adds significant overhead and should only be used for gradual migration:
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
# ✅ RECOMMENDED - V2 Direct API (Fast)
|
|
203
|
+
from html_to_markdown import convert, ConversionOptions
|
|
204
|
+
|
|
205
|
+
markdown = convert(html) # Simple conversion - FAST
|
|
206
|
+
markdown = convert(html, ConversionOptions(heading_style="atx")) # With options - FAST
|
|
207
|
+
|
|
208
|
+
# ❌ AVOID - V1 Compatibility Layer (Slow)
|
|
209
|
+
from html_to_markdown import convert_to_markdown
|
|
210
|
+
|
|
211
|
+
markdown = convert_to_markdown(html, heading_style="atx") # Adds 77% overhead
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### Performance Comparison
|
|
215
|
+
|
|
216
|
+
Benchmarked on Apple M4 with 25-paragraph HTML document:
|
|
217
|
+
|
|
218
|
+
| API | ops/sec | Relative Performance | Recommendation |
|
|
219
|
+
| ------------------------ | ---------------- | -------------------- | ------------------- |
|
|
220
|
+
| **V2 API** (`convert()`) | **129,822** | baseline | ✅ **Use this** |
|
|
221
|
+
| **V1 Compat Layer** | **67,673** | **77% slower** | ⚠️ Migration only |
|
|
222
|
+
| **CLI** | **150-210 MB/s** | Fastest | ✅ Batch processing |
|
|
223
|
+
|
|
224
|
+
The v1 compatibility layer creates extra Python objects and performs additional conversions, significantly impacting performance.
|
|
225
|
+
|
|
226
|
+
### When to Use Each
|
|
227
|
+
|
|
228
|
+
- **V2 API (`convert()`)**: All new code, production systems, performance-critical applications ← **Use this**
|
|
229
|
+
- **V1 Compat (`convert_to_markdown()`)**: Only for gradual migration from legacy codebases
|
|
230
|
+
- **CLI (`html-to-markdown`)**: Batch processing, shell scripts, maximum throughput
|
|
231
|
+
|
|
232
|
+
## v1 Compatibility
|
|
233
|
+
|
|
234
|
+
A compatibility layer is provided to ease migration from v1.x:
|
|
235
|
+
|
|
236
|
+
- **Compat shim**: `html_to_markdown.v1_compat` exposes `convert_to_markdown`, `convert_to_markdown_stream`, and `markdownify`. Keyword mappings are listed in the [changelog](CHANGELOG.md#v200).
|
|
237
|
+
- **⚠️ Performance warning**: These compatibility functions add 77% overhead. Migrate to v2 API as soon as possible.
|
|
238
|
+
- **CLI**: The Rust CLI replaces the old Python script. New flags are documented via `html-to-markdown --help`.
|
|
239
|
+
- **Removed options**: `code_language_callback`, `strip`, and streaming APIs were removed; use `ConversionOptions`, `PreprocessingOptions`, and the inline-image helpers instead.
|
|
240
|
+
|
|
241
|
+
## Links
|
|
242
|
+
|
|
243
|
+
- GitHub: [https://github.com/Goldziher/html-to-markdown](https://github.com/Goldziher/html-to-markdown)
|
|
244
|
+
- Discord: [https://discord.gg/pXxagNK2zN](https://discord.gg/pXxagNK2zN)
|
|
245
|
+
- Kreuzberg ecosystem: [https://kreuzberg.dev](https://kreuzberg.dev)
|
|
246
|
+
|
|
247
|
+
## License
|
|
248
|
+
|
|
249
|
+
MIT License – see [LICENSE](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE).
|
|
250
|
+
|
|
251
|
+
## Support
|
|
252
|
+
|
|
253
|
+
If you find this library useful, consider [sponsoring the project](https://github.com/sponsors/Goldziher).
|
|
254
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
html_to_markdown-2.7.0.data/scripts/html-to-markdown.exe,sha256=5w9ag00jGlkN_VLmzchWmU9oA_8bL3r1Xv8gkwWUBcs,3363840
|
|
2
|
+
html_to_markdown-2.7.0.dist-info/METADATA,sha256=BIjjqRcWPQMBRaaSwqU5Hf-x2uZ_3xMlhZwccbeYBLI,10277
|
|
3
|
+
html_to_markdown-2.7.0.dist-info/WHEEL,sha256=G3JyZRtw6x7sQDM5feqT5IDYMcac7O2Ec3LW6k1bFXE,96
|
|
4
|
+
html_to_markdown-2.7.0.dist-info/licenses/LICENSE,sha256=QhKFMkQLa4mSUlOsyG9VElzC7GYbAKtiS_EwOCyH-b4,1107
|
|
5
|
+
html_to_markdown/__init__.py,sha256=G1MsrpXqG94nUABxuB69lOYdlKpmcu-XQWGn7ImxxZE,1564
|
|
6
|
+
html_to_markdown/__main__.py,sha256=5objj9lB7hhpSpZsDok5tv9o9yztVR63Ccww-pXsAyY,343
|
|
7
|
+
html_to_markdown/_html_to_markdown.pyd,sha256=DWWcvIEWn6SufG_UcF6Q6I2xHDmrN3t0fIe4TkzkkSI,3084288
|
|
8
|
+
html_to_markdown/_html_to_markdown.pyi,sha256=lh2hj6GyGx71fJzZPD5giZbO6XQYYBIlfQUJq4MwVPQ,878
|
|
9
|
+
html_to_markdown/api.py,sha256=xxdVbIZjuSewhsgntdfY5DFJaYIEZITz2TBieqUCR3A,5241
|
|
10
|
+
html_to_markdown/bin/html-to-markdown.exe,sha256=5w9ag00jGlkN_VLmzchWmU9oA_8bL3r1Xv8gkwWUBcs,3363840
|
|
11
|
+
html_to_markdown/cli.py,sha256=z59l8sF8wIRRzJtUd-tXgqiC0WTqkTjzl-df8Ey_oQ0,67
|
|
12
|
+
html_to_markdown/cli_proxy.py,sha256=Y0Z98U0EMDqIRtdEkcHa1dVntWkw69maczeksr-Cq28,4000
|
|
13
|
+
html_to_markdown/exceptions.py,sha256=31VqpPi4JLGv7lI2481Z4f2s5ejYmq97c3s-WFFkXVU,2443
|
|
14
|
+
html_to_markdown/options.py,sha256=iDEIfxxZlSHDM3V-Sr-XVxYLC1mzvuic56jSycYvQvY,5224
|
|
15
|
+
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
+
html_to_markdown/v1_compat.py,sha256=qBfWRsXxox4I4Mm2kzvxEvqEKZ8DwYMQK-bbLHTUk-A,8253
|
|
17
|
+
html_to_markdown-2.7.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright 2024-2025 Na'aman Hirschfeld
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|