html-to-markdown 2.0.1__cp310-abi3-macosx_11_0_arm64.whl → 2.1.2__cp310-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +3 -9
- html_to_markdown/_html_to_markdown.abi3.so +0 -0
- html_to_markdown/_rust.pyi +4 -12
- html_to_markdown/api.py +7 -34
- html_to_markdown/bin/html-to-markdown +0 -0
- html_to_markdown/cli.py +0 -6
- html_to_markdown/cli_proxy.py +23 -33
- html_to_markdown/exceptions.py +8 -16
- html_to_markdown/options.py +3 -76
- html_to_markdown/v1_compat.py +79 -51
- {html_to_markdown-2.0.1.data → html_to_markdown-2.1.2.data}/scripts/html-to-markdown +0 -0
- html_to_markdown-2.1.2.dist-info/METADATA +196 -0
- html_to_markdown-2.1.2.dist-info/RECORD +17 -0
- html_to_markdown-2.0.1.dist-info/METADATA +0 -243
- html_to_markdown-2.0.1.dist-info/RECORD +0 -17
- {html_to_markdown-2.0.1.dist-info → html_to_markdown-2.1.2.dist-info}/WHEEL +0 -0
- {html_to_markdown-2.0.1.dist-info → html_to_markdown-2.1.2.dist-info}/licenses/LICENSE +0 -0
html_to_markdown/__init__.py
CHANGED
|
@@ -23,12 +23,8 @@ from html_to_markdown.exceptions import (
|
|
|
23
23
|
InvalidParserError,
|
|
24
24
|
MissingDependencyError,
|
|
25
25
|
)
|
|
26
|
-
from html_to_markdown.options import
|
|
27
|
-
|
|
28
|
-
ParsingOptions,
|
|
29
|
-
PreprocessingOptions,
|
|
30
|
-
)
|
|
31
|
-
from html_to_markdown.v1_compat import convert_to_markdown, convert_to_markdown_stream, markdownify
|
|
26
|
+
from html_to_markdown.options import ConversionOptions, PreprocessingOptions
|
|
27
|
+
from html_to_markdown.v1_compat import convert_to_markdown, markdownify
|
|
32
28
|
|
|
33
29
|
__all__ = [
|
|
34
30
|
"ConflictingOptionsError",
|
|
@@ -37,12 +33,10 @@ __all__ = [
|
|
|
37
33
|
"HtmlToMarkdownError",
|
|
38
34
|
"InvalidParserError",
|
|
39
35
|
"MissingDependencyError",
|
|
40
|
-
"ParsingOptions",
|
|
41
36
|
"PreprocessingOptions",
|
|
42
37
|
"convert",
|
|
43
38
|
"convert_to_markdown",
|
|
44
|
-
"convert_to_markdown_stream",
|
|
45
39
|
"markdownify",
|
|
46
40
|
]
|
|
47
41
|
|
|
48
|
-
__version__ = "2.
|
|
42
|
+
__version__ = "2.1.1"
|
|
Binary file
|
html_to_markdown/_rust.pyi
CHANGED
|
@@ -21,8 +21,9 @@ class ConversionOptions:
|
|
|
21
21
|
sub_symbol: str
|
|
22
22
|
sup_symbol: str
|
|
23
23
|
newline_style: str
|
|
24
|
+
keep_inline_images_in: list[str]
|
|
24
25
|
preprocessing: PreprocessingOptions
|
|
25
|
-
|
|
26
|
+
encoding: str
|
|
26
27
|
|
|
27
28
|
def __init__(
|
|
28
29
|
self,
|
|
@@ -48,8 +49,9 @@ class ConversionOptions:
|
|
|
48
49
|
sub_symbol: str = "",
|
|
49
50
|
sup_symbol: str = "",
|
|
50
51
|
newline_style: str = "spaces",
|
|
52
|
+
keep_inline_images_in: list[str] | None = None,
|
|
51
53
|
preprocessing: PreprocessingOptions | None = None,
|
|
52
|
-
|
|
54
|
+
encoding: str = "utf-8",
|
|
53
55
|
) -> None: ...
|
|
54
56
|
|
|
55
57
|
class PreprocessingOptions:
|
|
@@ -66,14 +68,4 @@ class PreprocessingOptions:
|
|
|
66
68
|
remove_forms: bool = True,
|
|
67
69
|
) -> None: ...
|
|
68
70
|
|
|
69
|
-
class ParsingOptions:
|
|
70
|
-
encoding: str
|
|
71
|
-
parser: str | None
|
|
72
|
-
|
|
73
|
-
def __init__(
|
|
74
|
-
self,
|
|
75
|
-
encoding: str = "utf-8",
|
|
76
|
-
parser: str | None = None,
|
|
77
|
-
) -> None: ...
|
|
78
|
-
|
|
79
71
|
def convert(html: str, options: ConversionOptions | None = None) -> str: ...
|
html_to_markdown/api.py
CHANGED
|
@@ -7,47 +7,28 @@ using the Rust backend for conversion.
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
import html_to_markdown._html_to_markdown as _rust # type: ignore[import-not-found]
|
|
10
|
-
from html_to_markdown.options import
|
|
11
|
-
ConversionOptions,
|
|
12
|
-
ParsingOptions,
|
|
13
|
-
PreprocessingOptions,
|
|
14
|
-
)
|
|
10
|
+
from html_to_markdown.options import ConversionOptions, PreprocessingOptions
|
|
15
11
|
|
|
16
12
|
|
|
17
13
|
def convert(
|
|
18
14
|
html: str,
|
|
19
15
|
options: ConversionOptions | None = None,
|
|
20
16
|
preprocessing: PreprocessingOptions | None = None,
|
|
21
|
-
parsing: ParsingOptions | None = None,
|
|
22
17
|
) -> str:
|
|
23
|
-
"""Convert HTML to Markdown using Rust backend.
|
|
24
|
-
|
|
25
|
-
This is the main entry point for the v2 API, using dataclass-based configuration
|
|
26
|
-
and Rust implementation for high-performance conversion.
|
|
18
|
+
"""Convert HTML to Markdown using the Rust backend.
|
|
27
19
|
|
|
28
20
|
Args:
|
|
29
|
-
html: HTML string to convert
|
|
30
|
-
options: Conversion options (
|
|
31
|
-
preprocessing: HTML preprocessing options (
|
|
32
|
-
parsing: HTML parsing options (uses defaults if None)
|
|
21
|
+
html: HTML string to convert.
|
|
22
|
+
options: Conversion configuration options (defaults to ConversionOptions()).
|
|
23
|
+
preprocessing: HTML preprocessing options (defaults to PreprocessingOptions()).
|
|
33
24
|
|
|
34
25
|
Returns:
|
|
35
|
-
Markdown string
|
|
36
|
-
|
|
37
|
-
Example:
|
|
38
|
-
>>> from html_to_markdown import convert, ConversionOptions
|
|
39
|
-
>>> options = ConversionOptions(heading_style="atx", list_indent_width=2)
|
|
40
|
-
>>> markdown = convert("<h1>Title</h1>", options)
|
|
41
|
-
>>> print(markdown)
|
|
42
|
-
# Title
|
|
43
|
-
<BLANKLINE>
|
|
26
|
+
Converted Markdown string.
|
|
44
27
|
"""
|
|
45
28
|
if options is None:
|
|
46
29
|
options = ConversionOptions()
|
|
47
30
|
if preprocessing is None:
|
|
48
31
|
preprocessing = PreprocessingOptions()
|
|
49
|
-
if parsing is None:
|
|
50
|
-
parsing = ParsingOptions()
|
|
51
32
|
|
|
52
33
|
rust_preprocessing = _rust.PreprocessingOptions(
|
|
53
34
|
enabled=preprocessing.enabled,
|
|
@@ -56,11 +37,6 @@ def convert(
|
|
|
56
37
|
remove_forms=preprocessing.remove_forms,
|
|
57
38
|
)
|
|
58
39
|
|
|
59
|
-
rust_parsing = _rust.ParsingOptions(
|
|
60
|
-
encoding=parsing.encoding,
|
|
61
|
-
parser=parsing.parser,
|
|
62
|
-
)
|
|
63
|
-
|
|
64
40
|
rust_options = _rust.ConversionOptions(
|
|
65
41
|
heading_style=options.heading_style,
|
|
66
42
|
list_indent_type=options.list_indent_type,
|
|
@@ -75,9 +51,6 @@ def convert(
|
|
|
75
51
|
autolinks=options.autolinks,
|
|
76
52
|
default_title=options.default_title,
|
|
77
53
|
br_in_tables=options.br_in_tables,
|
|
78
|
-
hocr_extract_tables=options.hocr_extract_tables,
|
|
79
|
-
hocr_table_column_threshold=options.hocr_table_column_threshold,
|
|
80
|
-
hocr_table_row_threshold_ratio=options.hocr_table_row_threshold_ratio,
|
|
81
54
|
highlight_style=options.highlight_style,
|
|
82
55
|
extract_metadata=options.extract_metadata,
|
|
83
56
|
whitespace_mode=options.whitespace_mode,
|
|
@@ -91,7 +64,7 @@ def convert(
|
|
|
91
64
|
code_block_style=options.code_block_style,
|
|
92
65
|
keep_inline_images_in=list(options.keep_inline_images_in) if options.keep_inline_images_in else [],
|
|
93
66
|
preprocessing=rust_preprocessing,
|
|
94
|
-
|
|
67
|
+
encoding=options.encoding,
|
|
95
68
|
debug=options.debug,
|
|
96
69
|
strip_tags=list(options.strip_tags) if options.strip_tags else [],
|
|
97
70
|
)
|
|
Binary file
|
html_to_markdown/cli.py
CHANGED
|
@@ -1,9 +1,3 @@
|
|
|
1
|
-
"""CLI wrapper that proxies to Rust CLI binary.
|
|
2
|
-
|
|
3
|
-
This module provides backwards compatibility for code that imports
|
|
4
|
-
from html_to_markdown.cli. The actual CLI implementation is in Rust.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
1
|
from html_to_markdown.cli_proxy import main
|
|
8
2
|
|
|
9
3
|
__all__ = ["main"]
|
html_to_markdown/cli_proxy.py
CHANGED
|
@@ -1,25 +1,19 @@
|
|
|
1
|
-
"""CLI proxy that calls the Rust CLI binary.
|
|
2
|
-
|
|
3
|
-
This module provides a Python wrapper around the Rust CLI binary,
|
|
4
|
-
allowing the Python package to use the high-performance Rust implementation
|
|
5
|
-
for command-line operations. It also provides v1 -> v2 CLI argument translation.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
1
|
import subprocess
|
|
9
2
|
import sys
|
|
3
|
+
import warnings
|
|
10
4
|
from pathlib import Path
|
|
11
5
|
|
|
12
6
|
from html_to_markdown.exceptions import RedundantV1FlagError, RemovedV1FlagError
|
|
13
7
|
|
|
14
8
|
|
|
15
9
|
def find_cli_binary() -> Path:
|
|
16
|
-
"""Find the html-to-markdown CLI binary.
|
|
10
|
+
"""Find the html-to-markdown CLI binary in expected locations.
|
|
17
11
|
|
|
18
12
|
Returns:
|
|
19
|
-
Path to the CLI binary
|
|
13
|
+
Path to the CLI binary.
|
|
20
14
|
|
|
21
15
|
Raises:
|
|
22
|
-
FileNotFoundError: If the binary cannot be found
|
|
16
|
+
FileNotFoundError: If the binary cannot be found.
|
|
23
17
|
"""
|
|
24
18
|
binary_name = "html-to-markdown.exe" if sys.platform == "win32" else "html-to-markdown"
|
|
25
19
|
|
|
@@ -38,28 +32,22 @@ def find_cli_binary() -> Path:
|
|
|
38
32
|
|
|
39
33
|
|
|
40
34
|
def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
|
|
41
|
-
"""Translate v1 CLI arguments to v2
|
|
42
|
-
|
|
43
|
-
This handles differences between the v1 Python CLI and v2 Rust CLI:
|
|
44
|
-
- Boolean flags: v1 used --flag/--no-flag, v2 uses presence/absence
|
|
45
|
-
- Flag name changes: --preprocess-html -> --preprocess
|
|
46
|
-
- Unsupported flags: --strip, --convert (raise errors)
|
|
35
|
+
"""Translate v1 CLI arguments to v2 format.
|
|
47
36
|
|
|
48
37
|
Args:
|
|
49
|
-
argv:
|
|
38
|
+
argv: List of command-line arguments.
|
|
50
39
|
|
|
51
40
|
Returns:
|
|
52
|
-
Translated
|
|
41
|
+
Translated list of arguments compatible with v2.
|
|
53
42
|
|
|
54
43
|
Raises:
|
|
55
|
-
RemovedV1FlagError: If a v1 flag has been removed in v2
|
|
44
|
+
RemovedV1FlagError: If a v1 flag has been removed in v2.
|
|
56
45
|
"""
|
|
57
46
|
translated = []
|
|
58
47
|
i = 0
|
|
59
48
|
while i < len(argv):
|
|
60
49
|
arg = argv[i]
|
|
61
50
|
|
|
62
|
-
# Error on removed/unsupported v1 features
|
|
63
51
|
if arg in ("--strip", "--convert"):
|
|
64
52
|
raise RemovedV1FlagError(
|
|
65
53
|
flag=arg,
|
|
@@ -67,8 +55,6 @@ def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
|
|
|
67
55
|
migration="Remove this flag from your command. The feature is no longer available.",
|
|
68
56
|
)
|
|
69
57
|
|
|
70
|
-
# These flags are redundant (match v2 defaults) but we accept them for v1 compatibility
|
|
71
|
-
# Silently skip - Rust CLI defaults match these flags
|
|
72
58
|
if arg in (
|
|
73
59
|
"--no-escape-asterisks",
|
|
74
60
|
"--no-escape-underscores",
|
|
@@ -77,14 +63,21 @@ def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
|
|
|
77
63
|
"--no-autolinks",
|
|
78
64
|
"--no-extract-metadata",
|
|
79
65
|
):
|
|
80
|
-
|
|
81
|
-
|
|
66
|
+
warnings.warn(
|
|
67
|
+
f"'{arg}' is deprecated and redundant in v2. "
|
|
68
|
+
f"These options are now disabled by default. Remove this flag.",
|
|
69
|
+
DeprecationWarning,
|
|
70
|
+
stacklevel=2,
|
|
71
|
+
)
|
|
82
72
|
|
|
83
|
-
# Flag name translations
|
|
84
73
|
elif arg == "--preprocess-html":
|
|
74
|
+
warnings.warn(
|
|
75
|
+
"'--preprocess-html' is deprecated. Use '--preprocess' instead.",
|
|
76
|
+
DeprecationWarning,
|
|
77
|
+
stacklevel=2,
|
|
78
|
+
)
|
|
85
79
|
translated.append("--preprocess")
|
|
86
80
|
|
|
87
|
-
# Positive flags that should be passed through
|
|
88
81
|
elif arg in (
|
|
89
82
|
"--escape-asterisks",
|
|
90
83
|
"--escape-underscores",
|
|
@@ -95,7 +88,6 @@ def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
|
|
|
95
88
|
):
|
|
96
89
|
translated.append(arg)
|
|
97
90
|
|
|
98
|
-
# All other args pass through unchanged
|
|
99
91
|
else:
|
|
100
92
|
translated.append(arg)
|
|
101
93
|
|
|
@@ -105,23 +97,21 @@ def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
|
|
|
105
97
|
|
|
106
98
|
|
|
107
99
|
def main(argv: list[str]) -> str:
|
|
108
|
-
"""
|
|
100
|
+
"""Main entry point for the CLI proxy.
|
|
109
101
|
|
|
110
|
-
Translates v1
|
|
111
|
-
Exits with non-zero status on errors (FileNotFoundError, UnsupportedV1FeatureError, CLI errors).
|
|
102
|
+
Translates v1 arguments to v2 and invokes the native Rust CLI binary.
|
|
112
103
|
|
|
113
104
|
Args:
|
|
114
|
-
argv: Command
|
|
105
|
+
argv: Command-line arguments.
|
|
115
106
|
|
|
116
107
|
Returns:
|
|
117
|
-
|
|
108
|
+
Stdout from the CLI binary.
|
|
118
109
|
"""
|
|
119
110
|
cli_binary = find_cli_binary()
|
|
120
111
|
|
|
121
112
|
try:
|
|
122
113
|
translated_args = translate_v1_args_to_v2(argv)
|
|
123
114
|
except (RemovedV1FlagError, RedundantV1FlagError) as e:
|
|
124
|
-
# Format the error nicely for CLI users
|
|
125
115
|
sys.stderr.write(f"\n❌ Error: {e.flag}\n\n")
|
|
126
116
|
sys.stderr.write(f" {e.reason}\n\n")
|
|
127
117
|
sys.stderr.write(f" 💡 {e.migration}\n\n")
|
html_to_markdown/exceptions.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
"""Exception classes for html-to-markdown."""
|
|
2
|
-
|
|
3
1
|
from __future__ import annotations
|
|
4
2
|
|
|
5
3
|
|
|
6
4
|
class HtmlToMarkdownError(Exception):
|
|
7
|
-
"""Base exception for html-to-markdown errors."""
|
|
5
|
+
"""Base exception for all html-to-markdown errors."""
|
|
8
6
|
|
|
9
7
|
|
|
10
8
|
class MissingDependencyError(HtmlToMarkdownError):
|
|
@@ -22,7 +20,7 @@ class MissingDependencyError(HtmlToMarkdownError):
|
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
class InvalidParserError(HtmlToMarkdownError):
|
|
25
|
-
"""Raised when an invalid
|
|
23
|
+
"""Raised when an invalid parser is specified."""
|
|
26
24
|
|
|
27
25
|
def __init__(self, parser: str, available_parsers: list[str]) -> None:
|
|
28
26
|
self.parser = parser
|
|
@@ -33,14 +31,14 @@ class InvalidParserError(HtmlToMarkdownError):
|
|
|
33
31
|
|
|
34
32
|
|
|
35
33
|
class EmptyHtmlError(HtmlToMarkdownError):
|
|
36
|
-
"""Raised when
|
|
34
|
+
"""Raised when input HTML is empty."""
|
|
37
35
|
|
|
38
36
|
def __init__(self) -> None:
|
|
39
37
|
super().__init__("The input HTML is empty.")
|
|
40
38
|
|
|
41
39
|
|
|
42
40
|
class ConflictingOptionsError(HtmlToMarkdownError):
|
|
43
|
-
"""Raised when conflicting options are specified."""
|
|
41
|
+
"""Raised when conflicting configuration options are specified."""
|
|
44
42
|
|
|
45
43
|
def __init__(self, option1: str, option2: str) -> None:
|
|
46
44
|
self.option1 = option1
|
|
@@ -50,20 +48,14 @@ class ConflictingOptionsError(HtmlToMarkdownError):
|
|
|
50
48
|
|
|
51
49
|
|
|
52
50
|
class InvalidEncodingError(HtmlToMarkdownError):
|
|
53
|
-
"""Raised when an invalid encoding is specified."""
|
|
51
|
+
"""Raised when an invalid character encoding is specified."""
|
|
54
52
|
|
|
55
53
|
def __init__(self, encoding: str) -> None:
|
|
56
54
|
super().__init__(f"The specified encoding ({encoding}) is not valid.")
|
|
57
55
|
|
|
58
56
|
|
|
59
57
|
class UnsupportedV1FeatureError(HtmlToMarkdownError):
|
|
60
|
-
"""Raised when a v1 feature is not supported in v2.
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
flag: The CLI flag or feature that is not supported
|
|
64
|
-
reason: Why the feature is not supported
|
|
65
|
-
migration: How to migrate away from this feature
|
|
66
|
-
"""
|
|
58
|
+
"""Raised when a v1 feature is not supported in v2."""
|
|
67
59
|
|
|
68
60
|
def __init__(self, flag: str, reason: str, migration: str) -> None:
|
|
69
61
|
self.flag = flag
|
|
@@ -74,8 +66,8 @@ class UnsupportedV1FeatureError(HtmlToMarkdownError):
|
|
|
74
66
|
|
|
75
67
|
|
|
76
68
|
class RemovedV1FlagError(UnsupportedV1FeatureError):
|
|
77
|
-
"""Raised when a
|
|
69
|
+
"""Raised when a v1 flag has been removed in v2."""
|
|
78
70
|
|
|
79
71
|
|
|
80
72
|
class RedundantV1FlagError(UnsupportedV1FeatureError):
|
|
81
|
-
"""Raised when a v1 flag is redundant in v2
|
|
73
|
+
"""Raised when a v1 flag is redundant in v2."""
|
html_to_markdown/options.py
CHANGED
|
@@ -6,38 +6,7 @@ This module provides dataclass-based configuration for the v2 API.
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
8
|
from dataclasses import dataclass
|
|
9
|
-
from typing import
|
|
10
|
-
|
|
11
|
-
if TYPE_CHECKING:
|
|
12
|
-
from collections.abc import Callable
|
|
13
|
-
|
|
14
|
-
from bs4 import Tag
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class ConverterFunction(Protocol):
|
|
18
|
-
"""Protocol for custom converter functions.
|
|
19
|
-
|
|
20
|
-
Converter functions receive keyword-only arguments including the HTML tag,
|
|
21
|
-
processed text content, and any conversion options needed.
|
|
22
|
-
|
|
23
|
-
Example:
|
|
24
|
-
>>> def custom_link_converter(*, tag: Tag, text: str, autolinks: bool, **kwargs: Any) -> str:
|
|
25
|
-
... href = tag.get("href", "")
|
|
26
|
-
... return f"[{text}]({href})"
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
def __call__(self, *, tag: Tag, text: str, **kwargs: Any) -> str:
|
|
30
|
-
"""Convert an HTML element to Markdown.
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
tag: BeautifulSoup Tag object representing the HTML element
|
|
34
|
-
text: Processed text content of the element's children
|
|
35
|
-
**kwargs: Additional conversion options (varies by converter)
|
|
36
|
-
|
|
37
|
-
Returns:
|
|
38
|
-
Markdown string representation of the element
|
|
39
|
-
"""
|
|
40
|
-
...
|
|
9
|
+
from typing import Literal
|
|
41
10
|
|
|
42
11
|
|
|
43
12
|
@dataclass
|
|
@@ -87,8 +56,8 @@ class ConversionOptions:
|
|
|
87
56
|
code_language: str = ""
|
|
88
57
|
"""Default language for code blocks."""
|
|
89
58
|
|
|
90
|
-
|
|
91
|
-
"""
|
|
59
|
+
encoding: str = "utf-8"
|
|
60
|
+
"""Character encoding expected for the HTML input."""
|
|
92
61
|
|
|
93
62
|
autolinks: bool = True
|
|
94
63
|
"""Convert bare URLs to automatic links."""
|
|
@@ -102,15 +71,6 @@ class ConversionOptions:
|
|
|
102
71
|
br_in_tables: bool = False
|
|
103
72
|
"""Use <br> tags for line breaks in table cells instead of spaces."""
|
|
104
73
|
|
|
105
|
-
hocr_extract_tables: bool = True
|
|
106
|
-
"""Enable table extraction from hOCR (HTML-based OCR) documents."""
|
|
107
|
-
|
|
108
|
-
hocr_table_column_threshold: int = 50
|
|
109
|
-
"""Pixel threshold for detecting column boundaries in hOCR tables."""
|
|
110
|
-
|
|
111
|
-
hocr_table_row_threshold_ratio: float = 0.5
|
|
112
|
-
"""Row height ratio threshold for detecting row boundaries in hOCR tables."""
|
|
113
|
-
|
|
114
74
|
highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
|
|
115
75
|
"""Style for highlighting <mark> elements."""
|
|
116
76
|
|
|
@@ -129,9 +89,6 @@ class ConversionOptions:
|
|
|
129
89
|
wrap_width: int = 80
|
|
130
90
|
"""Column width for text wrapping."""
|
|
131
91
|
|
|
132
|
-
convert: set[str] | None = None
|
|
133
|
-
"""HTML tags to convert to Markdown (None = all supported tags). v1 compatibility only."""
|
|
134
|
-
|
|
135
92
|
strip_tags: set[str] | None = None
|
|
136
93
|
"""HTML tags to strip from output (output only text content, no markdown conversion)."""
|
|
137
94
|
|
|
@@ -150,9 +107,6 @@ class ConversionOptions:
|
|
|
150
107
|
code_block_style: Literal["indented", "backticks", "tildes"] = "backticks"
|
|
151
108
|
"""Style for code blocks: 'backticks' (```, better whitespace preservation), 'indented' (4 spaces), or 'tildes' (~~~). All are CommonMark compliant."""
|
|
152
109
|
|
|
153
|
-
custom_converters: dict[str, Callable[..., str]] | None = None
|
|
154
|
-
"""Custom converter functions for specific HTML elements."""
|
|
155
|
-
|
|
156
110
|
debug: bool = False
|
|
157
111
|
"""Enable debug mode with diagnostic warnings about unhandled elements and hOCR processing."""
|
|
158
112
|
|
|
@@ -182,30 +136,3 @@ class PreprocessingOptions:
|
|
|
182
136
|
|
|
183
137
|
remove_forms: bool = True
|
|
184
138
|
"""Remove form elements during preprocessing."""
|
|
185
|
-
|
|
186
|
-
excluded_navigation_classes: set[str] | None = None
|
|
187
|
-
"""Navigation class fragments to keep even when removing navigation."""
|
|
188
|
-
|
|
189
|
-
extra_navigation_classes: set[str] | None = None
|
|
190
|
-
"""Additional navigation class fragments to strip beyond defaults."""
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
@dataclass
|
|
194
|
-
class ParsingOptions:
|
|
195
|
-
"""HTML parsing configuration.
|
|
196
|
-
|
|
197
|
-
Example:
|
|
198
|
-
>>> options = ParsingOptions(
|
|
199
|
-
... encoding="utf-8",
|
|
200
|
-
... detect_encoding=True,
|
|
201
|
-
... )
|
|
202
|
-
"""
|
|
203
|
-
|
|
204
|
-
encoding: str = "utf-8"
|
|
205
|
-
"""Character encoding for decoding bytes input."""
|
|
206
|
-
|
|
207
|
-
detect_encoding: bool = False
|
|
208
|
-
"""Attempt to detect encoding from HTML (not yet implemented)."""
|
|
209
|
-
|
|
210
|
-
parser: str | None = None
|
|
211
|
-
"""HTML parser to use: 'html.parser', 'lxml', or 'html5lib' (None = auto)."""
|
html_to_markdown/v1_compat.py
CHANGED
|
@@ -1,21 +1,18 @@
|
|
|
1
1
|
"""V1 API compatibility layer.
|
|
2
2
|
|
|
3
3
|
Provides backward compatibility for the v1 convert_to_markdown API
|
|
4
|
-
by translating v1 kwargs to v2 ConversionOptions
|
|
4
|
+
by translating v1 kwargs to v2 ConversionOptions and PreprocessingOptions.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
import warnings
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
from collections.abc import Iterator
|
|
13
|
-
|
|
14
|
-
from html_to_markdown import ConversionOptions, ParsingOptions, PreprocessingOptions
|
|
11
|
+
from html_to_markdown import ConversionOptions, PreprocessingOptions
|
|
15
12
|
from html_to_markdown import convert as convert_v2
|
|
16
13
|
|
|
17
14
|
|
|
18
|
-
def convert_to_markdown(
|
|
15
|
+
def convert_to_markdown(
|
|
19
16
|
html: str,
|
|
20
17
|
*,
|
|
21
18
|
heading_style: str = "underlined",
|
|
@@ -48,32 +45,72 @@ def convert_to_markdown( # noqa: D417
|
|
|
48
45
|
preprocessing_preset: str = "standard",
|
|
49
46
|
remove_navigation: bool = True,
|
|
50
47
|
remove_forms: bool = True,
|
|
51
|
-
parser: str = "html.parser",
|
|
52
48
|
source_encoding: str = "utf-8",
|
|
53
49
|
code_language_callback: object | None = None,
|
|
54
50
|
strip: list[str] | None = None,
|
|
55
51
|
convert: list[str] | None = None,
|
|
56
52
|
custom_converters: dict[str, object] | None = None,
|
|
57
53
|
) -> str:
|
|
58
|
-
"""Convert HTML to Markdown (v1 API
|
|
59
|
-
|
|
60
|
-
This function provides backward compatibility with the v1 API by accepting
|
|
61
|
-
the same kwargs and translating them to v2 ConversionOptions.
|
|
54
|
+
"""Convert HTML to Markdown (v1 compatibility API).
|
|
62
55
|
|
|
63
|
-
|
|
64
|
-
-
|
|
65
|
-
- convert: Removed in v2
|
|
66
|
-
- custom_converters: Not yet implemented in v2
|
|
56
|
+
This function provides backward compatibility with the v1 API by translating
|
|
57
|
+
v1-style keyword arguments to v2 ConversionOptions and PreprocessingOptions.
|
|
67
58
|
|
|
68
59
|
Args:
|
|
69
|
-
html: HTML string to convert
|
|
60
|
+
html: HTML string to convert.
|
|
61
|
+
heading_style: Style for headings (default: "underlined" for v1 compatibility).
|
|
62
|
+
list_indent_type: Type of indentation for lists.
|
|
63
|
+
list_indent_width: Number of spaces for list indentation (v1 default: 4).
|
|
64
|
+
bullets: Characters to use for unordered list bullets.
|
|
65
|
+
strong_em_symbol: Symbol for strong/emphasis formatting.
|
|
66
|
+
escape_asterisks: Escape asterisk characters (v1 default: True).
|
|
67
|
+
escape_underscores: Escape underscore characters (v1 default: True).
|
|
68
|
+
escape_misc: Escape miscellaneous Markdown characters (v1 default: True).
|
|
69
|
+
code_language: Default language for code blocks.
|
|
70
|
+
autolinks: Convert bare URLs to automatic links.
|
|
71
|
+
default_title: Add a default title if none exists.
|
|
72
|
+
br_in_tables: Use <br> tags for line breaks in table cells.
|
|
73
|
+
hocr_extract_tables: Deprecated - always True in v2.
|
|
74
|
+
hocr_table_column_threshold: Deprecated - uses built-in heuristics in v2.
|
|
75
|
+
hocr_table_row_threshold_ratio: Deprecated - uses built-in heuristics in v2.
|
|
76
|
+
highlight_style: Style for highlighting <mark> elements.
|
|
77
|
+
extract_metadata: Extract metadata from HTML head.
|
|
78
|
+
whitespace_mode: How to handle whitespace.
|
|
79
|
+
strip_newlines: Remove newlines from HTML before processing.
|
|
80
|
+
wrap: Enable text wrapping.
|
|
81
|
+
wrap_width: Column width for text wrapping.
|
|
82
|
+
convert_as_inline: Treat block elements as inline.
|
|
83
|
+
sub_symbol: Symbol for subscript text.
|
|
84
|
+
sup_symbol: Symbol for superscript text.
|
|
85
|
+
newline_style: Style for newlines.
|
|
86
|
+
keep_inline_images_in: Parent tag names where images should remain inline.
|
|
87
|
+
preprocess: Enable HTML preprocessing.
|
|
88
|
+
preprocessing_preset: Preprocessing aggressiveness level.
|
|
89
|
+
remove_navigation: Remove navigation elements during preprocessing.
|
|
90
|
+
remove_forms: Remove form elements during preprocessing.
|
|
91
|
+
source_encoding: Character encoding expected for the HTML input.
|
|
92
|
+
code_language_callback: Deprecated - not supported in v2.
|
|
93
|
+
strip: HTML tags to strip from output.
|
|
94
|
+
convert: Deprecated - not supported in v2.
|
|
95
|
+
custom_converters: Deprecated - not yet implemented in v2.
|
|
70
96
|
|
|
71
97
|
Returns:
|
|
72
|
-
Markdown string
|
|
98
|
+
Converted Markdown string.
|
|
73
99
|
|
|
74
100
|
Raises:
|
|
75
|
-
NotImplementedError: If
|
|
101
|
+
NotImplementedError: If deprecated v1 features are used.
|
|
102
|
+
|
|
103
|
+
.. deprecated:: 2.0
|
|
104
|
+
Use :func:`html_to_markdown.convert` with :class:`ConversionOptions` instead.
|
|
105
|
+
The v1 API is provided for backward compatibility only.
|
|
76
106
|
"""
|
|
107
|
+
warnings.warn(
|
|
108
|
+
"convert_to_markdown() is deprecated and will be removed in v3.0. "
|
|
109
|
+
"Use html_to_markdown.convert() with ConversionOptions instead.",
|
|
110
|
+
DeprecationWarning,
|
|
111
|
+
stacklevel=2,
|
|
112
|
+
)
|
|
113
|
+
|
|
77
114
|
if code_language_callback is not None:
|
|
78
115
|
raise NotImplementedError(
|
|
79
116
|
"code_language_callback was removed in v2. Use the code_language option to set a default language."
|
|
@@ -82,9 +119,17 @@ def convert_to_markdown( # noqa: D417
|
|
|
82
119
|
raise NotImplementedError("convert option was removed in v2. All supported tags are converted by default.")
|
|
83
120
|
if custom_converters is not None:
|
|
84
121
|
raise NotImplementedError("custom_converters is not yet implemented in v2")
|
|
122
|
+
if not hocr_extract_tables:
|
|
123
|
+
raise NotImplementedError(
|
|
124
|
+
"hocr_extract_tables toggle was removed in v2. hOCR tables are always reconstructed when detected."
|
|
125
|
+
)
|
|
126
|
+
if hocr_table_column_threshold != 50 or hocr_table_row_threshold_ratio != 0.5:
|
|
127
|
+
raise NotImplementedError(
|
|
128
|
+
"hOCR table threshold overrides were removed in v2. Table reconstruction now uses built-in heuristics."
|
|
129
|
+
)
|
|
85
130
|
|
|
86
|
-
#
|
|
87
|
-
#
|
|
131
|
+
# ~keep: v1 used indented code blocks by default, but switched to backticks when a language was set
|
|
132
|
+
# This maintains v1 behavior for backward compatibility
|
|
88
133
|
code_block_style = "backticks" if code_language else "indented"
|
|
89
134
|
|
|
90
135
|
options = ConversionOptions(
|
|
@@ -101,9 +146,6 @@ def convert_to_markdown( # noqa: D417
|
|
|
101
146
|
autolinks=autolinks,
|
|
102
147
|
default_title=default_title,
|
|
103
148
|
br_in_tables=br_in_tables,
|
|
104
|
-
hocr_extract_tables=hocr_extract_tables,
|
|
105
|
-
hocr_table_column_threshold=hocr_table_column_threshold,
|
|
106
|
-
hocr_table_row_threshold_ratio=hocr_table_row_threshold_ratio,
|
|
107
149
|
highlight_style=highlight_style, # type: ignore[arg-type]
|
|
108
150
|
extract_metadata=extract_metadata,
|
|
109
151
|
whitespace_mode=whitespace_mode, # type: ignore[arg-type]
|
|
@@ -125,37 +167,23 @@ def convert_to_markdown( # noqa: D417
|
|
|
125
167
|
remove_forms=remove_forms,
|
|
126
168
|
)
|
|
127
169
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
parser=parser,
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
return convert_v2(html, options, preprocessing, parsing)
|
|
134
|
-
|
|
170
|
+
options.encoding = source_encoding
|
|
171
|
+
return convert_v2(html, options, preprocessing)
|
|
135
172
|
|
|
136
|
-
def convert_to_markdown_stream( # noqa: D417
|
|
137
|
-
html: str,
|
|
138
|
-
*,
|
|
139
|
-
chunk_size: int = 4096,
|
|
140
|
-
**kwargs: object,
|
|
141
|
-
) -> Iterator[str]:
|
|
142
|
-
"""Stream HTML to Markdown conversion (v1 API).
|
|
143
173
|
|
|
144
|
-
|
|
174
|
+
def markdownify(*args: object, **kwargs: object) -> str:
|
|
175
|
+
"""Alias for convert_to_markdown (deprecated).
|
|
145
176
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
chunk_size: Size of chunks to yield (not used in v2)
|
|
149
|
-
|
|
150
|
-
Raises:
|
|
151
|
-
NotImplementedError: Streaming was removed in v2
|
|
177
|
+
.. deprecated:: 2.0
|
|
178
|
+
Use html_to_markdown.convert() instead.
|
|
152
179
|
"""
|
|
153
|
-
|
|
154
|
-
"
|
|
155
|
-
"Use
|
|
180
|
+
warnings.warn(
|
|
181
|
+
"markdownify() is deprecated and will be removed in v3.0. "
|
|
182
|
+
"Use html_to_markdown.convert() with ConversionOptions instead.",
|
|
183
|
+
DeprecationWarning,
|
|
184
|
+
stacklevel=2,
|
|
156
185
|
)
|
|
186
|
+
return convert_to_markdown(*args, **kwargs) # type: ignore[arg-type]
|
|
157
187
|
|
|
158
188
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
__all__ = ["convert_to_markdown", "convert_to_markdown_stream", "markdownify"]
|
|
189
|
+
__all__ = ["convert_to_markdown", "markdownify"]
|
|
Binary file
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: html-to-markdown
|
|
3
|
+
Version: 2.1.2
|
|
4
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
5
|
+
Classifier: Environment :: Console
|
|
6
|
+
Classifier: Intended Audience :: Developers
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
|
+
Classifier: Programming Language :: Rust
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Classifier: Topic :: Text Processing
|
|
18
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Summary: High-performance HTML to Markdown converter powered by Rust with a clean Python API
|
|
24
|
+
Keywords: cli-tool,converter,html,html2markdown,html5,markdown,markup,parser,rust,text-processing
|
|
25
|
+
Home-Page: https://github.com/Goldziher/html-to-markdown
|
|
26
|
+
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
27
|
+
Requires-Python: >=3.10
|
|
28
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
29
|
+
Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
|
|
30
|
+
Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
|
|
31
|
+
Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
|
|
32
|
+
Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
|
|
33
|
+
|
|
34
|
+
# html-to-markdown
|
|
35
|
+
|
|
36
|
+
High-performance HTML to Markdown converter with a clean Python API (powered by a Rust core). Wheels are published for Linux, macOS, and Windows.
|
|
37
|
+
|
|
38
|
+
[](https://github.com/Goldziher/html-to-markdown)
|
|
39
|
+
[](https://github.com/Goldziher/html-to-markdown)
|
|
40
|
+
[](https://github.com/Goldziher/html-to-markdown)
|
|
41
|
+
[](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install html-to-markdown
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Performance Snapshot
|
|
50
|
+
|
|
51
|
+
Apple M4 • Real Wikipedia documents • `convert()` (Python)
|
|
52
|
+
|
|
53
|
+
| Document | Size | Latency | Throughput | Docs/sec |
|
|
54
|
+
| ------------------- | ----- | ------- | ---------- | -------- |
|
|
55
|
+
| Lists (Timeline) | 129KB | 0.62ms | 208 MB/s | 1,613 |
|
|
56
|
+
| Tables (Countries) | 360KB | 2.02ms | 178 MB/s | 495 |
|
|
57
|
+
| Mixed (Python wiki) | 656KB | 4.56ms | 144 MB/s | 219 |
|
|
58
|
+
|
|
59
|
+
> V1 averaged ~2.5 MB/s (Python/BeautifulSoup). V2’s Rust engine delivers 60–80× higher throughput.
|
|
60
|
+
|
|
61
|
+
## Quick Start
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from html_to_markdown import convert
|
|
65
|
+
|
|
66
|
+
html = """
|
|
67
|
+
<h1>Welcome</h1>
|
|
68
|
+
<p>This is <strong>fast</strong> Rust-powered conversion!</p>
|
|
69
|
+
<ul>
|
|
70
|
+
<li>Blazing fast</li>
|
|
71
|
+
<li>Type safe</li>
|
|
72
|
+
<li>Easy to use</li>
|
|
73
|
+
</ul>
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
markdown = convert(html)
|
|
77
|
+
print(markdown)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Configuration (v2 API)
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from html_to_markdown import ConversionOptions, convert
|
|
84
|
+
|
|
85
|
+
options = ConversionOptions(
|
|
86
|
+
heading_style="atx",
|
|
87
|
+
list_indent_width=2,
|
|
88
|
+
bullets="*+-",
|
|
89
|
+
)
|
|
90
|
+
options.escape_asterisks = True
|
|
91
|
+
options.code_language = "python"
|
|
92
|
+
options.extract_metadata = True
|
|
93
|
+
|
|
94
|
+
markdown = convert(html, options)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### HTML Preprocessing
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from html_to_markdown import ConversionOptions, PreprocessingOptions, convert
|
|
101
|
+
|
|
102
|
+
options = ConversionOptions(
|
|
103
|
+
preprocessing=PreprocessingOptions(enabled=True, preset="aggressive"),
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
markdown = convert(scraped_html, options)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Inline Image Extraction
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from html_to_markdown import InlineImageConfig, convert_with_inline_images
|
|
113
|
+
|
|
114
|
+
markdown, inline_images, warnings = convert_with_inline_images(
|
|
115
|
+
'<p><img src="data:image/png;base64,...==" alt="Pixel" width="1" height="1"></p>',
|
|
116
|
+
image_config=InlineImageConfig(max_decoded_size_bytes=1024, infer_dimensions=True),
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
if inline_images:
|
|
120
|
+
first = inline_images[0]
|
|
121
|
+
print(first["format"], first["dimensions"], first["attributes"]) # e.g. "png", (1, 1), {"width": "1"}
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Each inline image is returned as a typed dictionary (`bytes` payload, metadata, and relevant HTML attributes). Warnings are human-readable skip reasons.
|
|
125
|
+
|
|
126
|
+
### hOCR (HTML OCR) Support
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
from html_to_markdown import ConversionOptions, convert
|
|
130
|
+
|
|
131
|
+
# Default: emit structured Markdown directly
|
|
132
|
+
markdown = convert(hocr_html)
|
|
133
|
+
|
|
134
|
+
# hOCR documents are detected automatically; tables are reconstructed without extra configuration.
|
|
135
|
+
markdown = convert(hocr_html)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## CLI (same engine)
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
pipx install html-to-markdown # or: pip install html-to-markdown
|
|
142
|
+
|
|
143
|
+
html-to-markdown page.html > page.md
|
|
144
|
+
cat page.html | html-to-markdown --heading-style atx > page.md
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## API Surface
|
|
148
|
+
|
|
149
|
+
### `ConversionOptions`
|
|
150
|
+
|
|
151
|
+
Key fields (see docstring for full matrix):
|
|
152
|
+
|
|
153
|
+
- `heading_style`: `"underlined" | "atx" | "atx_closed"`
|
|
154
|
+
- `list_indent_width`: spaces per indent level (default 2)
|
|
155
|
+
- `bullets`: cycle of bullet characters (`"*+-"`)
|
|
156
|
+
- `strong_em_symbol`: `"*"` or `"_"`
|
|
157
|
+
- `code_language`: default fenced code block language
|
|
158
|
+
- `wrap`, `wrap_width`: wrap Markdown output
|
|
159
|
+
- `strip_tags`: remove specific HTML tags
|
|
160
|
+
- `preprocessing`: `PreprocessingOptions`
|
|
161
|
+
- `encoding`: input character encoding (informational)
|
|
162
|
+
|
|
163
|
+
### `PreprocessingOptions`
|
|
164
|
+
|
|
165
|
+
- `enabled`: enable HTML sanitisation
|
|
166
|
+
- `preset`: `"minimal" | "standard" | "aggressive"`
|
|
167
|
+
- `remove_navigation`, `remove_forms`
|
|
168
|
+
|
|
169
|
+
### `InlineImageConfig`
|
|
170
|
+
|
|
171
|
+
- `max_decoded_size_bytes`: reject larger payloads
|
|
172
|
+
- `filename_prefix`: generated name prefix (`embedded_image` default)
|
|
173
|
+
- `capture_svg`: collect inline `<svg>` (default `True`)
|
|
174
|
+
- `infer_dimensions`: decode raster images to obtain dimensions (default `False`)
|
|
175
|
+
|
|
176
|
+
## v1 Compatibility
|
|
177
|
+
|
|
178
|
+
- **Performance**: V1 averaged ~2.5 MB/s; V2 sustains 150–210 MB/s with identical Markdown output.
|
|
179
|
+
- **Compat shim**: `html_to_markdown.v1_compat` exposes `convert_to_markdown`, `convert_to_markdown_stream`, and `markdownify` to ease migration. Keyword mappings are listed in the [changelog](CHANGELOG.md#v200).
|
|
180
|
+
- **CLI**: The Rust CLI replaces the Python script. New flags are documented via `html-to-markdown --help`.
|
|
181
|
+
- **Removed options**: `code_language_callback`, `strip`, and streaming APIs were removed; use `ConversionOptions`, `PreprocessingOptions`, and the inline-image helpers instead.
|
|
182
|
+
|
|
183
|
+
## Links
|
|
184
|
+
|
|
185
|
+
- GitHub: [https://github.com/Goldziher/html-to-markdown](https://github.com/Goldziher/html-to-markdown)
|
|
186
|
+
- Discord: [https://discord.gg/pXxagNK2zN](https://discord.gg/pXxagNK2zN)
|
|
187
|
+
- Kreuzberg ecosystem: [https://kreuzberg.dev](https://kreuzberg.dev)
|
|
188
|
+
|
|
189
|
+
## License
|
|
190
|
+
|
|
191
|
+
MIT License – see [LICENSE](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE).
|
|
192
|
+
|
|
193
|
+
## Support
|
|
194
|
+
|
|
195
|
+
If you find this library useful, consider [sponsoring the project](https://github.com/sponsors/Goldziher).
|
|
196
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
html_to_markdown-2.1.2.data/scripts/html-to-markdown,sha256=7PFfHn91sQQL-AWpzh5gBRz0xVbizJBLYQs4izr24yc,3784640
|
|
2
|
+
html_to_markdown-2.1.2.dist-info/RECORD,,
|
|
3
|
+
html_to_markdown-2.1.2.dist-info/WHEEL,sha256=HtAbUhtjhH1WdiDuIy2CapdoAiKCwe6bij_Tlxr1lEg,131
|
|
4
|
+
html_to_markdown-2.1.2.dist-info/METADATA,sha256=W-yMCoN32dNA4ggXeZLY1RazbepXMmuE2yNVxkmUizQ,7071
|
|
5
|
+
html_to_markdown-2.1.2.dist-info/licenses/LICENSE,sha256=oQvPC-0UWvfg0WaeUBe11OJMtX60An-TW1ev_oaAA0k,1086
|
|
6
|
+
html_to_markdown/options.py,sha256=oV-_GFEKuL3RBu27RD1AhVruEh-bNuY3c8ATRbzcos0,4802
|
|
7
|
+
html_to_markdown/_html_to_markdown.abi3.so,sha256=igb9iD5dR4jf2qpiaLG0IYvNz_gzQ7d0yx5ofdNK6Sg,3618704
|
|
8
|
+
html_to_markdown/__init__.py,sha256=-HSsEKPPjp08ksh9aZi3xwdTE9-kNvplMlG2npMPVuI,1149
|
|
9
|
+
html_to_markdown/api.py,sha256=HuM6RZg064VxrTvwcY-OmraS-hsGM9Bt1tIaM0_w7F8,2727
|
|
10
|
+
html_to_markdown/_rust.pyi,sha256=An3Wlvedlr_2XgzqmXulLi5AzMx3HTqOJWH11M5cgcY,2026
|
|
11
|
+
html_to_markdown/v1_compat.py,sha256=VQq1wv8OedkESpCFaUpaSUBh6vJNkByylVUbY6EPIZ8,7856
|
|
12
|
+
html_to_markdown/cli.py,sha256=Rn-s3FZPea1jgCJtDzH_TFvOEiA_uZFVfgjhr6xyL_g,64
|
|
13
|
+
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
html_to_markdown/exceptions.py,sha256=aTASOzbywgfqOYjlw18ZkOWSxKff4EbUbmMua_73TGA,2370
|
|
15
|
+
html_to_markdown/cli_proxy.py,sha256=MbDRZdmQMCDI9cruy1vifc__FsjNPRdvBXKFU9GaAZE,3695
|
|
16
|
+
html_to_markdown/__main__.py,sha256=3Ic_EbOt2h6W88q084pkz5IKU6iY5z_woBygH6u9aw0,327
|
|
17
|
+
html_to_markdown/bin/html-to-markdown,sha256=7PFfHn91sQQL-AWpzh5gBRz0xVbizJBLYQs4izr24yc,3784640
|
|
@@ -1,243 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: html-to-markdown
|
|
3
|
-
Version: 2.0.1
|
|
4
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
5
|
-
Classifier: Environment :: Console
|
|
6
|
-
Classifier: Intended Audience :: Developers
|
|
7
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
-
Classifier: Operating System :: OS Independent
|
|
9
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
|
10
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
-
Classifier: Programming Language :: Rust
|
|
15
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
16
|
-
Classifier: Topic :: Text Processing
|
|
17
|
-
Classifier: Topic :: Text Processing :: Markup
|
|
18
|
-
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
19
|
-
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
20
|
-
Classifier: Typing :: Typed
|
|
21
|
-
License-File: LICENSE
|
|
22
|
-
Summary: High-performance HTML to Markdown converter powered by Rust with a clean Python API
|
|
23
|
-
Keywords: cli-tool,converter,html,html2markdown,html5,markdown,markup,parser,rust,text-processing
|
|
24
|
-
Home-Page: https://github.com/Goldziher/html-to-markdown
|
|
25
|
-
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
26
|
-
Requires-Python: >=3.10
|
|
27
|
-
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
28
|
-
Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
|
|
29
|
-
Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
|
|
30
|
-
Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
|
|
31
|
-
Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
|
|
32
|
-
|
|
33
|
-
# html-to-markdown
|
|
34
|
-
|
|
35
|
-
High-performance HTML to Markdown converter powered by Rust with a clean Python API. Available via PyPI with pre-built wheels for all major platforms.
|
|
36
|
-
|
|
37
|
-
[](https://pypi.org/project/html-to-markdown/)
|
|
38
|
-
[](https://crates.io/crates/html-to-markdown-rs)
|
|
39
|
-
[](https://pypi.org/project/html-to-markdown/)
|
|
40
|
-
[](https://opensource.org/licenses/MIT)
|
|
41
|
-
[](https://discord.gg/pXxagNK2zN)
|
|
42
|
-
|
|
43
|
-
Part of the [Kreuzberg](https://kreuzberg.dev) ecosystem for document intelligence.
|
|
44
|
-
|
|
45
|
-
## Installation
|
|
46
|
-
|
|
47
|
-
```bash
|
|
48
|
-
pip install html-to-markdown
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
Pre-built wheels available for:
|
|
52
|
-
|
|
53
|
-
- **Linux**: x86_64, aarch64
|
|
54
|
-
- **macOS**: x86_64 (Intel), arm64 (Apple Silicon)
|
|
55
|
-
- **Windows**: x86_64
|
|
56
|
-
|
|
57
|
-
## ⚡ Performance
|
|
58
|
-
|
|
59
|
-
Real Wikipedia documents on Apple M4:
|
|
60
|
-
|
|
61
|
-
| Document | Size | Latency | Throughput | Docs/sec |
|
|
62
|
-
| ------------------- | ----- | ------- | ---------- | -------- |
|
|
63
|
-
| Lists (Timeline) | 129KB | 0.62ms | 208 MB/s | 1,613 |
|
|
64
|
-
| Tables (Countries) | 360KB | 2.02ms | 178 MB/s | 495 |
|
|
65
|
-
| Mixed (Python wiki) | 656KB | 4.56ms | 144 MB/s | 219 |
|
|
66
|
-
|
|
67
|
-
**19-30x faster** than pure Python implementations.
|
|
68
|
-
|
|
69
|
-
## Quick Start
|
|
70
|
-
|
|
71
|
-
```python
|
|
72
|
-
from html_to_markdown import convert_to_markdown
|
|
73
|
-
|
|
74
|
-
html = """
|
|
75
|
-
<h1>Welcome</h1>
|
|
76
|
-
<p>This is <strong>fast</strong> Rust-powered conversion!</p>
|
|
77
|
-
<ul>
|
|
78
|
-
<li>Blazing fast</li>
|
|
79
|
-
<li>Type safe</li>
|
|
80
|
-
<li>Easy to use</li>
|
|
81
|
-
</ul>
|
|
82
|
-
"""
|
|
83
|
-
|
|
84
|
-
markdown = convert_to_markdown(html)
|
|
85
|
-
print(markdown)
|
|
86
|
-
```
|
|
87
|
-
|
|
88
|
-
Output:
|
|
89
|
-
|
|
90
|
-
```markdown
|
|
91
|
-
# Welcome
|
|
92
|
-
|
|
93
|
-
This is **fast** Rust-powered conversion!
|
|
94
|
-
|
|
95
|
-
- Blazing fast
|
|
96
|
-
- Type safe
|
|
97
|
-
- Easy to use
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
## Configuration
|
|
101
|
-
|
|
102
|
-
```python
|
|
103
|
-
from html_to_markdown import convert_to_markdown
|
|
104
|
-
|
|
105
|
-
markdown = convert_to_markdown(
|
|
106
|
-
html,
|
|
107
|
-
heading_style="atx", # "atx", "atx_closed", "underlined"
|
|
108
|
-
list_indent_width=2, # Discord/Slack: use 2
|
|
109
|
-
bullets="*+-", # Bullet characters
|
|
110
|
-
strong_em_symbol="*", # "*" or "_"
|
|
111
|
-
escape_asterisks=True, # Escape * in text
|
|
112
|
-
code_language="python", # Default code block language
|
|
113
|
-
extract_metadata=True, # Extract HTML metadata
|
|
114
|
-
)
|
|
115
|
-
```
|
|
116
|
-
|
|
117
|
-
### HTML Preprocessing
|
|
118
|
-
|
|
119
|
-
Clean web-scraped HTML before conversion:
|
|
120
|
-
|
|
121
|
-
```python
|
|
122
|
-
from html_to_markdown import convert_to_markdown
|
|
123
|
-
|
|
124
|
-
markdown = convert_to_markdown(
|
|
125
|
-
scraped_html,
|
|
126
|
-
preprocess=True,
|
|
127
|
-
preprocessing_preset="aggressive", # "minimal", "standard", "aggressive"
|
|
128
|
-
)
|
|
129
|
-
```
|
|
130
|
-
|
|
131
|
-
## Features
|
|
132
|
-
|
|
133
|
-
- **🚀 Blazing Fast**: Pure Rust core with ultra-fast `tl` HTML parser
|
|
134
|
-
- **🐍 Type Safe**: Full type hints and `.pyi` stubs for excellent IDE support
|
|
135
|
-
- **📊 hOCR 1.2 Compliant**: Full support for all 40+ elements and 20+ properties
|
|
136
|
-
- **📝 CommonMark Compliant**: Follows CommonMark specification for list formatting
|
|
137
|
-
- **🌍 Cross-Platform**: Pre-built wheels for Linux, macOS, and Windows
|
|
138
|
-
- **✅ Well-Tested**: 900+ tests with dual Python + Rust coverage
|
|
139
|
-
- **🔧 Zero Dependencies**: No BeautifulSoup or lxml required
|
|
140
|
-
|
|
141
|
-
## hOCR 1.2 Support
|
|
142
|
-
|
|
143
|
-
Complete hOCR 1.2 specification compliance with support for all elements, properties, and metadata:
|
|
144
|
-
|
|
145
|
-
```python
|
|
146
|
-
from html_to_markdown import convert_to_markdown
|
|
147
|
-
|
|
148
|
-
# Option 1: Document structure extraction (NEW in v2)
|
|
149
|
-
# Extracts all hOCR elements and converts to structured markdown
|
|
150
|
-
markdown = convert_to_markdown(hocr_html)
|
|
151
|
-
|
|
152
|
-
# Option 2: Legacy table extraction (spatial reconstruction)
|
|
153
|
-
# Reconstructs tables from word bounding boxes
|
|
154
|
-
markdown = convert_to_markdown(
|
|
155
|
-
hocr_html,
|
|
156
|
-
hocr_extract_tables=True,
|
|
157
|
-
hocr_table_column_threshold=50,
|
|
158
|
-
hocr_table_row_threshold_ratio=0.5,
|
|
159
|
-
)
|
|
160
|
-
```
|
|
161
|
-
|
|
162
|
-
**Full hOCR 1.2 Spec Coverage:**
|
|
163
|
-
|
|
164
|
-
- ✅ **All 40 Element Types** - Logical structure, typesetting, floats, inline, engine-specific
|
|
165
|
-
- ✅ **All 20+ Properties** - bbox, baseline, textangle, poly, x_wconf, x_font, x_fsize, and more
|
|
166
|
-
- ✅ **All 5 Metadata Fields** - ocr-system, ocr-capabilities, ocr-number-of-pages, ocr-langs, ocr-scripts
|
|
167
|
-
|
|
168
|
-
## Configuration Reference
|
|
169
|
-
|
|
170
|
-
### ConversionOptions
|
|
171
|
-
|
|
172
|
-
| Option | Type | Default | Description |
|
|
173
|
-
| -------------------------------- | ----- | ------------- | ----------------------------------------------------------------------- |
|
|
174
|
-
| `heading_style` | str | `"atx"` | Heading format: `"atx"` (#), `"atx_closed"` (# #), `"underlined"` (===) |
|
|
175
|
-
| `list_indent_width` | int | `2` | Spaces per list indent level (CommonMark: 2) |
|
|
176
|
-
| `list_indent_type` | str | `"spaces"` | `"spaces"` or `"tabs"` |
|
|
177
|
-
| `bullets` | str | `"*+-"` | Bullet chars for unordered lists (cycles through levels) |
|
|
178
|
-
| `strong_em_symbol` | str | `"*"` | Symbol for bold/italic: `"*"` or `"_"` |
|
|
179
|
-
| `escape_asterisks` | bool | `True` | Escape `*` in text |
|
|
180
|
-
| `escape_underscores` | bool | `True` | Escape `_` in text |
|
|
181
|
-
| `code_language` | str | `""` | Default language for code blocks |
|
|
182
|
-
| `code_block_style` | str | `"backticks"` | `"indented"` (4 spaces), `"backticks"` (\`\`\`), `"tildes"` (\~~~) |
|
|
183
|
-
| `extract_metadata` | bool | `True` | Extract HTML metadata as comment |
|
|
184
|
-
| `hocr_extract_tables` | bool | `True` | Enable hOCR table extraction |
|
|
185
|
-
| `hocr_table_column_threshold` | int | `50` | Column detection threshold (pixels) |
|
|
186
|
-
| `hocr_table_row_threshold_ratio` | float | `0.5` | Row grouping threshold ratio |
|
|
187
|
-
|
|
188
|
-
### Preprocessing Options
|
|
189
|
-
|
|
190
|
-
| Option | Type | Default | Description |
|
|
191
|
-
| ---------------------- | ---- | ------------ | ----------------------------------------- |
|
|
192
|
-
| `preprocess` | bool | `False` | Enable HTML preprocessing |
|
|
193
|
-
| `preprocessing_preset` | str | `"standard"` | `"minimal"`, `"standard"`, `"aggressive"` |
|
|
194
|
-
|
|
195
|
-
## CLI Tool
|
|
196
|
-
|
|
197
|
-
A native Rust CLI binary is also available:
|
|
198
|
-
|
|
199
|
-
```bash
|
|
200
|
-
# Install via pipx (recommended for CLI tools)
|
|
201
|
-
pipx install html-to-markdown
|
|
202
|
-
|
|
203
|
-
# Or install with pip
|
|
204
|
-
pip install html-to-markdown
|
|
205
|
-
|
|
206
|
-
# Use the CLI
|
|
207
|
-
html-to-markdown input.html > output.md
|
|
208
|
-
echo "<h1>Test</h1>" | html-to-markdown
|
|
209
|
-
```
|
|
210
|
-
|
|
211
|
-
**For Rust library usage and comprehensive documentation**, see the [GitHub repository](https://github.com/Goldziher/html-to-markdown).
|
|
212
|
-
|
|
213
|
-
## Upgrading from v1.x
|
|
214
|
-
|
|
215
|
-
All v1 code works without changes. v2 is a complete Rust rewrite with **19-30x performance improvements**:
|
|
216
|
-
|
|
217
|
-
**What Changed:**
|
|
218
|
-
|
|
219
|
-
- Complete Rust rewrite using `tl` HTML parser
|
|
220
|
-
- CommonMark-compliant defaults (2-space indents, minimal escaping, ATX headings)
|
|
221
|
-
- No BeautifulSoup or lxml dependencies
|
|
222
|
-
|
|
223
|
-
**Removed Features:**
|
|
224
|
-
|
|
225
|
-
- `code_language_callback` - use `code_language` for default language
|
|
226
|
-
- `strip` / `convert` options - use preprocessing instead
|
|
227
|
-
- `convert_to_markdown_stream()` - not supported in v2
|
|
228
|
-
|
|
229
|
-
## Links
|
|
230
|
-
|
|
231
|
-
- **GitHub Repository**: [https://github.com/Goldziher/html-to-markdown](https://github.com/Goldziher/html-to-markdown)
|
|
232
|
-
- **Rust Crate**: [https://crates.io/crates/html-to-markdown-rs](https://crates.io/crates/html-to-markdown-rs)
|
|
233
|
-
- **Discord Community**: [https://discord.gg/pXxagNK2zN](https://discord.gg/pXxagNK2zN)
|
|
234
|
-
- **Kreuzberg Ecosystem**: [https://kreuzberg.dev](https://kreuzberg.dev)
|
|
235
|
-
|
|
236
|
-
## License
|
|
237
|
-
|
|
238
|
-
MIT License - see [LICENSE](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE) for details.
|
|
239
|
-
|
|
240
|
-
## Support
|
|
241
|
-
|
|
242
|
-
If you find this library useful, consider [sponsoring the project](https://github.com/sponsors/Goldziher).
|
|
243
|
-
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
html_to_markdown-2.0.1.data/scripts/html-to-markdown,sha256=jhtwKs5YAVs1Vkx4e1hHuSfH3rLNRP0JyJC7DCsGr_I,3734448
|
|
2
|
-
html_to_markdown-2.0.1.dist-info/RECORD,,
|
|
3
|
-
html_to_markdown-2.0.1.dist-info/WHEEL,sha256=HtAbUhtjhH1WdiDuIy2CapdoAiKCwe6bij_Tlxr1lEg,131
|
|
4
|
-
html_to_markdown-2.0.1.dist-info/METADATA,sha256=n-MONvlBJmSDWmejjTWLwzIV-PzlbWh1hWDLywyv9oc,9847
|
|
5
|
-
html_to_markdown-2.0.1.dist-info/licenses/LICENSE,sha256=oQvPC-0UWvfg0WaeUBe11OJMtX60An-TW1ev_oaAA0k,1086
|
|
6
|
-
html_to_markdown/options.py,sha256=LXOUDqWwuvC-ryE118LttnATDO6-rlogYbbEGVfynhM,7241
|
|
7
|
-
html_to_markdown/_html_to_markdown.abi3.so,sha256=jvvpeF0sSDJ5bBeiR5INT91cIPIwGjc6CxRgT5dDp2g,2989792
|
|
8
|
-
html_to_markdown/__init__.py,sha256=0r7a2ruI_9xqj0Ko-5O4yCGrQ4Nga89qSUY4lTSyiDE,1266
|
|
9
|
-
html_to_markdown/api.py,sha256=0KgVWCDX-pWxrADxxxnqzk5_IhYc4fDxRytgeHttCKQ,3620
|
|
10
|
-
html_to_markdown/_rust.pyi,sha256=6GZ5fXfQ7VqglKB-kSZ395cysOdLIdQidDq6yoAHICA,2141
|
|
11
|
-
html_to_markdown/v1_compat.py,sha256=ThGk8g5rsZ_2gO1pA4_VThiLKuNhu4injClyv2pQmg4,5521
|
|
12
|
-
html_to_markdown/cli.py,sha256=OW6GZAR7adSOfqSaRGx5YqNU3xChAkwG98WHcRhL5ss,254
|
|
13
|
-
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
-
html_to_markdown/exceptions.py,sha256=0Yrzndw1kSqN-HMnE34TjZzo21iihiD1TZG1k2dmpdI,2626
|
|
15
|
-
html_to_markdown/cli_proxy.py,sha256=nuBMky_q_ArDUKGgWW6Vrxf2JwOa_RgmUPH8qYBIcRQ,4298
|
|
16
|
-
html_to_markdown/__main__.py,sha256=3Ic_EbOt2h6W88q084pkz5IKU6iY5z_woBygH6u9aw0,327
|
|
17
|
-
html_to_markdown/bin/html-to-markdown,sha256=jhtwKs5YAVs1Vkx4e1hHuSfH3rLNRP0JyJC7DCsGr_I,3734448
|
|
File without changes
|
|
File without changes
|