html-to-markdown 2.14.4__cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +62 -0
- html_to_markdown/__main__.py +16 -0
- html_to_markdown/_html_to_markdown.abi3.so +0 -0
- html_to_markdown/_html_to_markdown.pyi +196 -0
- html_to_markdown/api.py +195 -0
- html_to_markdown/bin/html-to-markdown +0 -0
- html_to_markdown/cli.py +3 -0
- html_to_markdown/cli_proxy.py +142 -0
- html_to_markdown/exceptions.py +73 -0
- html_to_markdown/options.py +144 -0
- html_to_markdown/py.typed +0 -0
- html_to_markdown/v1_compat.py +191 -0
- html_to_markdown-2.14.4.data/scripts/html-to-markdown +0 -0
- html_to_markdown-2.14.4.dist-info/METADATA +634 -0
- html_to_markdown-2.14.4.dist-info/RECORD +17 -0
- html_to_markdown-2.14.4.dist-info/WHEEL +6 -0
- html_to_markdown-2.14.4.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Configuration options for HTML to Markdown conversion.
|
|
2
|
+
|
|
3
|
+
This module provides dataclass-based configuration for the v2 API.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Literal
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ConversionOptions:
|
|
14
|
+
"""Main conversion configuration.
|
|
15
|
+
|
|
16
|
+
This class groups all conversion-related options together, replacing
|
|
17
|
+
the large number of keyword arguments in the v1 API.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
>>> options = ConversionOptions(
|
|
21
|
+
... heading_style="atx",
|
|
22
|
+
... list_indent_width=2,
|
|
23
|
+
... escape_asterisks=True,
|
|
24
|
+
... )
|
|
25
|
+
>>> from html_to_markdown import convert
|
|
26
|
+
>>> markdown = convert("<h1>Title</h1>", options)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
heading_style: Literal["underlined", "atx", "atx_closed"] = "atx"
|
|
30
|
+
"""Style for headings: 'atx' (#) is CommonMark default, 'underlined' (===), or 'atx_closed' (# #)."""
|
|
31
|
+
|
|
32
|
+
list_indent_type: Literal["spaces", "tabs"] = "spaces"
|
|
33
|
+
"""Type of indentation for lists."""
|
|
34
|
+
|
|
35
|
+
list_indent_width: int = 2
|
|
36
|
+
"""Number of spaces for list indentation (CommonMark uses 2 spaces, ignored if list_indent_type='tabs')."""
|
|
37
|
+
|
|
38
|
+
bullets: str = "-*+"
|
|
39
|
+
"""Characters to use for unordered list bullets (cycles through -, *, + for nested levels). CommonMark compliant."""
|
|
40
|
+
|
|
41
|
+
strong_em_symbol: Literal["*", "_"] = "*"
|
|
42
|
+
"""Symbol for strong/emphasis formatting."""
|
|
43
|
+
|
|
44
|
+
escape_asterisks: bool = False
|
|
45
|
+
"""Escape asterisk characters in text to prevent accidental formatting. Default False for minimal escaping (CommonMark)."""
|
|
46
|
+
|
|
47
|
+
escape_underscores: bool = False
|
|
48
|
+
"""Escape underscore characters in text to prevent accidental formatting. Default False for minimal escaping (CommonMark)."""
|
|
49
|
+
|
|
50
|
+
escape_misc: bool = False
|
|
51
|
+
"""Escape miscellaneous Markdown characters. Default False for minimal escaping (CommonMark)."""
|
|
52
|
+
|
|
53
|
+
escape_ascii: bool = False
|
|
54
|
+
"""Escape all ASCII punctuation (for CommonMark spec compliance tests). Disabled by default for minimal escaping."""
|
|
55
|
+
|
|
56
|
+
code_language: str = ""
|
|
57
|
+
"""Default language for code blocks."""
|
|
58
|
+
|
|
59
|
+
encoding: str = "utf-8"
|
|
60
|
+
"""Character encoding expected for the HTML input."""
|
|
61
|
+
|
|
62
|
+
autolinks: bool = True
|
|
63
|
+
"""Convert bare URLs to automatic links."""
|
|
64
|
+
|
|
65
|
+
default_title: bool = False
|
|
66
|
+
"""Add a default title if none exists."""
|
|
67
|
+
|
|
68
|
+
keep_inline_images_in: set[str] | None = None
|
|
69
|
+
"""Parent tag names where images should remain inline."""
|
|
70
|
+
|
|
71
|
+
br_in_tables: bool = False
|
|
72
|
+
"""Use <br> tags for line breaks in table cells instead of spaces."""
|
|
73
|
+
|
|
74
|
+
hocr_spatial_tables: bool = True
|
|
75
|
+
"""Reconstruct tables in hOCR documents using spatial heuristics."""
|
|
76
|
+
|
|
77
|
+
highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
|
|
78
|
+
"""Style for highlighting <mark> elements."""
|
|
79
|
+
|
|
80
|
+
extract_metadata: bool = True
|
|
81
|
+
"""Extract metadata from HTML head and include as comment."""
|
|
82
|
+
|
|
83
|
+
whitespace_mode: Literal["normalized", "strict"] = "normalized"
|
|
84
|
+
"""How to handle whitespace: 'normalized' or 'strict'."""
|
|
85
|
+
|
|
86
|
+
strip_newlines: bool = False
|
|
87
|
+
"""Remove newlines from HTML before processing."""
|
|
88
|
+
|
|
89
|
+
wrap: bool = False
|
|
90
|
+
"""Enable text wrapping."""
|
|
91
|
+
|
|
92
|
+
wrap_width: int = 80
|
|
93
|
+
"""Column width for text wrapping."""
|
|
94
|
+
|
|
95
|
+
strip_tags: set[str] | None = None
|
|
96
|
+
"""HTML tags to strip from output (output only text content, no markdown conversion)."""
|
|
97
|
+
|
|
98
|
+
preserve_tags: set[str] | None = None
|
|
99
|
+
"""HTML tags to preserve as-is in the output (keep original HTML). Useful for complex elements like tables."""
|
|
100
|
+
|
|
101
|
+
convert_as_inline: bool = False
|
|
102
|
+
"""Treat block elements as inline during conversion."""
|
|
103
|
+
|
|
104
|
+
sub_symbol: str = ""
|
|
105
|
+
"""Symbol for subscript text."""
|
|
106
|
+
|
|
107
|
+
sup_symbol: str = ""
|
|
108
|
+
"""Symbol for superscript text."""
|
|
109
|
+
|
|
110
|
+
newline_style: Literal["spaces", "backslash"] = "spaces"
|
|
111
|
+
"""Style for newlines: 'spaces' (two trailing spaces, CommonMark default) or 'backslash' (\\). Both are equally CommonMark compliant."""
|
|
112
|
+
|
|
113
|
+
code_block_style: Literal["indented", "backticks", "tildes"] = "backticks"
|
|
114
|
+
"""Style for code blocks: 'backticks' (```, better whitespace preservation), 'indented' (4 spaces), or 'tildes' (~~~). All are CommonMark compliant."""
|
|
115
|
+
|
|
116
|
+
debug: bool = False
|
|
117
|
+
"""Enable debug mode with diagnostic warnings about unhandled elements and hOCR processing."""
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@dataclass
|
|
121
|
+
class PreprocessingOptions:
|
|
122
|
+
"""HTML preprocessing configuration.
|
|
123
|
+
|
|
124
|
+
Controls how HTML is cleaned and preprocessed before conversion.
|
|
125
|
+
|
|
126
|
+
Example:
|
|
127
|
+
>>> options = PreprocessingOptions(
|
|
128
|
+
... enabled=True,
|
|
129
|
+
... preset="aggressive",
|
|
130
|
+
... remove_navigation=True,
|
|
131
|
+
... )
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
enabled: bool = True
|
|
135
|
+
"""Whether to enable HTML preprocessing (enabled by default for robust handling of malformed HTML)."""
|
|
136
|
+
|
|
137
|
+
preset: Literal["minimal", "standard", "aggressive"] = "standard"
|
|
138
|
+
"""Preprocessing aggressiveness level."""
|
|
139
|
+
|
|
140
|
+
remove_navigation: bool = True
|
|
141
|
+
"""Remove navigation elements during preprocessing."""
|
|
142
|
+
|
|
143
|
+
remove_forms: bool = True
|
|
144
|
+
"""Remove form elements during preprocessing."""
|
|
File without changes
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""V1 API compatibility layer.
|
|
2
|
+
|
|
3
|
+
Provides backward compatibility for the v1 convert_to_markdown API
|
|
4
|
+
by translating v1 kwargs to v2 ConversionOptions and PreprocessingOptions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import warnings
|
|
10
|
+
|
|
11
|
+
from html_to_markdown import ConversionOptions, PreprocessingOptions
|
|
12
|
+
from html_to_markdown import convert as convert_v2
|
|
13
|
+
|
|
14
|
+
DEPRECATION_MESSAGE = (
|
|
15
|
+
"The v1 compatibility layer is deprecated and will be removed in v3.0. "
|
|
16
|
+
"Use html_to_markdown.convert() with ConversionOptions instead."
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _warn_deprecated(api_name: str, *, stacklevel: int = 2) -> None:
|
|
21
|
+
warnings.warn(f"{api_name} is deprecated. {DEPRECATION_MESSAGE}", DeprecationWarning, stacklevel=stacklevel)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def convert_to_markdown(
|
|
25
|
+
html: str,
|
|
26
|
+
*,
|
|
27
|
+
heading_style: str = "underlined",
|
|
28
|
+
list_indent_type: str = "spaces",
|
|
29
|
+
list_indent_width: int = 4,
|
|
30
|
+
bullets: str = "*+-",
|
|
31
|
+
strong_em_symbol: str = "*",
|
|
32
|
+
escape_asterisks: bool = True,
|
|
33
|
+
escape_underscores: bool = True,
|
|
34
|
+
escape_misc: bool = True,
|
|
35
|
+
code_language: str = "",
|
|
36
|
+
autolinks: bool = True,
|
|
37
|
+
default_title: bool = False,
|
|
38
|
+
br_in_tables: bool = False,
|
|
39
|
+
hocr_extract_tables: bool = True,
|
|
40
|
+
hocr_table_column_threshold: int = 50,
|
|
41
|
+
hocr_table_row_threshold_ratio: float = 0.5,
|
|
42
|
+
highlight_style: str = "double-equal",
|
|
43
|
+
extract_metadata: bool = True,
|
|
44
|
+
whitespace_mode: str = "normalized",
|
|
45
|
+
strip_newlines: bool = False,
|
|
46
|
+
wrap: bool = False,
|
|
47
|
+
wrap_width: int = 80,
|
|
48
|
+
convert_as_inline: bool = False,
|
|
49
|
+
sub_symbol: str = "",
|
|
50
|
+
sup_symbol: str = "",
|
|
51
|
+
newline_style: str = "spaces",
|
|
52
|
+
keep_inline_images_in: set[str] | None = None,
|
|
53
|
+
preprocess: bool = False,
|
|
54
|
+
preprocessing_preset: str = "standard",
|
|
55
|
+
remove_navigation: bool = True,
|
|
56
|
+
remove_forms: bool = True,
|
|
57
|
+
source_encoding: str = "utf-8",
|
|
58
|
+
code_language_callback: object | None = None,
|
|
59
|
+
strip: list[str] | None = None,
|
|
60
|
+
convert: list[str] | None = None,
|
|
61
|
+
custom_converters: dict[str, object] | None = None,
|
|
62
|
+
) -> str:
|
|
63
|
+
"""Convert HTML to Markdown (v1 compatibility API).
|
|
64
|
+
|
|
65
|
+
This function provides backward compatibility with the v1 API by translating
|
|
66
|
+
v1-style keyword arguments to v2 ConversionOptions and PreprocessingOptions.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
html: HTML string to convert.
|
|
70
|
+
heading_style: Style for headings (default: "underlined" for v1 compatibility).
|
|
71
|
+
list_indent_type: Type of indentation for lists.
|
|
72
|
+
list_indent_width: Number of spaces for list indentation (v1 default: 4).
|
|
73
|
+
bullets: Characters to use for unordered list bullets.
|
|
74
|
+
strong_em_symbol: Symbol for strong/emphasis formatting.
|
|
75
|
+
escape_asterisks: Escape asterisk characters (v1 default: True).
|
|
76
|
+
escape_underscores: Escape underscore characters (v1 default: True).
|
|
77
|
+
escape_misc: Escape miscellaneous Markdown characters (v1 default: True).
|
|
78
|
+
code_language: Default language for code blocks.
|
|
79
|
+
autolinks: Convert bare URLs to automatic links.
|
|
80
|
+
default_title: Add a default title if none exists.
|
|
81
|
+
br_in_tables: Use <br> tags for line breaks in table cells.
|
|
82
|
+
hocr_extract_tables: Deprecated - always True in v2.
|
|
83
|
+
hocr_table_column_threshold: Deprecated - uses built-in heuristics in v2.
|
|
84
|
+
hocr_table_row_threshold_ratio: Deprecated - uses built-in heuristics in v2.
|
|
85
|
+
highlight_style: Style for highlighting <mark> elements.
|
|
86
|
+
extract_metadata: Extract metadata from HTML head.
|
|
87
|
+
whitespace_mode: How to handle whitespace.
|
|
88
|
+
strip_newlines: Remove newlines from HTML before processing.
|
|
89
|
+
wrap: Enable text wrapping.
|
|
90
|
+
wrap_width: Column width for text wrapping.
|
|
91
|
+
convert_as_inline: Treat block elements as inline.
|
|
92
|
+
sub_symbol: Symbol for subscript text.
|
|
93
|
+
sup_symbol: Symbol for superscript text.
|
|
94
|
+
newline_style: Style for newlines.
|
|
95
|
+
keep_inline_images_in: Parent tag names where images should remain inline.
|
|
96
|
+
preprocess: Enable HTML preprocessing.
|
|
97
|
+
preprocessing_preset: Preprocessing aggressiveness level.
|
|
98
|
+
remove_navigation: Remove navigation elements during preprocessing.
|
|
99
|
+
remove_forms: Remove form elements during preprocessing.
|
|
100
|
+
source_encoding: Character encoding expected for the HTML input.
|
|
101
|
+
code_language_callback: Deprecated - not supported in v2.
|
|
102
|
+
strip: HTML tags to strip from output.
|
|
103
|
+
convert: Deprecated - not supported in v2.
|
|
104
|
+
custom_converters: Deprecated - not yet implemented in v2.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Converted Markdown string.
|
|
108
|
+
|
|
109
|
+
Raises:
|
|
110
|
+
NotImplementedError: If deprecated v1 features are used.
|
|
111
|
+
|
|
112
|
+
.. deprecated:: 2.0
|
|
113
|
+
Use :func:`html_to_markdown.convert` with :class:`ConversionOptions` instead.
|
|
114
|
+
The v1 API is provided for backward compatibility only.
|
|
115
|
+
"""
|
|
116
|
+
_warn_deprecated("convert_to_markdown()", stacklevel=2)
|
|
117
|
+
|
|
118
|
+
if code_language_callback is not None:
|
|
119
|
+
raise NotImplementedError(
|
|
120
|
+
"code_language_callback was removed in v2. Use the code_language option to set a default language."
|
|
121
|
+
)
|
|
122
|
+
if convert is not None:
|
|
123
|
+
raise NotImplementedError("convert option was removed in v2. All supported tags are converted by default.")
|
|
124
|
+
if custom_converters is not None:
|
|
125
|
+
raise NotImplementedError("custom_converters is not yet implemented in v2")
|
|
126
|
+
if not hocr_extract_tables:
|
|
127
|
+
warnings.warn(
|
|
128
|
+
"hocr_extract_tables is deprecated and will be removed in a future release. "
|
|
129
|
+
"Use ConversionOptions(hocr_spatial_tables=False) to disable spatial table reconstruction.",
|
|
130
|
+
DeprecationWarning,
|
|
131
|
+
stacklevel=2,
|
|
132
|
+
)
|
|
133
|
+
if hocr_table_column_threshold != 50 or hocr_table_row_threshold_ratio != 0.5:
|
|
134
|
+
raise NotImplementedError(
|
|
135
|
+
"hOCR table threshold overrides were removed in v2. Table reconstruction now uses built-in heuristics."
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# ~keep: v1 used indented code blocks by default, but switched to backticks when a language was set
|
|
139
|
+
code_block_style = "backticks" if code_language else "indented"
|
|
140
|
+
|
|
141
|
+
options = ConversionOptions(
|
|
142
|
+
heading_style=heading_style, # type: ignore[arg-type]
|
|
143
|
+
list_indent_type=list_indent_type, # type: ignore[arg-type]
|
|
144
|
+
list_indent_width=list_indent_width,
|
|
145
|
+
bullets=bullets,
|
|
146
|
+
strong_em_symbol=strong_em_symbol, # type: ignore[arg-type]
|
|
147
|
+
escape_asterisks=escape_asterisks,
|
|
148
|
+
escape_underscores=escape_underscores,
|
|
149
|
+
escape_misc=escape_misc,
|
|
150
|
+
code_block_style=code_block_style, # type: ignore[arg-type]
|
|
151
|
+
code_language=code_language,
|
|
152
|
+
autolinks=autolinks,
|
|
153
|
+
default_title=default_title,
|
|
154
|
+
br_in_tables=br_in_tables,
|
|
155
|
+
hocr_spatial_tables=hocr_extract_tables,
|
|
156
|
+
highlight_style=highlight_style, # type: ignore[arg-type]
|
|
157
|
+
extract_metadata=extract_metadata,
|
|
158
|
+
whitespace_mode=whitespace_mode, # type: ignore[arg-type]
|
|
159
|
+
strip_newlines=strip_newlines,
|
|
160
|
+
wrap=wrap,
|
|
161
|
+
wrap_width=wrap_width,
|
|
162
|
+
convert_as_inline=convert_as_inline,
|
|
163
|
+
sub_symbol=sub_symbol,
|
|
164
|
+
sup_symbol=sup_symbol,
|
|
165
|
+
newline_style=newline_style, # type: ignore[arg-type]
|
|
166
|
+
keep_inline_images_in=keep_inline_images_in,
|
|
167
|
+
strip_tags=set(strip) if strip else None,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
preprocessing = PreprocessingOptions(
|
|
171
|
+
enabled=preprocess,
|
|
172
|
+
preset=preprocessing_preset, # type: ignore[arg-type]
|
|
173
|
+
remove_navigation=remove_navigation,
|
|
174
|
+
remove_forms=remove_forms,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
options.encoding = source_encoding
|
|
178
|
+
return convert_v2(html, options, preprocessing)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def markdownify(*args: object, **kwargs: object) -> str:
|
|
182
|
+
"""Alias for convert_to_markdown (deprecated).
|
|
183
|
+
|
|
184
|
+
.. deprecated:: 2.0
|
|
185
|
+
Use html_to_markdown.convert() instead.
|
|
186
|
+
"""
|
|
187
|
+
_warn_deprecated("markdownify()", stacklevel=2)
|
|
188
|
+
return convert_to_markdown(*args, **kwargs) # type: ignore[arg-type]
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
__all__ = ["convert_to_markdown", "markdownify"]
|
|
Binary file
|