html-to-markdown 2.14.4__cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -0,0 +1,144 @@
1
+ """Configuration options for HTML to Markdown conversion.
2
+
3
+ This module provides dataclass-based configuration for the v2 API.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass
9
+ from typing import Literal
10
+
11
+
12
+ @dataclass
13
+ class ConversionOptions:
14
+ """Main conversion configuration.
15
+
16
+ This class groups all conversion-related options together, replacing
17
+ the large number of keyword arguments in the v1 API.
18
+
19
+ Example:
20
+ >>> options = ConversionOptions(
21
+ ... heading_style="atx",
22
+ ... list_indent_width=2,
23
+ ... escape_asterisks=True,
24
+ ... )
25
+ >>> from html_to_markdown import convert
26
+ >>> markdown = convert("<h1>Title</h1>", options)
27
+ """
28
+
29
+ heading_style: Literal["underlined", "atx", "atx_closed"] = "atx"
30
+ """Style for headings: 'atx' (#) is CommonMark default, 'underlined' (===), or 'atx_closed' (# #)."""
31
+
32
+ list_indent_type: Literal["spaces", "tabs"] = "spaces"
33
+ """Type of indentation for lists."""
34
+
35
+ list_indent_width: int = 2
36
+ """Number of spaces for list indentation (CommonMark uses 2 spaces, ignored if list_indent_type='tabs')."""
37
+
38
+ bullets: str = "-*+"
39
+ """Characters to use for unordered list bullets (cycles through -, *, + for nested levels). CommonMark compliant."""
40
+
41
+ strong_em_symbol: Literal["*", "_"] = "*"
42
+ """Symbol for strong/emphasis formatting."""
43
+
44
+ escape_asterisks: bool = False
45
+ """Escape asterisk characters in text to prevent accidental formatting. Default False for minimal escaping (CommonMark)."""
46
+
47
+ escape_underscores: bool = False
48
+ """Escape underscore characters in text to prevent accidental formatting. Default False for minimal escaping (CommonMark)."""
49
+
50
+ escape_misc: bool = False
51
+ """Escape miscellaneous Markdown characters. Default False for minimal escaping (CommonMark)."""
52
+
53
+ escape_ascii: bool = False
54
+ """Escape all ASCII punctuation (for CommonMark spec compliance tests). Disabled by default for minimal escaping."""
55
+
56
+ code_language: str = ""
57
+ """Default language for code blocks."""
58
+
59
+ encoding: str = "utf-8"
60
+ """Character encoding expected for the HTML input."""
61
+
62
+ autolinks: bool = True
63
+ """Convert bare URLs to automatic links."""
64
+
65
+ default_title: bool = False
66
+ """Add a default title if none exists."""
67
+
68
+ keep_inline_images_in: set[str] | None = None
69
+ """Parent tag names where images should remain inline."""
70
+
71
+ br_in_tables: bool = False
72
+ """Use <br> tags for line breaks in table cells instead of spaces."""
73
+
74
+ hocr_spatial_tables: bool = True
75
+ """Reconstruct tables in hOCR documents using spatial heuristics."""
76
+
77
+ highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
78
+ """Style for highlighting <mark> elements."""
79
+
80
+ extract_metadata: bool = True
81
+ """Extract metadata from HTML head and include as comment."""
82
+
83
+ whitespace_mode: Literal["normalized", "strict"] = "normalized"
84
+ """How to handle whitespace: 'normalized' or 'strict'."""
85
+
86
+ strip_newlines: bool = False
87
+ """Remove newlines from HTML before processing."""
88
+
89
+ wrap: bool = False
90
+ """Enable text wrapping."""
91
+
92
+ wrap_width: int = 80
93
+ """Column width for text wrapping."""
94
+
95
+ strip_tags: set[str] | None = None
96
+ """HTML tags to strip from output (output only text content, no markdown conversion)."""
97
+
98
+ preserve_tags: set[str] | None = None
99
+ """HTML tags to preserve as-is in the output (keep original HTML). Useful for complex elements like tables."""
100
+
101
+ convert_as_inline: bool = False
102
+ """Treat block elements as inline during conversion."""
103
+
104
+ sub_symbol: str = ""
105
+ """Symbol for subscript text."""
106
+
107
+ sup_symbol: str = ""
108
+ """Symbol for superscript text."""
109
+
110
+ newline_style: Literal["spaces", "backslash"] = "spaces"
111
+ """Style for newlines: 'spaces' (two trailing spaces, CommonMark default) or 'backslash' (\\). Both are equally CommonMark compliant."""
112
+
113
+ code_block_style: Literal["indented", "backticks", "tildes"] = "backticks"
114
+ """Style for code blocks: 'backticks' (```, better whitespace preservation), 'indented' (4 spaces), or 'tildes' (~~~). All are CommonMark compliant."""
115
+
116
+ debug: bool = False
117
+ """Enable debug mode with diagnostic warnings about unhandled elements and hOCR processing."""
118
+
119
+
120
+ @dataclass
121
+ class PreprocessingOptions:
122
+ """HTML preprocessing configuration.
123
+
124
+ Controls how HTML is cleaned and preprocessed before conversion.
125
+
126
+ Example:
127
+ >>> options = PreprocessingOptions(
128
+ ... enabled=True,
129
+ ... preset="aggressive",
130
+ ... remove_navigation=True,
131
+ ... )
132
+ """
133
+
134
+ enabled: bool = True
135
+ """Whether to enable HTML preprocessing (enabled by default for robust handling of malformed HTML)."""
136
+
137
+ preset: Literal["minimal", "standard", "aggressive"] = "standard"
138
+ """Preprocessing aggressiveness level."""
139
+
140
+ remove_navigation: bool = True
141
+ """Remove navigation elements during preprocessing."""
142
+
143
+ remove_forms: bool = True
144
+ """Remove form elements during preprocessing."""
File without changes
@@ -0,0 +1,191 @@
1
+ """V1 API compatibility layer.
2
+
3
+ Provides backward compatibility for the v1 convert_to_markdown API
4
+ by translating v1 kwargs to v2 ConversionOptions and PreprocessingOptions.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import warnings
10
+
11
+ from html_to_markdown import ConversionOptions, PreprocessingOptions
12
+ from html_to_markdown import convert as convert_v2
13
+
14
+ DEPRECATION_MESSAGE = (
15
+ "The v1 compatibility layer is deprecated and will be removed in v3.0. "
16
+ "Use html_to_markdown.convert() with ConversionOptions instead."
17
+ )
18
+
19
+
20
+ def _warn_deprecated(api_name: str, *, stacklevel: int = 2) -> None:
21
+ warnings.warn(f"{api_name} is deprecated. {DEPRECATION_MESSAGE}", DeprecationWarning, stacklevel=stacklevel)
22
+
23
+
24
+ def convert_to_markdown(
25
+ html: str,
26
+ *,
27
+ heading_style: str = "underlined",
28
+ list_indent_type: str = "spaces",
29
+ list_indent_width: int = 4,
30
+ bullets: str = "*+-",
31
+ strong_em_symbol: str = "*",
32
+ escape_asterisks: bool = True,
33
+ escape_underscores: bool = True,
34
+ escape_misc: bool = True,
35
+ code_language: str = "",
36
+ autolinks: bool = True,
37
+ default_title: bool = False,
38
+ br_in_tables: bool = False,
39
+ hocr_extract_tables: bool = True,
40
+ hocr_table_column_threshold: int = 50,
41
+ hocr_table_row_threshold_ratio: float = 0.5,
42
+ highlight_style: str = "double-equal",
43
+ extract_metadata: bool = True,
44
+ whitespace_mode: str = "normalized",
45
+ strip_newlines: bool = False,
46
+ wrap: bool = False,
47
+ wrap_width: int = 80,
48
+ convert_as_inline: bool = False,
49
+ sub_symbol: str = "",
50
+ sup_symbol: str = "",
51
+ newline_style: str = "spaces",
52
+ keep_inline_images_in: set[str] | None = None,
53
+ preprocess: bool = False,
54
+ preprocessing_preset: str = "standard",
55
+ remove_navigation: bool = True,
56
+ remove_forms: bool = True,
57
+ source_encoding: str = "utf-8",
58
+ code_language_callback: object | None = None,
59
+ strip: list[str] | None = None,
60
+ convert: list[str] | None = None,
61
+ custom_converters: dict[str, object] | None = None,
62
+ ) -> str:
63
+ """Convert HTML to Markdown (v1 compatibility API).
64
+
65
+ This function provides backward compatibility with the v1 API by translating
66
+ v1-style keyword arguments to v2 ConversionOptions and PreprocessingOptions.
67
+
68
+ Args:
69
+ html: HTML string to convert.
70
+ heading_style: Style for headings (default: "underlined" for v1 compatibility).
71
+ list_indent_type: Type of indentation for lists.
72
+ list_indent_width: Number of spaces for list indentation (v1 default: 4).
73
+ bullets: Characters to use for unordered list bullets.
74
+ strong_em_symbol: Symbol for strong/emphasis formatting.
75
+ escape_asterisks: Escape asterisk characters (v1 default: True).
76
+ escape_underscores: Escape underscore characters (v1 default: True).
77
+ escape_misc: Escape miscellaneous Markdown characters (v1 default: True).
78
+ code_language: Default language for code blocks.
79
+ autolinks: Convert bare URLs to automatic links.
80
+ default_title: Add a default title if none exists.
81
+ br_in_tables: Use <br> tags for line breaks in table cells.
82
+ hocr_extract_tables: Deprecated - always True in v2.
83
+ hocr_table_column_threshold: Deprecated - uses built-in heuristics in v2.
84
+ hocr_table_row_threshold_ratio: Deprecated - uses built-in heuristics in v2.
85
+ highlight_style: Style for highlighting <mark> elements.
86
+ extract_metadata: Extract metadata from HTML head.
87
+ whitespace_mode: How to handle whitespace.
88
+ strip_newlines: Remove newlines from HTML before processing.
89
+ wrap: Enable text wrapping.
90
+ wrap_width: Column width for text wrapping.
91
+ convert_as_inline: Treat block elements as inline.
92
+ sub_symbol: Symbol for subscript text.
93
+ sup_symbol: Symbol for superscript text.
94
+ newline_style: Style for newlines.
95
+ keep_inline_images_in: Parent tag names where images should remain inline.
96
+ preprocess: Enable HTML preprocessing.
97
+ preprocessing_preset: Preprocessing aggressiveness level.
98
+ remove_navigation: Remove navigation elements during preprocessing.
99
+ remove_forms: Remove form elements during preprocessing.
100
+ source_encoding: Character encoding expected for the HTML input.
101
+ code_language_callback: Deprecated - not supported in v2.
102
+ strip: HTML tags to strip from output.
103
+ convert: Deprecated - not supported in v2.
104
+ custom_converters: Deprecated - not yet implemented in v2.
105
+
106
+ Returns:
107
+ Converted Markdown string.
108
+
109
+ Raises:
110
+ NotImplementedError: If deprecated v1 features are used.
111
+
112
+ .. deprecated:: 2.0
113
+ Use :func:`html_to_markdown.convert` with :class:`ConversionOptions` instead.
114
+ The v1 API is provided for backward compatibility only.
115
+ """
116
+ _warn_deprecated("convert_to_markdown()", stacklevel=2)
117
+
118
+ if code_language_callback is not None:
119
+ raise NotImplementedError(
120
+ "code_language_callback was removed in v2. Use the code_language option to set a default language."
121
+ )
122
+ if convert is not None:
123
+ raise NotImplementedError("convert option was removed in v2. All supported tags are converted by default.")
124
+ if custom_converters is not None:
125
+ raise NotImplementedError("custom_converters is not yet implemented in v2")
126
+ if not hocr_extract_tables:
127
+ warnings.warn(
128
+ "hocr_extract_tables is deprecated and will be removed in a future release. "
129
+ "Use ConversionOptions(hocr_spatial_tables=False) to disable spatial table reconstruction.",
130
+ DeprecationWarning,
131
+ stacklevel=2,
132
+ )
133
+ if hocr_table_column_threshold != 50 or hocr_table_row_threshold_ratio != 0.5:
134
+ raise NotImplementedError(
135
+ "hOCR table threshold overrides were removed in v2. Table reconstruction now uses built-in heuristics."
136
+ )
137
+
138
+ # ~keep: v1 used indented code blocks by default, but switched to backticks when a language was set
139
+ code_block_style = "backticks" if code_language else "indented"
140
+
141
+ options = ConversionOptions(
142
+ heading_style=heading_style, # type: ignore[arg-type]
143
+ list_indent_type=list_indent_type, # type: ignore[arg-type]
144
+ list_indent_width=list_indent_width,
145
+ bullets=bullets,
146
+ strong_em_symbol=strong_em_symbol, # type: ignore[arg-type]
147
+ escape_asterisks=escape_asterisks,
148
+ escape_underscores=escape_underscores,
149
+ escape_misc=escape_misc,
150
+ code_block_style=code_block_style, # type: ignore[arg-type]
151
+ code_language=code_language,
152
+ autolinks=autolinks,
153
+ default_title=default_title,
154
+ br_in_tables=br_in_tables,
155
+ hocr_spatial_tables=hocr_extract_tables,
156
+ highlight_style=highlight_style, # type: ignore[arg-type]
157
+ extract_metadata=extract_metadata,
158
+ whitespace_mode=whitespace_mode, # type: ignore[arg-type]
159
+ strip_newlines=strip_newlines,
160
+ wrap=wrap,
161
+ wrap_width=wrap_width,
162
+ convert_as_inline=convert_as_inline,
163
+ sub_symbol=sub_symbol,
164
+ sup_symbol=sup_symbol,
165
+ newline_style=newline_style, # type: ignore[arg-type]
166
+ keep_inline_images_in=keep_inline_images_in,
167
+ strip_tags=set(strip) if strip else None,
168
+ )
169
+
170
+ preprocessing = PreprocessingOptions(
171
+ enabled=preprocess,
172
+ preset=preprocessing_preset, # type: ignore[arg-type]
173
+ remove_navigation=remove_navigation,
174
+ remove_forms=remove_forms,
175
+ )
176
+
177
+ options.encoding = source_encoding
178
+ return convert_v2(html, options, preprocessing)
179
+
180
+
181
+ def markdownify(*args: object, **kwargs: object) -> str:
182
+ """Alias for convert_to_markdown (deprecated).
183
+
184
+ .. deprecated:: 2.0
185
+ Use html_to_markdown.convert() instead.
186
+ """
187
+ _warn_deprecated("markdownify()", stacklevel=2)
188
+ return convert_to_markdown(*args, **kwargs) # type: ignore[arg-type]
189
+
190
+
191
+ __all__ = ["convert_to_markdown", "markdownify"]