html-to-markdown 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +19 -2
- html_to_markdown/cli.py +103 -25
- html_to_markdown/constants.py +1 -0
- html_to_markdown/converters.py +1646 -104
- html_to_markdown/exceptions.py +49 -0
- html_to_markdown/processing.py +720 -47
- html_to_markdown-1.6.0.dist-info/METADATA +472 -0
- html_to_markdown-1.6.0.dist-info/RECORD +15 -0
- html_to_markdown-1.4.0.dist-info/METADATA +0 -249
- html_to_markdown-1.4.0.dist-info/RECORD +0 -14
- {html_to_markdown-1.4.0.dist-info → html_to_markdown-1.6.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.4.0.dist-info → html_to_markdown-1.6.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.4.0.dist-info → html_to_markdown-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.4.0.dist-info → html_to_markdown-1.6.0.dist-info}/top_level.txt +0 -0
html_to_markdown/__init__.py
CHANGED
|
@@ -1,5 +1,22 @@
|
|
|
1
|
-
from html_to_markdown.
|
|
1
|
+
from html_to_markdown.exceptions import (
|
|
2
|
+
ConflictingOptionsError,
|
|
3
|
+
EmptyHtmlError,
|
|
4
|
+
HtmlToMarkdownError,
|
|
5
|
+
InvalidParserError,
|
|
6
|
+
MissingDependencyError,
|
|
7
|
+
)
|
|
8
|
+
from html_to_markdown.processing import convert_to_markdown, convert_to_markdown_stream
|
|
2
9
|
|
|
10
|
+
# For backward compatibility and to maintain the existing API
|
|
3
11
|
markdownify = convert_to_markdown
|
|
4
12
|
|
|
5
|
-
__all__ = [
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ConflictingOptionsError",
|
|
15
|
+
"EmptyHtmlError",
|
|
16
|
+
"HtmlToMarkdownError",
|
|
17
|
+
"InvalidParserError",
|
|
18
|
+
"MissingDependencyError",
|
|
19
|
+
"convert_to_markdown",
|
|
20
|
+
"convert_to_markdown_stream",
|
|
21
|
+
"markdownify",
|
|
22
|
+
]
|
html_to_markdown/cli.py
CHANGED
|
@@ -1,7 +1,16 @@
|
|
|
1
|
+
import sys
|
|
1
2
|
from argparse import ArgumentParser, FileType
|
|
2
|
-
from sys import stdin
|
|
3
3
|
|
|
4
|
-
from html_to_markdown.constants import
|
|
4
|
+
from html_to_markdown.constants import (
|
|
5
|
+
ASTERISK,
|
|
6
|
+
ATX,
|
|
7
|
+
ATX_CLOSED,
|
|
8
|
+
BACKSLASH,
|
|
9
|
+
DOUBLE_EQUAL,
|
|
10
|
+
SPACES,
|
|
11
|
+
UNDERLINED,
|
|
12
|
+
UNDERSCORE,
|
|
13
|
+
)
|
|
5
14
|
from html_to_markdown.processing import convert_to_markdown
|
|
6
15
|
|
|
7
16
|
|
|
@@ -16,7 +25,7 @@ def main(argv: list[str]) -> str:
|
|
|
16
25
|
"html",
|
|
17
26
|
nargs="?",
|
|
18
27
|
type=FileType("r"),
|
|
19
|
-
default=stdin,
|
|
28
|
+
default=sys.stdin,
|
|
20
29
|
help="The HTML file to convert. Defaults to STDIN if not provided.",
|
|
21
30
|
)
|
|
22
31
|
|
|
@@ -43,8 +52,8 @@ def main(argv: list[str]) -> str:
|
|
|
43
52
|
|
|
44
53
|
parser.add_argument(
|
|
45
54
|
"--default-title",
|
|
46
|
-
action="
|
|
47
|
-
help="
|
|
55
|
+
action="store_true",
|
|
56
|
+
help="Set the link title to its href when no title is provided.",
|
|
48
57
|
)
|
|
49
58
|
|
|
50
59
|
parser.add_argument(
|
|
@@ -107,6 +116,13 @@ def main(argv: list[str]) -> str:
|
|
|
107
116
|
help="Disable escaping of '_' characters in text to '\\_'.",
|
|
108
117
|
)
|
|
109
118
|
|
|
119
|
+
parser.add_argument(
|
|
120
|
+
"--no-escape-misc",
|
|
121
|
+
dest="escape_misc",
|
|
122
|
+
action="store_false",
|
|
123
|
+
help="Disable escaping of miscellaneous characters to prevent conflicts in Markdown.",
|
|
124
|
+
)
|
|
125
|
+
|
|
110
126
|
parser.add_argument(
|
|
111
127
|
"-i",
|
|
112
128
|
"--keep-inline-images-in",
|
|
@@ -134,25 +150,87 @@ def main(argv: list[str]) -> str:
|
|
|
134
150
|
help="Remove newlines from HTML input before processing. This helps flatten janky output from HTML with unnecessary line breaks.",
|
|
135
151
|
)
|
|
136
152
|
|
|
137
|
-
|
|
153
|
+
parser.add_argument(
|
|
154
|
+
"--convert-as-inline",
|
|
155
|
+
action="store_true",
|
|
156
|
+
help="Treat the content as inline elements (no block elements like paragraphs).",
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
parser.add_argument(
|
|
160
|
+
"--no-extract-metadata",
|
|
161
|
+
dest="extract_metadata",
|
|
162
|
+
action="store_false",
|
|
163
|
+
help="Disable extraction of document metadata (title, meta tags) as a comment header.",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
parser.add_argument(
|
|
167
|
+
"--highlight-style",
|
|
168
|
+
default=DOUBLE_EQUAL,
|
|
169
|
+
choices=("double-equal", "html", "bold"),
|
|
170
|
+
help="Style to use for highlighted text (mark elements). Defaults to 'double-equal'.",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
parser.add_argument(
|
|
174
|
+
"--stream-processing",
|
|
175
|
+
action="store_true",
|
|
176
|
+
help="Use streaming processing for large documents to reduce memory usage.",
|
|
177
|
+
)
|
|
138
178
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
default_title=args.default_title,
|
|
145
|
-
heading_style=args.heading_style,
|
|
146
|
-
bullets=args.bullets,
|
|
147
|
-
strong_em_symbol=args.strong_em_symbol,
|
|
148
|
-
sub_symbol=args.sub_symbol,
|
|
149
|
-
sup_symbol=args.sup_symbol,
|
|
150
|
-
newline_style=args.newline_style,
|
|
151
|
-
code_language=args.code_language,
|
|
152
|
-
escape_asterisks=args.escape_asterisks,
|
|
153
|
-
escape_underscores=args.escape_underscores,
|
|
154
|
-
keep_inline_images_in=args.keep_inline_images_in,
|
|
155
|
-
wrap=args.wrap,
|
|
156
|
-
wrap_width=args.wrap_width,
|
|
157
|
-
strip_newlines=args.strip_newlines,
|
|
179
|
+
parser.add_argument(
|
|
180
|
+
"--chunk-size",
|
|
181
|
+
type=int,
|
|
182
|
+
default=1024,
|
|
183
|
+
help="Size of chunks when using streaming processing. Defaults to 1024 characters.",
|
|
158
184
|
)
|
|
185
|
+
|
|
186
|
+
parser.add_argument(
|
|
187
|
+
"--show-progress",
|
|
188
|
+
action="store_true",
|
|
189
|
+
help="Show progress information when processing large documents.",
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
args = parser.parse_args(argv)
|
|
193
|
+
|
|
194
|
+
# Prepare base arguments
|
|
195
|
+
base_args = {
|
|
196
|
+
"strip": args.strip,
|
|
197
|
+
"convert": args.convert,
|
|
198
|
+
"autolinks": args.autolinks,
|
|
199
|
+
"default_title": args.default_title,
|
|
200
|
+
"heading_style": args.heading_style,
|
|
201
|
+
"bullets": args.bullets,
|
|
202
|
+
"strong_em_symbol": args.strong_em_symbol,
|
|
203
|
+
"sub_symbol": args.sub_symbol,
|
|
204
|
+
"sup_symbol": args.sup_symbol,
|
|
205
|
+
"newline_style": args.newline_style,
|
|
206
|
+
"code_language": args.code_language,
|
|
207
|
+
"escape_asterisks": args.escape_asterisks,
|
|
208
|
+
"escape_underscores": args.escape_underscores,
|
|
209
|
+
"escape_misc": args.escape_misc,
|
|
210
|
+
"keep_inline_images_in": args.keep_inline_images_in,
|
|
211
|
+
"wrap": args.wrap,
|
|
212
|
+
"wrap_width": args.wrap_width,
|
|
213
|
+
"strip_newlines": args.strip_newlines,
|
|
214
|
+
"convert_as_inline": args.convert_as_inline,
|
|
215
|
+
"extract_metadata": args.extract_metadata,
|
|
216
|
+
"highlight_style": args.highlight_style,
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
# Add streaming parameters only if streaming is enabled
|
|
220
|
+
if args.stream_processing:
|
|
221
|
+
base_args["stream_processing"] = True
|
|
222
|
+
base_args["chunk_size"] = args.chunk_size
|
|
223
|
+
|
|
224
|
+
# Progress callback for CLI
|
|
225
|
+
if args.show_progress:
|
|
226
|
+
|
|
227
|
+
def progress_callback(processed: int, total: int) -> None:
|
|
228
|
+
if total > 0:
|
|
229
|
+
percent = (processed / total) * 100
|
|
230
|
+
# Use sys.stderr to avoid ruff T201 error for progress output
|
|
231
|
+
sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
|
|
232
|
+
sys.stderr.flush()
|
|
233
|
+
|
|
234
|
+
base_args["progress_callback"] = progress_callback
|
|
235
|
+
|
|
236
|
+
return convert_to_markdown(args.html.read(), **base_args)
|
html_to_markdown/constants.py
CHANGED