html-to-markdown 1.3.3__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +3 -2
- html_to_markdown/__main__.py +5 -2
- html_to_markdown/cli.py +114 -28
- html_to_markdown/constants.py +1 -0
- html_to_markdown/converters.py +1646 -105
- html_to_markdown/processing.py +499 -13
- html_to_markdown-1.5.0.dist-info/METADATA +436 -0
- html_to_markdown-1.5.0.dist-info/RECORD +14 -0
- {html_to_markdown-1.3.3.dist-info → html_to_markdown-1.5.0.dist-info}/entry_points.txt +1 -0
- html_to_markdown-1.3.3.dist-info/METADATA +0 -242
- html_to_markdown-1.3.3.dist-info/RECORD +0 -14
- {html_to_markdown-1.3.3.dist-info → html_to_markdown-1.5.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.3.3.dist-info → html_to_markdown-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.3.3.dist-info → html_to_markdown-1.5.0.dist-info}/top_level.txt +0 -0
html_to_markdown/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
from html_to_markdown.processing import convert_to_markdown
|
|
1
|
+
from html_to_markdown.processing import convert_to_markdown, convert_to_markdown_stream
|
|
2
2
|
|
|
3
|
+
# For backward compatibility and to maintain the existing API
|
|
3
4
|
markdownify = convert_to_markdown
|
|
4
5
|
|
|
5
|
-
__all__ = ["convert_to_markdown", "markdownify"]
|
|
6
|
+
__all__ = ["convert_to_markdown", "convert_to_markdown_stream", "markdownify"]
|
html_to_markdown/__main__.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
from html_to_markdown.cli import main
|
|
3
|
+
from html_to_markdown.cli import main
|
|
5
4
|
|
|
5
|
+
|
|
6
|
+
def cli() -> None:
|
|
7
|
+
"""Main CLI entrypoint."""
|
|
6
8
|
try:
|
|
7
9
|
result = main(sys.argv[1:])
|
|
8
10
|
print(result) # noqa: T201
|
|
@@ -10,5 +12,6 @@ def cli():
|
|
|
10
12
|
print(str(e), file=sys.stderr) # noqa: T201
|
|
11
13
|
sys.exit(1)
|
|
12
14
|
|
|
15
|
+
|
|
13
16
|
if __name__ == "__main__":
|
|
14
17
|
cli()
|
html_to_markdown/cli.py
CHANGED
|
@@ -1,11 +1,21 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
import sys
|
|
2
|
+
from argparse import ArgumentParser, FileType
|
|
3
|
+
|
|
4
|
+
from html_to_markdown.constants import (
|
|
5
|
+
ASTERISK,
|
|
6
|
+
ATX,
|
|
7
|
+
ATX_CLOSED,
|
|
8
|
+
BACKSLASH,
|
|
9
|
+
DOUBLE_EQUAL,
|
|
10
|
+
SPACES,
|
|
11
|
+
UNDERLINED,
|
|
12
|
+
UNDERSCORE,
|
|
13
|
+
)
|
|
14
|
+
from html_to_markdown.processing import convert_to_markdown
|
|
5
15
|
|
|
6
|
-
from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
|
|
7
|
-
from html_to_markdown.processing import convert_to_markdown
|
|
8
16
|
|
|
17
|
+
def main(argv: list[str]) -> str:
|
|
18
|
+
"""Command-line entry point."""
|
|
9
19
|
parser = ArgumentParser(
|
|
10
20
|
prog="html_to_markdown",
|
|
11
21
|
description="Converts HTML to Markdown.",
|
|
@@ -15,7 +25,7 @@ def main(argv: list[str]) -> str:
|
|
|
15
25
|
"html",
|
|
16
26
|
nargs="?",
|
|
17
27
|
type=FileType("r"),
|
|
18
|
-
default=stdin,
|
|
28
|
+
default=sys.stdin,
|
|
19
29
|
help="The HTML file to convert. Defaults to STDIN if not provided.",
|
|
20
30
|
)
|
|
21
31
|
|
|
@@ -42,8 +52,8 @@ def main(argv: list[str]) -> str:
|
|
|
42
52
|
|
|
43
53
|
parser.add_argument(
|
|
44
54
|
"--default-title",
|
|
45
|
-
action="
|
|
46
|
-
help="
|
|
55
|
+
action="store_true",
|
|
56
|
+
help="Set the link title to its href when no title is provided.",
|
|
47
57
|
)
|
|
48
58
|
|
|
49
59
|
parser.add_argument(
|
|
@@ -106,6 +116,13 @@ def main(argv: list[str]) -> str:
|
|
|
106
116
|
help="Disable escaping of '_' characters in text to '\\_'.",
|
|
107
117
|
)
|
|
108
118
|
|
|
119
|
+
parser.add_argument(
|
|
120
|
+
"--no-escape-misc",
|
|
121
|
+
dest="escape_misc",
|
|
122
|
+
action="store_false",
|
|
123
|
+
help="Disable escaping of miscellaneous characters to prevent conflicts in Markdown.",
|
|
124
|
+
)
|
|
125
|
+
|
|
109
126
|
parser.add_argument(
|
|
110
127
|
"-i",
|
|
111
128
|
"--keep-inline-images-in",
|
|
@@ -127,24 +144,93 @@ def main(argv: list[str]) -> str:
|
|
|
127
144
|
help="The number of characters at which text paragraphs should wrap. Defaults to 80.",
|
|
128
145
|
)
|
|
129
146
|
|
|
130
|
-
|
|
147
|
+
parser.add_argument(
|
|
148
|
+
"--strip-newlines",
|
|
149
|
+
action="store_true",
|
|
150
|
+
help="Remove newlines from HTML input before processing. This helps flatten janky output from HTML with unnecessary line breaks.",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
parser.add_argument(
|
|
154
|
+
"--convert-as-inline",
|
|
155
|
+
action="store_true",
|
|
156
|
+
help="Treat the content as inline elements (no block elements like paragraphs).",
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
parser.add_argument(
|
|
160
|
+
"--no-extract-metadata",
|
|
161
|
+
dest="extract_metadata",
|
|
162
|
+
action="store_false",
|
|
163
|
+
help="Disable extraction of document metadata (title, meta tags) as a comment header.",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
parser.add_argument(
|
|
167
|
+
"--highlight-style",
|
|
168
|
+
default=DOUBLE_EQUAL,
|
|
169
|
+
choices=("double-equal", "html", "bold"),
|
|
170
|
+
help="Style to use for highlighted text (mark elements). Defaults to 'double-equal'.",
|
|
171
|
+
)
|
|
131
172
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
autolinks=args.autolinks,
|
|
137
|
-
default_title=args.default_title,
|
|
138
|
-
heading_style=args.heading_style,
|
|
139
|
-
bullets=args.bullets,
|
|
140
|
-
strong_em_symbol=args.strong_em_symbol,
|
|
141
|
-
sub_symbol=args.sub_symbol,
|
|
142
|
-
sup_symbol=args.sup_symbol,
|
|
143
|
-
newline_style=args.newline_style,
|
|
144
|
-
code_language=args.code_language,
|
|
145
|
-
escape_asterisks=args.escape_asterisks,
|
|
146
|
-
escape_underscores=args.escape_underscores,
|
|
147
|
-
keep_inline_images_in=args.keep_inline_images_in,
|
|
148
|
-
wrap=args.wrap,
|
|
149
|
-
wrap_width=args.wrap_width,
|
|
173
|
+
parser.add_argument(
|
|
174
|
+
"--stream-processing",
|
|
175
|
+
action="store_true",
|
|
176
|
+
help="Use streaming processing for large documents to reduce memory usage.",
|
|
150
177
|
)
|
|
178
|
+
|
|
179
|
+
parser.add_argument(
|
|
180
|
+
"--chunk-size",
|
|
181
|
+
type=int,
|
|
182
|
+
default=1024,
|
|
183
|
+
help="Size of chunks when using streaming processing. Defaults to 1024 characters.",
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
parser.add_argument(
|
|
187
|
+
"--show-progress",
|
|
188
|
+
action="store_true",
|
|
189
|
+
help="Show progress information when processing large documents.",
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
args = parser.parse_args(argv)
|
|
193
|
+
|
|
194
|
+
# Prepare base arguments
|
|
195
|
+
base_args = {
|
|
196
|
+
"strip": args.strip,
|
|
197
|
+
"convert": args.convert,
|
|
198
|
+
"autolinks": args.autolinks,
|
|
199
|
+
"default_title": args.default_title,
|
|
200
|
+
"heading_style": args.heading_style,
|
|
201
|
+
"bullets": args.bullets,
|
|
202
|
+
"strong_em_symbol": args.strong_em_symbol,
|
|
203
|
+
"sub_symbol": args.sub_symbol,
|
|
204
|
+
"sup_symbol": args.sup_symbol,
|
|
205
|
+
"newline_style": args.newline_style,
|
|
206
|
+
"code_language": args.code_language,
|
|
207
|
+
"escape_asterisks": args.escape_asterisks,
|
|
208
|
+
"escape_underscores": args.escape_underscores,
|
|
209
|
+
"escape_misc": args.escape_misc,
|
|
210
|
+
"keep_inline_images_in": args.keep_inline_images_in,
|
|
211
|
+
"wrap": args.wrap,
|
|
212
|
+
"wrap_width": args.wrap_width,
|
|
213
|
+
"strip_newlines": args.strip_newlines,
|
|
214
|
+
"convert_as_inline": args.convert_as_inline,
|
|
215
|
+
"extract_metadata": args.extract_metadata,
|
|
216
|
+
"highlight_style": args.highlight_style,
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
# Add streaming parameters only if streaming is enabled
|
|
220
|
+
if args.stream_processing:
|
|
221
|
+
base_args["stream_processing"] = True
|
|
222
|
+
base_args["chunk_size"] = args.chunk_size
|
|
223
|
+
|
|
224
|
+
# Progress callback for CLI
|
|
225
|
+
if args.show_progress:
|
|
226
|
+
|
|
227
|
+
def progress_callback(processed: int, total: int) -> None:
|
|
228
|
+
if total > 0:
|
|
229
|
+
percent = (processed / total) * 100
|
|
230
|
+
# Use sys.stderr to avoid ruff T201 error for progress output
|
|
231
|
+
sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
|
|
232
|
+
sys.stderr.flush()
|
|
233
|
+
|
|
234
|
+
base_args["progress_callback"] = progress_callback
|
|
235
|
+
|
|
236
|
+
return convert_to_markdown(args.html.read(), **base_args)
|
html_to_markdown/constants.py
CHANGED