html-to-markdown 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__main__.py +0 -1
- html_to_markdown/cli.py +101 -45
- html_to_markdown/constants.py +3 -0
- html_to_markdown/converters.py +52 -573
- html_to_markdown/exceptions.py +1 -11
- html_to_markdown/preprocessor.py +0 -37
- html_to_markdown/processing.py +104 -202
- html_to_markdown/utils.py +2 -42
- html_to_markdown/whitespace.py +292 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/METADATA +204 -204
- html_to_markdown-1.10.0.dist-info/RECORD +17 -0
- html_to_markdown-1.9.0.dist-info/RECORD +0 -16
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/top_level.txt +0 -0
html_to_markdown/__main__.py
CHANGED
html_to_markdown/cli.py
CHANGED
|
@@ -10,15 +10,16 @@ from html_to_markdown.constants import (
|
|
|
10
10
|
SPACES,
|
|
11
11
|
UNDERLINED,
|
|
12
12
|
UNDERSCORE,
|
|
13
|
+
WHITESPACE_NORMALIZED,
|
|
14
|
+
WHITESPACE_STRICT,
|
|
13
15
|
)
|
|
14
16
|
from html_to_markdown.processing import convert_to_markdown
|
|
15
17
|
|
|
16
18
|
|
|
17
19
|
def main(argv: list[str]) -> str:
|
|
18
|
-
"""Command-line entry point."""
|
|
19
20
|
parser = ArgumentParser(
|
|
20
21
|
prog="html_to_markdown",
|
|
21
|
-
description="
|
|
22
|
+
description="Convert HTML to Markdown with comprehensive customization options.",
|
|
22
23
|
)
|
|
23
24
|
|
|
24
25
|
parser.add_argument(
|
|
@@ -33,188 +34,243 @@ def main(argv: list[str]) -> str:
|
|
|
33
34
|
"-s",
|
|
34
35
|
"--strip",
|
|
35
36
|
nargs="*",
|
|
36
|
-
help="
|
|
37
|
+
help="HTML tags to remove from output. Incompatible with --convert.",
|
|
37
38
|
)
|
|
38
39
|
|
|
39
40
|
parser.add_argument(
|
|
40
41
|
"-c",
|
|
41
42
|
"--convert",
|
|
42
43
|
nargs="*",
|
|
43
|
-
help="
|
|
44
|
+
help="HTML tags to convert (only these will be processed). Incompatible with --strip.",
|
|
44
45
|
)
|
|
45
46
|
|
|
46
47
|
parser.add_argument(
|
|
47
48
|
"-a",
|
|
48
49
|
"--autolinks",
|
|
49
50
|
action="store_true",
|
|
50
|
-
help="
|
|
51
|
+
help="Convert URLs to automatic links when text matches href.",
|
|
51
52
|
)
|
|
52
53
|
|
|
53
54
|
parser.add_argument(
|
|
54
55
|
"--default-title",
|
|
55
56
|
action="store_true",
|
|
56
|
-
help="
|
|
57
|
+
help="Use href as link title when no title is provided.",
|
|
57
58
|
)
|
|
58
59
|
|
|
59
60
|
parser.add_argument(
|
|
60
61
|
"--heading-style",
|
|
61
62
|
default=UNDERLINED,
|
|
62
63
|
choices=(ATX, ATX_CLOSED, UNDERLINED),
|
|
63
|
-
help="
|
|
64
|
+
help="Header style: 'atx' (#), 'atx_closed' (# #), or 'underlined' (===). Default: underlined.",
|
|
64
65
|
)
|
|
65
66
|
|
|
66
67
|
parser.add_argument(
|
|
67
68
|
"-b",
|
|
68
69
|
"--bullets",
|
|
69
70
|
default="*+-",
|
|
70
|
-
help="
|
|
71
|
+
help="Characters for bullet points, alternates by nesting level. Default: '*+-'.",
|
|
71
72
|
)
|
|
72
73
|
|
|
73
74
|
parser.add_argument(
|
|
74
75
|
"--strong-em-symbol",
|
|
75
76
|
default=ASTERISK,
|
|
76
77
|
choices=(ASTERISK, UNDERSCORE),
|
|
77
|
-
help="
|
|
78
|
+
help="Symbol for bold/italic text: '*' or '_'. Default: '*'.",
|
|
78
79
|
)
|
|
79
80
|
|
|
80
81
|
parser.add_argument(
|
|
81
82
|
"--sub-symbol",
|
|
82
83
|
default="",
|
|
83
|
-
help="
|
|
84
|
+
help="Characters to surround subscript text. Default: none.",
|
|
84
85
|
)
|
|
85
86
|
|
|
86
87
|
parser.add_argument(
|
|
87
88
|
"--sup-symbol",
|
|
88
89
|
default="",
|
|
89
|
-
help="
|
|
90
|
+
help="Characters to surround superscript text. Default: none.",
|
|
90
91
|
)
|
|
91
92
|
|
|
92
93
|
parser.add_argument(
|
|
93
94
|
"--newline-style",
|
|
94
95
|
default=SPACES,
|
|
95
96
|
choices=(SPACES, BACKSLASH),
|
|
96
|
-
help="
|
|
97
|
+
help="Line break style: 'spaces' (two spaces) or 'backslash' (\\). Default: spaces.",
|
|
97
98
|
)
|
|
98
99
|
|
|
99
100
|
parser.add_argument(
|
|
100
101
|
"--code-language",
|
|
101
102
|
default="",
|
|
102
|
-
help="
|
|
103
|
+
help="Default language for code blocks. Default: none.",
|
|
103
104
|
)
|
|
104
105
|
|
|
105
106
|
parser.add_argument(
|
|
106
107
|
"--no-escape-asterisks",
|
|
107
108
|
dest="escape_asterisks",
|
|
108
109
|
action="store_false",
|
|
109
|
-
help="
|
|
110
|
+
help="Don't escape asterisk (*) characters.",
|
|
110
111
|
)
|
|
111
112
|
|
|
112
113
|
parser.add_argument(
|
|
113
114
|
"--no-escape-underscores",
|
|
114
115
|
dest="escape_underscores",
|
|
115
116
|
action="store_false",
|
|
116
|
-
help="
|
|
117
|
+
help="Don't escape underscore (_) characters.",
|
|
117
118
|
)
|
|
118
119
|
|
|
119
120
|
parser.add_argument(
|
|
120
121
|
"--no-escape-misc",
|
|
121
122
|
dest="escape_misc",
|
|
122
123
|
action="store_false",
|
|
123
|
-
help="
|
|
124
|
+
help="Don't escape other special Markdown characters.",
|
|
124
125
|
)
|
|
125
126
|
|
|
126
127
|
parser.add_argument(
|
|
127
128
|
"-i",
|
|
128
129
|
"--keep-inline-images-in",
|
|
129
130
|
nargs="*",
|
|
130
|
-
help="
|
|
131
|
+
help="Parent tags where images remain inline (not converted to alt-text).",
|
|
131
132
|
)
|
|
132
133
|
|
|
133
|
-
parser.add_argument(
|
|
134
|
-
"-w",
|
|
135
|
-
"--wrap",
|
|
136
|
-
action="store_true",
|
|
137
|
-
help="Enable word wrapping for paragraphs at --wrap-width characters.",
|
|
138
|
-
)
|
|
134
|
+
parser.add_argument("-w", "--wrap", action="store_true", help="Enable text wrapping at --wrap-width characters.")
|
|
139
135
|
|
|
140
136
|
parser.add_argument(
|
|
141
137
|
"--wrap-width",
|
|
142
138
|
type=int,
|
|
143
139
|
default=80,
|
|
144
|
-
help="
|
|
140
|
+
help="Column width for text wrapping. Default: 80.",
|
|
145
141
|
)
|
|
146
142
|
|
|
147
143
|
parser.add_argument(
|
|
148
144
|
"--strip-newlines",
|
|
149
145
|
action="store_true",
|
|
150
|
-
help="Remove newlines from HTML input
|
|
146
|
+
help="Remove newlines from HTML input (helps with messy HTML formatting).",
|
|
151
147
|
)
|
|
152
148
|
|
|
153
149
|
parser.add_argument(
|
|
154
150
|
"--convert-as-inline",
|
|
155
151
|
action="store_true",
|
|
156
|
-
help="Treat
|
|
152
|
+
help="Treat all content as inline elements (no paragraph breaks).",
|
|
157
153
|
)
|
|
158
154
|
|
|
159
155
|
parser.add_argument(
|
|
160
156
|
"--no-extract-metadata",
|
|
161
157
|
dest="extract_metadata",
|
|
162
158
|
action="store_false",
|
|
163
|
-
help="
|
|
159
|
+
help="Don't extract metadata (title, meta tags) as comment header.",
|
|
164
160
|
)
|
|
165
161
|
|
|
166
162
|
parser.add_argument(
|
|
167
163
|
"--highlight-style",
|
|
168
164
|
default=DOUBLE_EQUAL,
|
|
169
165
|
choices=("double-equal", "html", "bold"),
|
|
170
|
-
help="
|
|
166
|
+
help="Highlighting style: 'double-equal' (==), 'html' (<mark>), or 'bold' (**). Default: double-equal.",
|
|
171
167
|
)
|
|
172
168
|
|
|
173
169
|
parser.add_argument(
|
|
174
170
|
"--stream-processing",
|
|
175
171
|
action="store_true",
|
|
176
|
-
help="
|
|
172
|
+
help="Process large documents in chunks to reduce memory usage.",
|
|
177
173
|
)
|
|
178
174
|
|
|
179
175
|
parser.add_argument(
|
|
180
176
|
"--chunk-size",
|
|
181
177
|
type=int,
|
|
182
178
|
default=1024,
|
|
183
|
-
help="
|
|
179
|
+
help="Chunk size for streaming processing. Default: 1024 characters.",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
parser.add_argument("--show-progress", action="store_true", help="Show progress bar for large documents.")
|
|
183
|
+
|
|
184
|
+
parser.add_argument(
|
|
185
|
+
"--parser",
|
|
186
|
+
choices=("html.parser", "lxml", "html5lib"),
|
|
187
|
+
help="HTML parser: 'lxml', 'html.parser', or 'html5lib'. Default: auto-detect.",
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
parser.add_argument(
|
|
191
|
+
"--list-indent-type",
|
|
192
|
+
default="spaces",
|
|
193
|
+
choices=("spaces", "tabs"),
|
|
194
|
+
help="List indentation: 'spaces' or 'tabs'. Default: spaces.",
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
parser.add_argument(
|
|
198
|
+
"--list-indent-width",
|
|
199
|
+
type=int,
|
|
200
|
+
default=4,
|
|
201
|
+
help="Spaces per list indent level (use 2 for Discord/Slack). Default: 4.",
|
|
184
202
|
)
|
|
185
203
|
|
|
186
204
|
parser.add_argument(
|
|
187
|
-
"--
|
|
205
|
+
"--whitespace-mode",
|
|
206
|
+
default=WHITESPACE_NORMALIZED,
|
|
207
|
+
choices=(WHITESPACE_NORMALIZED, WHITESPACE_STRICT),
|
|
208
|
+
help="Whitespace handling: 'normalized' (clean) or 'strict' (preserve). Default: normalized.",
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
parser.add_argument(
|
|
212
|
+
"--preprocess-html",
|
|
188
213
|
action="store_true",
|
|
189
|
-
help="
|
|
214
|
+
help="Clean messy HTML (removes navigation, ads, forms, etc).",
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
parser.add_argument(
|
|
218
|
+
"--preprocessing-preset",
|
|
219
|
+
default="standard",
|
|
220
|
+
choices=("minimal", "standard", "aggressive"),
|
|
221
|
+
help="Cleaning level: 'minimal', 'standard', or 'aggressive'. Default: standard.",
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
parser.add_argument(
|
|
225
|
+
"--no-remove-forms",
|
|
226
|
+
dest="remove_forms",
|
|
227
|
+
action="store_false",
|
|
228
|
+
help="Keep form elements when preprocessing (normally removed).",
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
parser.add_argument(
|
|
232
|
+
"--no-remove-navigation",
|
|
233
|
+
dest="remove_navigation",
|
|
234
|
+
action="store_false",
|
|
235
|
+
help="Keep navigation elements when preprocessing (normally removed).",
|
|
190
236
|
)
|
|
191
237
|
|
|
192
238
|
args = parser.parse_args(argv)
|
|
193
239
|
|
|
194
240
|
base_args = {
|
|
195
|
-
"strip": args.strip,
|
|
196
|
-
"convert": args.convert,
|
|
197
241
|
"autolinks": args.autolinks,
|
|
198
|
-
"default_title": args.default_title,
|
|
199
|
-
"heading_style": args.heading_style,
|
|
200
242
|
"bullets": args.bullets,
|
|
201
|
-
"strong_em_symbol": args.strong_em_symbol,
|
|
202
|
-
"sub_symbol": args.sub_symbol,
|
|
203
|
-
"sup_symbol": args.sup_symbol,
|
|
204
|
-
"newline_style": args.newline_style,
|
|
205
243
|
"code_language": args.code_language,
|
|
244
|
+
"convert": args.convert,
|
|
245
|
+
"convert_as_inline": args.convert_as_inline,
|
|
246
|
+
"default_title": args.default_title,
|
|
206
247
|
"escape_asterisks": args.escape_asterisks,
|
|
207
|
-
"escape_underscores": args.escape_underscores,
|
|
208
248
|
"escape_misc": args.escape_misc,
|
|
249
|
+
"escape_underscores": args.escape_underscores,
|
|
250
|
+
"extract_metadata": args.extract_metadata,
|
|
251
|
+
"heading_style": args.heading_style,
|
|
252
|
+
"highlight_style": args.highlight_style,
|
|
209
253
|
"keep_inline_images_in": args.keep_inline_images_in,
|
|
254
|
+
"list_indent_type": args.list_indent_type,
|
|
255
|
+
"list_indent_width": args.list_indent_width,
|
|
256
|
+
"newline_style": args.newline_style,
|
|
257
|
+
"preprocess_html": args.preprocess_html,
|
|
258
|
+
"preprocessing_preset": args.preprocessing_preset,
|
|
259
|
+
"remove_forms": args.remove_forms,
|
|
260
|
+
"remove_navigation": args.remove_navigation,
|
|
261
|
+
"strip": args.strip,
|
|
262
|
+
"strip_newlines": args.strip_newlines,
|
|
263
|
+
"strong_em_symbol": args.strong_em_symbol,
|
|
264
|
+
"sub_symbol": args.sub_symbol,
|
|
265
|
+
"sup_symbol": args.sup_symbol,
|
|
266
|
+
"whitespace_mode": args.whitespace_mode,
|
|
210
267
|
"wrap": args.wrap,
|
|
211
268
|
"wrap_width": args.wrap_width,
|
|
212
|
-
"strip_newlines": args.strip_newlines,
|
|
213
|
-
"convert_as_inline": args.convert_as_inline,
|
|
214
|
-
"extract_metadata": args.extract_metadata,
|
|
215
|
-
"highlight_style": args.highlight_style,
|
|
216
269
|
}
|
|
217
270
|
|
|
271
|
+
if args.parser:
|
|
272
|
+
base_args["parser"] = args.parser
|
|
273
|
+
|
|
218
274
|
if args.stream_processing:
|
|
219
275
|
base_args["stream_processing"] = True
|
|
220
276
|
base_args["chunk_size"] = args.chunk_size
|