html-to-markdown 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -4,7 +4,6 @@ from html_to_markdown.cli import main
4
4
 
5
5
 
6
6
  def cli() -> None:
7
- """Main CLI entrypoint."""
8
7
  try:
9
8
  result = main(sys.argv[1:])
10
9
  print(result) # noqa: T201
html_to_markdown/cli.py CHANGED
@@ -10,15 +10,16 @@ from html_to_markdown.constants import (
10
10
  SPACES,
11
11
  UNDERLINED,
12
12
  UNDERSCORE,
13
+ WHITESPACE_NORMALIZED,
14
+ WHITESPACE_STRICT,
13
15
  )
14
16
  from html_to_markdown.processing import convert_to_markdown
15
17
 
16
18
 
17
19
  def main(argv: list[str]) -> str:
18
- """Command-line entry point."""
19
20
  parser = ArgumentParser(
20
21
  prog="html_to_markdown",
21
- description="Converts HTML to Markdown.",
22
+ description="Convert HTML to Markdown with comprehensive customization options.",
22
23
  )
23
24
 
24
25
  parser.add_argument(
@@ -33,188 +34,243 @@ def main(argv: list[str]) -> str:
33
34
  "-s",
34
35
  "--strip",
35
36
  nargs="*",
36
- help="A list of tags to strip from the conversion. Incompatible with the --convert option.",
37
+ help="HTML tags to remove from output. Incompatible with --convert.",
37
38
  )
38
39
 
39
40
  parser.add_argument(
40
41
  "-c",
41
42
  "--convert",
42
43
  nargs="*",
43
- help="A list of HTML tags to explicitly convert. Incompatible with the --strip option.",
44
+ help="HTML tags to convert (only these will be processed). Incompatible with --strip.",
44
45
  )
45
46
 
46
47
  parser.add_argument(
47
48
  "-a",
48
49
  "--autolinks",
49
50
  action="store_true",
50
- help="Automatically convert anchor links where the content matches the href.",
51
+ help="Convert URLs to automatic links when text matches href.",
51
52
  )
52
53
 
53
54
  parser.add_argument(
54
55
  "--default-title",
55
56
  action="store_true",
56
- help="Set the link title to its href when no title is provided.",
57
+ help="Use href as link title when no title is provided.",
57
58
  )
58
59
 
59
60
  parser.add_argument(
60
61
  "--heading-style",
61
62
  default=UNDERLINED,
62
63
  choices=(ATX, ATX_CLOSED, UNDERLINED),
63
- help="Defines the heading conversion style: 'atx', 'atx_closed', or 'underlined'. Defaults to 'underlined'.",
64
+ help="Header style: 'atx' (#), 'atx_closed' (# #), or 'underlined' (===). Default: underlined.",
64
65
  )
65
66
 
66
67
  parser.add_argument(
67
68
  "-b",
68
69
  "--bullets",
69
70
  default="*+-",
70
- help="A string of bullet styles to use for list items. The style alternates based on nesting level. Defaults to '*+-'.",
71
+ help="Characters for bullet points, alternates by nesting level. Default: '*+-'.",
71
72
  )
72
73
 
73
74
  parser.add_argument(
74
75
  "--strong-em-symbol",
75
76
  default=ASTERISK,
76
77
  choices=(ASTERISK, UNDERSCORE),
77
- help="Choose between '*' or '_' for strong and emphasized text. Defaults to '*'.",
78
+ help="Symbol for bold/italic text: '*' or '_'. Default: '*'.",
78
79
  )
79
80
 
80
81
  parser.add_argument(
81
82
  "--sub-symbol",
82
83
  default="",
83
- help="Define the characters used to surround <sub> text. Defaults to empty.",
84
+ help="Characters to surround subscript text. Default: none.",
84
85
  )
85
86
 
86
87
  parser.add_argument(
87
88
  "--sup-symbol",
88
89
  default="",
89
- help="Define the characters used to surround <sup> text. Defaults to empty.",
90
+ help="Characters to surround superscript text. Default: none.",
90
91
  )
91
92
 
92
93
  parser.add_argument(
93
94
  "--newline-style",
94
95
  default=SPACES,
95
96
  choices=(SPACES, BACKSLASH),
96
- help="Specify the <br> conversion style: two spaces (default) or a backslash at the end of the line.",
97
+ help="Line break style: 'spaces' (two spaces) or 'backslash' (\\). Default: spaces.",
97
98
  )
98
99
 
99
100
  parser.add_argument(
100
101
  "--code-language",
101
102
  default="",
102
- help="Specify the default language for code blocks inside <pre> tags. Defaults to empty.",
103
+ help="Default language for code blocks. Default: none.",
103
104
  )
104
105
 
105
106
  parser.add_argument(
106
107
  "--no-escape-asterisks",
107
108
  dest="escape_asterisks",
108
109
  action="store_false",
109
- help="Disable escaping of '*' characters in text to '\\*'.",
110
+ help="Don't escape asterisk (*) characters.",
110
111
  )
111
112
 
112
113
  parser.add_argument(
113
114
  "--no-escape-underscores",
114
115
  dest="escape_underscores",
115
116
  action="store_false",
116
- help="Disable escaping of '_' characters in text to '\\_'.",
117
+ help="Don't escape underscore (_) characters.",
117
118
  )
118
119
 
119
120
  parser.add_argument(
120
121
  "--no-escape-misc",
121
122
  dest="escape_misc",
122
123
  action="store_false",
123
- help="Disable escaping of miscellaneous characters to prevent conflicts in Markdown.",
124
+ help="Don't escape other special Markdown characters.",
124
125
  )
125
126
 
126
127
  parser.add_argument(
127
128
  "-i",
128
129
  "--keep-inline-images-in",
129
130
  nargs="*",
130
- help="Specify parent tags where inline images should be preserved as images, rather than converted to alt-text. Defaults to None.",
131
+ help="Parent tags where images remain inline (not converted to alt-text).",
131
132
  )
132
133
 
133
- parser.add_argument(
134
- "-w",
135
- "--wrap",
136
- action="store_true",
137
- help="Enable word wrapping for paragraphs at --wrap-width characters.",
138
- )
134
+ parser.add_argument("-w", "--wrap", action="store_true", help="Enable text wrapping at --wrap-width characters.")
139
135
 
140
136
  parser.add_argument(
141
137
  "--wrap-width",
142
138
  type=int,
143
139
  default=80,
144
- help="The number of characters at which text paragraphs should wrap. Defaults to 80.",
140
+ help="Column width for text wrapping. Default: 80.",
145
141
  )
146
142
 
147
143
  parser.add_argument(
148
144
  "--strip-newlines",
149
145
  action="store_true",
150
- help="Remove newlines from HTML input before processing. This helps flatten janky output from HTML with unnecessary line breaks.",
146
+ help="Remove newlines from HTML input (helps with messy HTML formatting).",
151
147
  )
152
148
 
153
149
  parser.add_argument(
154
150
  "--convert-as-inline",
155
151
  action="store_true",
156
- help="Treat the content as inline elements (no block elements like paragraphs).",
152
+ help="Treat all content as inline elements (no paragraph breaks).",
157
153
  )
158
154
 
159
155
  parser.add_argument(
160
156
  "--no-extract-metadata",
161
157
  dest="extract_metadata",
162
158
  action="store_false",
163
- help="Disable extraction of document metadata (title, meta tags) as a comment header.",
159
+ help="Don't extract metadata (title, meta tags) as comment header.",
164
160
  )
165
161
 
166
162
  parser.add_argument(
167
163
  "--highlight-style",
168
164
  default=DOUBLE_EQUAL,
169
165
  choices=("double-equal", "html", "bold"),
170
- help="Style to use for highlighted text (mark elements). Defaults to 'double-equal'.",
166
+ help="Highlighting style: 'double-equal' (==), 'html' (<mark>), or 'bold' (**). Default: double-equal.",
171
167
  )
172
168
 
173
169
  parser.add_argument(
174
170
  "--stream-processing",
175
171
  action="store_true",
176
- help="Use streaming processing for large documents to reduce memory usage.",
172
+ help="Process large documents in chunks to reduce memory usage.",
177
173
  )
178
174
 
179
175
  parser.add_argument(
180
176
  "--chunk-size",
181
177
  type=int,
182
178
  default=1024,
183
- help="Size of chunks when using streaming processing. Defaults to 1024 characters.",
179
+ help="Chunk size for streaming processing. Default: 1024 characters.",
180
+ )
181
+
182
+ parser.add_argument("--show-progress", action="store_true", help="Show progress bar for large documents.")
183
+
184
+ parser.add_argument(
185
+ "--parser",
186
+ choices=("html.parser", "lxml", "html5lib"),
187
+ help="HTML parser: 'lxml', 'html.parser', or 'html5lib'. Default: auto-detect.",
188
+ )
189
+
190
+ parser.add_argument(
191
+ "--list-indent-type",
192
+ default="spaces",
193
+ choices=("spaces", "tabs"),
194
+ help="List indentation: 'spaces' or 'tabs'. Default: spaces.",
195
+ )
196
+
197
+ parser.add_argument(
198
+ "--list-indent-width",
199
+ type=int,
200
+ default=4,
201
+ help="Spaces per list indent level (use 2 for Discord/Slack). Default: 4.",
184
202
  )
185
203
 
186
204
  parser.add_argument(
187
- "--show-progress",
205
+ "--whitespace-mode",
206
+ default=WHITESPACE_NORMALIZED,
207
+ choices=(WHITESPACE_NORMALIZED, WHITESPACE_STRICT),
208
+ help="Whitespace handling: 'normalized' (clean) or 'strict' (preserve). Default: normalized.",
209
+ )
210
+
211
+ parser.add_argument(
212
+ "--preprocess-html",
188
213
  action="store_true",
189
- help="Show progress information when processing large documents.",
214
+ help="Clean messy HTML (removes navigation, ads, forms, etc).",
215
+ )
216
+
217
+ parser.add_argument(
218
+ "--preprocessing-preset",
219
+ default="standard",
220
+ choices=("minimal", "standard", "aggressive"),
221
+ help="Cleaning level: 'minimal', 'standard', or 'aggressive'. Default: standard.",
222
+ )
223
+
224
+ parser.add_argument(
225
+ "--no-remove-forms",
226
+ dest="remove_forms",
227
+ action="store_false",
228
+ help="Keep form elements when preprocessing (normally removed).",
229
+ )
230
+
231
+ parser.add_argument(
232
+ "--no-remove-navigation",
233
+ dest="remove_navigation",
234
+ action="store_false",
235
+ help="Keep navigation elements when preprocessing (normally removed).",
190
236
  )
191
237
 
192
238
  args = parser.parse_args(argv)
193
239
 
194
240
  base_args = {
195
- "strip": args.strip,
196
- "convert": args.convert,
197
241
  "autolinks": args.autolinks,
198
- "default_title": args.default_title,
199
- "heading_style": args.heading_style,
200
242
  "bullets": args.bullets,
201
- "strong_em_symbol": args.strong_em_symbol,
202
- "sub_symbol": args.sub_symbol,
203
- "sup_symbol": args.sup_symbol,
204
- "newline_style": args.newline_style,
205
243
  "code_language": args.code_language,
244
+ "convert": args.convert,
245
+ "convert_as_inline": args.convert_as_inline,
246
+ "default_title": args.default_title,
206
247
  "escape_asterisks": args.escape_asterisks,
207
- "escape_underscores": args.escape_underscores,
208
248
  "escape_misc": args.escape_misc,
249
+ "escape_underscores": args.escape_underscores,
250
+ "extract_metadata": args.extract_metadata,
251
+ "heading_style": args.heading_style,
252
+ "highlight_style": args.highlight_style,
209
253
  "keep_inline_images_in": args.keep_inline_images_in,
254
+ "list_indent_type": args.list_indent_type,
255
+ "list_indent_width": args.list_indent_width,
256
+ "newline_style": args.newline_style,
257
+ "preprocess_html": args.preprocess_html,
258
+ "preprocessing_preset": args.preprocessing_preset,
259
+ "remove_forms": args.remove_forms,
260
+ "remove_navigation": args.remove_navigation,
261
+ "strip": args.strip,
262
+ "strip_newlines": args.strip_newlines,
263
+ "strong_em_symbol": args.strong_em_symbol,
264
+ "sub_symbol": args.sub_symbol,
265
+ "sup_symbol": args.sup_symbol,
266
+ "whitespace_mode": args.whitespace_mode,
210
267
  "wrap": args.wrap,
211
268
  "wrap_width": args.wrap_width,
212
- "strip_newlines": args.strip_newlines,
213
- "convert_as_inline": args.convert_as_inline,
214
- "extract_metadata": args.extract_metadata,
215
- "highlight_style": args.highlight_style,
216
269
  }
217
270
 
271
+ if args.parser:
272
+ base_args["parser"] = args.parser
273
+
218
274
  if args.stream_processing:
219
275
  base_args["stream_processing"] = True
220
276
  base_args["chunk_size"] = args.chunk_size
@@ -17,3 +17,6 @@ UNDERLINED: Final = "underlined"
17
17
  SPACES: Final = "spaces"
18
18
  UNDERSCORE: Final = "_"
19
19
  DOUBLE_EQUAL: Final = "double-equal"
20
+
21
+ WHITESPACE_NORMALIZED: Final = "normalized"
22
+ WHITESPACE_STRICT: Final = "strict"