html-to-markdown 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -1,5 +1,6 @@
1
- from html_to_markdown.processing import convert_to_markdown
1
+ from html_to_markdown.processing import convert_to_markdown, convert_to_markdown_stream
2
2
 
3
+ # For backward compatibility and to maintain the existing API
3
4
  markdownify = convert_to_markdown
4
5
 
5
- __all__ = ["convert_to_markdown", "markdownify"]
6
+ __all__ = ["convert_to_markdown", "convert_to_markdown_stream", "markdownify"]
html_to_markdown/cli.py CHANGED
@@ -1,7 +1,16 @@
1
+ import sys
1
2
  from argparse import ArgumentParser, FileType
2
- from sys import stdin
3
3
 
4
- from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
4
+ from html_to_markdown.constants import (
5
+ ASTERISK,
6
+ ATX,
7
+ ATX_CLOSED,
8
+ BACKSLASH,
9
+ DOUBLE_EQUAL,
10
+ SPACES,
11
+ UNDERLINED,
12
+ UNDERSCORE,
13
+ )
5
14
  from html_to_markdown.processing import convert_to_markdown
6
15
 
7
16
 
@@ -16,7 +25,7 @@ def main(argv: list[str]) -> str:
16
25
  "html",
17
26
  nargs="?",
18
27
  type=FileType("r"),
19
- default=stdin,
28
+ default=sys.stdin,
20
29
  help="The HTML file to convert. Defaults to STDIN if not provided.",
21
30
  )
22
31
 
@@ -43,8 +52,8 @@ def main(argv: list[str]) -> str:
43
52
 
44
53
  parser.add_argument(
45
54
  "--default-title",
46
- action="store_false",
47
- help="Use this flag to disable setting the link title to its href when no title is provided.",
55
+ action="store_true",
56
+ help="Set the link title to its href when no title is provided.",
48
57
  )
49
58
 
50
59
  parser.add_argument(
@@ -107,6 +116,13 @@ def main(argv: list[str]) -> str:
107
116
  help="Disable escaping of '_' characters in text to '\\_'.",
108
117
  )
109
118
 
119
+ parser.add_argument(
120
+ "--no-escape-misc",
121
+ dest="escape_misc",
122
+ action="store_false",
123
+ help="Disable escaping of miscellaneous characters to prevent conflicts in Markdown.",
124
+ )
125
+
110
126
  parser.add_argument(
111
127
  "-i",
112
128
  "--keep-inline-images-in",
@@ -134,25 +150,87 @@ def main(argv: list[str]) -> str:
134
150
  help="Remove newlines from HTML input before processing. This helps flatten janky output from HTML with unnecessary line breaks.",
135
151
  )
136
152
 
137
- args = parser.parse_args(argv)
153
+ parser.add_argument(
154
+ "--convert-as-inline",
155
+ action="store_true",
156
+ help="Treat the content as inline elements (no block elements like paragraphs).",
157
+ )
158
+
159
+ parser.add_argument(
160
+ "--no-extract-metadata",
161
+ dest="extract_metadata",
162
+ action="store_false",
163
+ help="Disable extraction of document metadata (title, meta tags) as a comment header.",
164
+ )
165
+
166
+ parser.add_argument(
167
+ "--highlight-style",
168
+ default=DOUBLE_EQUAL,
169
+ choices=("double-equal", "html", "bold"),
170
+ help="Style to use for highlighted text (mark elements). Defaults to 'double-equal'.",
171
+ )
172
+
173
+ parser.add_argument(
174
+ "--stream-processing",
175
+ action="store_true",
176
+ help="Use streaming processing for large documents to reduce memory usage.",
177
+ )
138
178
 
139
- return convert_to_markdown(
140
- args.html.read(),
141
- strip=args.strip,
142
- convert=args.convert,
143
- autolinks=args.autolinks,
144
- default_title=args.default_title,
145
- heading_style=args.heading_style,
146
- bullets=args.bullets,
147
- strong_em_symbol=args.strong_em_symbol,
148
- sub_symbol=args.sub_symbol,
149
- sup_symbol=args.sup_symbol,
150
- newline_style=args.newline_style,
151
- code_language=args.code_language,
152
- escape_asterisks=args.escape_asterisks,
153
- escape_underscores=args.escape_underscores,
154
- keep_inline_images_in=args.keep_inline_images_in,
155
- wrap=args.wrap,
156
- wrap_width=args.wrap_width,
157
- strip_newlines=args.strip_newlines,
179
+ parser.add_argument(
180
+ "--chunk-size",
181
+ type=int,
182
+ default=1024,
183
+ help="Size of chunks when using streaming processing. Defaults to 1024 characters.",
158
184
  )
185
+
186
+ parser.add_argument(
187
+ "--show-progress",
188
+ action="store_true",
189
+ help="Show progress information when processing large documents.",
190
+ )
191
+
192
+ args = parser.parse_args(argv)
193
+
194
+ # Prepare base arguments
195
+ base_args = {
196
+ "strip": args.strip,
197
+ "convert": args.convert,
198
+ "autolinks": args.autolinks,
199
+ "default_title": args.default_title,
200
+ "heading_style": args.heading_style,
201
+ "bullets": args.bullets,
202
+ "strong_em_symbol": args.strong_em_symbol,
203
+ "sub_symbol": args.sub_symbol,
204
+ "sup_symbol": args.sup_symbol,
205
+ "newline_style": args.newline_style,
206
+ "code_language": args.code_language,
207
+ "escape_asterisks": args.escape_asterisks,
208
+ "escape_underscores": args.escape_underscores,
209
+ "escape_misc": args.escape_misc,
210
+ "keep_inline_images_in": args.keep_inline_images_in,
211
+ "wrap": args.wrap,
212
+ "wrap_width": args.wrap_width,
213
+ "strip_newlines": args.strip_newlines,
214
+ "convert_as_inline": args.convert_as_inline,
215
+ "extract_metadata": args.extract_metadata,
216
+ "highlight_style": args.highlight_style,
217
+ }
218
+
219
+ # Add streaming parameters only if streaming is enabled
220
+ if args.stream_processing:
221
+ base_args["stream_processing"] = True
222
+ base_args["chunk_size"] = args.chunk_size
223
+
224
+ # Progress callback for CLI
225
+ if args.show_progress:
226
+
227
+ def progress_callback(processed: int, total: int) -> None:
228
+ if total > 0:
229
+ percent = (processed / total) * 100
230
+ # Use sys.stderr to avoid ruff T201 error for progress output
231
+ sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
232
+ sys.stderr.flush()
233
+
234
+ base_args["progress_callback"] = progress_callback
235
+
236
+ return convert_to_markdown(args.html.read(), **base_args)
@@ -16,3 +16,4 @@ BACKSLASH: Final = "backslash"
16
16
  UNDERLINED: Final = "underlined"
17
17
  SPACES: Final = "spaces"
18
18
  UNDERSCORE: Final = "_"
19
+ DOUBLE_EQUAL: Final = "double-equal"