html-to-markdown 1.3.3__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -1,5 +1,6 @@
1
- from html_to_markdown.processing import convert_to_markdown
1
+ from html_to_markdown.processing import convert_to_markdown, convert_to_markdown_stream
2
2
 
3
+ # For backward compatibility and to maintain the existing API
3
4
  markdownify = convert_to_markdown
4
5
 
5
- __all__ = ["convert_to_markdown", "markdownify"]
6
+ __all__ = ["convert_to_markdown", "convert_to_markdown_stream", "markdownify"]
@@ -1,8 +1,10 @@
1
1
  import sys
2
2
 
3
- def cli():
4
- from html_to_markdown.cli import main
3
+ from html_to_markdown.cli import main
5
4
 
5
+
6
+ def cli() -> None:
7
+ """Main CLI entrypoint."""
6
8
  try:
7
9
  result = main(sys.argv[1:])
8
10
  print(result) # noqa: T201
@@ -10,5 +12,6 @@ def cli():
10
12
  print(str(e), file=sys.stderr) # noqa: T201
11
13
  sys.exit(1)
12
14
 
15
+
13
16
  if __name__ == "__main__":
14
17
  cli()
html_to_markdown/cli.py CHANGED
@@ -1,11 +1,21 @@
1
- def main(argv: list[str]) -> str:
2
- """Command-line entry point."""
3
- from argparse import ArgumentParser, FileType
4
- from sys import stdin
1
+ import sys
2
+ from argparse import ArgumentParser, FileType
3
+
4
+ from html_to_markdown.constants import (
5
+ ASTERISK,
6
+ ATX,
7
+ ATX_CLOSED,
8
+ BACKSLASH,
9
+ DOUBLE_EQUAL,
10
+ SPACES,
11
+ UNDERLINED,
12
+ UNDERSCORE,
13
+ )
14
+ from html_to_markdown.processing import convert_to_markdown
5
15
 
6
- from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
7
- from html_to_markdown.processing import convert_to_markdown
8
16
 
17
+ def main(argv: list[str]) -> str:
18
+ """Command-line entry point."""
9
19
  parser = ArgumentParser(
10
20
  prog="html_to_markdown",
11
21
  description="Converts HTML to Markdown.",
@@ -15,7 +25,7 @@ def main(argv: list[str]) -> str:
15
25
  "html",
16
26
  nargs="?",
17
27
  type=FileType("r"),
18
- default=stdin,
28
+ default=sys.stdin,
19
29
  help="The HTML file to convert. Defaults to STDIN if not provided.",
20
30
  )
21
31
 
@@ -42,8 +52,8 @@ def main(argv: list[str]) -> str:
42
52
 
43
53
  parser.add_argument(
44
54
  "--default-title",
45
- action="store_false",
46
- help="Use this flag to disable setting the link title to its href when no title is provided.",
55
+ action="store_true",
56
+ help="Set the link title to its href when no title is provided.",
47
57
  )
48
58
 
49
59
  parser.add_argument(
@@ -106,6 +116,13 @@ def main(argv: list[str]) -> str:
106
116
  help="Disable escaping of '_' characters in text to '\\_'.",
107
117
  )
108
118
 
119
+ parser.add_argument(
120
+ "--no-escape-misc",
121
+ dest="escape_misc",
122
+ action="store_false",
123
+ help="Disable escaping of miscellaneous characters to prevent conflicts in Markdown.",
124
+ )
125
+
109
126
  parser.add_argument(
110
127
  "-i",
111
128
  "--keep-inline-images-in",
@@ -127,24 +144,93 @@ def main(argv: list[str]) -> str:
127
144
  help="The number of characters at which text paragraphs should wrap. Defaults to 80.",
128
145
  )
129
146
 
130
- args = parser.parse_args(argv)
147
+ parser.add_argument(
148
+ "--strip-newlines",
149
+ action="store_true",
150
+ help="Remove newlines from HTML input before processing. This helps flatten janky output from HTML with unnecessary line breaks.",
151
+ )
152
+
153
+ parser.add_argument(
154
+ "--convert-as-inline",
155
+ action="store_true",
156
+ help="Treat the content as inline elements (no block elements like paragraphs).",
157
+ )
158
+
159
+ parser.add_argument(
160
+ "--no-extract-metadata",
161
+ dest="extract_metadata",
162
+ action="store_false",
163
+ help="Disable extraction of document metadata (title, meta tags) as a comment header.",
164
+ )
165
+
166
+ parser.add_argument(
167
+ "--highlight-style",
168
+ default=DOUBLE_EQUAL,
169
+ choices=("double-equal", "html", "bold"),
170
+ help="Style to use for highlighted text (mark elements). Defaults to 'double-equal'.",
171
+ )
131
172
 
132
- return convert_to_markdown(
133
- args.html.read(),
134
- strip=args.strip,
135
- convert=args.convert,
136
- autolinks=args.autolinks,
137
- default_title=args.default_title,
138
- heading_style=args.heading_style,
139
- bullets=args.bullets,
140
- strong_em_symbol=args.strong_em_symbol,
141
- sub_symbol=args.sub_symbol,
142
- sup_symbol=args.sup_symbol,
143
- newline_style=args.newline_style,
144
- code_language=args.code_language,
145
- escape_asterisks=args.escape_asterisks,
146
- escape_underscores=args.escape_underscores,
147
- keep_inline_images_in=args.keep_inline_images_in,
148
- wrap=args.wrap,
149
- wrap_width=args.wrap_width,
173
+ parser.add_argument(
174
+ "--stream-processing",
175
+ action="store_true",
176
+ help="Use streaming processing for large documents to reduce memory usage.",
150
177
  )
178
+
179
+ parser.add_argument(
180
+ "--chunk-size",
181
+ type=int,
182
+ default=1024,
183
+ help="Size of chunks when using streaming processing. Defaults to 1024 characters.",
184
+ )
185
+
186
+ parser.add_argument(
187
+ "--show-progress",
188
+ action="store_true",
189
+ help="Show progress information when processing large documents.",
190
+ )
191
+
192
+ args = parser.parse_args(argv)
193
+
194
+ # Prepare base arguments
195
+ base_args = {
196
+ "strip": args.strip,
197
+ "convert": args.convert,
198
+ "autolinks": args.autolinks,
199
+ "default_title": args.default_title,
200
+ "heading_style": args.heading_style,
201
+ "bullets": args.bullets,
202
+ "strong_em_symbol": args.strong_em_symbol,
203
+ "sub_symbol": args.sub_symbol,
204
+ "sup_symbol": args.sup_symbol,
205
+ "newline_style": args.newline_style,
206
+ "code_language": args.code_language,
207
+ "escape_asterisks": args.escape_asterisks,
208
+ "escape_underscores": args.escape_underscores,
209
+ "escape_misc": args.escape_misc,
210
+ "keep_inline_images_in": args.keep_inline_images_in,
211
+ "wrap": args.wrap,
212
+ "wrap_width": args.wrap_width,
213
+ "strip_newlines": args.strip_newlines,
214
+ "convert_as_inline": args.convert_as_inline,
215
+ "extract_metadata": args.extract_metadata,
216
+ "highlight_style": args.highlight_style,
217
+ }
218
+
219
+ # Add streaming parameters only if streaming is enabled
220
+ if args.stream_processing:
221
+ base_args["stream_processing"] = True
222
+ base_args["chunk_size"] = args.chunk_size
223
+
224
+ # Progress callback for CLI
225
+ if args.show_progress:
226
+
227
+ def progress_callback(processed: int, total: int) -> None:
228
+ if total > 0:
229
+ percent = (processed / total) * 100
230
+ # Use sys.stderr to avoid ruff T201 error for progress output
231
+ sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
232
+ sys.stderr.flush()
233
+
234
+ base_args["progress_callback"] = progress_callback
235
+
236
+ return convert_to_markdown(args.html.read(), **base_args)
@@ -16,3 +16,4 @@ BACKSLASH: Final = "backslash"
16
16
  UNDERLINED: Final = "underlined"
17
17
  SPACES: Final = "spaces"
18
18
  UNDERSCORE: Final = "_"
19
+ DOUBLE_EQUAL: Final = "double-equal"