html-to-markdown 1.6.0__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +3 -1
- html_to_markdown/cli.py +1 -4
- html_to_markdown/converters.py +375 -645
- html_to_markdown/preprocessor.py +407 -0
- html_to_markdown/processing.py +227 -87
- html_to_markdown/utils.py +12 -5
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/METADATA +87 -14
- html_to_markdown-1.9.0.dist-info/RECORD +16 -0
- html_to_markdown-1.6.0.dist-info/RECORD +0 -15
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/top_level.txt +0 -0
html_to_markdown/__init__.py
CHANGED
|
@@ -5,9 +5,9 @@ from html_to_markdown.exceptions import (
|
|
|
5
5
|
InvalidParserError,
|
|
6
6
|
MissingDependencyError,
|
|
7
7
|
)
|
|
8
|
+
from html_to_markdown.preprocessor import create_preprocessor, preprocess_html
|
|
8
9
|
from html_to_markdown.processing import convert_to_markdown, convert_to_markdown_stream
|
|
9
10
|
|
|
10
|
-
# For backward compatibility and to maintain the existing API
|
|
11
11
|
markdownify = convert_to_markdown
|
|
12
12
|
|
|
13
13
|
__all__ = [
|
|
@@ -18,5 +18,7 @@ __all__ = [
|
|
|
18
18
|
"MissingDependencyError",
|
|
19
19
|
"convert_to_markdown",
|
|
20
20
|
"convert_to_markdown_stream",
|
|
21
|
+
"create_preprocessor",
|
|
21
22
|
"markdownify",
|
|
23
|
+
"preprocess_html",
|
|
22
24
|
]
|
html_to_markdown/cli.py
CHANGED
|
@@ -191,7 +191,6 @@ def main(argv: list[str]) -> str:
|
|
|
191
191
|
|
|
192
192
|
args = parser.parse_args(argv)
|
|
193
193
|
|
|
194
|
-
# Prepare base arguments
|
|
195
194
|
base_args = {
|
|
196
195
|
"strip": args.strip,
|
|
197
196
|
"convert": args.convert,
|
|
@@ -216,18 +215,16 @@ def main(argv: list[str]) -> str:
|
|
|
216
215
|
"highlight_style": args.highlight_style,
|
|
217
216
|
}
|
|
218
217
|
|
|
219
|
-
# Add streaming parameters only if streaming is enabled
|
|
220
218
|
if args.stream_processing:
|
|
221
219
|
base_args["stream_processing"] = True
|
|
222
220
|
base_args["chunk_size"] = args.chunk_size
|
|
223
221
|
|
|
224
|
-
# Progress callback for CLI
|
|
225
222
|
if args.show_progress:
|
|
226
223
|
|
|
227
224
|
def progress_callback(processed: int, total: int) -> None:
|
|
228
225
|
if total > 0:
|
|
229
226
|
percent = (processed / total) * 100
|
|
230
|
-
|
|
227
|
+
|
|
231
228
|
sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
|
|
232
229
|
sys.stderr.flush()
|
|
233
230
|
|