html-to-markdown 2.2.0__tar.gz → 2.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/Cargo.lock +295 -5
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/Cargo.toml +2 -2
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/PKG-INFO +47 -8
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/README_PYPI.md +46 -7
- html_to_markdown-2.3.0/crates/html-to-markdown/README.md +124 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/src/hocr/converter.rs +98 -12
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown-py/Cargo.toml +0 -6
- html_to_markdown-2.3.0/crates/html-to-markdown-py/README.md +159 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/html_to_markdown/__init__.py +1 -1
- html_to_markdown-2.3.0/html_to_markdown/bin/html-to-markdown +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/html_to_markdown/cli_proxy.py +1 -1
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/pyproject.toml +9 -4
- html_to_markdown-2.2.0/crates/html-to-markdown/README.md +0 -387
- html_to_markdown-2.2.0/crates/html-to-markdown-py/README.md +0 -86
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/LICENSE +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/Cargo.toml +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/benches/conversion_benchmark.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/benches/micro_benchmark.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/benches/profiling_benchmark.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/examples/basic.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/examples/table.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/examples/test_escape.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/examples/test_inline_formatting.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/examples/test_lists.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/examples/test_semantic_tags.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/examples/test_tables.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/examples/test_task_lists.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/examples/test_whitespace.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/src/converter.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/src/error.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/src/hocr/extractor.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/src/hocr/mod.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/src/hocr/parser.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/src/hocr/spatial.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/src/hocr/types.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/src/inline_images.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/src/lib.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/src/options.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/src/sanitizer.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/src/text.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/src/wrapper.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/tests/commonmark_compliance_test.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/tests/hocr_compliance_test.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown/tests/integration_test.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown-py/python/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown-py/python/html_to_markdown/_html_to_markdown.pyi +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown-py/src/lib.rs +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/crates/html-to-markdown-py/uv.lock +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/html_to_markdown/_rust.pyi +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/html_to_markdown/api.py +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/html_to_markdown/cli.py +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/html_to_markdown/exceptions.py +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/html_to_markdown/options.py +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-2.2.0 → html_to_markdown-2.3.0}/html_to_markdown/v1_compat.py +0 -0
|
@@ -155,6 +155,16 @@ version = "0.3.0"
|
|
|
155
155
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
156
156
|
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
|
157
157
|
|
|
158
|
+
[[package]]
|
|
159
|
+
name = "cc"
|
|
160
|
+
version = "1.2.41"
|
|
161
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
162
|
+
checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7"
|
|
163
|
+
dependencies = [
|
|
164
|
+
"find-msvc-tools",
|
|
165
|
+
"shlex",
|
|
166
|
+
]
|
|
167
|
+
|
|
158
168
|
[[package]]
|
|
159
169
|
name = "cfg-if"
|
|
160
170
|
version = "1.0.3"
|
|
@@ -259,6 +269,25 @@ version = "1.0.4"
|
|
|
259
269
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
260
270
|
checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
|
|
261
271
|
|
|
272
|
+
[[package]]
|
|
273
|
+
name = "console_error_panic_hook"
|
|
274
|
+
version = "0.1.7"
|
|
275
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
276
|
+
checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc"
|
|
277
|
+
dependencies = [
|
|
278
|
+
"cfg-if",
|
|
279
|
+
"wasm-bindgen",
|
|
280
|
+
]
|
|
281
|
+
|
|
282
|
+
[[package]]
|
|
283
|
+
name = "convert_case"
|
|
284
|
+
version = "0.8.0"
|
|
285
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
286
|
+
checksum = "baaaa0ecca5b51987b9423ccdc971514dd8b0bb7b4060b983d3664dad3f1f89f"
|
|
287
|
+
dependencies = [
|
|
288
|
+
"unicode-segmentation",
|
|
289
|
+
]
|
|
290
|
+
|
|
262
291
|
[[package]]
|
|
263
292
|
name = "crc32fast"
|
|
264
293
|
version = "1.5.0"
|
|
@@ -358,6 +387,28 @@ dependencies = [
|
|
|
358
387
|
"syn",
|
|
359
388
|
]
|
|
360
389
|
|
|
390
|
+
[[package]]
|
|
391
|
+
name = "ctor"
|
|
392
|
+
version = "0.5.0"
|
|
393
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
394
|
+
checksum = "67773048316103656a637612c4a62477603b777d91d9c62ff2290f9cde178fdb"
|
|
395
|
+
dependencies = [
|
|
396
|
+
"ctor-proc-macro",
|
|
397
|
+
"dtor",
|
|
398
|
+
]
|
|
399
|
+
|
|
400
|
+
[[package]]
|
|
401
|
+
name = "ctor-proc-macro"
|
|
402
|
+
version = "0.0.6"
|
|
403
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
404
|
+
checksum = "e2931af7e13dc045d8e9d26afccc6fa115d64e115c9c84b1166288b46f6782c2"
|
|
405
|
+
|
|
406
|
+
[[package]]
|
|
407
|
+
name = "cty"
|
|
408
|
+
version = "0.2.2"
|
|
409
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
410
|
+
checksum = "b365fabc795046672053e29c954733ec3b05e4be654ab130fe8f1f94d7051f35"
|
|
411
|
+
|
|
361
412
|
[[package]]
|
|
362
413
|
name = "difflib"
|
|
363
414
|
version = "0.4.0"
|
|
@@ -396,6 +447,21 @@ dependencies = [
|
|
|
396
447
|
"dtoa",
|
|
397
448
|
]
|
|
398
449
|
|
|
450
|
+
[[package]]
|
|
451
|
+
name = "dtor"
|
|
452
|
+
version = "0.1.0"
|
|
453
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
454
|
+
checksum = "e58a0764cddb55ab28955347b45be00ade43d4d6f3ba4bf3dc354e4ec9432934"
|
|
455
|
+
dependencies = [
|
|
456
|
+
"dtor-proc-macro",
|
|
457
|
+
]
|
|
458
|
+
|
|
459
|
+
[[package]]
|
|
460
|
+
name = "dtor-proc-macro"
|
|
461
|
+
version = "0.0.6"
|
|
462
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
463
|
+
checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5"
|
|
464
|
+
|
|
399
465
|
[[package]]
|
|
400
466
|
name = "either"
|
|
401
467
|
version = "1.15.0"
|
|
@@ -436,6 +502,12 @@ dependencies = [
|
|
|
436
502
|
"simd-adler32",
|
|
437
503
|
]
|
|
438
504
|
|
|
505
|
+
[[package]]
|
|
506
|
+
name = "find-msvc-tools"
|
|
507
|
+
version = "0.1.4"
|
|
508
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
509
|
+
checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127"
|
|
510
|
+
|
|
439
511
|
[[package]]
|
|
440
512
|
name = "flate2"
|
|
441
513
|
version = "1.1.4"
|
|
@@ -474,6 +546,19 @@ dependencies = [
|
|
|
474
546
|
"new_debug_unreachable",
|
|
475
547
|
]
|
|
476
548
|
|
|
549
|
+
[[package]]
|
|
550
|
+
name = "getrandom"
|
|
551
|
+
version = "0.2.16"
|
|
552
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
553
|
+
checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
|
|
554
|
+
dependencies = [
|
|
555
|
+
"cfg-if",
|
|
556
|
+
"js-sys",
|
|
557
|
+
"libc",
|
|
558
|
+
"wasi 0.11.1+wasi-snapshot-preview1",
|
|
559
|
+
"wasm-bindgen",
|
|
560
|
+
]
|
|
561
|
+
|
|
477
562
|
[[package]]
|
|
478
563
|
name = "getrandom"
|
|
479
564
|
version = "0.3.3"
|
|
@@ -483,7 +568,7 @@ dependencies = [
|
|
|
483
568
|
"cfg-if",
|
|
484
569
|
"libc",
|
|
485
570
|
"r-efi",
|
|
486
|
-
"wasi",
|
|
571
|
+
"wasi 0.14.7+wasi-0.2.4",
|
|
487
572
|
]
|
|
488
573
|
|
|
489
574
|
[[package]]
|
|
@@ -530,7 +615,7 @@ dependencies = [
|
|
|
530
615
|
|
|
531
616
|
[[package]]
|
|
532
617
|
name = "html-to-markdown-cli"
|
|
533
|
-
version = "2.
|
|
618
|
+
version = "2.3.0"
|
|
534
619
|
dependencies = [
|
|
535
620
|
"assert_cmd",
|
|
536
621
|
"clap",
|
|
@@ -542,9 +627,20 @@ dependencies = [
|
|
|
542
627
|
"tempfile",
|
|
543
628
|
]
|
|
544
629
|
|
|
630
|
+
[[package]]
|
|
631
|
+
name = "html-to-markdown-node"
|
|
632
|
+
version = "2.3.0"
|
|
633
|
+
dependencies = [
|
|
634
|
+
"html-to-markdown-rs",
|
|
635
|
+
"mimalloc-rust",
|
|
636
|
+
"napi",
|
|
637
|
+
"napi-build",
|
|
638
|
+
"napi-derive",
|
|
639
|
+
]
|
|
640
|
+
|
|
545
641
|
[[package]]
|
|
546
642
|
name = "html-to-markdown-py"
|
|
547
|
-
version = "2.
|
|
643
|
+
version = "2.3.0"
|
|
548
644
|
dependencies = [
|
|
549
645
|
"base64",
|
|
550
646
|
"html-to-markdown-rs",
|
|
@@ -554,7 +650,7 @@ dependencies = [
|
|
|
554
650
|
|
|
555
651
|
[[package]]
|
|
556
652
|
name = "html-to-markdown-rs"
|
|
557
|
-
version = "2.
|
|
653
|
+
version = "2.3.0"
|
|
558
654
|
dependencies = [
|
|
559
655
|
"ammonia",
|
|
560
656
|
"base64",
|
|
@@ -569,6 +665,20 @@ dependencies = [
|
|
|
569
665
|
"tl",
|
|
570
666
|
]
|
|
571
667
|
|
|
668
|
+
[[package]]
|
|
669
|
+
name = "html-to-markdown-wasm"
|
|
670
|
+
version = "2.3.0"
|
|
671
|
+
dependencies = [
|
|
672
|
+
"console_error_panic_hook",
|
|
673
|
+
"getrandom 0.2.16",
|
|
674
|
+
"html-to-markdown-rs",
|
|
675
|
+
"js-sys",
|
|
676
|
+
"serde",
|
|
677
|
+
"serde-wasm-bindgen",
|
|
678
|
+
"wasm-bindgen",
|
|
679
|
+
"wasm-bindgen-test",
|
|
680
|
+
]
|
|
681
|
+
|
|
572
682
|
[[package]]
|
|
573
683
|
name = "html5ever"
|
|
574
684
|
version = "0.35.0"
|
|
@@ -769,6 +879,16 @@ version = "0.2.177"
|
|
|
769
879
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
770
880
|
checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
|
|
771
881
|
|
|
882
|
+
[[package]]
|
|
883
|
+
name = "libloading"
|
|
884
|
+
version = "0.8.9"
|
|
885
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
886
|
+
checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
|
|
887
|
+
dependencies = [
|
|
888
|
+
"cfg-if",
|
|
889
|
+
"windows-link",
|
|
890
|
+
]
|
|
891
|
+
|
|
772
892
|
[[package]]
|
|
773
893
|
name = "linux-raw-sys"
|
|
774
894
|
version = "0.11.0"
|
|
@@ -845,6 +965,36 @@ dependencies = [
|
|
|
845
965
|
"autocfg",
|
|
846
966
|
]
|
|
847
967
|
|
|
968
|
+
[[package]]
|
|
969
|
+
name = "mimalloc-rust"
|
|
970
|
+
version = "0.2.1"
|
|
971
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
972
|
+
checksum = "5eb726c8298efb4010b2c46d8050e4be36cf807b9d9e98cb112f830914fc9bbe"
|
|
973
|
+
dependencies = [
|
|
974
|
+
"cty",
|
|
975
|
+
"mimalloc-rust-sys",
|
|
976
|
+
]
|
|
977
|
+
|
|
978
|
+
[[package]]
|
|
979
|
+
name = "mimalloc-rust-sys"
|
|
980
|
+
version = "1.7.9-source"
|
|
981
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
982
|
+
checksum = "6413e13241a9809f291568133eca6694572cf528c1a6175502d090adce5dd5db"
|
|
983
|
+
dependencies = [
|
|
984
|
+
"cc",
|
|
985
|
+
"cty",
|
|
986
|
+
]
|
|
987
|
+
|
|
988
|
+
[[package]]
|
|
989
|
+
name = "minicov"
|
|
990
|
+
version = "0.3.7"
|
|
991
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
992
|
+
checksum = "f27fe9f1cc3c22e1687f9446c2083c4c5fc7f0bcf1c7a86bdbded14985895b4b"
|
|
993
|
+
dependencies = [
|
|
994
|
+
"cc",
|
|
995
|
+
"walkdir",
|
|
996
|
+
]
|
|
997
|
+
|
|
848
998
|
[[package]]
|
|
849
999
|
name = "miniz_oxide"
|
|
850
1000
|
version = "0.8.9"
|
|
@@ -865,12 +1015,74 @@ dependencies = [
|
|
|
865
1015
|
"pxfm",
|
|
866
1016
|
]
|
|
867
1017
|
|
|
1018
|
+
[[package]]
|
|
1019
|
+
name = "napi"
|
|
1020
|
+
version = "3.3.0"
|
|
1021
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1022
|
+
checksum = "f1b74e3dce5230795bb4d2821b941706dee733c7308752507254b0497f39cad7"
|
|
1023
|
+
dependencies = [
|
|
1024
|
+
"bitflags",
|
|
1025
|
+
"ctor",
|
|
1026
|
+
"napi-build",
|
|
1027
|
+
"napi-sys",
|
|
1028
|
+
"nohash-hasher",
|
|
1029
|
+
"rustc-hash",
|
|
1030
|
+
]
|
|
1031
|
+
|
|
1032
|
+
[[package]]
|
|
1033
|
+
name = "napi-build"
|
|
1034
|
+
version = "2.2.3"
|
|
1035
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1036
|
+
checksum = "dcae8ad5609d14afb3a3b91dee88c757016261b151e9dcecabf1b2a31a6cab14"
|
|
1037
|
+
|
|
1038
|
+
[[package]]
|
|
1039
|
+
name = "napi-derive"
|
|
1040
|
+
version = "3.2.5"
|
|
1041
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1042
|
+
checksum = "7552d5a579b834614bbd496db5109f1b9f1c758f08224b0dee1e408333adf0d0"
|
|
1043
|
+
dependencies = [
|
|
1044
|
+
"convert_case",
|
|
1045
|
+
"ctor",
|
|
1046
|
+
"napi-derive-backend",
|
|
1047
|
+
"proc-macro2",
|
|
1048
|
+
"quote",
|
|
1049
|
+
"syn",
|
|
1050
|
+
]
|
|
1051
|
+
|
|
1052
|
+
[[package]]
|
|
1053
|
+
name = "napi-derive-backend"
|
|
1054
|
+
version = "2.2.0"
|
|
1055
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1056
|
+
checksum = "5f6a81ac7486b70f2532a289603340862c06eea5a1e650c1ffeda2ce1238516a"
|
|
1057
|
+
dependencies = [
|
|
1058
|
+
"convert_case",
|
|
1059
|
+
"proc-macro2",
|
|
1060
|
+
"quote",
|
|
1061
|
+
"semver",
|
|
1062
|
+
"syn",
|
|
1063
|
+
]
|
|
1064
|
+
|
|
1065
|
+
[[package]]
|
|
1066
|
+
name = "napi-sys"
|
|
1067
|
+
version = "3.0.0"
|
|
1068
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1069
|
+
checksum = "3e4e7135a8f97aa0f1509cce21a8a1f9dcec1b50d8dee006b48a5adb69a9d64d"
|
|
1070
|
+
dependencies = [
|
|
1071
|
+
"libloading",
|
|
1072
|
+
]
|
|
1073
|
+
|
|
868
1074
|
[[package]]
|
|
869
1075
|
name = "new_debug_unreachable"
|
|
870
1076
|
version = "1.0.6"
|
|
871
1077
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
872
1078
|
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
|
|
873
1079
|
|
|
1080
|
+
[[package]]
|
|
1081
|
+
name = "nohash-hasher"
|
|
1082
|
+
version = "0.2.0"
|
|
1083
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1084
|
+
checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451"
|
|
1085
|
+
|
|
874
1086
|
[[package]]
|
|
875
1087
|
name = "normalize-line-endings"
|
|
876
1088
|
version = "0.3.0"
|
|
@@ -1256,6 +1468,12 @@ version = "0.2.2"
|
|
|
1256
1468
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1257
1469
|
checksum = "88f8660c1ff60292143c98d08fc6e2f654d722db50410e3f3797d40baaf9d8f3"
|
|
1258
1470
|
|
|
1471
|
+
[[package]]
|
|
1472
|
+
name = "rustc-hash"
|
|
1473
|
+
version = "2.1.1"
|
|
1474
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1475
|
+
checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
|
|
1476
|
+
|
|
1259
1477
|
[[package]]
|
|
1260
1478
|
name = "rustix"
|
|
1261
1479
|
version = "1.1.2"
|
|
@@ -1296,6 +1514,12 @@ version = "1.2.0"
|
|
|
1296
1514
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1297
1515
|
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
|
1298
1516
|
|
|
1517
|
+
[[package]]
|
|
1518
|
+
name = "semver"
|
|
1519
|
+
version = "1.0.27"
|
|
1520
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1521
|
+
checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
|
|
1522
|
+
|
|
1299
1523
|
[[package]]
|
|
1300
1524
|
name = "serde"
|
|
1301
1525
|
version = "1.0.228"
|
|
@@ -1306,6 +1530,17 @@ dependencies = [
|
|
|
1306
1530
|
"serde_derive",
|
|
1307
1531
|
]
|
|
1308
1532
|
|
|
1533
|
+
[[package]]
|
|
1534
|
+
name = "serde-wasm-bindgen"
|
|
1535
|
+
version = "0.6.5"
|
|
1536
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1537
|
+
checksum = "8302e169f0eddcc139c70f139d19d6467353af16f9fce27e8c30158036a1e16b"
|
|
1538
|
+
dependencies = [
|
|
1539
|
+
"js-sys",
|
|
1540
|
+
"serde",
|
|
1541
|
+
"wasm-bindgen",
|
|
1542
|
+
]
|
|
1543
|
+
|
|
1309
1544
|
[[package]]
|
|
1310
1545
|
name = "serde_core"
|
|
1311
1546
|
version = "1.0.228"
|
|
@@ -1339,6 +1574,12 @@ dependencies = [
|
|
|
1339
1574
|
"serde_core",
|
|
1340
1575
|
]
|
|
1341
1576
|
|
|
1577
|
+
[[package]]
|
|
1578
|
+
name = "shlex"
|
|
1579
|
+
version = "1.3.0"
|
|
1580
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1581
|
+
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
|
1582
|
+
|
|
1342
1583
|
[[package]]
|
|
1343
1584
|
name = "simd-adler32"
|
|
1344
1585
|
version = "0.3.7"
|
|
@@ -1429,7 +1670,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
1429
1670
|
checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16"
|
|
1430
1671
|
dependencies = [
|
|
1431
1672
|
"fastrand",
|
|
1432
|
-
"getrandom",
|
|
1673
|
+
"getrandom 0.3.3",
|
|
1433
1674
|
"once_cell",
|
|
1434
1675
|
"rustix",
|
|
1435
1676
|
"windows-sys 0.61.2",
|
|
@@ -1504,6 +1745,12 @@ version = "1.0.19"
|
|
|
1504
1745
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1505
1746
|
checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d"
|
|
1506
1747
|
|
|
1748
|
+
[[package]]
|
|
1749
|
+
name = "unicode-segmentation"
|
|
1750
|
+
version = "1.12.0"
|
|
1751
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1752
|
+
checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
|
|
1753
|
+
|
|
1507
1754
|
[[package]]
|
|
1508
1755
|
name = "unindent"
|
|
1509
1756
|
version = "0.2.4"
|
|
@@ -1565,6 +1812,12 @@ dependencies = [
|
|
|
1565
1812
|
"winapi-util",
|
|
1566
1813
|
]
|
|
1567
1814
|
|
|
1815
|
+
[[package]]
|
|
1816
|
+
name = "wasi"
|
|
1817
|
+
version = "0.11.1+wasi-snapshot-preview1"
|
|
1818
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1819
|
+
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
|
|
1820
|
+
|
|
1568
1821
|
[[package]]
|
|
1569
1822
|
name = "wasi"
|
|
1570
1823
|
version = "0.14.7+wasi-0.2.4"
|
|
@@ -1610,6 +1863,19 @@ dependencies = [
|
|
|
1610
1863
|
"wasm-bindgen-shared",
|
|
1611
1864
|
]
|
|
1612
1865
|
|
|
1866
|
+
[[package]]
|
|
1867
|
+
name = "wasm-bindgen-futures"
|
|
1868
|
+
version = "0.4.54"
|
|
1869
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1870
|
+
checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c"
|
|
1871
|
+
dependencies = [
|
|
1872
|
+
"cfg-if",
|
|
1873
|
+
"js-sys",
|
|
1874
|
+
"once_cell",
|
|
1875
|
+
"wasm-bindgen",
|
|
1876
|
+
"web-sys",
|
|
1877
|
+
]
|
|
1878
|
+
|
|
1613
1879
|
[[package]]
|
|
1614
1880
|
name = "wasm-bindgen-macro"
|
|
1615
1881
|
version = "0.2.104"
|
|
@@ -1642,6 +1908,30 @@ dependencies = [
|
|
|
1642
1908
|
"unicode-ident",
|
|
1643
1909
|
]
|
|
1644
1910
|
|
|
1911
|
+
[[package]]
|
|
1912
|
+
name = "wasm-bindgen-test"
|
|
1913
|
+
version = "0.3.54"
|
|
1914
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1915
|
+
checksum = "4e381134e148c1062f965a42ed1f5ee933eef2927c3f70d1812158f711d39865"
|
|
1916
|
+
dependencies = [
|
|
1917
|
+
"js-sys",
|
|
1918
|
+
"minicov",
|
|
1919
|
+
"wasm-bindgen",
|
|
1920
|
+
"wasm-bindgen-futures",
|
|
1921
|
+
"wasm-bindgen-test-macro",
|
|
1922
|
+
]
|
|
1923
|
+
|
|
1924
|
+
[[package]]
|
|
1925
|
+
name = "wasm-bindgen-test-macro"
|
|
1926
|
+
version = "0.3.54"
|
|
1927
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1928
|
+
checksum = "b673bca3298fe582aeef8352330ecbad91849f85090805582400850f8270a2e8"
|
|
1929
|
+
dependencies = [
|
|
1930
|
+
"proc-macro2",
|
|
1931
|
+
"quote",
|
|
1932
|
+
"syn",
|
|
1933
|
+
]
|
|
1934
|
+
|
|
1645
1935
|
[[package]]
|
|
1646
1936
|
name = "web-sys"
|
|
1647
1937
|
version = "0.3.81"
|
|
@@ -3,7 +3,7 @@ resolver = "2"
|
|
|
3
3
|
members = ["crates/html-to-markdown-py"]
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "2.
|
|
6
|
+
version = "2.3.0"
|
|
7
7
|
edition = "2021"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
9
9
|
license = "MIT"
|
|
@@ -15,7 +15,7 @@ rust-version = "1.80"
|
|
|
15
15
|
|
|
16
16
|
[workspace.dependencies]
|
|
17
17
|
# Core library
|
|
18
|
-
html-to-markdown-rs = { version = "2.
|
|
18
|
+
html-to-markdown-rs = { version = "2.3.0", path = "crates/html-to-markdown" }
|
|
19
19
|
|
|
20
20
|
# HTML parsing and sanitization
|
|
21
21
|
tl = "0.7"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.3.0
|
|
4
4
|
Classifier: Development Status :: 5 - Production/Stable
|
|
5
5
|
Classifier: Environment :: Console
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -52,11 +52,11 @@ Apple M4 • Real Wikipedia documents • `convert()` (Python)
|
|
|
52
52
|
|
|
53
53
|
| Document | Size | Latency | Throughput | Docs/sec |
|
|
54
54
|
| ------------------- | ----- | ------- | ---------- | -------- |
|
|
55
|
-
| Lists (Timeline) | 129KB | 0.62ms | 208
|
|
56
|
-
| Tables (Countries) | 360KB | 2.02ms | 178
|
|
57
|
-
| Mixed (Python wiki) | 656KB | 4.56ms | 144
|
|
55
|
+
| Lists (Timeline) | 129KB | 0.62ms | 208 MB/s | 1,613 |
|
|
56
|
+
| Tables (Countries) | 360KB | 2.02ms | 178 MB/s | 495 |
|
|
57
|
+
| Mixed (Python wiki) | 656KB | 4.56ms | 144 MB/s | 219 |
|
|
58
58
|
|
|
59
|
-
> V1 averaged ~2.5
|
|
59
|
+
> V1 averaged ~2.5 MB/s (Python/BeautifulSoup). V2's Rust engine delivers 60–80× higher throughput.
|
|
60
60
|
|
|
61
61
|
## Quick Start
|
|
62
62
|
|
|
@@ -173,11 +173,50 @@ Key fields (see docstring for full matrix):
|
|
|
173
173
|
- `capture_svg`: collect inline `<svg>` (default `True`)
|
|
174
174
|
- `infer_dimensions`: decode raster images to obtain dimensions (default `False`)
|
|
175
175
|
|
|
176
|
+
## Performance: V2 vs V1 Compatibility Layer
|
|
177
|
+
|
|
178
|
+
### ⚠️ Important: Always Use V2 API
|
|
179
|
+
|
|
180
|
+
The v2 API (`convert()`) is **strongly recommended** for all code. The v1 compatibility layer adds significant overhead and should only be used for gradual migration:
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
# ✅ RECOMMENDED - V2 Direct API (Fast)
|
|
184
|
+
from html_to_markdown import convert, ConversionOptions
|
|
185
|
+
|
|
186
|
+
markdown = convert(html) # Simple conversion - FAST
|
|
187
|
+
markdown = convert(html, ConversionOptions(heading_style="atx")) # With options - FAST
|
|
188
|
+
|
|
189
|
+
# ❌ AVOID - V1 Compatibility Layer (Slow)
|
|
190
|
+
from html_to_markdown import convert_to_markdown
|
|
191
|
+
|
|
192
|
+
markdown = convert_to_markdown(html, heading_style="atx") # Adds 77% overhead
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Performance Comparison
|
|
196
|
+
|
|
197
|
+
Benchmarked on Apple M4 with 25-paragraph HTML document:
|
|
198
|
+
|
|
199
|
+
| API | ops/sec | Relative Performance | Recommendation |
|
|
200
|
+
| ------------------------ | ---------------- | -------------------- | ------------------- |
|
|
201
|
+
| **V2 API** (`convert()`) | **129,822** | baseline | ✅ **Use this** |
|
|
202
|
+
| **V1 Compat Layer** | **67,673** | **77% slower** | ⚠️ Migration only |
|
|
203
|
+
| **CLI** | **150-210 MB/s** | Fastest | ✅ Batch processing |
|
|
204
|
+
|
|
205
|
+
The v1 compatibility layer creates extra Python objects and performs additional conversions, significantly impacting performance.
|
|
206
|
+
|
|
207
|
+
### When to Use Each
|
|
208
|
+
|
|
209
|
+
- **V2 API (`convert()`)**: All new code, production systems, performance-critical applications ← **Use this**
|
|
210
|
+
- **V1 Compat (`convert_to_markdown()`)**: Only for gradual migration from legacy codebases
|
|
211
|
+
- **CLI (`html-to-markdown`)**: Batch processing, shell scripts, maximum throughput
|
|
212
|
+
|
|
176
213
|
## v1 Compatibility
|
|
177
214
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
- **
|
|
215
|
+
A compatibility layer is provided to ease migration from v1.x:
|
|
216
|
+
|
|
217
|
+
- **Compat shim**: `html_to_markdown.v1_compat` exposes `convert_to_markdown`, `convert_to_markdown_stream`, and `markdownify`. Keyword mappings are listed in the [changelog](CHANGELOG.md#v200).
|
|
218
|
+
- **⚠️ Performance warning**: These compatibility functions add 77% overhead. Migrate to v2 API as soon as possible.
|
|
219
|
+
- **CLI**: The Rust CLI replaces the old Python script. New flags are documented via `html-to-markdown --help`.
|
|
181
220
|
- **Removed options**: `code_language_callback`, `strip`, and streaming APIs were removed; use `ConversionOptions`, `PreprocessingOptions`, and the inline-image helpers instead.
|
|
182
221
|
|
|
183
222
|
## Links
|
|
@@ -19,11 +19,11 @@ Apple M4 • Real Wikipedia documents • `convert()` (Python)
|
|
|
19
19
|
|
|
20
20
|
| Document | Size | Latency | Throughput | Docs/sec |
|
|
21
21
|
| ------------------- | ----- | ------- | ---------- | -------- |
|
|
22
|
-
| Lists (Timeline) | 129KB | 0.62ms | 208
|
|
23
|
-
| Tables (Countries) | 360KB | 2.02ms | 178
|
|
24
|
-
| Mixed (Python wiki) | 656KB | 4.56ms | 144
|
|
22
|
+
| Lists (Timeline) | 129KB | 0.62ms | 208 MB/s | 1,613 |
|
|
23
|
+
| Tables (Countries) | 360KB | 2.02ms | 178 MB/s | 495 |
|
|
24
|
+
| Mixed (Python wiki) | 656KB | 4.56ms | 144 MB/s | 219 |
|
|
25
25
|
|
|
26
|
-
> V1 averaged ~2.5
|
|
26
|
+
> V1 averaged ~2.5 MB/s (Python/BeautifulSoup). V2's Rust engine delivers 60–80× higher throughput.
|
|
27
27
|
|
|
28
28
|
## Quick Start
|
|
29
29
|
|
|
@@ -140,11 +140,50 @@ Key fields (see docstring for full matrix):
|
|
|
140
140
|
- `capture_svg`: collect inline `<svg>` (default `True`)
|
|
141
141
|
- `infer_dimensions`: decode raster images to obtain dimensions (default `False`)
|
|
142
142
|
|
|
143
|
+
## Performance: V2 vs V1 Compatibility Layer
|
|
144
|
+
|
|
145
|
+
### ⚠️ Important: Always Use V2 API
|
|
146
|
+
|
|
147
|
+
The v2 API (`convert()`) is **strongly recommended** for all code. The v1 compatibility layer adds significant overhead and should only be used for gradual migration:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
# ✅ RECOMMENDED - V2 Direct API (Fast)
|
|
151
|
+
from html_to_markdown import convert, ConversionOptions
|
|
152
|
+
|
|
153
|
+
markdown = convert(html) # Simple conversion - FAST
|
|
154
|
+
markdown = convert(html, ConversionOptions(heading_style="atx")) # With options - FAST
|
|
155
|
+
|
|
156
|
+
# ❌ AVOID - V1 Compatibility Layer (Slow)
|
|
157
|
+
from html_to_markdown import convert_to_markdown
|
|
158
|
+
|
|
159
|
+
markdown = convert_to_markdown(html, heading_style="atx") # Adds 77% overhead
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Performance Comparison
|
|
163
|
+
|
|
164
|
+
Benchmarked on Apple M4 with 25-paragraph HTML document:
|
|
165
|
+
|
|
166
|
+
| API | ops/sec | Relative Performance | Recommendation |
|
|
167
|
+
| ------------------------ | ---------------- | -------------------- | ------------------- |
|
|
168
|
+
| **V2 API** (`convert()`) | **129,822** | baseline | ✅ **Use this** |
|
|
169
|
+
| **V1 Compat Layer** | **67,673** | **77% slower** | ⚠️ Migration only |
|
|
170
|
+
| **CLI** | **150-210 MB/s** | Fastest | ✅ Batch processing |
|
|
171
|
+
|
|
172
|
+
The v1 compatibility layer creates extra Python objects and performs additional conversions, significantly impacting performance.
|
|
173
|
+
|
|
174
|
+
### When to Use Each
|
|
175
|
+
|
|
176
|
+
- **V2 API (`convert()`)**: All new code, production systems, performance-critical applications ← **Use this**
|
|
177
|
+
- **V1 Compat (`convert_to_markdown()`)**: Only for gradual migration from legacy codebases
|
|
178
|
+
- **CLI (`html-to-markdown`)**: Batch processing, shell scripts, maximum throughput
|
|
179
|
+
|
|
143
180
|
## v1 Compatibility
|
|
144
181
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
- **
|
|
182
|
+
A compatibility layer is provided to ease migration from v1.x:
|
|
183
|
+
|
|
184
|
+
- **Compat shim**: `html_to_markdown.v1_compat` exposes `convert_to_markdown`, `convert_to_markdown_stream`, and `markdownify`. Keyword mappings are listed in the [changelog](CHANGELOG.md#v200).
|
|
185
|
+
- **⚠️ Performance warning**: These compatibility functions add 77% overhead. Migrate to v2 API as soon as possible.
|
|
186
|
+
- **CLI**: The Rust CLI replaces the old Python script. New flags are documented via `html-to-markdown --help`.
|
|
148
187
|
- **Removed options**: `code_language_callback`, `strip`, and streaming APIs were removed; use `ConversionOptions`, `PreprocessingOptions`, and the inline-image helpers instead.
|
|
149
188
|
|
|
150
189
|
## Links
|