html-to-markdown 2.3.4__tar.gz → 2.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (54) hide show
  1. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/Cargo.lock +55 -171
  2. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/Cargo.toml +3 -3
  3. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/PKG-INFO +1 -1
  4. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/Cargo.toml +1 -1
  5. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/converter.rs +177 -1
  6. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/converter.rs +41 -5
  7. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/html_to_markdown/__init__.py +1 -1
  8. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/html_to_markdown/bin/html-to-markdown +0 -0
  9. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/pyproject.toml +4 -4
  10. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/LICENSE +0 -0
  11. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/README_PYPI.md +0 -0
  12. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/README.md +0 -0
  13. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/benches/conversion_benchmark.rs +0 -0
  14. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/benches/micro_benchmark.rs +0 -0
  15. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/benches/profiling_benchmark.rs +0 -0
  16. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/basic.rs +0 -0
  17. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/table.rs +0 -0
  18. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_escape.rs +0 -0
  19. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_inline_formatting.rs +0 -0
  20. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_lists.rs +0 -0
  21. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_semantic_tags.rs +0 -0
  22. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_tables.rs +0 -0
  23. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_task_lists.rs +0 -0
  24. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_whitespace.rs +0 -0
  25. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/error.rs +0 -0
  26. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/extractor.rs +0 -0
  27. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/mod.rs +0 -0
  28. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/parser.rs +0 -0
  29. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/spatial.rs +0 -0
  30. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/types.rs +0 -0
  31. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/inline_images.rs +0 -0
  32. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/lib.rs +0 -0
  33. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/options.rs +0 -0
  34. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/sanitizer.rs +0 -0
  35. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/text.rs +0 -0
  36. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/wrapper.rs +0 -0
  37. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/tests/commonmark_compliance_test.rs +0 -0
  38. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/tests/hocr_compliance_test.rs +0 -0
  39. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown/tests/integration_test.rs +0 -0
  40. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/Cargo.toml +0 -0
  41. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/README.md +0 -0
  42. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/python/html_to_markdown/__init__.py +0 -0
  43. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/python/html_to_markdown/_html_to_markdown.pyi +0 -0
  44. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/src/lib.rs +0 -0
  45. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/uv.lock +0 -0
  46. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/html_to_markdown/__main__.py +0 -0
  47. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/html_to_markdown/_rust.pyi +0 -0
  48. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/html_to_markdown/api.py +0 -0
  49. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/html_to_markdown/cli.py +0 -0
  50. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/html_to_markdown/cli_proxy.py +0 -0
  51. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/html_to_markdown/exceptions.py +0 -0
  52. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/html_to_markdown/options.py +0 -0
  53. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/html_to_markdown/py.typed +0 -0
  54. {html_to_markdown-2.3.4 → html_to_markdown-2.4.1}/html_to_markdown/v1_compat.py +0 -0
@@ -116,9 +116,9 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
116
116
 
117
117
  [[package]]
118
118
  name = "bitflags"
119
- version = "2.9.4"
119
+ version = "2.10.0"
120
120
  source = "registry+https://github.com/rust-lang/crates.io-index"
121
- checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394"
121
+ checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
122
122
 
123
123
  [[package]]
124
124
  name = "bstr"
@@ -167,9 +167,9 @@ dependencies = [
167
167
 
168
168
  [[package]]
169
169
  name = "cfg-if"
170
- version = "1.0.3"
170
+ version = "1.0.4"
171
171
  source = "registry+https://github.com/rust-lang/crates.io-index"
172
- checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9"
172
+ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
173
173
 
174
174
  [[package]]
175
175
  name = "ciborium"
@@ -200,9 +200,9 @@ dependencies = [
200
200
 
201
201
  [[package]]
202
202
  name = "clap"
203
- version = "4.5.49"
203
+ version = "4.5.50"
204
204
  source = "registry+https://github.com/rust-lang/crates.io-index"
205
- checksum = "f4512b90fa68d3a9932cea5184017c5d200f5921df706d45e853537dea51508f"
205
+ checksum = "0c2cfd7bf8a6017ddaa4e32ffe7403d547790db06bd171c1c53926faab501623"
206
206
  dependencies = [
207
207
  "clap_builder",
208
208
  "clap_derive",
@@ -210,9 +210,9 @@ dependencies = [
210
210
 
211
211
  [[package]]
212
212
  name = "clap_builder"
213
- version = "4.5.49"
213
+ version = "4.5.50"
214
214
  source = "registry+https://github.com/rust-lang/crates.io-index"
215
- checksum = "0025e98baa12e766c67ba13ff4695a887a1eba19569aad00a472546795bd6730"
215
+ checksum = "0a4c05b9e80c5ccd3a7ef080ad7b6ba7d6fc00a985b8b157197075677c82c7a0"
216
216
  dependencies = [
217
217
  "anstream",
218
218
  "anstyle",
@@ -299,25 +299,22 @@ dependencies = [
299
299
 
300
300
  [[package]]
301
301
  name = "criterion"
302
- version = "0.5.1"
302
+ version = "0.7.0"
303
303
  source = "registry+https://github.com/rust-lang/crates.io-index"
304
- checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
304
+ checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928"
305
305
  dependencies = [
306
306
  "anes",
307
307
  "cast",
308
308
  "ciborium",
309
309
  "clap",
310
310
  "criterion-plot",
311
- "is-terminal",
312
311
  "itertools",
313
312
  "num-traits",
314
- "once_cell",
315
313
  "oorandom",
316
314
  "plotters",
317
315
  "rayon",
318
316
  "regex",
319
317
  "serde",
320
- "serde_derive",
321
318
  "serde_json",
322
319
  "tinytemplate",
323
320
  "walkdir",
@@ -325,9 +322,9 @@ dependencies = [
325
322
 
326
323
  [[package]]
327
324
  name = "criterion-plot"
328
- version = "0.5.0"
325
+ version = "0.6.0"
329
326
  source = "registry+https://github.com/rust-lang/crates.io-index"
330
- checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
327
+ checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338"
331
328
  dependencies = [
332
329
  "cast",
333
330
  "itertools",
@@ -548,27 +545,16 @@ dependencies = [
548
545
 
549
546
  [[package]]
550
547
  name = "getrandom"
551
- version = "0.2.16"
548
+ version = "0.3.4"
552
549
  source = "registry+https://github.com/rust-lang/crates.io-index"
553
- checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
550
+ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
554
551
  dependencies = [
555
552
  "cfg-if",
556
553
  "js-sys",
557
554
  "libc",
558
- "wasi 0.11.1+wasi-snapshot-preview1",
559
- "wasm-bindgen",
560
- ]
561
-
562
- [[package]]
563
- name = "getrandom"
564
- version = "0.3.3"
565
- source = "registry+https://github.com/rust-lang/crates.io-index"
566
- checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
567
- dependencies = [
568
- "cfg-if",
569
- "libc",
570
555
  "r-efi",
571
- "wasi 0.14.7+wasi-0.2.4",
556
+ "wasip2",
557
+ "wasm-bindgen",
572
558
  ]
573
559
 
574
560
  [[package]]
@@ -598,12 +584,6 @@ version = "0.5.0"
598
584
  source = "registry+https://github.com/rust-lang/crates.io-index"
599
585
  checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
600
586
 
601
- [[package]]
602
- name = "hermit-abi"
603
- version = "0.5.2"
604
- source = "registry+https://github.com/rust-lang/crates.io-index"
605
- checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
606
-
607
587
  [[package]]
608
588
  name = "html-escape"
609
589
  version = "0.2.13"
@@ -615,7 +595,7 @@ dependencies = [
615
595
 
616
596
  [[package]]
617
597
  name = "html-to-markdown-cli"
618
- version = "2.3.4"
598
+ version = "2.4.1"
619
599
  dependencies = [
620
600
  "assert_cmd",
621
601
  "clap",
@@ -629,7 +609,7 @@ dependencies = [
629
609
 
630
610
  [[package]]
631
611
  name = "html-to-markdown-node"
632
- version = "2.3.4"
612
+ version = "2.4.1"
633
613
  dependencies = [
634
614
  "html-to-markdown-rs",
635
615
  "mimalloc-rust",
@@ -640,7 +620,7 @@ dependencies = [
640
620
 
641
621
  [[package]]
642
622
  name = "html-to-markdown-py"
643
- version = "2.3.4"
623
+ version = "2.4.1"
644
624
  dependencies = [
645
625
  "base64",
646
626
  "html-to-markdown-rs",
@@ -650,7 +630,7 @@ dependencies = [
650
630
 
651
631
  [[package]]
652
632
  name = "html-to-markdown-rs"
653
- version = "2.3.4"
633
+ version = "2.4.1"
654
634
  dependencies = [
655
635
  "ammonia",
656
636
  "base64",
@@ -667,10 +647,10 @@ dependencies = [
667
647
 
668
648
  [[package]]
669
649
  name = "html-to-markdown-wasm"
670
- version = "2.3.4"
650
+ version = "2.4.1"
671
651
  dependencies = [
672
652
  "console_error_panic_hook",
673
- "getrandom 0.2.16",
653
+ "getrandom",
674
654
  "html-to-markdown-rs",
675
655
  "js-sys",
676
656
  "serde",
@@ -827,32 +807,24 @@ dependencies = [
827
807
 
828
808
  [[package]]
829
809
  name = "indoc"
830
- version = "2.0.6"
810
+ version = "2.0.7"
831
811
  source = "registry+https://github.com/rust-lang/crates.io-index"
832
- checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
833
-
834
- [[package]]
835
- name = "is-terminal"
836
- version = "0.4.16"
837
- source = "registry+https://github.com/rust-lang/crates.io-index"
838
- checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9"
812
+ checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
839
813
  dependencies = [
840
- "hermit-abi",
841
- "libc",
842
- "windows-sys 0.59.0",
814
+ "rustversion",
843
815
  ]
844
816
 
845
817
  [[package]]
846
818
  name = "is_terminal_polyfill"
847
- version = "1.70.1"
819
+ version = "1.70.2"
848
820
  source = "registry+https://github.com/rust-lang/crates.io-index"
849
- checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
821
+ checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
850
822
 
851
823
  [[package]]
852
824
  name = "itertools"
853
- version = "0.10.5"
825
+ version = "0.13.0"
854
826
  source = "registry+https://github.com/rust-lang/crates.io-index"
855
- checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
827
+ checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
856
828
  dependencies = [
857
829
  "either",
858
830
  ]
@@ -1106,9 +1078,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
1106
1078
 
1107
1079
  [[package]]
1108
1080
  name = "once_cell_polyfill"
1109
- version = "1.70.1"
1081
+ version = "1.70.2"
1110
1082
  source = "registry+https://github.com/rust-lang/crates.io-index"
1111
- checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
1083
+ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
1112
1084
 
1113
1085
  [[package]]
1114
1086
  name = "oorandom"
@@ -1309,9 +1281,9 @@ dependencies = [
1309
1281
 
1310
1282
  [[package]]
1311
1283
  name = "pyo3"
1312
- version = "0.26.0"
1284
+ version = "0.27.1"
1313
1285
  source = "registry+https://github.com/rust-lang/crates.io-index"
1314
- checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383"
1286
+ checksum = "37a6df7eab65fc7bee654a421404947e10a0f7085b6951bf2ea395f4659fb0cf"
1315
1287
  dependencies = [
1316
1288
  "indoc",
1317
1289
  "libc",
@@ -1326,18 +1298,18 @@ dependencies = [
1326
1298
 
1327
1299
  [[package]]
1328
1300
  name = "pyo3-build-config"
1329
- version = "0.26.0"
1301
+ version = "0.27.1"
1330
1302
  source = "registry+https://github.com/rust-lang/crates.io-index"
1331
- checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f"
1303
+ checksum = "f77d387774f6f6eec64a004eac0ed525aab7fa1966d94b42f743797b3e395afb"
1332
1304
  dependencies = [
1333
1305
  "target-lexicon",
1334
1306
  ]
1335
1307
 
1336
1308
  [[package]]
1337
1309
  name = "pyo3-ffi"
1338
- version = "0.26.0"
1310
+ version = "0.27.1"
1339
1311
  source = "registry+https://github.com/rust-lang/crates.io-index"
1340
- checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105"
1312
+ checksum = "2dd13844a4242793e02df3e2ec093f540d948299a6a77ea9ce7afd8623f542be"
1341
1313
  dependencies = [
1342
1314
  "libc",
1343
1315
  "pyo3-build-config",
@@ -1345,9 +1317,9 @@ dependencies = [
1345
1317
 
1346
1318
  [[package]]
1347
1319
  name = "pyo3-macros"
1348
- version = "0.26.0"
1320
+ version = "0.27.1"
1349
1321
  source = "registry+https://github.com/rust-lang/crates.io-index"
1350
- checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded"
1322
+ checksum = "eaf8f9f1108270b90d3676b8679586385430e5c0bb78bb5f043f95499c821a71"
1351
1323
  dependencies = [
1352
1324
  "proc-macro2",
1353
1325
  "pyo3-macros-backend",
@@ -1357,9 +1329,9 @@ dependencies = [
1357
1329
 
1358
1330
  [[package]]
1359
1331
  name = "pyo3-macros-backend"
1360
- version = "0.26.0"
1332
+ version = "0.27.1"
1361
1333
  source = "registry+https://github.com/rust-lang/crates.io-index"
1362
- checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf"
1334
+ checksum = "70a3b2274450ba5288bc9b8c1b69ff569d1d61189d4bff38f8d22e03d17f932b"
1363
1335
  dependencies = [
1364
1336
  "heck",
1365
1337
  "proc-macro2",
@@ -1637,9 +1609,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
1637
1609
 
1638
1610
  [[package]]
1639
1611
  name = "syn"
1640
- version = "2.0.106"
1612
+ version = "2.0.107"
1641
1613
  source = "registry+https://github.com/rust-lang/crates.io-index"
1642
- checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6"
1614
+ checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b"
1643
1615
  dependencies = [
1644
1616
  "proc-macro2",
1645
1617
  "quote",
@@ -1670,7 +1642,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
1670
1642
  checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16"
1671
1643
  dependencies = [
1672
1644
  "fastrand",
1673
- "getrandom 0.3.3",
1645
+ "getrandom",
1674
1646
  "once_cell",
1675
1647
  "rustix",
1676
1648
  "windows-sys 0.61.2",
@@ -1741,9 +1713,9 @@ checksum = "b130bd8a58c163224b44e217b4239ca7b927d82bf6cc2fea1fc561d15056e3f7"
1741
1713
 
1742
1714
  [[package]]
1743
1715
  name = "unicode-ident"
1744
- version = "1.0.19"
1716
+ version = "1.0.20"
1745
1717
  source = "registry+https://github.com/rust-lang/crates.io-index"
1746
- checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d"
1718
+ checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06"
1747
1719
 
1748
1720
  [[package]]
1749
1721
  name = "unicode-segmentation"
@@ -1812,21 +1784,6 @@ dependencies = [
1812
1784
  "winapi-util",
1813
1785
  ]
1814
1786
 
1815
- [[package]]
1816
- name = "wasi"
1817
- version = "0.11.1+wasi-snapshot-preview1"
1818
- source = "registry+https://github.com/rust-lang/crates.io-index"
1819
- checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
1820
-
1821
- [[package]]
1822
- name = "wasi"
1823
- version = "0.14.7+wasi-0.2.4"
1824
- source = "registry+https://github.com/rust-lang/crates.io-index"
1825
- checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c"
1826
- dependencies = [
1827
- "wasip2",
1828
- ]
1829
-
1830
1787
  [[package]]
1831
1788
  name = "wasip2"
1832
1789
  version = "1.0.1+wasi-0.2.4"
@@ -1975,22 +1932,13 @@ version = "0.2.1"
1975
1932
  source = "registry+https://github.com/rust-lang/crates.io-index"
1976
1933
  checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
1977
1934
 
1978
- [[package]]
1979
- name = "windows-sys"
1980
- version = "0.59.0"
1981
- source = "registry+https://github.com/rust-lang/crates.io-index"
1982
- checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
1983
- dependencies = [
1984
- "windows-targets 0.52.6",
1985
- ]
1986
-
1987
1935
  [[package]]
1988
1936
  name = "windows-sys"
1989
1937
  version = "0.60.2"
1990
1938
  source = "registry+https://github.com/rust-lang/crates.io-index"
1991
1939
  checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
1992
1940
  dependencies = [
1993
- "windows-targets 0.53.5",
1941
+ "windows-targets",
1994
1942
  ]
1995
1943
 
1996
1944
  [[package]]
@@ -2002,22 +1950,6 @@ dependencies = [
2002
1950
  "windows-link",
2003
1951
  ]
2004
1952
 
2005
- [[package]]
2006
- name = "windows-targets"
2007
- version = "0.52.6"
2008
- source = "registry+https://github.com/rust-lang/crates.io-index"
2009
- checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
2010
- dependencies = [
2011
- "windows_aarch64_gnullvm 0.52.6",
2012
- "windows_aarch64_msvc 0.52.6",
2013
- "windows_i686_gnu 0.52.6",
2014
- "windows_i686_gnullvm 0.52.6",
2015
- "windows_i686_msvc 0.52.6",
2016
- "windows_x86_64_gnu 0.52.6",
2017
- "windows_x86_64_gnullvm 0.52.6",
2018
- "windows_x86_64_msvc 0.52.6",
2019
- ]
2020
-
2021
1953
  [[package]]
2022
1954
  name = "windows-targets"
2023
1955
  version = "0.53.5"
@@ -2025,106 +1957,58 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
2025
1957
  checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
2026
1958
  dependencies = [
2027
1959
  "windows-link",
2028
- "windows_aarch64_gnullvm 0.53.1",
2029
- "windows_aarch64_msvc 0.53.1",
2030
- "windows_i686_gnu 0.53.1",
2031
- "windows_i686_gnullvm 0.53.1",
2032
- "windows_i686_msvc 0.53.1",
2033
- "windows_x86_64_gnu 0.53.1",
2034
- "windows_x86_64_gnullvm 0.53.1",
2035
- "windows_x86_64_msvc 0.53.1",
1960
+ "windows_aarch64_gnullvm",
1961
+ "windows_aarch64_msvc",
1962
+ "windows_i686_gnu",
1963
+ "windows_i686_gnullvm",
1964
+ "windows_i686_msvc",
1965
+ "windows_x86_64_gnu",
1966
+ "windows_x86_64_gnullvm",
1967
+ "windows_x86_64_msvc",
2036
1968
  ]
2037
1969
 
2038
- [[package]]
2039
- name = "windows_aarch64_gnullvm"
2040
- version = "0.52.6"
2041
- source = "registry+https://github.com/rust-lang/crates.io-index"
2042
- checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
2043
-
2044
1970
  [[package]]
2045
1971
  name = "windows_aarch64_gnullvm"
2046
1972
  version = "0.53.1"
2047
1973
  source = "registry+https://github.com/rust-lang/crates.io-index"
2048
1974
  checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
2049
1975
 
2050
- [[package]]
2051
- name = "windows_aarch64_msvc"
2052
- version = "0.52.6"
2053
- source = "registry+https://github.com/rust-lang/crates.io-index"
2054
- checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
2055
-
2056
1976
  [[package]]
2057
1977
  name = "windows_aarch64_msvc"
2058
1978
  version = "0.53.1"
2059
1979
  source = "registry+https://github.com/rust-lang/crates.io-index"
2060
1980
  checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
2061
1981
 
2062
- [[package]]
2063
- name = "windows_i686_gnu"
2064
- version = "0.52.6"
2065
- source = "registry+https://github.com/rust-lang/crates.io-index"
2066
- checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
2067
-
2068
1982
  [[package]]
2069
1983
  name = "windows_i686_gnu"
2070
1984
  version = "0.53.1"
2071
1985
  source = "registry+https://github.com/rust-lang/crates.io-index"
2072
1986
  checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
2073
1987
 
2074
- [[package]]
2075
- name = "windows_i686_gnullvm"
2076
- version = "0.52.6"
2077
- source = "registry+https://github.com/rust-lang/crates.io-index"
2078
- checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
2079
-
2080
1988
  [[package]]
2081
1989
  name = "windows_i686_gnullvm"
2082
1990
  version = "0.53.1"
2083
1991
  source = "registry+https://github.com/rust-lang/crates.io-index"
2084
1992
  checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
2085
1993
 
2086
- [[package]]
2087
- name = "windows_i686_msvc"
2088
- version = "0.52.6"
2089
- source = "registry+https://github.com/rust-lang/crates.io-index"
2090
- checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
2091
-
2092
1994
  [[package]]
2093
1995
  name = "windows_i686_msvc"
2094
1996
  version = "0.53.1"
2095
1997
  source = "registry+https://github.com/rust-lang/crates.io-index"
2096
1998
  checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
2097
1999
 
2098
- [[package]]
2099
- name = "windows_x86_64_gnu"
2100
- version = "0.52.6"
2101
- source = "registry+https://github.com/rust-lang/crates.io-index"
2102
- checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
2103
-
2104
2000
  [[package]]
2105
2001
  name = "windows_x86_64_gnu"
2106
2002
  version = "0.53.1"
2107
2003
  source = "registry+https://github.com/rust-lang/crates.io-index"
2108
2004
  checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
2109
2005
 
2110
- [[package]]
2111
- name = "windows_x86_64_gnullvm"
2112
- version = "0.52.6"
2113
- source = "registry+https://github.com/rust-lang/crates.io-index"
2114
- checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
2115
-
2116
2006
  [[package]]
2117
2007
  name = "windows_x86_64_gnullvm"
2118
2008
  version = "0.53.1"
2119
2009
  source = "registry+https://github.com/rust-lang/crates.io-index"
2120
2010
  checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
2121
2011
 
2122
- [[package]]
2123
- name = "windows_x86_64_msvc"
2124
- version = "0.52.6"
2125
- source = "registry+https://github.com/rust-lang/crates.io-index"
2126
- checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
2127
-
2128
2012
  [[package]]
2129
2013
  name = "windows_x86_64_msvc"
2130
2014
  version = "0.53.1"
@@ -3,7 +3,7 @@ resolver = "2"
3
3
  members = ["crates/html-to-markdown-py"]
4
4
 
5
5
  [workspace.package]
6
- version = "2.3.4"
6
+ version = "2.4.1"
7
7
  edition = "2021"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
9
9
  license = "MIT"
@@ -15,7 +15,7 @@ rust-version = "1.80"
15
15
 
16
16
  [workspace.dependencies]
17
17
  # Core library
18
- html-to-markdown-rs = { version = "2.3.4", path = "crates/html-to-markdown" }
18
+ html-to-markdown-rs = { version = "2.4.1", path = "crates/html-to-markdown" }
19
19
 
20
20
  # HTML parsing and sanitization
21
21
  tl = "0.7"
@@ -34,7 +34,7 @@ clap_complete = "4.5"
34
34
  clap_mangen = "0.2"
35
35
 
36
36
  # Python bindings
37
- pyo3 = { version = "0.26.0", features = ["abi3-py310"] }
37
+ pyo3 = { version = "0.27.1", features = ["abi3-py310"] }
38
38
 
39
39
  [profile.release]
40
40
  lto = "thin"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 2.3.4
3
+ Version: 2.4.1
4
4
  Classifier: Development Status :: 5 - Production/Stable
5
5
  Classifier: Environment :: Console
6
6
  Classifier: Intended Audience :: Developers
@@ -33,7 +33,7 @@ image = { version = "0.25", default-features = false, features = ["gif", "jpeg",
33
33
  [dev-dependencies]
34
34
  serde = { version = "1.0", features = ["derive"] }
35
35
  serde_json = "1.0"
36
- criterion = { version = "0.5", features = ["html_reports"] }
36
+ criterion = { version = "0.7", features = ["html_reports"] }
37
37
 
38
38
  [[bench]]
39
39
  name = "conversion_benchmark"
@@ -47,6 +47,9 @@ use std::collections::BTreeMap;
47
47
  #[cfg(feature = "inline-images")]
48
48
  use std::rc::Rc;
49
49
 
50
+ use std::borrow::Cow;
51
+ use std::str;
52
+
50
53
  use crate::error::Result;
51
54
  #[cfg(feature = "inline-images")]
52
55
  use crate::inline_images::{InlineImageCollector, InlineImageFormat, InlineImageSource};
@@ -971,7 +974,9 @@ fn convert_html_impl(
971
974
  .replace("<hr/>", "<hr>")
972
975
  .replace("<img/>", "<img>");
973
976
 
974
- let dom = tl::parse(&html, tl::ParserOptions::default())
977
+ let html = strip_script_and_style_sections(&html);
978
+
979
+ let dom = tl::parse(html.as_ref(), tl::ParserOptions::default())
975
980
  .map_err(|_| crate::error::ConversionError::ParseError("Failed to parse HTML".to_string()))?;
976
981
 
977
982
  let parser = dom.parser();
@@ -1075,6 +1080,161 @@ fn convert_html_impl(
1075
1080
  }
1076
1081
  }
1077
1082
 
1083
+ fn strip_script_and_style_sections(input: &str) -> Cow<'_, str> {
1084
+ const TAGS: [&[u8]; 2] = [b"script", b"style"];
1085
+ const SVG: &[u8] = b"svg";
1086
+
1087
+ let bytes = input.as_bytes();
1088
+ let len = bytes.len();
1089
+ let mut idx = 0;
1090
+ let mut last = 0;
1091
+ let mut output: Option<String> = None;
1092
+ let mut svg_depth = 0usize;
1093
+
1094
+ while idx < len {
1095
+ if bytes[idx] == b'<' {
1096
+ if matches_tag_start(bytes, idx + 1, SVG) {
1097
+ if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
1098
+ svg_depth += 1;
1099
+ idx = open_end;
1100
+ continue;
1101
+ }
1102
+ } else if matches_end_tag_start(bytes, idx + 1, SVG) {
1103
+ if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
1104
+ if svg_depth > 0 {
1105
+ svg_depth = svg_depth.saturating_sub(1);
1106
+ }
1107
+ idx = close_end;
1108
+ continue;
1109
+ }
1110
+ }
1111
+
1112
+ if svg_depth == 0 {
1113
+ let mut handled = false;
1114
+ for tag in TAGS {
1115
+ if matches_tag_start(bytes, idx + 1, tag) {
1116
+ if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
1117
+ let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
1118
+ let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1119
+ out.push_str(&input[last..idx]);
1120
+ out.push_str(&input[idx..open_end]);
1121
+ out.push_str("</");
1122
+ out.push_str(str::from_utf8(tag).unwrap());
1123
+ out.push('>');
1124
+
1125
+ last = remove_end;
1126
+ idx = remove_end;
1127
+ handled = true;
1128
+ }
1129
+ }
1130
+
1131
+ if handled {
1132
+ break;
1133
+ }
1134
+ }
1135
+
1136
+ if handled {
1137
+ continue;
1138
+ }
1139
+ }
1140
+ }
1141
+
1142
+ idx += 1;
1143
+ }
1144
+
1145
+ if let Some(mut out) = output {
1146
+ if last < input.len() {
1147
+ out.push_str(&input[last..]);
1148
+ }
1149
+ Cow::Owned(out)
1150
+ } else {
1151
+ Cow::Borrowed(input)
1152
+ }
1153
+ }
1154
+
1155
+ fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
1156
+ if start >= bytes.len() {
1157
+ return false;
1158
+ }
1159
+
1160
+ if start + tag.len() > bytes.len() {
1161
+ return false;
1162
+ }
1163
+
1164
+ if !bytes[start..start + tag.len()].eq_ignore_ascii_case(tag) {
1165
+ return false;
1166
+ }
1167
+
1168
+ start += tag.len();
1169
+
1170
+ match bytes.get(start) {
1171
+ Some(b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r') => true,
1172
+ Some(_) => false,
1173
+ None => true,
1174
+ }
1175
+ }
1176
+
1177
+ fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
1178
+ let len = bytes.len();
1179
+ let mut in_quote: Option<u8> = None;
1180
+
1181
+ while idx < len {
1182
+ match bytes[idx] {
1183
+ b'"' | b'\'' => {
1184
+ if let Some(current) = in_quote {
1185
+ if current == bytes[idx] {
1186
+ in_quote = None;
1187
+ }
1188
+ } else {
1189
+ in_quote = Some(bytes[idx]);
1190
+ }
1191
+ }
1192
+ b'>' if in_quote.is_none() => return Some(idx + 1),
1193
+ _ => {}
1194
+ }
1195
+ idx += 1;
1196
+ }
1197
+
1198
+ None
1199
+ }
1200
+
1201
+ fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
1202
+ let len = bytes.len();
1203
+ let mut depth = 1usize;
1204
+
1205
+ while idx < len {
1206
+ if bytes[idx] == b'<' {
1207
+ if matches_tag_start(bytes, idx + 1, tag) {
1208
+ if let Some(next) = find_tag_end(bytes, idx + 1 + tag.len()) {
1209
+ depth += 1;
1210
+ idx = next;
1211
+ continue;
1212
+ }
1213
+ } else if matches_end_tag_start(bytes, idx + 1, tag) {
1214
+ if let Some(close) = find_tag_end(bytes, idx + 2 + tag.len()) {
1215
+ depth -= 1;
1216
+ if depth == 0 {
1217
+ return Some(close);
1218
+ }
1219
+ idx = close;
1220
+ continue;
1221
+ }
1222
+ }
1223
+ }
1224
+
1225
+ idx += 1;
1226
+ }
1227
+
1228
+ None
1229
+ }
1230
+
1231
+ fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
1232
+ if start >= bytes.len() || bytes[start] != b'/' {
1233
+ return false;
1234
+ }
1235
+ matches_tag_start(bytes, start + 1, tag)
1236
+ }
1237
+
1078
1238
  /// Check if an element is inline (not block-level).
1079
1239
  fn is_inline_element(tag_name: &str) -> bool {
1080
1240
  matches!(
@@ -4002,6 +4162,22 @@ mod tests {
4002
4162
  assert_eq!(calculate_list_continuation_indent(4), 7);
4003
4163
  }
4004
4164
 
4165
+ #[test]
4166
+ fn strips_script_sections_without_removing_following_content() {
4167
+ let input = "<div>before</div><script>1 < 2</script><p>after</p>";
4168
+ let stripped = strip_script_and_style_sections(input);
4169
+ assert_eq!(stripped, "<div>before</div><script></script><p>after</p>");
4170
+ }
4171
+
4172
+ #[test]
4173
+ fn strips_multiline_script_sections() {
4174
+ let input = "<html>\n<script>1 < 2</script>\nContent\n</html>";
4175
+ let stripped = strip_script_and_style_sections(input);
4176
+ assert!(stripped.contains("Content"));
4177
+ assert!(stripped.contains("<script"));
4178
+ assert!(!stripped.contains("1 < 2"));
4179
+ }
4180
+
4005
4181
  #[test]
4006
4182
  fn test_add_list_continuation_indent_blank_line() {
4007
4183
  let opts = ConversionOptions::default();
@@ -92,6 +92,7 @@ pub fn convert_to_markdown_with_options(
92
92
  );
93
93
  }
94
94
 
95
+ collapse_extra_newlines(&mut output);
95
96
  output.trim().to_string()
96
97
  }
97
98
 
@@ -185,7 +186,7 @@ fn convert_element(
185
186
  if enable_spatial_tables {
186
187
  if let Some(table_markdown) = try_spatial_table_reconstruction(element) {
187
188
  output.push_str(&table_markdown);
188
- output.push_str("\n\n");
189
+ ensure_trailing_blank_line(output);
189
190
  return;
190
191
  }
191
192
  }
@@ -324,7 +325,7 @@ fn convert_element(
324
325
  if enable_spatial_tables {
325
326
  if let Some(table_markdown) = try_spatial_table_reconstruction(element) {
326
327
  output.push_str(&table_markdown);
327
- output.push_str("\n\n");
328
+ ensure_trailing_blank_line(output);
328
329
  } else {
329
330
  // Fallback: process children normally
330
331
  let mut sorted_children: Vec<_> = element.children.iter().collect();
@@ -334,7 +335,7 @@ fn convert_element(
334
335
  for child in sorted_children {
335
336
  convert_element(child, output, depth + 1, preserve_structure, enable_spatial_tables, ctx);
336
337
  }
337
- output.push_str("\n\n");
338
+ ensure_trailing_blank_line(output);
338
339
  }
339
340
  } else {
340
341
  // Fallback: process children normally
@@ -345,7 +346,7 @@ fn convert_element(
345
346
  for child in sorted_children {
346
347
  convert_element(child, output, depth + 1, preserve_structure, enable_spatial_tables, ctx);
347
348
  }
348
- output.push_str("\n\n");
349
+ ensure_trailing_blank_line(output);
349
350
  }
350
351
  }
351
352
 
@@ -361,7 +362,7 @@ fn convert_element(
361
362
  for child in sorted_children {
362
363
  convert_element(child, output, depth + 1, preserve_structure, enable_spatial_tables, ctx);
363
364
  }
364
- output.push_str("\n\n");
365
+ ensure_trailing_blank_line(output);
365
366
  }
366
367
 
367
368
  // Images - markdown image placeholder or alt text
@@ -463,6 +464,41 @@ fn convert_element(
463
464
  }
464
465
  }
465
466
 
467
+ fn ensure_trailing_blank_line(output: &mut String) {
468
+ while output.ends_with("\n\n\n") {
469
+ output.pop();
470
+ }
471
+ if output.ends_with("\n\n") {
472
+ return;
473
+ }
474
+ if output.ends_with('\n') {
475
+ output.push('\n');
476
+ } else {
477
+ output.push_str("\n\n");
478
+ }
479
+ }
480
+
481
+ fn collapse_extra_newlines(output: &mut String) {
482
+ let mut collapsed = String::with_capacity(output.len());
483
+ let mut newline_count = 0;
484
+
485
+ for ch in output.chars() {
486
+ if ch == '\n' {
487
+ newline_count += 1;
488
+ if newline_count <= 2 {
489
+ collapsed.push('\n');
490
+ }
491
+ } else {
492
+ newline_count = 0;
493
+ collapsed.push(ch);
494
+ }
495
+ }
496
+
497
+ if collapsed.len() != output.len() {
498
+ *output = collapsed;
499
+ }
500
+ }
501
+
466
502
  fn append_text_and_children(
467
503
  element: &HocrElement,
468
504
  output: &mut String,
@@ -49,4 +49,4 @@ __all__ = [
49
49
  "markdownify",
50
50
  ]
51
51
 
52
- __version__ = "2.3.4"
52
+ __version__ = "2.4.1"
@@ -7,7 +7,7 @@ requires = [
7
7
 
8
8
  [project]
9
9
  name = "html-to-markdown"
10
- version = "2.3.4"
10
+ version = "2.4.1"
11
11
  description = "High-performance HTML to Markdown converter powered by Rust with a clean Python API"
12
12
  readme = "README_PYPI.md"
13
13
  keywords = [
@@ -54,15 +54,15 @@ urls.Repository = "https://github.com/Goldziher/html-to-markdown.git"
54
54
  [dependency-groups]
55
55
  dev = [
56
56
  "covdefaults>=2.3",
57
- "memray>=1.18; sys_platform!='win32'",
57
+ "memray>=1.19.1; sys_platform!='win32'",
58
58
  "mypy>=1.18.2",
59
59
  "pre-commit>=4.3",
60
- "psutil>=7.1; sys_platform!='win32'",
60
+ "psutil>=7.1.1; sys_platform!='win32'",
61
61
  "pytest>=8.4.2",
62
62
  "pytest-benchmark>=5.1",
63
63
  "pytest-cov>=7",
64
64
  "pytest-mock>=3.15.1",
65
- "ruff>=0.13.3",
65
+ "ruff>=0.14.1",
66
66
  "types-psutil>=7.0.0.20251001",
67
67
  "uv-bump",
68
68
  ]