html-to-markdown 2.4.1__tar.gz → 2.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/Cargo.lock +31 -34
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/Cargo.toml +2 -2
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/PKG-INFO +7 -4
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/README_PYPI.md +6 -3
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/README.md +35 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/converter.rs +327 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/converter.rs +14 -1
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/extractor.rs +39 -50
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/lib.rs +2 -2
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/options.rs +6 -1
- html_to_markdown-2.5.0/crates/html-to-markdown/src/sanitizer.rs +284 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/Cargo.toml +1 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/README.md +19 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/src/lib.rs +104 -89
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/__init__.py +1 -1
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/api.py +1 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/bin/html-to-markdown +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/options.py +5 -2
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/pyproject.toml +1 -1
- html_to_markdown-2.4.1/crates/html-to-markdown/src/sanitizer.rs +0 -85
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/LICENSE +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/Cargo.toml +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/benches/conversion_benchmark.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/benches/micro_benchmark.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/benches/profiling_benchmark.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/basic.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/table.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_escape.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_inline_formatting.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_lists.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_semantic_tags.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_tables.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_task_lists.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_whitespace.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/error.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/mod.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/parser.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/spatial.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/types.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/inline_images.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/text.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/wrapper.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/tests/commonmark_compliance_test.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/tests/hocr_compliance_test.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/tests/integration_test.rs +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/python/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/python/html_to_markdown/_html_to_markdown.pyi +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/uv.lock +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/_rust.pyi +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/cli.py +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/cli_proxy.py +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/exceptions.py +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/v1_compat.py +0 -0
|
@@ -157,9 +157,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
|
|
157
157
|
|
|
158
158
|
[[package]]
|
|
159
159
|
name = "cc"
|
|
160
|
-
version = "1.2.
|
|
160
|
+
version = "1.2.42"
|
|
161
161
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
162
|
-
checksum = "
|
|
162
|
+
checksum = "81bbf3b3619004ad9bd139f62a9ab5cfe467f307455a0d307b0cf58bf070feaa"
|
|
163
163
|
dependencies = [
|
|
164
164
|
"find-msvc-tools",
|
|
165
165
|
"shlex",
|
|
@@ -249,9 +249,9 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
|
|
|
249
249
|
|
|
250
250
|
[[package]]
|
|
251
251
|
name = "clap_mangen"
|
|
252
|
-
version = "0.2.
|
|
252
|
+
version = "0.2.31"
|
|
253
253
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
254
|
-
checksum = "
|
|
254
|
+
checksum = "439ea63a92086df93893164221ad4f24142086d535b3a0957b9b9bea2dc86301"
|
|
255
255
|
dependencies = [
|
|
256
256
|
"clap",
|
|
257
257
|
"roff",
|
|
@@ -386,9 +386,9 @@ dependencies = [
|
|
|
386
386
|
|
|
387
387
|
[[package]]
|
|
388
388
|
name = "ctor"
|
|
389
|
-
version = "0.
|
|
389
|
+
version = "0.6.0"
|
|
390
390
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
391
|
-
checksum = "
|
|
391
|
+
checksum = "59c9b8bdf64ee849747c1b12eb861d21aa47fa161564f48332f1afe2373bf899"
|
|
392
392
|
dependencies = [
|
|
393
393
|
"ctor-proc-macro",
|
|
394
394
|
"dtor",
|
|
@@ -396,9 +396,9 @@ dependencies = [
|
|
|
396
396
|
|
|
397
397
|
[[package]]
|
|
398
398
|
name = "ctor-proc-macro"
|
|
399
|
-
version = "0.0.
|
|
399
|
+
version = "0.0.7"
|
|
400
400
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
401
|
-
checksum = "
|
|
401
|
+
checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
|
|
402
402
|
|
|
403
403
|
[[package]]
|
|
404
404
|
name = "cty"
|
|
@@ -425,9 +425,9 @@ dependencies = [
|
|
|
425
425
|
|
|
426
426
|
[[package]]
|
|
427
427
|
name = "doc-comment"
|
|
428
|
-
version = "0.3.
|
|
428
|
+
version = "0.3.4"
|
|
429
429
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
430
|
-
checksum = "
|
|
430
|
+
checksum = "780955b8b195a21ab8e4ac6b60dd1dbdcec1dc6c51c0617964b08c81785e12c9"
|
|
431
431
|
|
|
432
432
|
[[package]]
|
|
433
433
|
name = "dtoa"
|
|
@@ -507,9 +507,9 @@ checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127"
|
|
|
507
507
|
|
|
508
508
|
[[package]]
|
|
509
509
|
name = "flate2"
|
|
510
|
-
version = "1.1.
|
|
510
|
+
version = "1.1.5"
|
|
511
511
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
512
|
-
checksum = "
|
|
512
|
+
checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb"
|
|
513
513
|
dependencies = [
|
|
514
514
|
"crc32fast",
|
|
515
515
|
"miniz_oxide",
|
|
@@ -550,11 +550,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
550
550
|
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
|
|
551
551
|
dependencies = [
|
|
552
552
|
"cfg-if",
|
|
553
|
-
"js-sys",
|
|
554
553
|
"libc",
|
|
555
554
|
"r-efi",
|
|
556
555
|
"wasip2",
|
|
557
|
-
"wasm-bindgen",
|
|
558
556
|
]
|
|
559
557
|
|
|
560
558
|
[[package]]
|
|
@@ -595,7 +593,7 @@ dependencies = [
|
|
|
595
593
|
|
|
596
594
|
[[package]]
|
|
597
595
|
name = "html-to-markdown-cli"
|
|
598
|
-
version = "2.
|
|
596
|
+
version = "2.5.0"
|
|
599
597
|
dependencies = [
|
|
600
598
|
"assert_cmd",
|
|
601
599
|
"clap",
|
|
@@ -609,7 +607,7 @@ dependencies = [
|
|
|
609
607
|
|
|
610
608
|
[[package]]
|
|
611
609
|
name = "html-to-markdown-node"
|
|
612
|
-
version = "2.
|
|
610
|
+
version = "2.5.0"
|
|
613
611
|
dependencies = [
|
|
614
612
|
"html-to-markdown-rs",
|
|
615
613
|
"mimalloc-rust",
|
|
@@ -620,7 +618,7 @@ dependencies = [
|
|
|
620
618
|
|
|
621
619
|
[[package]]
|
|
622
620
|
name = "html-to-markdown-py"
|
|
623
|
-
version = "2.
|
|
621
|
+
version = "2.5.0"
|
|
624
622
|
dependencies = [
|
|
625
623
|
"base64",
|
|
626
624
|
"html-to-markdown-rs",
|
|
@@ -630,7 +628,7 @@ dependencies = [
|
|
|
630
628
|
|
|
631
629
|
[[package]]
|
|
632
630
|
name = "html-to-markdown-rs"
|
|
633
|
-
version = "2.
|
|
631
|
+
version = "2.5.0"
|
|
634
632
|
dependencies = [
|
|
635
633
|
"ammonia",
|
|
636
634
|
"base64",
|
|
@@ -647,10 +645,9 @@ dependencies = [
|
|
|
647
645
|
|
|
648
646
|
[[package]]
|
|
649
647
|
name = "html-to-markdown-wasm"
|
|
650
|
-
version = "2.
|
|
648
|
+
version = "2.5.0"
|
|
651
649
|
dependencies = [
|
|
652
650
|
"console_error_panic_hook",
|
|
653
|
-
"getrandom",
|
|
654
651
|
"html-to-markdown-rs",
|
|
655
652
|
"js-sys",
|
|
656
653
|
"serde",
|
|
@@ -989,9 +986,9 @@ dependencies = [
|
|
|
989
986
|
|
|
990
987
|
[[package]]
|
|
991
988
|
name = "napi"
|
|
992
|
-
version = "3.
|
|
989
|
+
version = "3.4.0"
|
|
993
990
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
994
|
-
checksum = "
|
|
991
|
+
checksum = "c3a1135cfe16ca43ac82ac05858554fc39c037d8e4592f2b4a83d7ef8e822f43"
|
|
995
992
|
dependencies = [
|
|
996
993
|
"bitflags",
|
|
997
994
|
"ctor",
|
|
@@ -1003,15 +1000,15 @@ dependencies = [
|
|
|
1003
1000
|
|
|
1004
1001
|
[[package]]
|
|
1005
1002
|
name = "napi-build"
|
|
1006
|
-
version = "2.2.
|
|
1003
|
+
version = "2.2.4"
|
|
1007
1004
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1008
|
-
checksum = "
|
|
1005
|
+
checksum = "3ae82775d1b06f3f07efd0666e59bbc175da8383bc372051031d7a447e94fbea"
|
|
1009
1006
|
|
|
1010
1007
|
[[package]]
|
|
1011
1008
|
name = "napi-derive"
|
|
1012
|
-
version = "3.
|
|
1009
|
+
version = "3.3.0"
|
|
1013
1010
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1014
|
-
checksum = "
|
|
1011
|
+
checksum = "78665d6bdf10e9a4e6b38123efb0f66962e6197c1aea2f07cff3f159a374696d"
|
|
1015
1012
|
dependencies = [
|
|
1016
1013
|
"convert_case",
|
|
1017
1014
|
"ctor",
|
|
@@ -1023,9 +1020,9 @@ dependencies = [
|
|
|
1023
1020
|
|
|
1024
1021
|
[[package]]
|
|
1025
1022
|
name = "napi-derive-backend"
|
|
1026
|
-
version = "
|
|
1023
|
+
version = "3.0.0"
|
|
1027
1024
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1028
|
-
checksum = "
|
|
1025
|
+
checksum = "42d55d01423e7264de3acc13b258fa48ca7cf38a4d25db848908ec3c1304a85a"
|
|
1029
1026
|
dependencies = [
|
|
1030
1027
|
"convert_case",
|
|
1031
1028
|
"proc-macro2",
|
|
@@ -1036,9 +1033,9 @@ dependencies = [
|
|
|
1036
1033
|
|
|
1037
1034
|
[[package]]
|
|
1038
1035
|
name = "napi-sys"
|
|
1039
|
-
version = "3.0.
|
|
1036
|
+
version = "3.0.1"
|
|
1040
1037
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1041
|
-
checksum = "
|
|
1038
|
+
checksum = "1ed8f0e23a62a3ce0fbb6527cdc056e9282ddd9916b068c46f8923e18eed5ee6"
|
|
1042
1039
|
dependencies = [
|
|
1043
1040
|
"libloading",
|
|
1044
1041
|
]
|
|
@@ -1263,9 +1260,9 @@ dependencies = [
|
|
|
1263
1260
|
|
|
1264
1261
|
[[package]]
|
|
1265
1262
|
name = "proc-macro2"
|
|
1266
|
-
version = "1.0.
|
|
1263
|
+
version = "1.0.103"
|
|
1267
1264
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1268
|
-
checksum = "
|
|
1265
|
+
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
|
|
1269
1266
|
dependencies = [
|
|
1270
1267
|
"unicode-ident",
|
|
1271
1268
|
]
|
|
@@ -1609,9 +1606,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
|
|
1609
1606
|
|
|
1610
1607
|
[[package]]
|
|
1611
1608
|
name = "syn"
|
|
1612
|
-
version = "2.0.
|
|
1609
|
+
version = "2.0.108"
|
|
1613
1610
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1614
|
-
checksum = "
|
|
1611
|
+
checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917"
|
|
1615
1612
|
dependencies = [
|
|
1616
1613
|
"proc-macro2",
|
|
1617
1614
|
"quote",
|
|
@@ -3,7 +3,7 @@ resolver = "2"
|
|
|
3
3
|
members = ["crates/html-to-markdown-py"]
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "2.
|
|
6
|
+
version = "2.5.0"
|
|
7
7
|
edition = "2021"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
9
9
|
license = "MIT"
|
|
@@ -15,7 +15,7 @@ rust-version = "1.80"
|
|
|
15
15
|
|
|
16
16
|
[workspace.dependencies]
|
|
17
17
|
# Core library
|
|
18
|
-
html-to-markdown-rs = { version = "2.
|
|
18
|
+
html-to-markdown-rs = { version = "2.5.0", path = "crates/html-to-markdown" }
|
|
19
19
|
|
|
20
20
|
# HTML parsing and sanitization
|
|
21
21
|
tl = "0.7"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.5.0
|
|
4
4
|
Classifier: Development Status :: 5 - Production/Stable
|
|
5
5
|
Classifier: Environment :: Console
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -162,9 +162,12 @@ Key fields (see docstring for full matrix):
|
|
|
162
162
|
|
|
163
163
|
### `PreprocessingOptions`
|
|
164
164
|
|
|
165
|
-
- `enabled`: enable HTML sanitisation
|
|
166
|
-
- `preset`: `"minimal" | "standard" | "aggressive"`
|
|
167
|
-
- `remove_navigation
|
|
165
|
+
- `enabled`: enable HTML sanitisation (default: `True` since v2.4.2 for robust malformed HTML handling)
|
|
166
|
+
- `preset`: `"minimal" | "standard" | "aggressive"` (default: `"standard"`)
|
|
167
|
+
- `remove_navigation`: remove navigation elements (default: `True`)
|
|
168
|
+
- `remove_forms`: remove form elements (default: `True`)
|
|
169
|
+
|
|
170
|
+
**Note:** As of v2.4.2, preprocessing is enabled by default to ensure robust handling of malformed HTML (e.g., bare angle brackets like `1<2` in content). Set `enabled=False` if you need minimal preprocessing.
|
|
168
171
|
|
|
169
172
|
### `InlineImageConfig`
|
|
170
173
|
|
|
@@ -129,9 +129,12 @@ Key fields (see docstring for full matrix):
|
|
|
129
129
|
|
|
130
130
|
### `PreprocessingOptions`
|
|
131
131
|
|
|
132
|
-
- `enabled`: enable HTML sanitisation
|
|
133
|
-
- `preset`: `"minimal" | "standard" | "aggressive"`
|
|
134
|
-
- `remove_navigation
|
|
132
|
+
- `enabled`: enable HTML sanitisation (default: `True` since v2.4.2 for robust malformed HTML handling)
|
|
133
|
+
- `preset`: `"minimal" | "standard" | "aggressive"` (default: `"standard"`)
|
|
134
|
+
- `remove_navigation`: remove navigation elements (default: `True`)
|
|
135
|
+
- `remove_forms`: remove form elements (default: `True`)
|
|
136
|
+
|
|
137
|
+
**Note:** As of v2.4.2, preprocessing is enabled by default to ensure robust handling of malformed HTML (e.g., bare angle brackets like `1<2` in content). Set `enabled=False` if you need minimal preprocessing.
|
|
135
138
|
|
|
136
139
|
### `InlineImageConfig`
|
|
137
140
|
|
|
@@ -60,6 +60,41 @@ let options = ConversionOptions {
|
|
|
60
60
|
let markdown = convert(html, Some(options))?;
|
|
61
61
|
```
|
|
62
62
|
|
|
63
|
+
### Preserving HTML Tags
|
|
64
|
+
|
|
65
|
+
The `preserve_tags` option allows you to keep specific HTML tags in their original form instead of converting them to Markdown. This is useful for complex elements like tables that may not convert well:
|
|
66
|
+
|
|
67
|
+
```rust
|
|
68
|
+
use html_to_markdown_rs::{convert, ConversionOptions};
|
|
69
|
+
|
|
70
|
+
let html = r#"
|
|
71
|
+
<p>Before table</p>
|
|
72
|
+
<table class="data">
|
|
73
|
+
<tr><th>Name</th><th>Value</th></tr>
|
|
74
|
+
<tr><td>Item 1</td><td>100</td></tr>
|
|
75
|
+
</table>
|
|
76
|
+
<p>After table</p>
|
|
77
|
+
"#;
|
|
78
|
+
|
|
79
|
+
let options = ConversionOptions {
|
|
80
|
+
preserve_tags: vec!["table".to_string()],
|
|
81
|
+
..Default::default()
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
let markdown = convert(html, Some(options))?;
|
|
85
|
+
// Result: "Before table\n\n<table class=\"data\">...</table>\n\nAfter table\n"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
You can preserve multiple tag types and combine with `strip_tags`:
|
|
89
|
+
|
|
90
|
+
```rust
|
|
91
|
+
let options = ConversionOptions {
|
|
92
|
+
preserve_tags: vec!["table".to_string(), "form".to_string()],
|
|
93
|
+
strip_tags: vec!["script".to_string(), "style".to_string()],
|
|
94
|
+
..Default::default()
|
|
95
|
+
};
|
|
96
|
+
```
|
|
97
|
+
|
|
63
98
|
## Web Scraping with Preprocessing
|
|
64
99
|
|
|
65
100
|
```rust
|
|
@@ -974,6 +974,9 @@ fn convert_html_impl(
|
|
|
974
974
|
.replace("<hr/>", "<hr>")
|
|
975
975
|
.replace("<img/>", "<img>");
|
|
976
976
|
|
|
977
|
+
// Escape malformed angle brackets in text content to prevent parser failures
|
|
978
|
+
let html = escape_malformed_angle_brackets(&html);
|
|
979
|
+
|
|
977
980
|
let html = strip_script_and_style_sections(&html);
|
|
978
981
|
|
|
979
982
|
let dom = tl::parse(html.as_ref(), tl::ParserOptions::default())
|
|
@@ -1080,6 +1083,151 @@ fn convert_html_impl(
|
|
|
1080
1083
|
}
|
|
1081
1084
|
}
|
|
1082
1085
|
|
|
1086
|
+
/// Escape malformed angle brackets in HTML that are not part of valid tags.
|
|
1087
|
+
///
|
|
1088
|
+
/// This function ensures robust parsing by escaping bare `<` and `>` characters
|
|
1089
|
+
/// that appear in text content and are not part of HTML tags. This prevents
|
|
1090
|
+
/// parser failures on malformed HTML like "1<2" or comparisons in text.
|
|
1091
|
+
///
|
|
1092
|
+
/// # Examples
|
|
1093
|
+
///
|
|
1094
|
+
/// - `1<2` becomes `1<2`
|
|
1095
|
+
/// - `<div>1<2</div>` becomes `<div>1<2</div>`
|
|
1096
|
+
/// - `<script>1 < 2</script>` remains unchanged (handled by script stripping)
|
|
1097
|
+
fn escape_malformed_angle_brackets(input: &str) -> Cow<'_, str> {
|
|
1098
|
+
let bytes = input.as_bytes();
|
|
1099
|
+
let len = bytes.len();
|
|
1100
|
+
let mut idx = 0;
|
|
1101
|
+
let mut last = 0;
|
|
1102
|
+
let mut output: Option<String> = None;
|
|
1103
|
+
|
|
1104
|
+
while idx < len {
|
|
1105
|
+
if bytes[idx] == b'<' {
|
|
1106
|
+
// Check if this is a valid tag start
|
|
1107
|
+
if idx + 1 < len {
|
|
1108
|
+
let next = bytes[idx + 1];
|
|
1109
|
+
|
|
1110
|
+
// Valid tag patterns: <tagname, </tagname, <!doctype, <!--
|
|
1111
|
+
let is_valid_tag = match next {
|
|
1112
|
+
b'!' => {
|
|
1113
|
+
// DOCTYPE or comment
|
|
1114
|
+
idx + 2 < len
|
|
1115
|
+
&& (bytes[idx + 2] == b'-'
|
|
1116
|
+
|| bytes[idx + 2].is_ascii_alphabetic()
|
|
1117
|
+
|| bytes[idx + 2].is_ascii_uppercase())
|
|
1118
|
+
}
|
|
1119
|
+
b'/' => {
|
|
1120
|
+
// Closing tag
|
|
1121
|
+
idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
|
|
1122
|
+
}
|
|
1123
|
+
b'?' => {
|
|
1124
|
+
// XML declaration
|
|
1125
|
+
true
|
|
1126
|
+
}
|
|
1127
|
+
c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => {
|
|
1128
|
+
// Opening tag
|
|
1129
|
+
true
|
|
1130
|
+
}
|
|
1131
|
+
_ => false,
|
|
1132
|
+
};
|
|
1133
|
+
|
|
1134
|
+
if !is_valid_tag {
|
|
1135
|
+
// This is a bare `<` that should be escaped
|
|
1136
|
+
let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
|
|
1137
|
+
out.push_str(&input[last..idx]);
|
|
1138
|
+
out.push_str("<");
|
|
1139
|
+
last = idx + 1;
|
|
1140
|
+
}
|
|
1141
|
+
} else {
|
|
1142
|
+
// `<` at end of string - escape it
|
|
1143
|
+
let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
|
|
1144
|
+
out.push_str(&input[last..idx]);
|
|
1145
|
+
out.push_str("<");
|
|
1146
|
+
last = idx + 1;
|
|
1147
|
+
}
|
|
1148
|
+
}
|
|
1149
|
+
idx += 1;
|
|
1150
|
+
}
|
|
1151
|
+
|
|
1152
|
+
if let Some(mut out) = output {
|
|
1153
|
+
if last < input.len() {
|
|
1154
|
+
out.push_str(&input[last..]);
|
|
1155
|
+
}
|
|
1156
|
+
Cow::Owned(out)
|
|
1157
|
+
} else {
|
|
1158
|
+
Cow::Borrowed(input)
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1162
|
+
/// Serialize a tag and its children back to HTML.
|
|
1163
|
+
///
|
|
1164
|
+
/// This is used for the preserve_tags feature to output original HTML for specific elements.
|
|
1165
|
+
fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
|
|
1166
|
+
let mut html = String::new();
|
|
1167
|
+
serialize_node_to_html(handle, parser, &mut html);
|
|
1168
|
+
html
|
|
1169
|
+
}
|
|
1170
|
+
|
|
1171
|
+
/// Recursively serialize a node to HTML.
|
|
1172
|
+
fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
|
|
1173
|
+
match handle.get(parser) {
|
|
1174
|
+
Some(tl::Node::Tag(tag)) => {
|
|
1175
|
+
let tag_name = tag.name().as_utf8_str();
|
|
1176
|
+
|
|
1177
|
+
// Opening tag
|
|
1178
|
+
output.push('<');
|
|
1179
|
+
output.push_str(&tag_name);
|
|
1180
|
+
|
|
1181
|
+
// Attributes
|
|
1182
|
+
for (key, value) in tag.attributes().iter() {
|
|
1183
|
+
output.push(' ');
|
|
1184
|
+
output.push_str(&key);
|
|
1185
|
+
if let Some(val) = value {
|
|
1186
|
+
output.push_str("=\"");
|
|
1187
|
+
output.push_str(&val);
|
|
1188
|
+
output.push('"');
|
|
1189
|
+
}
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
output.push('>');
|
|
1193
|
+
|
|
1194
|
+
// Children
|
|
1195
|
+
let children = tag.children();
|
|
1196
|
+
for child_handle in children.top().iter() {
|
|
1197
|
+
serialize_node_to_html(child_handle, parser, output);
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
// Closing tag (skip for self-closing tags)
|
|
1201
|
+
if !matches!(
|
|
1202
|
+
tag_name.as_ref(),
|
|
1203
|
+
"br" | "hr"
|
|
1204
|
+
| "img"
|
|
1205
|
+
| "input"
|
|
1206
|
+
| "meta"
|
|
1207
|
+
| "link"
|
|
1208
|
+
| "area"
|
|
1209
|
+
| "base"
|
|
1210
|
+
| "col"
|
|
1211
|
+
| "embed"
|
|
1212
|
+
| "param"
|
|
1213
|
+
| "source"
|
|
1214
|
+
| "track"
|
|
1215
|
+
| "wbr"
|
|
1216
|
+
) {
|
|
1217
|
+
output.push_str("</");
|
|
1218
|
+
output.push_str(&tag_name);
|
|
1219
|
+
output.push('>');
|
|
1220
|
+
}
|
|
1221
|
+
}
|
|
1222
|
+
Some(tl::Node::Raw(bytes)) => {
|
|
1223
|
+
if let Ok(text) = std::str::from_utf8(bytes.as_bytes()) {
|
|
1224
|
+
output.push_str(text);
|
|
1225
|
+
}
|
|
1226
|
+
}
|
|
1227
|
+
_ => {}
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1083
1231
|
fn strip_script_and_style_sections(input: &str) -> Cow<'_, str> {
|
|
1084
1232
|
const TAGS: [&[u8]; 2] = [b"script", b"style"];
|
|
1085
1233
|
const SVG: &[u8] = b"svg";
|
|
@@ -1478,6 +1626,13 @@ fn walk_node(
|
|
|
1478
1626
|
return;
|
|
1479
1627
|
}
|
|
1480
1628
|
|
|
1629
|
+
// Preserve tags: output original HTML
|
|
1630
|
+
if options.preserve_tags.iter().any(|t| t.as_str() == tag_name) {
|
|
1631
|
+
let html = serialize_tag_to_html(node_handle, parser);
|
|
1632
|
+
output.push_str(&html);
|
|
1633
|
+
return;
|
|
1634
|
+
}
|
|
1635
|
+
|
|
1481
1636
|
match tag_name.as_ref() {
|
|
1482
1637
|
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
|
|
1483
1638
|
let level = tag_name.chars().last().and_then(|c| c.to_digit(10)).unwrap_or(1) as usize;
|
|
@@ -4225,4 +4380,176 @@ mod tests {
|
|
|
4225
4380
|
add_list_continuation_indent(&mut output, 1, false, &opts);
|
|
4226
4381
|
assert_eq!(output, "* First\n ");
|
|
4227
4382
|
}
|
|
4383
|
+
|
|
4384
|
+
#[test]
|
|
4385
|
+
fn test_escape_malformed_angle_brackets_bare() {
|
|
4386
|
+
let input = "1<2";
|
|
4387
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4388
|
+
assert_eq!(escaped, "1<2");
|
|
4389
|
+
}
|
|
4390
|
+
|
|
4391
|
+
#[test]
|
|
4392
|
+
fn test_escape_malformed_angle_brackets_in_text() {
|
|
4393
|
+
let input = "<html>1<2 Content</html>";
|
|
4394
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4395
|
+
assert_eq!(escaped, "<html>1<2 Content</html>");
|
|
4396
|
+
}
|
|
4397
|
+
|
|
4398
|
+
#[test]
|
|
4399
|
+
fn test_escape_malformed_angle_brackets_multiple() {
|
|
4400
|
+
let input = "1 < 2 < 3";
|
|
4401
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4402
|
+
assert_eq!(escaped, "1 < 2 < 3");
|
|
4403
|
+
}
|
|
4404
|
+
|
|
4405
|
+
#[test]
|
|
4406
|
+
fn test_escape_malformed_angle_brackets_preserves_valid_tags() {
|
|
4407
|
+
let input = "<div>content</div>";
|
|
4408
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4409
|
+
assert_eq!(escaped, "<div>content</div>");
|
|
4410
|
+
}
|
|
4411
|
+
|
|
4412
|
+
#[test]
|
|
4413
|
+
fn test_escape_malformed_angle_brackets_mixed() {
|
|
4414
|
+
let input = "<div>1<2</div><p>3<4</p>";
|
|
4415
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4416
|
+
assert_eq!(escaped, "<div>1<2</div><p>3<4</p>");
|
|
4417
|
+
}
|
|
4418
|
+
|
|
4419
|
+
#[test]
|
|
4420
|
+
fn test_escape_malformed_angle_brackets_at_end() {
|
|
4421
|
+
let input = "test<";
|
|
4422
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4423
|
+
assert_eq!(escaped, "test<");
|
|
4424
|
+
}
|
|
4425
|
+
|
|
4426
|
+
#[test]
|
|
4427
|
+
fn test_escape_malformed_angle_brackets_preserves_comments() {
|
|
4428
|
+
let input = "<!-- comment -->1<2";
|
|
4429
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4430
|
+
assert_eq!(escaped, "<!-- comment -->1<2");
|
|
4431
|
+
}
|
|
4432
|
+
|
|
4433
|
+
#[test]
|
|
4434
|
+
fn test_escape_malformed_angle_brackets_preserves_doctype() {
|
|
4435
|
+
let input = "<!DOCTYPE html>1<2";
|
|
4436
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4437
|
+
assert_eq!(escaped, "<!DOCTYPE html>1<2");
|
|
4438
|
+
}
|
|
4439
|
+
|
|
4440
|
+
#[test]
|
|
4441
|
+
fn test_convert_with_malformed_angle_brackets() {
|
|
4442
|
+
// Test the full conversion pipeline (issue #94)
|
|
4443
|
+
let html = "<html>1<2\nContent</html>";
|
|
4444
|
+
let result = convert_html(html, &ConversionOptions::default()).unwrap();
|
|
4445
|
+
assert!(
|
|
4446
|
+
result.contains("Content"),
|
|
4447
|
+
"Result should contain 'Content': {:?}",
|
|
4448
|
+
result
|
|
4449
|
+
);
|
|
4450
|
+
assert!(
|
|
4451
|
+
result.contains("1<2") || result.contains("1<2"),
|
|
4452
|
+
"Result should contain escaped or unescaped comparison"
|
|
4453
|
+
);
|
|
4454
|
+
}
|
|
4455
|
+
|
|
4456
|
+
#[test]
|
|
4457
|
+
fn test_convert_with_malformed_angle_brackets_in_div() {
|
|
4458
|
+
let html = "<html><div>1<2</div><div>Content</div></html>";
|
|
4459
|
+
let result = convert_html(html, &ConversionOptions::default()).unwrap();
|
|
4460
|
+
assert!(
|
|
4461
|
+
result.contains("Content"),
|
|
4462
|
+
"Result should contain 'Content': {:?}",
|
|
4463
|
+
result
|
|
4464
|
+
);
|
|
4465
|
+
}
|
|
4466
|
+
|
|
4467
|
+
#[test]
|
|
4468
|
+
fn test_convert_with_multiple_malformed_angle_brackets() {
|
|
4469
|
+
let html = "<html>1 < 2 < 3<p>Content</p></html>";
|
|
4470
|
+
let result = convert_html(html, &ConversionOptions::default()).unwrap();
|
|
4471
|
+
assert!(
|
|
4472
|
+
result.contains("Content"),
|
|
4473
|
+
"Result should contain 'Content': {:?}",
|
|
4474
|
+
result
|
|
4475
|
+
);
|
|
4476
|
+
}
|
|
4477
|
+
|
|
4478
|
+
#[test]
|
|
4479
|
+
fn test_preserve_tags_simple_table() {
|
|
4480
|
+
let html = r#"<div><table><tr><td>Cell 1</td><td>Cell 2</td></tr></table><p>Text</p></div>"#;
|
|
4481
|
+
let mut options = ConversionOptions::default();
|
|
4482
|
+
options.preserve_tags = vec!["table".to_string()];
|
|
4483
|
+
let result = convert_html(html, &options).unwrap();
|
|
4484
|
+
|
|
4485
|
+
assert!(result.contains("<table>"), "Should preserve table tag");
|
|
4486
|
+
assert!(result.contains("</table>"), "Should have closing table tag");
|
|
4487
|
+
assert!(result.contains("<tr>"), "Should preserve tr tag");
|
|
4488
|
+
assert!(result.contains("<td>"), "Should preserve td tag");
|
|
4489
|
+
assert!(result.contains("Text"), "Should convert other elements");
|
|
4490
|
+
}
|
|
4491
|
+
|
|
4492
|
+
#[test]
|
|
4493
|
+
fn test_preserve_tags_with_attributes() {
|
|
4494
|
+
let html = r#"<table class="data" id="mytable"><tr><td>Data</td></tr></table>"#;
|
|
4495
|
+
let mut options = ConversionOptions::default();
|
|
4496
|
+
options.preserve_tags = vec!["table".to_string()];
|
|
4497
|
+
let result = convert_html(html, &options).unwrap();
|
|
4498
|
+
|
|
4499
|
+
assert!(result.contains("<table"), "Should preserve table tag");
|
|
4500
|
+
assert!(result.contains("class="), "Should preserve class attribute");
|
|
4501
|
+
assert!(result.contains("id="), "Should preserve id attribute");
|
|
4502
|
+
assert!(result.contains("</table>"), "Should have closing tag");
|
|
4503
|
+
}
|
|
4504
|
+
|
|
4505
|
+
#[test]
|
|
4506
|
+
fn test_preserve_tags_multiple_tags() {
|
|
4507
|
+
let html = r#"<div><table><tr><td>Table</td></tr></table><form><input type="text"/></form><p>Text</p></div>"#;
|
|
4508
|
+
let mut options = ConversionOptions::default();
|
|
4509
|
+
options.preserve_tags = vec!["table".to_string(), "form".to_string()];
|
|
4510
|
+
let result = convert_html(html, &options).unwrap();
|
|
4511
|
+
|
|
4512
|
+
assert!(result.contains("<table>"), "Should preserve table");
|
|
4513
|
+
assert!(result.contains("<form>"), "Should preserve form");
|
|
4514
|
+
assert!(result.contains("Text"), "Should convert paragraph");
|
|
4515
|
+
}
|
|
4516
|
+
|
|
4517
|
+
#[test]
|
|
4518
|
+
fn test_preserve_tags_nested_content() {
|
|
4519
|
+
let html = r#"<table><thead><tr><th>Header</th></tr></thead><tbody><tr><td>Data</td></tr></tbody></table>"#;
|
|
4520
|
+
let mut options = ConversionOptions::default();
|
|
4521
|
+
options.preserve_tags = vec!["table".to_string()];
|
|
4522
|
+
let result = convert_html(html, &options).unwrap();
|
|
4523
|
+
|
|
4524
|
+
assert!(result.contains("<thead>"), "Should preserve nested thead");
|
|
4525
|
+
assert!(result.contains("<tbody>"), "Should preserve nested tbody");
|
|
4526
|
+
assert!(result.contains("<th>"), "Should preserve th tag");
|
|
4527
|
+
assert!(result.contains("Header"), "Should preserve text content");
|
|
4528
|
+
}
|
|
4529
|
+
|
|
4530
|
+
#[test]
|
|
4531
|
+
fn test_preserve_tags_empty_list() {
|
|
4532
|
+
let html = r#"<table><tr><td>Cell</td></tr></table>"#;
|
|
4533
|
+
let options = ConversionOptions::default(); // No preserve_tags
|
|
4534
|
+
let result = convert_html(html, &options).unwrap();
|
|
4535
|
+
|
|
4536
|
+
// Should convert to markdown table (or at least not preserve HTML)
|
|
4537
|
+
assert!(
|
|
4538
|
+
!result.contains("<table>"),
|
|
4539
|
+
"Should not preserve table without preserve_tags"
|
|
4540
|
+
);
|
|
4541
|
+
}
|
|
4542
|
+
|
|
4543
|
+
#[test]
|
|
4544
|
+
fn test_preserve_tags_vs_strip_tags() {
|
|
4545
|
+
let html = r#"<table><tr><td>Table</td></tr></table><div><span>Text</span></div>"#;
|
|
4546
|
+
let mut options = ConversionOptions::default();
|
|
4547
|
+
options.preserve_tags = vec!["table".to_string()];
|
|
4548
|
+
options.strip_tags = vec!["span".to_string()];
|
|
4549
|
+
let result = convert_html(html, &options).unwrap();
|
|
4550
|
+
|
|
4551
|
+
assert!(result.contains("<table>"), "Should preserve table");
|
|
4552
|
+
assert!(!result.contains("<span>"), "Should strip span tag");
|
|
4553
|
+
assert!(result.contains("Text"), "Should keep span text content");
|
|
4554
|
+
}
|
|
4228
4555
|
}
|
{html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/converter.rs
RENAMED
|
@@ -237,9 +237,22 @@ fn convert_element(
|
|
|
237
237
|
|
|
238
238
|
// Words - join with space
|
|
239
239
|
HocrElementType::OcrxWord => {
|
|
240
|
+
// Ensure space before this word if output doesn't end with whitespace or markdown formatting
|
|
241
|
+
if !output.is_empty()
|
|
242
|
+
&& !output.ends_with(' ')
|
|
243
|
+
&& !output.ends_with('\t')
|
|
244
|
+
&& !output.ends_with('\n')
|
|
245
|
+
&& !output.ends_with('*') // Don't add space after italic/bold markers
|
|
246
|
+
&& !output.ends_with('`') // Don't add space after code markers
|
|
247
|
+
&& !output.ends_with('_') // Don't add space after underline markers
|
|
248
|
+
&& !output.ends_with('[')
|
|
249
|
+
// Don't add space after opening bracket (link/image alt)
|
|
250
|
+
{
|
|
251
|
+
output.push(' ');
|
|
252
|
+
}
|
|
253
|
+
|
|
240
254
|
if !element.text.is_empty() {
|
|
241
255
|
output.push_str(&element.text);
|
|
242
|
-
output.push(' ');
|
|
243
256
|
}
|
|
244
257
|
}
|
|
245
258
|
|