html-to-markdown 2.4.1__tar.gz → 2.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (55) hide show
  1. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/Cargo.lock +31 -34
  2. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/Cargo.toml +2 -2
  3. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/PKG-INFO +7 -4
  4. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/README_PYPI.md +6 -3
  5. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/README.md +35 -0
  6. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/converter.rs +327 -0
  7. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/converter.rs +14 -1
  8. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/extractor.rs +39 -50
  9. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/lib.rs +2 -2
  10. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/options.rs +6 -1
  11. html_to_markdown-2.5.0/crates/html-to-markdown/src/sanitizer.rs +284 -0
  12. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/Cargo.toml +1 -0
  13. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/README.md +19 -0
  14. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/src/lib.rs +104 -89
  15. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/__init__.py +1 -1
  16. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/api.py +1 -0
  17. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/bin/html-to-markdown +0 -0
  18. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/options.py +5 -2
  19. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/pyproject.toml +1 -1
  20. html_to_markdown-2.4.1/crates/html-to-markdown/src/sanitizer.rs +0 -85
  21. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/LICENSE +0 -0
  22. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/Cargo.toml +0 -0
  23. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/benches/conversion_benchmark.rs +0 -0
  24. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/benches/micro_benchmark.rs +0 -0
  25. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/benches/profiling_benchmark.rs +0 -0
  26. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/basic.rs +0 -0
  27. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/table.rs +0 -0
  28. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_escape.rs +0 -0
  29. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_inline_formatting.rs +0 -0
  30. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_lists.rs +0 -0
  31. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_semantic_tags.rs +0 -0
  32. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_tables.rs +0 -0
  33. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_task_lists.rs +0 -0
  34. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_whitespace.rs +0 -0
  35. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/error.rs +0 -0
  36. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/mod.rs +0 -0
  37. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/parser.rs +0 -0
  38. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/spatial.rs +0 -0
  39. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/types.rs +0 -0
  40. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/inline_images.rs +0 -0
  41. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/text.rs +0 -0
  42. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/wrapper.rs +0 -0
  43. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/tests/commonmark_compliance_test.rs +0 -0
  44. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/tests/hocr_compliance_test.rs +0 -0
  45. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown/tests/integration_test.rs +0 -0
  46. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/python/html_to_markdown/__init__.py +0 -0
  47. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/python/html_to_markdown/_html_to_markdown.pyi +0 -0
  48. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/uv.lock +0 -0
  49. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/__main__.py +0 -0
  50. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/_rust.pyi +0 -0
  51. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/cli.py +0 -0
  52. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/cli_proxy.py +0 -0
  53. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/exceptions.py +0 -0
  54. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/py.typed +0 -0
  55. {html_to_markdown-2.4.1 → html_to_markdown-2.5.0}/html_to_markdown/v1_compat.py +0 -0
@@ -157,9 +157,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
157
157
 
158
158
  [[package]]
159
159
  name = "cc"
160
- version = "1.2.41"
160
+ version = "1.2.42"
161
161
  source = "registry+https://github.com/rust-lang/crates.io-index"
162
- checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7"
162
+ checksum = "81bbf3b3619004ad9bd139f62a9ab5cfe467f307455a0d307b0cf58bf070feaa"
163
163
  dependencies = [
164
164
  "find-msvc-tools",
165
165
  "shlex",
@@ -249,9 +249,9 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
249
249
 
250
250
  [[package]]
251
251
  name = "clap_mangen"
252
- version = "0.2.30"
252
+ version = "0.2.31"
253
253
  source = "registry+https://github.com/rust-lang/crates.io-index"
254
- checksum = "263c8214a8e0cb8129f3c62036c50e9c6e15c7bd364c42e0437c492b9293f778"
254
+ checksum = "439ea63a92086df93893164221ad4f24142086d535b3a0957b9b9bea2dc86301"
255
255
  dependencies = [
256
256
  "clap",
257
257
  "roff",
@@ -386,9 +386,9 @@ dependencies = [
386
386
 
387
387
  [[package]]
388
388
  name = "ctor"
389
- version = "0.5.0"
389
+ version = "0.6.0"
390
390
  source = "registry+https://github.com/rust-lang/crates.io-index"
391
- checksum = "67773048316103656a637612c4a62477603b777d91d9c62ff2290f9cde178fdb"
391
+ checksum = "59c9b8bdf64ee849747c1b12eb861d21aa47fa161564f48332f1afe2373bf899"
392
392
  dependencies = [
393
393
  "ctor-proc-macro",
394
394
  "dtor",
@@ -396,9 +396,9 @@ dependencies = [
396
396
 
397
397
  [[package]]
398
398
  name = "ctor-proc-macro"
399
- version = "0.0.6"
399
+ version = "0.0.7"
400
400
  source = "registry+https://github.com/rust-lang/crates.io-index"
401
- checksum = "e2931af7e13dc045d8e9d26afccc6fa115d64e115c9c84b1166288b46f6782c2"
401
+ checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
402
402
 
403
403
  [[package]]
404
404
  name = "cty"
@@ -425,9 +425,9 @@ dependencies = [
425
425
 
426
426
  [[package]]
427
427
  name = "doc-comment"
428
- version = "0.3.3"
428
+ version = "0.3.4"
429
429
  source = "registry+https://github.com/rust-lang/crates.io-index"
430
- checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
430
+ checksum = "780955b8b195a21ab8e4ac6b60dd1dbdcec1dc6c51c0617964b08c81785e12c9"
431
431
 
432
432
  [[package]]
433
433
  name = "dtoa"
@@ -507,9 +507,9 @@ checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127"
507
507
 
508
508
  [[package]]
509
509
  name = "flate2"
510
- version = "1.1.4"
510
+ version = "1.1.5"
511
511
  source = "registry+https://github.com/rust-lang/crates.io-index"
512
- checksum = "dc5a4e564e38c699f2880d3fda590bedc2e69f3f84cd48b457bd892ce61d0aa9"
512
+ checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb"
513
513
  dependencies = [
514
514
  "crc32fast",
515
515
  "miniz_oxide",
@@ -550,11 +550,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
550
550
  checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
551
551
  dependencies = [
552
552
  "cfg-if",
553
- "js-sys",
554
553
  "libc",
555
554
  "r-efi",
556
555
  "wasip2",
557
- "wasm-bindgen",
558
556
  ]
559
557
 
560
558
  [[package]]
@@ -595,7 +593,7 @@ dependencies = [
595
593
 
596
594
  [[package]]
597
595
  name = "html-to-markdown-cli"
598
- version = "2.4.1"
596
+ version = "2.5.0"
599
597
  dependencies = [
600
598
  "assert_cmd",
601
599
  "clap",
@@ -609,7 +607,7 @@ dependencies = [
609
607
 
610
608
  [[package]]
611
609
  name = "html-to-markdown-node"
612
- version = "2.4.1"
610
+ version = "2.5.0"
613
611
  dependencies = [
614
612
  "html-to-markdown-rs",
615
613
  "mimalloc-rust",
@@ -620,7 +618,7 @@ dependencies = [
620
618
 
621
619
  [[package]]
622
620
  name = "html-to-markdown-py"
623
- version = "2.4.1"
621
+ version = "2.5.0"
624
622
  dependencies = [
625
623
  "base64",
626
624
  "html-to-markdown-rs",
@@ -630,7 +628,7 @@ dependencies = [
630
628
 
631
629
  [[package]]
632
630
  name = "html-to-markdown-rs"
633
- version = "2.4.1"
631
+ version = "2.5.0"
634
632
  dependencies = [
635
633
  "ammonia",
636
634
  "base64",
@@ -647,10 +645,9 @@ dependencies = [
647
645
 
648
646
  [[package]]
649
647
  name = "html-to-markdown-wasm"
650
- version = "2.4.1"
648
+ version = "2.5.0"
651
649
  dependencies = [
652
650
  "console_error_panic_hook",
653
- "getrandom",
654
651
  "html-to-markdown-rs",
655
652
  "js-sys",
656
653
  "serde",
@@ -989,9 +986,9 @@ dependencies = [
989
986
 
990
987
  [[package]]
991
988
  name = "napi"
992
- version = "3.3.0"
989
+ version = "3.4.0"
993
990
  source = "registry+https://github.com/rust-lang/crates.io-index"
994
- checksum = "f1b74e3dce5230795bb4d2821b941706dee733c7308752507254b0497f39cad7"
991
+ checksum = "c3a1135cfe16ca43ac82ac05858554fc39c037d8e4592f2b4a83d7ef8e822f43"
995
992
  dependencies = [
996
993
  "bitflags",
997
994
  "ctor",
@@ -1003,15 +1000,15 @@ dependencies = [
1003
1000
 
1004
1001
  [[package]]
1005
1002
  name = "napi-build"
1006
- version = "2.2.3"
1003
+ version = "2.2.4"
1007
1004
  source = "registry+https://github.com/rust-lang/crates.io-index"
1008
- checksum = "dcae8ad5609d14afb3a3b91dee88c757016261b151e9dcecabf1b2a31a6cab14"
1005
+ checksum = "3ae82775d1b06f3f07efd0666e59bbc175da8383bc372051031d7a447e94fbea"
1009
1006
 
1010
1007
  [[package]]
1011
1008
  name = "napi-derive"
1012
- version = "3.2.5"
1009
+ version = "3.3.0"
1013
1010
  source = "registry+https://github.com/rust-lang/crates.io-index"
1014
- checksum = "7552d5a579b834614bbd496db5109f1b9f1c758f08224b0dee1e408333adf0d0"
1011
+ checksum = "78665d6bdf10e9a4e6b38123efb0f66962e6197c1aea2f07cff3f159a374696d"
1015
1012
  dependencies = [
1016
1013
  "convert_case",
1017
1014
  "ctor",
@@ -1023,9 +1020,9 @@ dependencies = [
1023
1020
 
1024
1021
  [[package]]
1025
1022
  name = "napi-derive-backend"
1026
- version = "2.2.0"
1023
+ version = "3.0.0"
1027
1024
  source = "registry+https://github.com/rust-lang/crates.io-index"
1028
- checksum = "5f6a81ac7486b70f2532a289603340862c06eea5a1e650c1ffeda2ce1238516a"
1025
+ checksum = "42d55d01423e7264de3acc13b258fa48ca7cf38a4d25db848908ec3c1304a85a"
1029
1026
  dependencies = [
1030
1027
  "convert_case",
1031
1028
  "proc-macro2",
@@ -1036,9 +1033,9 @@ dependencies = [
1036
1033
 
1037
1034
  [[package]]
1038
1035
  name = "napi-sys"
1039
- version = "3.0.0"
1036
+ version = "3.0.1"
1040
1037
  source = "registry+https://github.com/rust-lang/crates.io-index"
1041
- checksum = "3e4e7135a8f97aa0f1509cce21a8a1f9dcec1b50d8dee006b48a5adb69a9d64d"
1038
+ checksum = "1ed8f0e23a62a3ce0fbb6527cdc056e9282ddd9916b068c46f8923e18eed5ee6"
1042
1039
  dependencies = [
1043
1040
  "libloading",
1044
1041
  ]
@@ -1263,9 +1260,9 @@ dependencies = [
1263
1260
 
1264
1261
  [[package]]
1265
1262
  name = "proc-macro2"
1266
- version = "1.0.101"
1263
+ version = "1.0.103"
1267
1264
  source = "registry+https://github.com/rust-lang/crates.io-index"
1268
- checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
1265
+ checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
1269
1266
  dependencies = [
1270
1267
  "unicode-ident",
1271
1268
  ]
@@ -1609,9 +1606,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
1609
1606
 
1610
1607
  [[package]]
1611
1608
  name = "syn"
1612
- version = "2.0.107"
1609
+ version = "2.0.108"
1613
1610
  source = "registry+https://github.com/rust-lang/crates.io-index"
1614
- checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b"
1611
+ checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917"
1615
1612
  dependencies = [
1616
1613
  "proc-macro2",
1617
1614
  "quote",
@@ -3,7 +3,7 @@ resolver = "2"
3
3
  members = ["crates/html-to-markdown-py"]
4
4
 
5
5
  [workspace.package]
6
- version = "2.4.1"
6
+ version = "2.5.0"
7
7
  edition = "2021"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
9
9
  license = "MIT"
@@ -15,7 +15,7 @@ rust-version = "1.80"
15
15
 
16
16
  [workspace.dependencies]
17
17
  # Core library
18
- html-to-markdown-rs = { version = "2.4.1", path = "crates/html-to-markdown" }
18
+ html-to-markdown-rs = { version = "2.5.0", path = "crates/html-to-markdown" }
19
19
 
20
20
  # HTML parsing and sanitization
21
21
  tl = "0.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 2.4.1
3
+ Version: 2.5.0
4
4
  Classifier: Development Status :: 5 - Production/Stable
5
5
  Classifier: Environment :: Console
6
6
  Classifier: Intended Audience :: Developers
@@ -162,9 +162,12 @@ Key fields (see docstring for full matrix):
162
162
 
163
163
  ### `PreprocessingOptions`
164
164
 
165
- - `enabled`: enable HTML sanitisation
166
- - `preset`: `"minimal" | "standard" | "aggressive"`
167
- - `remove_navigation`, `remove_forms`
165
+ - `enabled`: enable HTML sanitisation (default: `True` since v2.4.2 for robust malformed HTML handling)
166
+ - `preset`: `"minimal" | "standard" | "aggressive"` (default: `"standard"`)
167
+ - `remove_navigation`: remove navigation elements (default: `True`)
168
+ - `remove_forms`: remove form elements (default: `True`)
169
+
170
+ **Note:** As of v2.4.2, preprocessing is enabled by default to ensure robust handling of malformed HTML (e.g., bare angle brackets like `1<2` in content). Set `enabled=False` if you need minimal preprocessing.
168
171
 
169
172
  ### `InlineImageConfig`
170
173
 
@@ -129,9 +129,12 @@ Key fields (see docstring for full matrix):
129
129
 
130
130
  ### `PreprocessingOptions`
131
131
 
132
- - `enabled`: enable HTML sanitisation
133
- - `preset`: `"minimal" | "standard" | "aggressive"`
134
- - `remove_navigation`, `remove_forms`
132
+ - `enabled`: enable HTML sanitisation (default: `True` since v2.4.2 for robust malformed HTML handling)
133
+ - `preset`: `"minimal" | "standard" | "aggressive"` (default: `"standard"`)
134
+ - `remove_navigation`: remove navigation elements (default: `True`)
135
+ - `remove_forms`: remove form elements (default: `True`)
136
+
137
+ **Note:** As of v2.4.2, preprocessing is enabled by default to ensure robust handling of malformed HTML (e.g., bare angle brackets like `1<2` in content). Set `enabled=False` if you need minimal preprocessing.
135
138
 
136
139
  ### `InlineImageConfig`
137
140
 
@@ -60,6 +60,41 @@ let options = ConversionOptions {
60
60
  let markdown = convert(html, Some(options))?;
61
61
  ```
62
62
 
63
+ ### Preserving HTML Tags
64
+
65
+ The `preserve_tags` option allows you to keep specific HTML tags in their original form instead of converting them to Markdown. This is useful for complex elements like tables that may not convert well:
66
+
67
+ ```rust
68
+ use html_to_markdown_rs::{convert, ConversionOptions};
69
+
70
+ let html = r#"
71
+ <p>Before table</p>
72
+ <table class="data">
73
+ <tr><th>Name</th><th>Value</th></tr>
74
+ <tr><td>Item 1</td><td>100</td></tr>
75
+ </table>
76
+ <p>After table</p>
77
+ "#;
78
+
79
+ let options = ConversionOptions {
80
+ preserve_tags: vec!["table".to_string()],
81
+ ..Default::default()
82
+ };
83
+
84
+ let markdown = convert(html, Some(options))?;
85
+ // Result: "Before table\n\n<table class=\"data\">...</table>\n\nAfter table\n"
86
+ ```
87
+
88
+ You can preserve multiple tag types and combine with `strip_tags`:
89
+
90
+ ```rust
91
+ let options = ConversionOptions {
92
+ preserve_tags: vec!["table".to_string(), "form".to_string()],
93
+ strip_tags: vec!["script".to_string(), "style".to_string()],
94
+ ..Default::default()
95
+ };
96
+ ```
97
+
63
98
  ## Web Scraping with Preprocessing
64
99
 
65
100
  ```rust
@@ -974,6 +974,9 @@ fn convert_html_impl(
974
974
  .replace("<hr/>", "<hr>")
975
975
  .replace("<img/>", "<img>");
976
976
 
977
+ // Escape malformed angle brackets in text content to prevent parser failures
978
+ let html = escape_malformed_angle_brackets(&html);
979
+
977
980
  let html = strip_script_and_style_sections(&html);
978
981
 
979
982
  let dom = tl::parse(html.as_ref(), tl::ParserOptions::default())
@@ -1080,6 +1083,151 @@ fn convert_html_impl(
1080
1083
  }
1081
1084
  }
1082
1085
 
1086
+ /// Escape malformed angle brackets in HTML that are not part of valid tags.
1087
+ ///
1088
+ /// This function ensures robust parsing by escaping bare `<` and `>` characters
1089
+ /// that appear in text content and are not part of HTML tags. This prevents
1090
+ /// parser failures on malformed HTML like "1<2" or comparisons in text.
1091
+ ///
1092
+ /// # Examples
1093
+ ///
1094
+ /// - `1<2` becomes `1&lt;2`
1095
+ /// - `<div>1<2</div>` becomes `<div>1&lt;2</div>`
1096
+ /// - `<script>1 < 2</script>` remains unchanged (handled by script stripping)
1097
+ fn escape_malformed_angle_brackets(input: &str) -> Cow<'_, str> {
1098
+ let bytes = input.as_bytes();
1099
+ let len = bytes.len();
1100
+ let mut idx = 0;
1101
+ let mut last = 0;
1102
+ let mut output: Option<String> = None;
1103
+
1104
+ while idx < len {
1105
+ if bytes[idx] == b'<' {
1106
+ // Check if this is a valid tag start
1107
+ if idx + 1 < len {
1108
+ let next = bytes[idx + 1];
1109
+
1110
+ // Valid tag patterns: <tagname, </tagname, <!doctype, <!--
1111
+ let is_valid_tag = match next {
1112
+ b'!' => {
1113
+ // DOCTYPE or comment
1114
+ idx + 2 < len
1115
+ && (bytes[idx + 2] == b'-'
1116
+ || bytes[idx + 2].is_ascii_alphabetic()
1117
+ || bytes[idx + 2].is_ascii_uppercase())
1118
+ }
1119
+ b'/' => {
1120
+ // Closing tag
1121
+ idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
1122
+ }
1123
+ b'?' => {
1124
+ // XML declaration
1125
+ true
1126
+ }
1127
+ c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => {
1128
+ // Opening tag
1129
+ true
1130
+ }
1131
+ _ => false,
1132
+ };
1133
+
1134
+ if !is_valid_tag {
1135
+ // This is a bare `<` that should be escaped
1136
+ let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1137
+ out.push_str(&input[last..idx]);
1138
+ out.push_str("&lt;");
1139
+ last = idx + 1;
1140
+ }
1141
+ } else {
1142
+ // `<` at end of string - escape it
1143
+ let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1144
+ out.push_str(&input[last..idx]);
1145
+ out.push_str("&lt;");
1146
+ last = idx + 1;
1147
+ }
1148
+ }
1149
+ idx += 1;
1150
+ }
1151
+
1152
+ if let Some(mut out) = output {
1153
+ if last < input.len() {
1154
+ out.push_str(&input[last..]);
1155
+ }
1156
+ Cow::Owned(out)
1157
+ } else {
1158
+ Cow::Borrowed(input)
1159
+ }
1160
+ }
1161
+
1162
+ /// Serialize a tag and its children back to HTML.
1163
+ ///
1164
+ /// This is used for the preserve_tags feature to output original HTML for specific elements.
1165
+ fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
1166
+ let mut html = String::new();
1167
+ serialize_node_to_html(handle, parser, &mut html);
1168
+ html
1169
+ }
1170
+
1171
+ /// Recursively serialize a node to HTML.
1172
+ fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
1173
+ match handle.get(parser) {
1174
+ Some(tl::Node::Tag(tag)) => {
1175
+ let tag_name = tag.name().as_utf8_str();
1176
+
1177
+ // Opening tag
1178
+ output.push('<');
1179
+ output.push_str(&tag_name);
1180
+
1181
+ // Attributes
1182
+ for (key, value) in tag.attributes().iter() {
1183
+ output.push(' ');
1184
+ output.push_str(&key);
1185
+ if let Some(val) = value {
1186
+ output.push_str("=\"");
1187
+ output.push_str(&val);
1188
+ output.push('"');
1189
+ }
1190
+ }
1191
+
1192
+ output.push('>');
1193
+
1194
+ // Children
1195
+ let children = tag.children();
1196
+ for child_handle in children.top().iter() {
1197
+ serialize_node_to_html(child_handle, parser, output);
1198
+ }
1199
+
1200
+ // Closing tag (skip for self-closing tags)
1201
+ if !matches!(
1202
+ tag_name.as_ref(),
1203
+ "br" | "hr"
1204
+ | "img"
1205
+ | "input"
1206
+ | "meta"
1207
+ | "link"
1208
+ | "area"
1209
+ | "base"
1210
+ | "col"
1211
+ | "embed"
1212
+ | "param"
1213
+ | "source"
1214
+ | "track"
1215
+ | "wbr"
1216
+ ) {
1217
+ output.push_str("</");
1218
+ output.push_str(&tag_name);
1219
+ output.push('>');
1220
+ }
1221
+ }
1222
+ Some(tl::Node::Raw(bytes)) => {
1223
+ if let Ok(text) = std::str::from_utf8(bytes.as_bytes()) {
1224
+ output.push_str(text);
1225
+ }
1226
+ }
1227
+ _ => {}
1228
+ }
1229
+ }
1230
+
1083
1231
  fn strip_script_and_style_sections(input: &str) -> Cow<'_, str> {
1084
1232
  const TAGS: [&[u8]; 2] = [b"script", b"style"];
1085
1233
  const SVG: &[u8] = b"svg";
@@ -1478,6 +1626,13 @@ fn walk_node(
1478
1626
  return;
1479
1627
  }
1480
1628
 
1629
+ // Preserve tags: output original HTML
1630
+ if options.preserve_tags.iter().any(|t| t.as_str() == tag_name) {
1631
+ let html = serialize_tag_to_html(node_handle, parser);
1632
+ output.push_str(&html);
1633
+ return;
1634
+ }
1635
+
1481
1636
  match tag_name.as_ref() {
1482
1637
  "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
1483
1638
  let level = tag_name.chars().last().and_then(|c| c.to_digit(10)).unwrap_or(1) as usize;
@@ -4225,4 +4380,176 @@ mod tests {
4225
4380
  add_list_continuation_indent(&mut output, 1, false, &opts);
4226
4381
  assert_eq!(output, "* First\n ");
4227
4382
  }
4383
+
4384
+ #[test]
4385
+ fn test_escape_malformed_angle_brackets_bare() {
4386
+ let input = "1<2";
4387
+ let escaped = escape_malformed_angle_brackets(input);
4388
+ assert_eq!(escaped, "1&lt;2");
4389
+ }
4390
+
4391
+ #[test]
4392
+ fn test_escape_malformed_angle_brackets_in_text() {
4393
+ let input = "<html>1<2 Content</html>";
4394
+ let escaped = escape_malformed_angle_brackets(input);
4395
+ assert_eq!(escaped, "<html>1&lt;2 Content</html>");
4396
+ }
4397
+
4398
+ #[test]
4399
+ fn test_escape_malformed_angle_brackets_multiple() {
4400
+ let input = "1 < 2 < 3";
4401
+ let escaped = escape_malformed_angle_brackets(input);
4402
+ assert_eq!(escaped, "1 &lt; 2 &lt; 3");
4403
+ }
4404
+
4405
+ #[test]
4406
+ fn test_escape_malformed_angle_brackets_preserves_valid_tags() {
4407
+ let input = "<div>content</div>";
4408
+ let escaped = escape_malformed_angle_brackets(input);
4409
+ assert_eq!(escaped, "<div>content</div>");
4410
+ }
4411
+
4412
+ #[test]
4413
+ fn test_escape_malformed_angle_brackets_mixed() {
4414
+ let input = "<div>1<2</div><p>3<4</p>";
4415
+ let escaped = escape_malformed_angle_brackets(input);
4416
+ assert_eq!(escaped, "<div>1&lt;2</div><p>3&lt;4</p>");
4417
+ }
4418
+
4419
+ #[test]
4420
+ fn test_escape_malformed_angle_brackets_at_end() {
4421
+ let input = "test<";
4422
+ let escaped = escape_malformed_angle_brackets(input);
4423
+ assert_eq!(escaped, "test&lt;");
4424
+ }
4425
+
4426
+ #[test]
4427
+ fn test_escape_malformed_angle_brackets_preserves_comments() {
4428
+ let input = "<!-- comment -->1<2";
4429
+ let escaped = escape_malformed_angle_brackets(input);
4430
+ assert_eq!(escaped, "<!-- comment -->1&lt;2");
4431
+ }
4432
+
4433
+ #[test]
4434
+ fn test_escape_malformed_angle_brackets_preserves_doctype() {
4435
+ let input = "<!DOCTYPE html>1<2";
4436
+ let escaped = escape_malformed_angle_brackets(input);
4437
+ assert_eq!(escaped, "<!DOCTYPE html>1&lt;2");
4438
+ }
4439
+
4440
+ #[test]
4441
+ fn test_convert_with_malformed_angle_brackets() {
4442
+ // Test the full conversion pipeline (issue #94)
4443
+ let html = "<html>1<2\nContent</html>";
4444
+ let result = convert_html(html, &ConversionOptions::default()).unwrap();
4445
+ assert!(
4446
+ result.contains("Content"),
4447
+ "Result should contain 'Content': {:?}",
4448
+ result
4449
+ );
4450
+ assert!(
4451
+ result.contains("1<2") || result.contains("1&lt;2"),
4452
+ "Result should contain escaped or unescaped comparison"
4453
+ );
4454
+ }
4455
+
4456
+ #[test]
4457
+ fn test_convert_with_malformed_angle_brackets_in_div() {
4458
+ let html = "<html><div>1<2</div><div>Content</div></html>";
4459
+ let result = convert_html(html, &ConversionOptions::default()).unwrap();
4460
+ assert!(
4461
+ result.contains("Content"),
4462
+ "Result should contain 'Content': {:?}",
4463
+ result
4464
+ );
4465
+ }
4466
+
4467
+ #[test]
4468
+ fn test_convert_with_multiple_malformed_angle_brackets() {
4469
+ let html = "<html>1 < 2 < 3<p>Content</p></html>";
4470
+ let result = convert_html(html, &ConversionOptions::default()).unwrap();
4471
+ assert!(
4472
+ result.contains("Content"),
4473
+ "Result should contain 'Content': {:?}",
4474
+ result
4475
+ );
4476
+ }
4477
+
4478
+ #[test]
4479
+ fn test_preserve_tags_simple_table() {
4480
+ let html = r#"<div><table><tr><td>Cell 1</td><td>Cell 2</td></tr></table><p>Text</p></div>"#;
4481
+ let mut options = ConversionOptions::default();
4482
+ options.preserve_tags = vec!["table".to_string()];
4483
+ let result = convert_html(html, &options).unwrap();
4484
+
4485
+ assert!(result.contains("<table>"), "Should preserve table tag");
4486
+ assert!(result.contains("</table>"), "Should have closing table tag");
4487
+ assert!(result.contains("<tr>"), "Should preserve tr tag");
4488
+ assert!(result.contains("<td>"), "Should preserve td tag");
4489
+ assert!(result.contains("Text"), "Should convert other elements");
4490
+ }
4491
+
4492
+ #[test]
4493
+ fn test_preserve_tags_with_attributes() {
4494
+ let html = r#"<table class="data" id="mytable"><tr><td>Data</td></tr></table>"#;
4495
+ let mut options = ConversionOptions::default();
4496
+ options.preserve_tags = vec!["table".to_string()];
4497
+ let result = convert_html(html, &options).unwrap();
4498
+
4499
+ assert!(result.contains("<table"), "Should preserve table tag");
4500
+ assert!(result.contains("class="), "Should preserve class attribute");
4501
+ assert!(result.contains("id="), "Should preserve id attribute");
4502
+ assert!(result.contains("</table>"), "Should have closing tag");
4503
+ }
4504
+
4505
+ #[test]
4506
+ fn test_preserve_tags_multiple_tags() {
4507
+ let html = r#"<div><table><tr><td>Table</td></tr></table><form><input type="text"/></form><p>Text</p></div>"#;
4508
+ let mut options = ConversionOptions::default();
4509
+ options.preserve_tags = vec!["table".to_string(), "form".to_string()];
4510
+ let result = convert_html(html, &options).unwrap();
4511
+
4512
+ assert!(result.contains("<table>"), "Should preserve table");
4513
+ assert!(result.contains("<form>"), "Should preserve form");
4514
+ assert!(result.contains("Text"), "Should convert paragraph");
4515
+ }
4516
+
4517
+ #[test]
4518
+ fn test_preserve_tags_nested_content() {
4519
+ let html = r#"<table><thead><tr><th>Header</th></tr></thead><tbody><tr><td>Data</td></tr></tbody></table>"#;
4520
+ let mut options = ConversionOptions::default();
4521
+ options.preserve_tags = vec!["table".to_string()];
4522
+ let result = convert_html(html, &options).unwrap();
4523
+
4524
+ assert!(result.contains("<thead>"), "Should preserve nested thead");
4525
+ assert!(result.contains("<tbody>"), "Should preserve nested tbody");
4526
+ assert!(result.contains("<th>"), "Should preserve th tag");
4527
+ assert!(result.contains("Header"), "Should preserve text content");
4528
+ }
4529
+
4530
+ #[test]
4531
+ fn test_preserve_tags_empty_list() {
4532
+ let html = r#"<table><tr><td>Cell</td></tr></table>"#;
4533
+ let options = ConversionOptions::default(); // No preserve_tags
4534
+ let result = convert_html(html, &options).unwrap();
4535
+
4536
+ // Should convert to markdown table (or at least not preserve HTML)
4537
+ assert!(
4538
+ !result.contains("<table>"),
4539
+ "Should not preserve table without preserve_tags"
4540
+ );
4541
+ }
4542
+
4543
+ #[test]
4544
+ fn test_preserve_tags_vs_strip_tags() {
4545
+ let html = r#"<table><tr><td>Table</td></tr></table><div><span>Text</span></div>"#;
4546
+ let mut options = ConversionOptions::default();
4547
+ options.preserve_tags = vec!["table".to_string()];
4548
+ options.strip_tags = vec!["span".to_string()];
4549
+ let result = convert_html(html, &options).unwrap();
4550
+
4551
+ assert!(result.contains("<table>"), "Should preserve table");
4552
+ assert!(!result.contains("<span>"), "Should strip span tag");
4553
+ assert!(result.contains("Text"), "Should keep span text content");
4554
+ }
4228
4555
  }
@@ -237,9 +237,22 @@ fn convert_element(
237
237
 
238
238
  // Words - join with space
239
239
  HocrElementType::OcrxWord => {
240
+ // Ensure space before this word if output doesn't end with whitespace or markdown formatting
241
+ if !output.is_empty()
242
+ && !output.ends_with(' ')
243
+ && !output.ends_with('\t')
244
+ && !output.ends_with('\n')
245
+ && !output.ends_with('*') // Don't add space after italic/bold markers
246
+ && !output.ends_with('`') // Don't add space after code markers
247
+ && !output.ends_with('_') // Don't add space after underline markers
248
+ && !output.ends_with('[')
249
+ // Don't add space after opening bracket (link/image alt)
250
+ {
251
+ output.push(' ');
252
+ }
253
+
240
254
  if !element.text.is_empty() {
241
255
  output.push_str(&element.text);
242
- output.push(' ');
243
256
  }
244
257
  }
245
258