html-to-markdown 2.4.0__tar.gz → 2.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (54) hide show
  1. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/Cargo.lock +27 -27
  2. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/Cargo.toml +2 -2
  3. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/PKG-INFO +7 -4
  4. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/README_PYPI.md +6 -3
  5. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/converter.rs +350 -1
  6. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/options.rs +1 -1
  7. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/__init__.py +1 -1
  8. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/bin/html-to-markdown +0 -0
  9. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/options.py +2 -2
  10. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/pyproject.toml +1 -1
  11. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/LICENSE +0 -0
  12. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/Cargo.toml +0 -0
  13. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/README.md +0 -0
  14. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/benches/conversion_benchmark.rs +0 -0
  15. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/benches/micro_benchmark.rs +0 -0
  16. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/benches/profiling_benchmark.rs +0 -0
  17. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/basic.rs +0 -0
  18. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/table.rs +0 -0
  19. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_escape.rs +0 -0
  20. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_inline_formatting.rs +0 -0
  21. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_lists.rs +0 -0
  22. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_semantic_tags.rs +0 -0
  23. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_tables.rs +0 -0
  24. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_task_lists.rs +0 -0
  25. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_whitespace.rs +0 -0
  26. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/error.rs +0 -0
  27. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/converter.rs +0 -0
  28. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/extractor.rs +0 -0
  29. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/mod.rs +0 -0
  30. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/parser.rs +0 -0
  31. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/spatial.rs +0 -0
  32. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/types.rs +0 -0
  33. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/inline_images.rs +0 -0
  34. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/lib.rs +0 -0
  35. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/sanitizer.rs +0 -0
  36. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/text.rs +0 -0
  37. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/wrapper.rs +0 -0
  38. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/tests/commonmark_compliance_test.rs +0 -0
  39. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/tests/hocr_compliance_test.rs +0 -0
  40. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/tests/integration_test.rs +0 -0
  41. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown-py/Cargo.toml +0 -0
  42. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown-py/README.md +0 -0
  43. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown-py/python/html_to_markdown/__init__.py +0 -0
  44. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown-py/python/html_to_markdown/_html_to_markdown.pyi +0 -0
  45. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown-py/src/lib.rs +0 -0
  46. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown-py/uv.lock +0 -0
  47. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/__main__.py +0 -0
  48. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/_rust.pyi +0 -0
  49. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/api.py +0 -0
  50. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/cli.py +0 -0
  51. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/cli_proxy.py +0 -0
  52. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/exceptions.py +0 -0
  53. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/py.typed +0 -0
  54. {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/v1_compat.py +0 -0
@@ -249,9 +249,9 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
249
249
 
250
250
  [[package]]
251
251
  name = "clap_mangen"
252
- version = "0.2.30"
252
+ version = "0.2.31"
253
253
  source = "registry+https://github.com/rust-lang/crates.io-index"
254
- checksum = "263c8214a8e0cb8129f3c62036c50e9c6e15c7bd364c42e0437c492b9293f778"
254
+ checksum = "439ea63a92086df93893164221ad4f24142086d535b3a0957b9b9bea2dc86301"
255
255
  dependencies = [
256
256
  "clap",
257
257
  "roff",
@@ -386,9 +386,9 @@ dependencies = [
386
386
 
387
387
  [[package]]
388
388
  name = "ctor"
389
- version = "0.5.0"
389
+ version = "0.6.0"
390
390
  source = "registry+https://github.com/rust-lang/crates.io-index"
391
- checksum = "67773048316103656a637612c4a62477603b777d91d9c62ff2290f9cde178fdb"
391
+ checksum = "59c9b8bdf64ee849747c1b12eb861d21aa47fa161564f48332f1afe2373bf899"
392
392
  dependencies = [
393
393
  "ctor-proc-macro",
394
394
  "dtor",
@@ -396,9 +396,9 @@ dependencies = [
396
396
 
397
397
  [[package]]
398
398
  name = "ctor-proc-macro"
399
- version = "0.0.6"
399
+ version = "0.0.7"
400
400
  source = "registry+https://github.com/rust-lang/crates.io-index"
401
- checksum = "e2931af7e13dc045d8e9d26afccc6fa115d64e115c9c84b1166288b46f6782c2"
401
+ checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
402
402
 
403
403
  [[package]]
404
404
  name = "cty"
@@ -507,9 +507,9 @@ checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127"
507
507
 
508
508
  [[package]]
509
509
  name = "flate2"
510
- version = "1.1.4"
510
+ version = "1.1.5"
511
511
  source = "registry+https://github.com/rust-lang/crates.io-index"
512
- checksum = "dc5a4e564e38c699f2880d3fda590bedc2e69f3f84cd48b457bd892ce61d0aa9"
512
+ checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb"
513
513
  dependencies = [
514
514
  "crc32fast",
515
515
  "miniz_oxide",
@@ -595,7 +595,7 @@ dependencies = [
595
595
 
596
596
  [[package]]
597
597
  name = "html-to-markdown-cli"
598
- version = "2.4.0"
598
+ version = "2.4.2"
599
599
  dependencies = [
600
600
  "assert_cmd",
601
601
  "clap",
@@ -609,7 +609,7 @@ dependencies = [
609
609
 
610
610
  [[package]]
611
611
  name = "html-to-markdown-node"
612
- version = "2.4.0"
612
+ version = "2.4.2"
613
613
  dependencies = [
614
614
  "html-to-markdown-rs",
615
615
  "mimalloc-rust",
@@ -620,7 +620,7 @@ dependencies = [
620
620
 
621
621
  [[package]]
622
622
  name = "html-to-markdown-py"
623
- version = "2.4.0"
623
+ version = "2.4.2"
624
624
  dependencies = [
625
625
  "base64",
626
626
  "html-to-markdown-rs",
@@ -630,7 +630,7 @@ dependencies = [
630
630
 
631
631
  [[package]]
632
632
  name = "html-to-markdown-rs"
633
- version = "2.4.0"
633
+ version = "2.4.2"
634
634
  dependencies = [
635
635
  "ammonia",
636
636
  "base64",
@@ -647,7 +647,7 @@ dependencies = [
647
647
 
648
648
  [[package]]
649
649
  name = "html-to-markdown-wasm"
650
- version = "2.4.0"
650
+ version = "2.4.2"
651
651
  dependencies = [
652
652
  "console_error_panic_hook",
653
653
  "getrandom",
@@ -989,9 +989,9 @@ dependencies = [
989
989
 
990
990
  [[package]]
991
991
  name = "napi"
992
- version = "3.3.0"
992
+ version = "3.4.0"
993
993
  source = "registry+https://github.com/rust-lang/crates.io-index"
994
- checksum = "f1b74e3dce5230795bb4d2821b941706dee733c7308752507254b0497f39cad7"
994
+ checksum = "c3a1135cfe16ca43ac82ac05858554fc39c037d8e4592f2b4a83d7ef8e822f43"
995
995
  dependencies = [
996
996
  "bitflags",
997
997
  "ctor",
@@ -1003,15 +1003,15 @@ dependencies = [
1003
1003
 
1004
1004
  [[package]]
1005
1005
  name = "napi-build"
1006
- version = "2.2.3"
1006
+ version = "2.2.4"
1007
1007
  source = "registry+https://github.com/rust-lang/crates.io-index"
1008
- checksum = "dcae8ad5609d14afb3a3b91dee88c757016261b151e9dcecabf1b2a31a6cab14"
1008
+ checksum = "3ae82775d1b06f3f07efd0666e59bbc175da8383bc372051031d7a447e94fbea"
1009
1009
 
1010
1010
  [[package]]
1011
1011
  name = "napi-derive"
1012
- version = "3.2.5"
1012
+ version = "3.3.0"
1013
1013
  source = "registry+https://github.com/rust-lang/crates.io-index"
1014
- checksum = "7552d5a579b834614bbd496db5109f1b9f1c758f08224b0dee1e408333adf0d0"
1014
+ checksum = "78665d6bdf10e9a4e6b38123efb0f66962e6197c1aea2f07cff3f159a374696d"
1015
1015
  dependencies = [
1016
1016
  "convert_case",
1017
1017
  "ctor",
@@ -1023,9 +1023,9 @@ dependencies = [
1023
1023
 
1024
1024
  [[package]]
1025
1025
  name = "napi-derive-backend"
1026
- version = "2.2.0"
1026
+ version = "3.0.0"
1027
1027
  source = "registry+https://github.com/rust-lang/crates.io-index"
1028
- checksum = "5f6a81ac7486b70f2532a289603340862c06eea5a1e650c1ffeda2ce1238516a"
1028
+ checksum = "42d55d01423e7264de3acc13b258fa48ca7cf38a4d25db848908ec3c1304a85a"
1029
1029
  dependencies = [
1030
1030
  "convert_case",
1031
1031
  "proc-macro2",
@@ -1036,9 +1036,9 @@ dependencies = [
1036
1036
 
1037
1037
  [[package]]
1038
1038
  name = "napi-sys"
1039
- version = "3.0.0"
1039
+ version = "3.0.1"
1040
1040
  source = "registry+https://github.com/rust-lang/crates.io-index"
1041
- checksum = "3e4e7135a8f97aa0f1509cce21a8a1f9dcec1b50d8dee006b48a5adb69a9d64d"
1041
+ checksum = "1ed8f0e23a62a3ce0fbb6527cdc056e9282ddd9916b068c46f8923e18eed5ee6"
1042
1042
  dependencies = [
1043
1043
  "libloading",
1044
1044
  ]
@@ -1263,9 +1263,9 @@ dependencies = [
1263
1263
 
1264
1264
  [[package]]
1265
1265
  name = "proc-macro2"
1266
- version = "1.0.101"
1266
+ version = "1.0.103"
1267
1267
  source = "registry+https://github.com/rust-lang/crates.io-index"
1268
- checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
1268
+ checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
1269
1269
  dependencies = [
1270
1270
  "unicode-ident",
1271
1271
  ]
@@ -1609,9 +1609,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
1609
1609
 
1610
1610
  [[package]]
1611
1611
  name = "syn"
1612
- version = "2.0.107"
1612
+ version = "2.0.108"
1613
1613
  source = "registry+https://github.com/rust-lang/crates.io-index"
1614
- checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b"
1614
+ checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917"
1615
1615
  dependencies = [
1616
1616
  "proc-macro2",
1617
1617
  "quote",
@@ -3,7 +3,7 @@ resolver = "2"
3
3
  members = ["crates/html-to-markdown-py"]
4
4
 
5
5
  [workspace.package]
6
- version = "2.4.0"
6
+ version = "2.4.2"
7
7
  edition = "2021"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
9
9
  license = "MIT"
@@ -15,7 +15,7 @@ rust-version = "1.80"
15
15
 
16
16
  [workspace.dependencies]
17
17
  # Core library
18
- html-to-markdown-rs = { version = "2.4.0", path = "crates/html-to-markdown" }
18
+ html-to-markdown-rs = { version = "2.4.2", path = "crates/html-to-markdown" }
19
19
 
20
20
  # HTML parsing and sanitization
21
21
  tl = "0.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 2.4.0
3
+ Version: 2.4.2
4
4
  Classifier: Development Status :: 5 - Production/Stable
5
5
  Classifier: Environment :: Console
6
6
  Classifier: Intended Audience :: Developers
@@ -162,9 +162,12 @@ Key fields (see docstring for full matrix):
162
162
 
163
163
  ### `PreprocessingOptions`
164
164
 
165
- - `enabled`: enable HTML sanitisation
166
- - `preset`: `"minimal" | "standard" | "aggressive"`
167
- - `remove_navigation`, `remove_forms`
165
+ - `enabled`: enable HTML sanitisation (default: `True` since v2.4.2 for robust malformed HTML handling)
166
+ - `preset`: `"minimal" | "standard" | "aggressive"` (default: `"standard"`)
167
+ - `remove_navigation`: remove navigation elements (default: `True`)
168
+ - `remove_forms`: remove form elements (default: `True`)
169
+
170
+ **Note:** As of v2.4.2, preprocessing is enabled by default to ensure robust handling of malformed HTML (e.g., bare angle brackets like `1<2` in content). Set `enabled=False` if you need minimal preprocessing.
168
171
 
169
172
  ### `InlineImageConfig`
170
173
 
@@ -129,9 +129,12 @@ Key fields (see docstring for full matrix):
129
129
 
130
130
  ### `PreprocessingOptions`
131
131
 
132
- - `enabled`: enable HTML sanitisation
133
- - `preset`: `"minimal" | "standard" | "aggressive"`
134
- - `remove_navigation`, `remove_forms`
132
+ - `enabled`: enable HTML sanitisation (default: `True` since v2.4.2 for robust malformed HTML handling)
133
+ - `preset`: `"minimal" | "standard" | "aggressive"` (default: `"standard"`)
134
+ - `remove_navigation`: remove navigation elements (default: `True`)
135
+ - `remove_forms`: remove form elements (default: `True`)
136
+
137
+ **Note:** As of v2.4.2, preprocessing is enabled by default to ensure robust handling of malformed HTML (e.g., bare angle brackets like `1<2` in content). Set `enabled=False` if you need minimal preprocessing.
135
138
 
136
139
  ### `InlineImageConfig`
137
140
 
@@ -47,6 +47,9 @@ use std::collections::BTreeMap;
47
47
  #[cfg(feature = "inline-images")]
48
48
  use std::rc::Rc;
49
49
 
50
+ use std::borrow::Cow;
51
+ use std::str;
52
+
50
53
  use crate::error::Result;
51
54
  #[cfg(feature = "inline-images")]
52
55
  use crate::inline_images::{InlineImageCollector, InlineImageFormat, InlineImageSource};
@@ -971,7 +974,12 @@ fn convert_html_impl(
971
974
  .replace("<hr/>", "<hr>")
972
975
  .replace("<img/>", "<img>");
973
976
 
974
- let dom = tl::parse(&html, tl::ParserOptions::default())
977
+ // Escape malformed angle brackets in text content to prevent parser failures
978
+ let html = escape_malformed_angle_brackets(&html);
979
+
980
+ let html = strip_script_and_style_sections(&html);
981
+
982
+ let dom = tl::parse(html.as_ref(), tl::ParserOptions::default())
975
983
  .map_err(|_| crate::error::ConversionError::ParseError("Failed to parse HTML".to_string()))?;
976
984
 
977
985
  let parser = dom.parser();
@@ -1075,6 +1083,237 @@ fn convert_html_impl(
1075
1083
  }
1076
1084
  }
1077
1085
 
1086
+ /// Escape malformed angle brackets in HTML that are not part of valid tags.
1087
+ ///
1088
+ /// This function ensures robust parsing by escaping bare `<` and `>` characters
1089
+ /// that appear in text content and are not part of HTML tags. This prevents
1090
+ /// parser failures on malformed HTML like "1<2" or comparisons in text.
1091
+ ///
1092
+ /// # Examples
1093
+ ///
1094
+ /// - `1<2` becomes `1&lt;2`
1095
+ /// - `<div>1<2</div>` becomes `<div>1&lt;2</div>`
1096
+ /// - `<script>1 < 2</script>` remains unchanged (handled by script stripping)
1097
+ fn escape_malformed_angle_brackets(input: &str) -> Cow<'_, str> {
1098
+ let bytes = input.as_bytes();
1099
+ let len = bytes.len();
1100
+ let mut idx = 0;
1101
+ let mut last = 0;
1102
+ let mut output: Option<String> = None;
1103
+
1104
+ while idx < len {
1105
+ if bytes[idx] == b'<' {
1106
+ // Check if this is a valid tag start
1107
+ if idx + 1 < len {
1108
+ let next = bytes[idx + 1];
1109
+
1110
+ // Valid tag patterns: <tagname, </tagname, <!doctype, <!--
1111
+ let is_valid_tag = match next {
1112
+ b'!' => {
1113
+ // DOCTYPE or comment
1114
+ idx + 2 < len
1115
+ && (bytes[idx + 2] == b'-'
1116
+ || bytes[idx + 2].is_ascii_alphabetic()
1117
+ || bytes[idx + 2].is_ascii_uppercase())
1118
+ }
1119
+ b'/' => {
1120
+ // Closing tag
1121
+ idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
1122
+ }
1123
+ b'?' => {
1124
+ // XML declaration
1125
+ true
1126
+ }
1127
+ c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => {
1128
+ // Opening tag
1129
+ true
1130
+ }
1131
+ _ => false,
1132
+ };
1133
+
1134
+ if !is_valid_tag {
1135
+ // This is a bare `<` that should be escaped
1136
+ let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1137
+ out.push_str(&input[last..idx]);
1138
+ out.push_str("&lt;");
1139
+ last = idx + 1;
1140
+ }
1141
+ } else {
1142
+ // `<` at end of string - escape it
1143
+ let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1144
+ out.push_str(&input[last..idx]);
1145
+ out.push_str("&lt;");
1146
+ last = idx + 1;
1147
+ }
1148
+ }
1149
+ idx += 1;
1150
+ }
1151
+
1152
+ if let Some(mut out) = output {
1153
+ if last < input.len() {
1154
+ out.push_str(&input[last..]);
1155
+ }
1156
+ Cow::Owned(out)
1157
+ } else {
1158
+ Cow::Borrowed(input)
1159
+ }
1160
+ }
1161
+
1162
+ fn strip_script_and_style_sections(input: &str) -> Cow<'_, str> {
1163
+ const TAGS: [&[u8]; 2] = [b"script", b"style"];
1164
+ const SVG: &[u8] = b"svg";
1165
+
1166
+ let bytes = input.as_bytes();
1167
+ let len = bytes.len();
1168
+ let mut idx = 0;
1169
+ let mut last = 0;
1170
+ let mut output: Option<String> = None;
1171
+ let mut svg_depth = 0usize;
1172
+
1173
+ while idx < len {
1174
+ if bytes[idx] == b'<' {
1175
+ if matches_tag_start(bytes, idx + 1, SVG) {
1176
+ if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
1177
+ svg_depth += 1;
1178
+ idx = open_end;
1179
+ continue;
1180
+ }
1181
+ } else if matches_end_tag_start(bytes, idx + 1, SVG) {
1182
+ if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
1183
+ if svg_depth > 0 {
1184
+ svg_depth = svg_depth.saturating_sub(1);
1185
+ }
1186
+ idx = close_end;
1187
+ continue;
1188
+ }
1189
+ }
1190
+
1191
+ if svg_depth == 0 {
1192
+ let mut handled = false;
1193
+ for tag in TAGS {
1194
+ if matches_tag_start(bytes, idx + 1, tag) {
1195
+ if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
1196
+ let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
1197
+ let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1198
+ out.push_str(&input[last..idx]);
1199
+ out.push_str(&input[idx..open_end]);
1200
+ out.push_str("</");
1201
+ out.push_str(str::from_utf8(tag).unwrap());
1202
+ out.push('>');
1203
+
1204
+ last = remove_end;
1205
+ idx = remove_end;
1206
+ handled = true;
1207
+ }
1208
+ }
1209
+
1210
+ if handled {
1211
+ break;
1212
+ }
1213
+ }
1214
+
1215
+ if handled {
1216
+ continue;
1217
+ }
1218
+ }
1219
+ }
1220
+
1221
+ idx += 1;
1222
+ }
1223
+
1224
+ if let Some(mut out) = output {
1225
+ if last < input.len() {
1226
+ out.push_str(&input[last..]);
1227
+ }
1228
+ Cow::Owned(out)
1229
+ } else {
1230
+ Cow::Borrowed(input)
1231
+ }
1232
+ }
1233
+
1234
+ fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
1235
+ if start >= bytes.len() {
1236
+ return false;
1237
+ }
1238
+
1239
+ if start + tag.len() > bytes.len() {
1240
+ return false;
1241
+ }
1242
+
1243
+ if !bytes[start..start + tag.len()].eq_ignore_ascii_case(tag) {
1244
+ return false;
1245
+ }
1246
+
1247
+ start += tag.len();
1248
+
1249
+ match bytes.get(start) {
1250
+ Some(b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r') => true,
1251
+ Some(_) => false,
1252
+ None => true,
1253
+ }
1254
+ }
1255
+
1256
+ fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
1257
+ let len = bytes.len();
1258
+ let mut in_quote: Option<u8> = None;
1259
+
1260
+ while idx < len {
1261
+ match bytes[idx] {
1262
+ b'"' | b'\'' => {
1263
+ if let Some(current) = in_quote {
1264
+ if current == bytes[idx] {
1265
+ in_quote = None;
1266
+ }
1267
+ } else {
1268
+ in_quote = Some(bytes[idx]);
1269
+ }
1270
+ }
1271
+ b'>' if in_quote.is_none() => return Some(idx + 1),
1272
+ _ => {}
1273
+ }
1274
+ idx += 1;
1275
+ }
1276
+
1277
+ None
1278
+ }
1279
+
1280
+ fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
1281
+ let len = bytes.len();
1282
+ let mut depth = 1usize;
1283
+
1284
+ while idx < len {
1285
+ if bytes[idx] == b'<' {
1286
+ if matches_tag_start(bytes, idx + 1, tag) {
1287
+ if let Some(next) = find_tag_end(bytes, idx + 1 + tag.len()) {
1288
+ depth += 1;
1289
+ idx = next;
1290
+ continue;
1291
+ }
1292
+ } else if matches_end_tag_start(bytes, idx + 1, tag) {
1293
+ if let Some(close) = find_tag_end(bytes, idx + 2 + tag.len()) {
1294
+ depth -= 1;
1295
+ if depth == 0 {
1296
+ return Some(close);
1297
+ }
1298
+ idx = close;
1299
+ continue;
1300
+ }
1301
+ }
1302
+ }
1303
+
1304
+ idx += 1;
1305
+ }
1306
+
1307
+ None
1308
+ }
1309
+
1310
+ fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
1311
+ if start >= bytes.len() || bytes[start] != b'/' {
1312
+ return false;
1313
+ }
1314
+ matches_tag_start(bytes, start + 1, tag)
1315
+ }
1316
+
1078
1317
  /// Check if an element is inline (not block-level).
1079
1318
  fn is_inline_element(tag_name: &str) -> bool {
1080
1319
  matches!(
@@ -4002,6 +4241,22 @@ mod tests {
4002
4241
  assert_eq!(calculate_list_continuation_indent(4), 7);
4003
4242
  }
4004
4243
 
4244
+ #[test]
4245
+ fn strips_script_sections_without_removing_following_content() {
4246
+ let input = "<div>before</div><script>1 < 2</script><p>after</p>";
4247
+ let stripped = strip_script_and_style_sections(input);
4248
+ assert_eq!(stripped, "<div>before</div><script></script><p>after</p>");
4249
+ }
4250
+
4251
+ #[test]
4252
+ fn strips_multiline_script_sections() {
4253
+ let input = "<html>\n<script>1 < 2</script>\nContent\n</html>";
4254
+ let stripped = strip_script_and_style_sections(input);
4255
+ assert!(stripped.contains("Content"));
4256
+ assert!(stripped.contains("<script"));
4257
+ assert!(!stripped.contains("1 < 2"));
4258
+ }
4259
+
4005
4260
  #[test]
4006
4261
  fn test_add_list_continuation_indent_blank_line() {
4007
4262
  let opts = ConversionOptions::default();
@@ -4049,4 +4304,98 @@ mod tests {
4049
4304
  add_list_continuation_indent(&mut output, 1, false, &opts);
4050
4305
  assert_eq!(output, "* First\n ");
4051
4306
  }
4307
+
4308
+ #[test]
4309
+ fn test_escape_malformed_angle_brackets_bare() {
4310
+ let input = "1<2";
4311
+ let escaped = escape_malformed_angle_brackets(input);
4312
+ assert_eq!(escaped, "1&lt;2");
4313
+ }
4314
+
4315
+ #[test]
4316
+ fn test_escape_malformed_angle_brackets_in_text() {
4317
+ let input = "<html>1<2 Content</html>";
4318
+ let escaped = escape_malformed_angle_brackets(input);
4319
+ assert_eq!(escaped, "<html>1&lt;2 Content</html>");
4320
+ }
4321
+
4322
+ #[test]
4323
+ fn test_escape_malformed_angle_brackets_multiple() {
4324
+ let input = "1 < 2 < 3";
4325
+ let escaped = escape_malformed_angle_brackets(input);
4326
+ assert_eq!(escaped, "1 &lt; 2 &lt; 3");
4327
+ }
4328
+
4329
+ #[test]
4330
+ fn test_escape_malformed_angle_brackets_preserves_valid_tags() {
4331
+ let input = "<div>content</div>";
4332
+ let escaped = escape_malformed_angle_brackets(input);
4333
+ assert_eq!(escaped, "<div>content</div>");
4334
+ }
4335
+
4336
+ #[test]
4337
+ fn test_escape_malformed_angle_brackets_mixed() {
4338
+ let input = "<div>1<2</div><p>3<4</p>";
4339
+ let escaped = escape_malformed_angle_brackets(input);
4340
+ assert_eq!(escaped, "<div>1&lt;2</div><p>3&lt;4</p>");
4341
+ }
4342
+
4343
+ #[test]
4344
+ fn test_escape_malformed_angle_brackets_at_end() {
4345
+ let input = "test<";
4346
+ let escaped = escape_malformed_angle_brackets(input);
4347
+ assert_eq!(escaped, "test&lt;");
4348
+ }
4349
+
4350
+ #[test]
4351
+ fn test_escape_malformed_angle_brackets_preserves_comments() {
4352
+ let input = "<!-- comment -->1<2";
4353
+ let escaped = escape_malformed_angle_brackets(input);
4354
+ assert_eq!(escaped, "<!-- comment -->1&lt;2");
4355
+ }
4356
+
4357
+ #[test]
4358
+ fn test_escape_malformed_angle_brackets_preserves_doctype() {
4359
+ let input = "<!DOCTYPE html>1<2";
4360
+ let escaped = escape_malformed_angle_brackets(input);
4361
+ assert_eq!(escaped, "<!DOCTYPE html>1&lt;2");
4362
+ }
4363
+
4364
+ #[test]
4365
+ fn test_convert_with_malformed_angle_brackets() {
4366
+ // Test the full conversion pipeline (issue #94)
4367
+ let html = "<html>1<2\nContent</html>";
4368
+ let result = convert_html(html, &ConversionOptions::default()).unwrap();
4369
+ assert!(
4370
+ result.contains("Content"),
4371
+ "Result should contain 'Content': {:?}",
4372
+ result
4373
+ );
4374
+ assert!(
4375
+ result.contains("1<2") || result.contains("1&lt;2"),
4376
+ "Result should contain escaped or unescaped comparison"
4377
+ );
4378
+ }
4379
+
4380
+ #[test]
4381
+ fn test_convert_with_malformed_angle_brackets_in_div() {
4382
+ let html = "<html><div>1<2</div><div>Content</div></html>";
4383
+ let result = convert_html(html, &ConversionOptions::default()).unwrap();
4384
+ assert!(
4385
+ result.contains("Content"),
4386
+ "Result should contain 'Content': {:?}",
4387
+ result
4388
+ );
4389
+ }
4390
+
4391
+ #[test]
4392
+ fn test_convert_with_multiple_malformed_angle_brackets() {
4393
+ let html = "<html>1 < 2 < 3<p>Content</p></html>";
4394
+ let result = convert_html(html, &ConversionOptions::default()).unwrap();
4395
+ assert!(
4396
+ result.contains("Content"),
4397
+ "Result should contain 'Content': {:?}",
4398
+ result
4399
+ );
4400
+ }
4052
4401
  }
@@ -258,7 +258,7 @@ pub struct PreprocessingOptions {
258
258
  impl Default for PreprocessingOptions {
259
259
  fn default() -> Self {
260
260
  Self {
261
- enabled: false,
261
+ enabled: true,
262
262
  preset: PreprocessingPreset::default(),
263
263
  remove_navigation: true,
264
264
  remove_forms: true,
@@ -49,4 +49,4 @@ __all__ = [
49
49
  "markdownify",
50
50
  ]
51
51
 
52
- __version__ = "2.4.0"
52
+ __version__ = "2.4.2"
@@ -128,8 +128,8 @@ class PreprocessingOptions:
128
128
  ... )
129
129
  """
130
130
 
131
- enabled: bool = False
132
- """Whether to enable HTML preprocessing (disabled by default for minimal transformation)."""
131
+ enabled: bool = True
132
+ """Whether to enable HTML preprocessing (enabled by default for robust handling of malformed HTML)."""
133
133
 
134
134
  preset: Literal["minimal", "standard", "aggressive"] = "standard"
135
135
  """Preprocessing aggressiveness level."""
@@ -7,7 +7,7 @@ requires = [
7
7
 
8
8
  [project]
9
9
  name = "html-to-markdown"
10
- version = "2.4.0"
10
+ version = "2.4.2"
11
11
  description = "High-performance HTML to Markdown converter powered by Rust with a clean Python API"
12
12
  readme = "README_PYPI.md"
13
13
  keywords = [