html-to-markdown 2.4.0__tar.gz → 2.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/Cargo.lock +27 -27
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/Cargo.toml +2 -2
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/PKG-INFO +7 -4
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/README_PYPI.md +6 -3
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/converter.rs +350 -1
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/options.rs +1 -1
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/__init__.py +1 -1
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/bin/html-to-markdown +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/options.py +2 -2
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/pyproject.toml +1 -1
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/LICENSE +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/Cargo.toml +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/README.md +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/benches/conversion_benchmark.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/benches/micro_benchmark.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/benches/profiling_benchmark.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/basic.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/table.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_escape.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_inline_formatting.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_lists.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_semantic_tags.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_tables.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_task_lists.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_whitespace.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/error.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/converter.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/extractor.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/mod.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/parser.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/spatial.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/types.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/inline_images.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/lib.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/sanitizer.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/text.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/wrapper.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/tests/commonmark_compliance_test.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/tests/hocr_compliance_test.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/tests/integration_test.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown-py/Cargo.toml +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown-py/README.md +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown-py/python/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown-py/python/html_to_markdown/_html_to_markdown.pyi +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown-py/src/lib.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown-py/uv.lock +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/_rust.pyi +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/api.py +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/cli.py +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/cli_proxy.py +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/exceptions.py +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/html_to_markdown/v1_compat.py +0 -0
|
@@ -249,9 +249,9 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
|
|
|
249
249
|
|
|
250
250
|
[[package]]
|
|
251
251
|
name = "clap_mangen"
|
|
252
|
-
version = "0.2.
|
|
252
|
+
version = "0.2.31"
|
|
253
253
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
254
|
-
checksum = "
|
|
254
|
+
checksum = "439ea63a92086df93893164221ad4f24142086d535b3a0957b9b9bea2dc86301"
|
|
255
255
|
dependencies = [
|
|
256
256
|
"clap",
|
|
257
257
|
"roff",
|
|
@@ -386,9 +386,9 @@ dependencies = [
|
|
|
386
386
|
|
|
387
387
|
[[package]]
|
|
388
388
|
name = "ctor"
|
|
389
|
-
version = "0.
|
|
389
|
+
version = "0.6.0"
|
|
390
390
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
391
|
-
checksum = "
|
|
391
|
+
checksum = "59c9b8bdf64ee849747c1b12eb861d21aa47fa161564f48332f1afe2373bf899"
|
|
392
392
|
dependencies = [
|
|
393
393
|
"ctor-proc-macro",
|
|
394
394
|
"dtor",
|
|
@@ -396,9 +396,9 @@ dependencies = [
|
|
|
396
396
|
|
|
397
397
|
[[package]]
|
|
398
398
|
name = "ctor-proc-macro"
|
|
399
|
-
version = "0.0.
|
|
399
|
+
version = "0.0.7"
|
|
400
400
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
401
|
-
checksum = "
|
|
401
|
+
checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
|
|
402
402
|
|
|
403
403
|
[[package]]
|
|
404
404
|
name = "cty"
|
|
@@ -507,9 +507,9 @@ checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127"
|
|
|
507
507
|
|
|
508
508
|
[[package]]
|
|
509
509
|
name = "flate2"
|
|
510
|
-
version = "1.1.
|
|
510
|
+
version = "1.1.5"
|
|
511
511
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
512
|
-
checksum = "
|
|
512
|
+
checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb"
|
|
513
513
|
dependencies = [
|
|
514
514
|
"crc32fast",
|
|
515
515
|
"miniz_oxide",
|
|
@@ -595,7 +595,7 @@ dependencies = [
|
|
|
595
595
|
|
|
596
596
|
[[package]]
|
|
597
597
|
name = "html-to-markdown-cli"
|
|
598
|
-
version = "2.4.
|
|
598
|
+
version = "2.4.2"
|
|
599
599
|
dependencies = [
|
|
600
600
|
"assert_cmd",
|
|
601
601
|
"clap",
|
|
@@ -609,7 +609,7 @@ dependencies = [
|
|
|
609
609
|
|
|
610
610
|
[[package]]
|
|
611
611
|
name = "html-to-markdown-node"
|
|
612
|
-
version = "2.4.
|
|
612
|
+
version = "2.4.2"
|
|
613
613
|
dependencies = [
|
|
614
614
|
"html-to-markdown-rs",
|
|
615
615
|
"mimalloc-rust",
|
|
@@ -620,7 +620,7 @@ dependencies = [
|
|
|
620
620
|
|
|
621
621
|
[[package]]
|
|
622
622
|
name = "html-to-markdown-py"
|
|
623
|
-
version = "2.4.
|
|
623
|
+
version = "2.4.2"
|
|
624
624
|
dependencies = [
|
|
625
625
|
"base64",
|
|
626
626
|
"html-to-markdown-rs",
|
|
@@ -630,7 +630,7 @@ dependencies = [
|
|
|
630
630
|
|
|
631
631
|
[[package]]
|
|
632
632
|
name = "html-to-markdown-rs"
|
|
633
|
-
version = "2.4.
|
|
633
|
+
version = "2.4.2"
|
|
634
634
|
dependencies = [
|
|
635
635
|
"ammonia",
|
|
636
636
|
"base64",
|
|
@@ -647,7 +647,7 @@ dependencies = [
|
|
|
647
647
|
|
|
648
648
|
[[package]]
|
|
649
649
|
name = "html-to-markdown-wasm"
|
|
650
|
-
version = "2.4.
|
|
650
|
+
version = "2.4.2"
|
|
651
651
|
dependencies = [
|
|
652
652
|
"console_error_panic_hook",
|
|
653
653
|
"getrandom",
|
|
@@ -989,9 +989,9 @@ dependencies = [
|
|
|
989
989
|
|
|
990
990
|
[[package]]
|
|
991
991
|
name = "napi"
|
|
992
|
-
version = "3.
|
|
992
|
+
version = "3.4.0"
|
|
993
993
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
994
|
-
checksum = "
|
|
994
|
+
checksum = "c3a1135cfe16ca43ac82ac05858554fc39c037d8e4592f2b4a83d7ef8e822f43"
|
|
995
995
|
dependencies = [
|
|
996
996
|
"bitflags",
|
|
997
997
|
"ctor",
|
|
@@ -1003,15 +1003,15 @@ dependencies = [
|
|
|
1003
1003
|
|
|
1004
1004
|
[[package]]
|
|
1005
1005
|
name = "napi-build"
|
|
1006
|
-
version = "2.2.
|
|
1006
|
+
version = "2.2.4"
|
|
1007
1007
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1008
|
-
checksum = "
|
|
1008
|
+
checksum = "3ae82775d1b06f3f07efd0666e59bbc175da8383bc372051031d7a447e94fbea"
|
|
1009
1009
|
|
|
1010
1010
|
[[package]]
|
|
1011
1011
|
name = "napi-derive"
|
|
1012
|
-
version = "3.
|
|
1012
|
+
version = "3.3.0"
|
|
1013
1013
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1014
|
-
checksum = "
|
|
1014
|
+
checksum = "78665d6bdf10e9a4e6b38123efb0f66962e6197c1aea2f07cff3f159a374696d"
|
|
1015
1015
|
dependencies = [
|
|
1016
1016
|
"convert_case",
|
|
1017
1017
|
"ctor",
|
|
@@ -1023,9 +1023,9 @@ dependencies = [
|
|
|
1023
1023
|
|
|
1024
1024
|
[[package]]
|
|
1025
1025
|
name = "napi-derive-backend"
|
|
1026
|
-
version = "
|
|
1026
|
+
version = "3.0.0"
|
|
1027
1027
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1028
|
-
checksum = "
|
|
1028
|
+
checksum = "42d55d01423e7264de3acc13b258fa48ca7cf38a4d25db848908ec3c1304a85a"
|
|
1029
1029
|
dependencies = [
|
|
1030
1030
|
"convert_case",
|
|
1031
1031
|
"proc-macro2",
|
|
@@ -1036,9 +1036,9 @@ dependencies = [
|
|
|
1036
1036
|
|
|
1037
1037
|
[[package]]
|
|
1038
1038
|
name = "napi-sys"
|
|
1039
|
-
version = "3.0.
|
|
1039
|
+
version = "3.0.1"
|
|
1040
1040
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1041
|
-
checksum = "
|
|
1041
|
+
checksum = "1ed8f0e23a62a3ce0fbb6527cdc056e9282ddd9916b068c46f8923e18eed5ee6"
|
|
1042
1042
|
dependencies = [
|
|
1043
1043
|
"libloading",
|
|
1044
1044
|
]
|
|
@@ -1263,9 +1263,9 @@ dependencies = [
|
|
|
1263
1263
|
|
|
1264
1264
|
[[package]]
|
|
1265
1265
|
name = "proc-macro2"
|
|
1266
|
-
version = "1.0.
|
|
1266
|
+
version = "1.0.103"
|
|
1267
1267
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1268
|
-
checksum = "
|
|
1268
|
+
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
|
|
1269
1269
|
dependencies = [
|
|
1270
1270
|
"unicode-ident",
|
|
1271
1271
|
]
|
|
@@ -1609,9 +1609,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
|
|
1609
1609
|
|
|
1610
1610
|
[[package]]
|
|
1611
1611
|
name = "syn"
|
|
1612
|
-
version = "2.0.
|
|
1612
|
+
version = "2.0.108"
|
|
1613
1613
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1614
|
-
checksum = "
|
|
1614
|
+
checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917"
|
|
1615
1615
|
dependencies = [
|
|
1616
1616
|
"proc-macro2",
|
|
1617
1617
|
"quote",
|
|
@@ -3,7 +3,7 @@ resolver = "2"
|
|
|
3
3
|
members = ["crates/html-to-markdown-py"]
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "2.4.
|
|
6
|
+
version = "2.4.2"
|
|
7
7
|
edition = "2021"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
9
9
|
license = "MIT"
|
|
@@ -15,7 +15,7 @@ rust-version = "1.80"
|
|
|
15
15
|
|
|
16
16
|
[workspace.dependencies]
|
|
17
17
|
# Core library
|
|
18
|
-
html-to-markdown-rs = { version = "2.4.
|
|
18
|
+
html-to-markdown-rs = { version = "2.4.2", path = "crates/html-to-markdown" }
|
|
19
19
|
|
|
20
20
|
# HTML parsing and sanitization
|
|
21
21
|
tl = "0.7"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 2.4.
|
|
3
|
+
Version: 2.4.2
|
|
4
4
|
Classifier: Development Status :: 5 - Production/Stable
|
|
5
5
|
Classifier: Environment :: Console
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -162,9 +162,12 @@ Key fields (see docstring for full matrix):
|
|
|
162
162
|
|
|
163
163
|
### `PreprocessingOptions`
|
|
164
164
|
|
|
165
|
-
- `enabled`: enable HTML sanitisation
|
|
166
|
-
- `preset`: `"minimal" | "standard" | "aggressive"`
|
|
167
|
-
- `remove_navigation
|
|
165
|
+
- `enabled`: enable HTML sanitisation (default: `True` since v2.4.2 for robust malformed HTML handling)
|
|
166
|
+
- `preset`: `"minimal" | "standard" | "aggressive"` (default: `"standard"`)
|
|
167
|
+
- `remove_navigation`: remove navigation elements (default: `True`)
|
|
168
|
+
- `remove_forms`: remove form elements (default: `True`)
|
|
169
|
+
|
|
170
|
+
**Note:** As of v2.4.2, preprocessing is enabled by default to ensure robust handling of malformed HTML (e.g., bare angle brackets like `1<2` in content). Set `enabled=False` if you need minimal preprocessing.
|
|
168
171
|
|
|
169
172
|
### `InlineImageConfig`
|
|
170
173
|
|
|
@@ -129,9 +129,12 @@ Key fields (see docstring for full matrix):
|
|
|
129
129
|
|
|
130
130
|
### `PreprocessingOptions`
|
|
131
131
|
|
|
132
|
-
- `enabled`: enable HTML sanitisation
|
|
133
|
-
- `preset`: `"minimal" | "standard" | "aggressive"`
|
|
134
|
-
- `remove_navigation
|
|
132
|
+
- `enabled`: enable HTML sanitisation (default: `True` since v2.4.2 for robust malformed HTML handling)
|
|
133
|
+
- `preset`: `"minimal" | "standard" | "aggressive"` (default: `"standard"`)
|
|
134
|
+
- `remove_navigation`: remove navigation elements (default: `True`)
|
|
135
|
+
- `remove_forms`: remove form elements (default: `True`)
|
|
136
|
+
|
|
137
|
+
**Note:** As of v2.4.2, preprocessing is enabled by default to ensure robust handling of malformed HTML (e.g., bare angle brackets like `1<2` in content). Set `enabled=False` if you need minimal preprocessing.
|
|
135
138
|
|
|
136
139
|
### `InlineImageConfig`
|
|
137
140
|
|
|
@@ -47,6 +47,9 @@ use std::collections::BTreeMap;
|
|
|
47
47
|
#[cfg(feature = "inline-images")]
|
|
48
48
|
use std::rc::Rc;
|
|
49
49
|
|
|
50
|
+
use std::borrow::Cow;
|
|
51
|
+
use std::str;
|
|
52
|
+
|
|
50
53
|
use crate::error::Result;
|
|
51
54
|
#[cfg(feature = "inline-images")]
|
|
52
55
|
use crate::inline_images::{InlineImageCollector, InlineImageFormat, InlineImageSource};
|
|
@@ -971,7 +974,12 @@ fn convert_html_impl(
|
|
|
971
974
|
.replace("<hr/>", "<hr>")
|
|
972
975
|
.replace("<img/>", "<img>");
|
|
973
976
|
|
|
974
|
-
|
|
977
|
+
// Escape malformed angle brackets in text content to prevent parser failures
|
|
978
|
+
let html = escape_malformed_angle_brackets(&html);
|
|
979
|
+
|
|
980
|
+
let html = strip_script_and_style_sections(&html);
|
|
981
|
+
|
|
982
|
+
let dom = tl::parse(html.as_ref(), tl::ParserOptions::default())
|
|
975
983
|
.map_err(|_| crate::error::ConversionError::ParseError("Failed to parse HTML".to_string()))?;
|
|
976
984
|
|
|
977
985
|
let parser = dom.parser();
|
|
@@ -1075,6 +1083,237 @@ fn convert_html_impl(
|
|
|
1075
1083
|
}
|
|
1076
1084
|
}
|
|
1077
1085
|
|
|
1086
|
+
/// Escape malformed angle brackets in HTML that are not part of valid tags.
|
|
1087
|
+
///
|
|
1088
|
+
/// This function ensures robust parsing by escaping bare `<` and `>` characters
|
|
1089
|
+
/// that appear in text content and are not part of HTML tags. This prevents
|
|
1090
|
+
/// parser failures on malformed HTML like "1<2" or comparisons in text.
|
|
1091
|
+
///
|
|
1092
|
+
/// # Examples
|
|
1093
|
+
///
|
|
1094
|
+
/// - `1<2` becomes `1<2`
|
|
1095
|
+
/// - `<div>1<2</div>` becomes `<div>1<2</div>`
|
|
1096
|
+
/// - `<script>1 < 2</script>` remains unchanged (handled by script stripping)
|
|
1097
|
+
fn escape_malformed_angle_brackets(input: &str) -> Cow<'_, str> {
|
|
1098
|
+
let bytes = input.as_bytes();
|
|
1099
|
+
let len = bytes.len();
|
|
1100
|
+
let mut idx = 0;
|
|
1101
|
+
let mut last = 0;
|
|
1102
|
+
let mut output: Option<String> = None;
|
|
1103
|
+
|
|
1104
|
+
while idx < len {
|
|
1105
|
+
if bytes[idx] == b'<' {
|
|
1106
|
+
// Check if this is a valid tag start
|
|
1107
|
+
if idx + 1 < len {
|
|
1108
|
+
let next = bytes[idx + 1];
|
|
1109
|
+
|
|
1110
|
+
// Valid tag patterns: <tagname, </tagname, <!doctype, <!--
|
|
1111
|
+
let is_valid_tag = match next {
|
|
1112
|
+
b'!' => {
|
|
1113
|
+
// DOCTYPE or comment
|
|
1114
|
+
idx + 2 < len
|
|
1115
|
+
&& (bytes[idx + 2] == b'-'
|
|
1116
|
+
|| bytes[idx + 2].is_ascii_alphabetic()
|
|
1117
|
+
|| bytes[idx + 2].is_ascii_uppercase())
|
|
1118
|
+
}
|
|
1119
|
+
b'/' => {
|
|
1120
|
+
// Closing tag
|
|
1121
|
+
idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
|
|
1122
|
+
}
|
|
1123
|
+
b'?' => {
|
|
1124
|
+
// XML declaration
|
|
1125
|
+
true
|
|
1126
|
+
}
|
|
1127
|
+
c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => {
|
|
1128
|
+
// Opening tag
|
|
1129
|
+
true
|
|
1130
|
+
}
|
|
1131
|
+
_ => false,
|
|
1132
|
+
};
|
|
1133
|
+
|
|
1134
|
+
if !is_valid_tag {
|
|
1135
|
+
// This is a bare `<` that should be escaped
|
|
1136
|
+
let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
|
|
1137
|
+
out.push_str(&input[last..idx]);
|
|
1138
|
+
out.push_str("<");
|
|
1139
|
+
last = idx + 1;
|
|
1140
|
+
}
|
|
1141
|
+
} else {
|
|
1142
|
+
// `<` at end of string - escape it
|
|
1143
|
+
let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
|
|
1144
|
+
out.push_str(&input[last..idx]);
|
|
1145
|
+
out.push_str("<");
|
|
1146
|
+
last = idx + 1;
|
|
1147
|
+
}
|
|
1148
|
+
}
|
|
1149
|
+
idx += 1;
|
|
1150
|
+
}
|
|
1151
|
+
|
|
1152
|
+
if let Some(mut out) = output {
|
|
1153
|
+
if last < input.len() {
|
|
1154
|
+
out.push_str(&input[last..]);
|
|
1155
|
+
}
|
|
1156
|
+
Cow::Owned(out)
|
|
1157
|
+
} else {
|
|
1158
|
+
Cow::Borrowed(input)
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1162
|
+
fn strip_script_and_style_sections(input: &str) -> Cow<'_, str> {
|
|
1163
|
+
const TAGS: [&[u8]; 2] = [b"script", b"style"];
|
|
1164
|
+
const SVG: &[u8] = b"svg";
|
|
1165
|
+
|
|
1166
|
+
let bytes = input.as_bytes();
|
|
1167
|
+
let len = bytes.len();
|
|
1168
|
+
let mut idx = 0;
|
|
1169
|
+
let mut last = 0;
|
|
1170
|
+
let mut output: Option<String> = None;
|
|
1171
|
+
let mut svg_depth = 0usize;
|
|
1172
|
+
|
|
1173
|
+
while idx < len {
|
|
1174
|
+
if bytes[idx] == b'<' {
|
|
1175
|
+
if matches_tag_start(bytes, idx + 1, SVG) {
|
|
1176
|
+
if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
|
|
1177
|
+
svg_depth += 1;
|
|
1178
|
+
idx = open_end;
|
|
1179
|
+
continue;
|
|
1180
|
+
}
|
|
1181
|
+
} else if matches_end_tag_start(bytes, idx + 1, SVG) {
|
|
1182
|
+
if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
|
|
1183
|
+
if svg_depth > 0 {
|
|
1184
|
+
svg_depth = svg_depth.saturating_sub(1);
|
|
1185
|
+
}
|
|
1186
|
+
idx = close_end;
|
|
1187
|
+
continue;
|
|
1188
|
+
}
|
|
1189
|
+
}
|
|
1190
|
+
|
|
1191
|
+
if svg_depth == 0 {
|
|
1192
|
+
let mut handled = false;
|
|
1193
|
+
for tag in TAGS {
|
|
1194
|
+
if matches_tag_start(bytes, idx + 1, tag) {
|
|
1195
|
+
if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
|
|
1196
|
+
let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
|
|
1197
|
+
let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
|
|
1198
|
+
out.push_str(&input[last..idx]);
|
|
1199
|
+
out.push_str(&input[idx..open_end]);
|
|
1200
|
+
out.push_str("</");
|
|
1201
|
+
out.push_str(str::from_utf8(tag).unwrap());
|
|
1202
|
+
out.push('>');
|
|
1203
|
+
|
|
1204
|
+
last = remove_end;
|
|
1205
|
+
idx = remove_end;
|
|
1206
|
+
handled = true;
|
|
1207
|
+
}
|
|
1208
|
+
}
|
|
1209
|
+
|
|
1210
|
+
if handled {
|
|
1211
|
+
break;
|
|
1212
|
+
}
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1215
|
+
if handled {
|
|
1216
|
+
continue;
|
|
1217
|
+
}
|
|
1218
|
+
}
|
|
1219
|
+
}
|
|
1220
|
+
|
|
1221
|
+
idx += 1;
|
|
1222
|
+
}
|
|
1223
|
+
|
|
1224
|
+
if let Some(mut out) = output {
|
|
1225
|
+
if last < input.len() {
|
|
1226
|
+
out.push_str(&input[last..]);
|
|
1227
|
+
}
|
|
1228
|
+
Cow::Owned(out)
|
|
1229
|
+
} else {
|
|
1230
|
+
Cow::Borrowed(input)
|
|
1231
|
+
}
|
|
1232
|
+
}
|
|
1233
|
+
|
|
1234
|
+
fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
|
|
1235
|
+
if start >= bytes.len() {
|
|
1236
|
+
return false;
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
if start + tag.len() > bytes.len() {
|
|
1240
|
+
return false;
|
|
1241
|
+
}
|
|
1242
|
+
|
|
1243
|
+
if !bytes[start..start + tag.len()].eq_ignore_ascii_case(tag) {
|
|
1244
|
+
return false;
|
|
1245
|
+
}
|
|
1246
|
+
|
|
1247
|
+
start += tag.len();
|
|
1248
|
+
|
|
1249
|
+
match bytes.get(start) {
|
|
1250
|
+
Some(b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r') => true,
|
|
1251
|
+
Some(_) => false,
|
|
1252
|
+
None => true,
|
|
1253
|
+
}
|
|
1254
|
+
}
|
|
1255
|
+
|
|
1256
|
+
fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
|
|
1257
|
+
let len = bytes.len();
|
|
1258
|
+
let mut in_quote: Option<u8> = None;
|
|
1259
|
+
|
|
1260
|
+
while idx < len {
|
|
1261
|
+
match bytes[idx] {
|
|
1262
|
+
b'"' | b'\'' => {
|
|
1263
|
+
if let Some(current) = in_quote {
|
|
1264
|
+
if current == bytes[idx] {
|
|
1265
|
+
in_quote = None;
|
|
1266
|
+
}
|
|
1267
|
+
} else {
|
|
1268
|
+
in_quote = Some(bytes[idx]);
|
|
1269
|
+
}
|
|
1270
|
+
}
|
|
1271
|
+
b'>' if in_quote.is_none() => return Some(idx + 1),
|
|
1272
|
+
_ => {}
|
|
1273
|
+
}
|
|
1274
|
+
idx += 1;
|
|
1275
|
+
}
|
|
1276
|
+
|
|
1277
|
+
None
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1280
|
+
fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
|
|
1281
|
+
let len = bytes.len();
|
|
1282
|
+
let mut depth = 1usize;
|
|
1283
|
+
|
|
1284
|
+
while idx < len {
|
|
1285
|
+
if bytes[idx] == b'<' {
|
|
1286
|
+
if matches_tag_start(bytes, idx + 1, tag) {
|
|
1287
|
+
if let Some(next) = find_tag_end(bytes, idx + 1 + tag.len()) {
|
|
1288
|
+
depth += 1;
|
|
1289
|
+
idx = next;
|
|
1290
|
+
continue;
|
|
1291
|
+
}
|
|
1292
|
+
} else if matches_end_tag_start(bytes, idx + 1, tag) {
|
|
1293
|
+
if let Some(close) = find_tag_end(bytes, idx + 2 + tag.len()) {
|
|
1294
|
+
depth -= 1;
|
|
1295
|
+
if depth == 0 {
|
|
1296
|
+
return Some(close);
|
|
1297
|
+
}
|
|
1298
|
+
idx = close;
|
|
1299
|
+
continue;
|
|
1300
|
+
}
|
|
1301
|
+
}
|
|
1302
|
+
}
|
|
1303
|
+
|
|
1304
|
+
idx += 1;
|
|
1305
|
+
}
|
|
1306
|
+
|
|
1307
|
+
None
|
|
1308
|
+
}
|
|
1309
|
+
|
|
1310
|
+
fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
|
|
1311
|
+
if start >= bytes.len() || bytes[start] != b'/' {
|
|
1312
|
+
return false;
|
|
1313
|
+
}
|
|
1314
|
+
matches_tag_start(bytes, start + 1, tag)
|
|
1315
|
+
}
|
|
1316
|
+
|
|
1078
1317
|
/// Check if an element is inline (not block-level).
|
|
1079
1318
|
fn is_inline_element(tag_name: &str) -> bool {
|
|
1080
1319
|
matches!(
|
|
@@ -4002,6 +4241,22 @@ mod tests {
|
|
|
4002
4241
|
assert_eq!(calculate_list_continuation_indent(4), 7);
|
|
4003
4242
|
}
|
|
4004
4243
|
|
|
4244
|
+
#[test]
|
|
4245
|
+
fn strips_script_sections_without_removing_following_content() {
|
|
4246
|
+
let input = "<div>before</div><script>1 < 2</script><p>after</p>";
|
|
4247
|
+
let stripped = strip_script_and_style_sections(input);
|
|
4248
|
+
assert_eq!(stripped, "<div>before</div><script></script><p>after</p>");
|
|
4249
|
+
}
|
|
4250
|
+
|
|
4251
|
+
#[test]
|
|
4252
|
+
fn strips_multiline_script_sections() {
|
|
4253
|
+
let input = "<html>\n<script>1 < 2</script>\nContent\n</html>";
|
|
4254
|
+
let stripped = strip_script_and_style_sections(input);
|
|
4255
|
+
assert!(stripped.contains("Content"));
|
|
4256
|
+
assert!(stripped.contains("<script"));
|
|
4257
|
+
assert!(!stripped.contains("1 < 2"));
|
|
4258
|
+
}
|
|
4259
|
+
|
|
4005
4260
|
#[test]
|
|
4006
4261
|
fn test_add_list_continuation_indent_blank_line() {
|
|
4007
4262
|
let opts = ConversionOptions::default();
|
|
@@ -4049,4 +4304,98 @@ mod tests {
|
|
|
4049
4304
|
add_list_continuation_indent(&mut output, 1, false, &opts);
|
|
4050
4305
|
assert_eq!(output, "* First\n ");
|
|
4051
4306
|
}
|
|
4307
|
+
|
|
4308
|
+
#[test]
|
|
4309
|
+
fn test_escape_malformed_angle_brackets_bare() {
|
|
4310
|
+
let input = "1<2";
|
|
4311
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4312
|
+
assert_eq!(escaped, "1<2");
|
|
4313
|
+
}
|
|
4314
|
+
|
|
4315
|
+
#[test]
|
|
4316
|
+
fn test_escape_malformed_angle_brackets_in_text() {
|
|
4317
|
+
let input = "<html>1<2 Content</html>";
|
|
4318
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4319
|
+
assert_eq!(escaped, "<html>1<2 Content</html>");
|
|
4320
|
+
}
|
|
4321
|
+
|
|
4322
|
+
#[test]
|
|
4323
|
+
fn test_escape_malformed_angle_brackets_multiple() {
|
|
4324
|
+
let input = "1 < 2 < 3";
|
|
4325
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4326
|
+
assert_eq!(escaped, "1 < 2 < 3");
|
|
4327
|
+
}
|
|
4328
|
+
|
|
4329
|
+
#[test]
|
|
4330
|
+
fn test_escape_malformed_angle_brackets_preserves_valid_tags() {
|
|
4331
|
+
let input = "<div>content</div>";
|
|
4332
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4333
|
+
assert_eq!(escaped, "<div>content</div>");
|
|
4334
|
+
}
|
|
4335
|
+
|
|
4336
|
+
#[test]
|
|
4337
|
+
fn test_escape_malformed_angle_brackets_mixed() {
|
|
4338
|
+
let input = "<div>1<2</div><p>3<4</p>";
|
|
4339
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4340
|
+
assert_eq!(escaped, "<div>1<2</div><p>3<4</p>");
|
|
4341
|
+
}
|
|
4342
|
+
|
|
4343
|
+
#[test]
|
|
4344
|
+
fn test_escape_malformed_angle_brackets_at_end() {
|
|
4345
|
+
let input = "test<";
|
|
4346
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4347
|
+
assert_eq!(escaped, "test<");
|
|
4348
|
+
}
|
|
4349
|
+
|
|
4350
|
+
#[test]
|
|
4351
|
+
fn test_escape_malformed_angle_brackets_preserves_comments() {
|
|
4352
|
+
let input = "<!-- comment -->1<2";
|
|
4353
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4354
|
+
assert_eq!(escaped, "<!-- comment -->1<2");
|
|
4355
|
+
}
|
|
4356
|
+
|
|
4357
|
+
#[test]
|
|
4358
|
+
fn test_escape_malformed_angle_brackets_preserves_doctype() {
|
|
4359
|
+
let input = "<!DOCTYPE html>1<2";
|
|
4360
|
+
let escaped = escape_malformed_angle_brackets(input);
|
|
4361
|
+
assert_eq!(escaped, "<!DOCTYPE html>1<2");
|
|
4362
|
+
}
|
|
4363
|
+
|
|
4364
|
+
#[test]
|
|
4365
|
+
fn test_convert_with_malformed_angle_brackets() {
|
|
4366
|
+
// Test the full conversion pipeline (issue #94)
|
|
4367
|
+
let html = "<html>1<2\nContent</html>";
|
|
4368
|
+
let result = convert_html(html, &ConversionOptions::default()).unwrap();
|
|
4369
|
+
assert!(
|
|
4370
|
+
result.contains("Content"),
|
|
4371
|
+
"Result should contain 'Content': {:?}",
|
|
4372
|
+
result
|
|
4373
|
+
);
|
|
4374
|
+
assert!(
|
|
4375
|
+
result.contains("1<2") || result.contains("1<2"),
|
|
4376
|
+
"Result should contain escaped or unescaped comparison"
|
|
4377
|
+
);
|
|
4378
|
+
}
|
|
4379
|
+
|
|
4380
|
+
#[test]
|
|
4381
|
+
fn test_convert_with_malformed_angle_brackets_in_div() {
|
|
4382
|
+
let html = "<html><div>1<2</div><div>Content</div></html>";
|
|
4383
|
+
let result = convert_html(html, &ConversionOptions::default()).unwrap();
|
|
4384
|
+
assert!(
|
|
4385
|
+
result.contains("Content"),
|
|
4386
|
+
"Result should contain 'Content': {:?}",
|
|
4387
|
+
result
|
|
4388
|
+
);
|
|
4389
|
+
}
|
|
4390
|
+
|
|
4391
|
+
#[test]
|
|
4392
|
+
fn test_convert_with_multiple_malformed_angle_brackets() {
|
|
4393
|
+
let html = "<html>1 < 2 < 3<p>Content</p></html>";
|
|
4394
|
+
let result = convert_html(html, &ConversionOptions::default()).unwrap();
|
|
4395
|
+
assert!(
|
|
4396
|
+
result.contains("Content"),
|
|
4397
|
+
"Result should contain 'Content': {:?}",
|
|
4398
|
+
result
|
|
4399
|
+
);
|
|
4400
|
+
}
|
|
4052
4401
|
}
|
|
Binary file
|
|
@@ -128,8 +128,8 @@ class PreprocessingOptions:
|
|
|
128
128
|
... )
|
|
129
129
|
"""
|
|
130
130
|
|
|
131
|
-
enabled: bool =
|
|
132
|
-
"""Whether to enable HTML preprocessing (
|
|
131
|
+
enabled: bool = True
|
|
132
|
+
"""Whether to enable HTML preprocessing (enabled by default for robust handling of malformed HTML)."""
|
|
133
133
|
|
|
134
134
|
preset: Literal["minimal", "standard", "aggressive"] = "standard"
|
|
135
135
|
"""Preprocessing aggressiveness level."""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/benches/micro_benchmark.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_escape.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_lists.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/examples/test_tables.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/converter.rs
RENAMED
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/extractor.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/parser.rs
RENAMED
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/hocr/spatial.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/src/inline_images.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.2}/crates/html-to-markdown/tests/integration_test.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|