html-to-markdown 2.4.2__tar.gz → 2.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/Cargo.lock +9 -12
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/Cargo.toml +2 -2
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/PKG-INFO +1 -1
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/README.md +35 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/converter.rs +154 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/converter.rs +14 -1
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/extractor.rs +39 -50
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/lib.rs +2 -2
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/options.rs +5 -0
- html_to_markdown-2.5.0/crates/html-to-markdown/src/sanitizer.rs +284 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/Cargo.toml +1 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/README.md +19 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/src/lib.rs +104 -89
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/html_to_markdown/__init__.py +1 -1
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/html_to_markdown/api.py +1 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/html_to_markdown/bin/html-to-markdown +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/html_to_markdown/options.py +3 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/pyproject.toml +1 -1
- html_to_markdown-2.4.2/crates/html-to-markdown/src/sanitizer.rs +0 -85
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/LICENSE +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/README_PYPI.md +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/Cargo.toml +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/benches/conversion_benchmark.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/benches/micro_benchmark.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/benches/profiling_benchmark.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/basic.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/table.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_escape.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_inline_formatting.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_lists.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_semantic_tags.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_tables.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_task_lists.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/examples/test_whitespace.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/error.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/mod.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/parser.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/spatial.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/types.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/inline_images.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/text.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/wrapper.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/tests/commonmark_compliance_test.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/tests/hocr_compliance_test.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/tests/integration_test.rs +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/python/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/python/html_to_markdown/_html_to_markdown.pyi +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown-py/uv.lock +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/html_to_markdown/_rust.pyi +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/html_to_markdown/cli.py +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/html_to_markdown/cli_proxy.py +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/html_to_markdown/exceptions.py +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/html_to_markdown/v1_compat.py +0 -0
|
@@ -157,9 +157,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
|
|
157
157
|
|
|
158
158
|
[[package]]
|
|
159
159
|
name = "cc"
|
|
160
|
-
version = "1.2.
|
|
160
|
+
version = "1.2.42"
|
|
161
161
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
162
|
-
checksum = "
|
|
162
|
+
checksum = "81bbf3b3619004ad9bd139f62a9ab5cfe467f307455a0d307b0cf58bf070feaa"
|
|
163
163
|
dependencies = [
|
|
164
164
|
"find-msvc-tools",
|
|
165
165
|
"shlex",
|
|
@@ -425,9 +425,9 @@ dependencies = [
|
|
|
425
425
|
|
|
426
426
|
[[package]]
|
|
427
427
|
name = "doc-comment"
|
|
428
|
-
version = "0.3.
|
|
428
|
+
version = "0.3.4"
|
|
429
429
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
430
|
-
checksum = "
|
|
430
|
+
checksum = "780955b8b195a21ab8e4ac6b60dd1dbdcec1dc6c51c0617964b08c81785e12c9"
|
|
431
431
|
|
|
432
432
|
[[package]]
|
|
433
433
|
name = "dtoa"
|
|
@@ -550,11 +550,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
550
550
|
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
|
|
551
551
|
dependencies = [
|
|
552
552
|
"cfg-if",
|
|
553
|
-
"js-sys",
|
|
554
553
|
"libc",
|
|
555
554
|
"r-efi",
|
|
556
555
|
"wasip2",
|
|
557
|
-
"wasm-bindgen",
|
|
558
556
|
]
|
|
559
557
|
|
|
560
558
|
[[package]]
|
|
@@ -595,7 +593,7 @@ dependencies = [
|
|
|
595
593
|
|
|
596
594
|
[[package]]
|
|
597
595
|
name = "html-to-markdown-cli"
|
|
598
|
-
version = "2.
|
|
596
|
+
version = "2.5.0"
|
|
599
597
|
dependencies = [
|
|
600
598
|
"assert_cmd",
|
|
601
599
|
"clap",
|
|
@@ -609,7 +607,7 @@ dependencies = [
|
|
|
609
607
|
|
|
610
608
|
[[package]]
|
|
611
609
|
name = "html-to-markdown-node"
|
|
612
|
-
version = "2.
|
|
610
|
+
version = "2.5.0"
|
|
613
611
|
dependencies = [
|
|
614
612
|
"html-to-markdown-rs",
|
|
615
613
|
"mimalloc-rust",
|
|
@@ -620,7 +618,7 @@ dependencies = [
|
|
|
620
618
|
|
|
621
619
|
[[package]]
|
|
622
620
|
name = "html-to-markdown-py"
|
|
623
|
-
version = "2.
|
|
621
|
+
version = "2.5.0"
|
|
624
622
|
dependencies = [
|
|
625
623
|
"base64",
|
|
626
624
|
"html-to-markdown-rs",
|
|
@@ -630,7 +628,7 @@ dependencies = [
|
|
|
630
628
|
|
|
631
629
|
[[package]]
|
|
632
630
|
name = "html-to-markdown-rs"
|
|
633
|
-
version = "2.
|
|
631
|
+
version = "2.5.0"
|
|
634
632
|
dependencies = [
|
|
635
633
|
"ammonia",
|
|
636
634
|
"base64",
|
|
@@ -647,10 +645,9 @@ dependencies = [
|
|
|
647
645
|
|
|
648
646
|
[[package]]
|
|
649
647
|
name = "html-to-markdown-wasm"
|
|
650
|
-
version = "2.
|
|
648
|
+
version = "2.5.0"
|
|
651
649
|
dependencies = [
|
|
652
650
|
"console_error_panic_hook",
|
|
653
|
-
"getrandom",
|
|
654
651
|
"html-to-markdown-rs",
|
|
655
652
|
"js-sys",
|
|
656
653
|
"serde",
|
|
@@ -3,7 +3,7 @@ resolver = "2"
|
|
|
3
3
|
members = ["crates/html-to-markdown-py"]
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "2.
|
|
6
|
+
version = "2.5.0"
|
|
7
7
|
edition = "2021"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
9
9
|
license = "MIT"
|
|
@@ -15,7 +15,7 @@ rust-version = "1.80"
|
|
|
15
15
|
|
|
16
16
|
[workspace.dependencies]
|
|
17
17
|
# Core library
|
|
18
|
-
html-to-markdown-rs = { version = "2.
|
|
18
|
+
html-to-markdown-rs = { version = "2.5.0", path = "crates/html-to-markdown" }
|
|
19
19
|
|
|
20
20
|
# HTML parsing and sanitization
|
|
21
21
|
tl = "0.7"
|
|
@@ -60,6 +60,41 @@ let options = ConversionOptions {
|
|
|
60
60
|
let markdown = convert(html, Some(options))?;
|
|
61
61
|
```
|
|
62
62
|
|
|
63
|
+
### Preserving HTML Tags
|
|
64
|
+
|
|
65
|
+
The `preserve_tags` option allows you to keep specific HTML tags in their original form instead of converting them to Markdown. This is useful for complex elements like tables that may not convert well:
|
|
66
|
+
|
|
67
|
+
```rust
|
|
68
|
+
use html_to_markdown_rs::{convert, ConversionOptions};
|
|
69
|
+
|
|
70
|
+
let html = r#"
|
|
71
|
+
<p>Before table</p>
|
|
72
|
+
<table class="data">
|
|
73
|
+
<tr><th>Name</th><th>Value</th></tr>
|
|
74
|
+
<tr><td>Item 1</td><td>100</td></tr>
|
|
75
|
+
</table>
|
|
76
|
+
<p>After table</p>
|
|
77
|
+
"#;
|
|
78
|
+
|
|
79
|
+
let options = ConversionOptions {
|
|
80
|
+
preserve_tags: vec!["table".to_string()],
|
|
81
|
+
..Default::default()
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
let markdown = convert(html, Some(options))?;
|
|
85
|
+
// Result: "Before table\n\n<table class=\"data\">...</table>\n\nAfter table\n"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
You can preserve multiple tag types and combine with `strip_tags`:
|
|
89
|
+
|
|
90
|
+
```rust
|
|
91
|
+
let options = ConversionOptions {
|
|
92
|
+
preserve_tags: vec!["table".to_string(), "form".to_string()],
|
|
93
|
+
strip_tags: vec!["script".to_string(), "style".to_string()],
|
|
94
|
+
..Default::default()
|
|
95
|
+
};
|
|
96
|
+
```
|
|
97
|
+
|
|
63
98
|
## Web Scraping with Preprocessing
|
|
64
99
|
|
|
65
100
|
```rust
|
|
@@ -1159,6 +1159,75 @@ fn escape_malformed_angle_brackets(input: &str) -> Cow<'_, str> {
|
|
|
1159
1159
|
}
|
|
1160
1160
|
}
|
|
1161
1161
|
|
|
1162
|
+
/// Serialize a tag and its children back to HTML.
|
|
1163
|
+
///
|
|
1164
|
+
/// This is used for the preserve_tags feature to output original HTML for specific elements.
|
|
1165
|
+
fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
|
|
1166
|
+
let mut html = String::new();
|
|
1167
|
+
serialize_node_to_html(handle, parser, &mut html);
|
|
1168
|
+
html
|
|
1169
|
+
}
|
|
1170
|
+
|
|
1171
|
+
/// Recursively serialize a node to HTML.
|
|
1172
|
+
fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
|
|
1173
|
+
match handle.get(parser) {
|
|
1174
|
+
Some(tl::Node::Tag(tag)) => {
|
|
1175
|
+
let tag_name = tag.name().as_utf8_str();
|
|
1176
|
+
|
|
1177
|
+
// Opening tag
|
|
1178
|
+
output.push('<');
|
|
1179
|
+
output.push_str(&tag_name);
|
|
1180
|
+
|
|
1181
|
+
// Attributes
|
|
1182
|
+
for (key, value) in tag.attributes().iter() {
|
|
1183
|
+
output.push(' ');
|
|
1184
|
+
output.push_str(&key);
|
|
1185
|
+
if let Some(val) = value {
|
|
1186
|
+
output.push_str("=\"");
|
|
1187
|
+
output.push_str(&val);
|
|
1188
|
+
output.push('"');
|
|
1189
|
+
}
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
output.push('>');
|
|
1193
|
+
|
|
1194
|
+
// Children
|
|
1195
|
+
let children = tag.children();
|
|
1196
|
+
for child_handle in children.top().iter() {
|
|
1197
|
+
serialize_node_to_html(child_handle, parser, output);
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
// Closing tag (skip for self-closing tags)
|
|
1201
|
+
if !matches!(
|
|
1202
|
+
tag_name.as_ref(),
|
|
1203
|
+
"br" | "hr"
|
|
1204
|
+
| "img"
|
|
1205
|
+
| "input"
|
|
1206
|
+
| "meta"
|
|
1207
|
+
| "link"
|
|
1208
|
+
| "area"
|
|
1209
|
+
| "base"
|
|
1210
|
+
| "col"
|
|
1211
|
+
| "embed"
|
|
1212
|
+
| "param"
|
|
1213
|
+
| "source"
|
|
1214
|
+
| "track"
|
|
1215
|
+
| "wbr"
|
|
1216
|
+
) {
|
|
1217
|
+
output.push_str("</");
|
|
1218
|
+
output.push_str(&tag_name);
|
|
1219
|
+
output.push('>');
|
|
1220
|
+
}
|
|
1221
|
+
}
|
|
1222
|
+
Some(tl::Node::Raw(bytes)) => {
|
|
1223
|
+
if let Ok(text) = std::str::from_utf8(bytes.as_bytes()) {
|
|
1224
|
+
output.push_str(text);
|
|
1225
|
+
}
|
|
1226
|
+
}
|
|
1227
|
+
_ => {}
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1162
1231
|
fn strip_script_and_style_sections(input: &str) -> Cow<'_, str> {
|
|
1163
1232
|
const TAGS: [&[u8]; 2] = [b"script", b"style"];
|
|
1164
1233
|
const SVG: &[u8] = b"svg";
|
|
@@ -1557,6 +1626,13 @@ fn walk_node(
|
|
|
1557
1626
|
return;
|
|
1558
1627
|
}
|
|
1559
1628
|
|
|
1629
|
+
// Preserve tags: output original HTML
|
|
1630
|
+
if options.preserve_tags.iter().any(|t| t.as_str() == tag_name) {
|
|
1631
|
+
let html = serialize_tag_to_html(node_handle, parser);
|
|
1632
|
+
output.push_str(&html);
|
|
1633
|
+
return;
|
|
1634
|
+
}
|
|
1635
|
+
|
|
1560
1636
|
match tag_name.as_ref() {
|
|
1561
1637
|
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
|
|
1562
1638
|
let level = tag_name.chars().last().and_then(|c| c.to_digit(10)).unwrap_or(1) as usize;
|
|
@@ -4398,4 +4474,82 @@ mod tests {
|
|
|
4398
4474
|
result
|
|
4399
4475
|
);
|
|
4400
4476
|
}
|
|
4477
|
+
|
|
4478
|
+
#[test]
|
|
4479
|
+
fn test_preserve_tags_simple_table() {
|
|
4480
|
+
let html = r#"<div><table><tr><td>Cell 1</td><td>Cell 2</td></tr></table><p>Text</p></div>"#;
|
|
4481
|
+
let mut options = ConversionOptions::default();
|
|
4482
|
+
options.preserve_tags = vec!["table".to_string()];
|
|
4483
|
+
let result = convert_html(html, &options).unwrap();
|
|
4484
|
+
|
|
4485
|
+
assert!(result.contains("<table>"), "Should preserve table tag");
|
|
4486
|
+
assert!(result.contains("</table>"), "Should have closing table tag");
|
|
4487
|
+
assert!(result.contains("<tr>"), "Should preserve tr tag");
|
|
4488
|
+
assert!(result.contains("<td>"), "Should preserve td tag");
|
|
4489
|
+
assert!(result.contains("Text"), "Should convert other elements");
|
|
4490
|
+
}
|
|
4491
|
+
|
|
4492
|
+
#[test]
|
|
4493
|
+
fn test_preserve_tags_with_attributes() {
|
|
4494
|
+
let html = r#"<table class="data" id="mytable"><tr><td>Data</td></tr></table>"#;
|
|
4495
|
+
let mut options = ConversionOptions::default();
|
|
4496
|
+
options.preserve_tags = vec!["table".to_string()];
|
|
4497
|
+
let result = convert_html(html, &options).unwrap();
|
|
4498
|
+
|
|
4499
|
+
assert!(result.contains("<table"), "Should preserve table tag");
|
|
4500
|
+
assert!(result.contains("class="), "Should preserve class attribute");
|
|
4501
|
+
assert!(result.contains("id="), "Should preserve id attribute");
|
|
4502
|
+
assert!(result.contains("</table>"), "Should have closing tag");
|
|
4503
|
+
}
|
|
4504
|
+
|
|
4505
|
+
#[test]
|
|
4506
|
+
fn test_preserve_tags_multiple_tags() {
|
|
4507
|
+
let html = r#"<div><table><tr><td>Table</td></tr></table><form><input type="text"/></form><p>Text</p></div>"#;
|
|
4508
|
+
let mut options = ConversionOptions::default();
|
|
4509
|
+
options.preserve_tags = vec!["table".to_string(), "form".to_string()];
|
|
4510
|
+
let result = convert_html(html, &options).unwrap();
|
|
4511
|
+
|
|
4512
|
+
assert!(result.contains("<table>"), "Should preserve table");
|
|
4513
|
+
assert!(result.contains("<form>"), "Should preserve form");
|
|
4514
|
+
assert!(result.contains("Text"), "Should convert paragraph");
|
|
4515
|
+
}
|
|
4516
|
+
|
|
4517
|
+
#[test]
|
|
4518
|
+
fn test_preserve_tags_nested_content() {
|
|
4519
|
+
let html = r#"<table><thead><tr><th>Header</th></tr></thead><tbody><tr><td>Data</td></tr></tbody></table>"#;
|
|
4520
|
+
let mut options = ConversionOptions::default();
|
|
4521
|
+
options.preserve_tags = vec!["table".to_string()];
|
|
4522
|
+
let result = convert_html(html, &options).unwrap();
|
|
4523
|
+
|
|
4524
|
+
assert!(result.contains("<thead>"), "Should preserve nested thead");
|
|
4525
|
+
assert!(result.contains("<tbody>"), "Should preserve nested tbody");
|
|
4526
|
+
assert!(result.contains("<th>"), "Should preserve th tag");
|
|
4527
|
+
assert!(result.contains("Header"), "Should preserve text content");
|
|
4528
|
+
}
|
|
4529
|
+
|
|
4530
|
+
#[test]
|
|
4531
|
+
fn test_preserve_tags_empty_list() {
|
|
4532
|
+
let html = r#"<table><tr><td>Cell</td></tr></table>"#;
|
|
4533
|
+
let options = ConversionOptions::default(); // No preserve_tags
|
|
4534
|
+
let result = convert_html(html, &options).unwrap();
|
|
4535
|
+
|
|
4536
|
+
// Should convert to markdown table (or at least not preserve HTML)
|
|
4537
|
+
assert!(
|
|
4538
|
+
!result.contains("<table>"),
|
|
4539
|
+
"Should not preserve table without preserve_tags"
|
|
4540
|
+
);
|
|
4541
|
+
}
|
|
4542
|
+
|
|
4543
|
+
#[test]
|
|
4544
|
+
fn test_preserve_tags_vs_strip_tags() {
|
|
4545
|
+
let html = r#"<table><tr><td>Table</td></tr></table><div><span>Text</span></div>"#;
|
|
4546
|
+
let mut options = ConversionOptions::default();
|
|
4547
|
+
options.preserve_tags = vec!["table".to_string()];
|
|
4548
|
+
options.strip_tags = vec!["span".to_string()];
|
|
4549
|
+
let result = convert_html(html, &options).unwrap();
|
|
4550
|
+
|
|
4551
|
+
assert!(result.contains("<table>"), "Should preserve table");
|
|
4552
|
+
assert!(!result.contains("<span>"), "Should strip span tag");
|
|
4553
|
+
assert!(result.contains("Text"), "Should keep span text content");
|
|
4554
|
+
}
|
|
4401
4555
|
}
|
{html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/converter.rs
RENAMED
|
@@ -237,9 +237,22 @@ fn convert_element(
|
|
|
237
237
|
|
|
238
238
|
// Words - join with space
|
|
239
239
|
HocrElementType::OcrxWord => {
|
|
240
|
+
// Ensure space before this word if output doesn't end with whitespace or markdown formatting
|
|
241
|
+
if !output.is_empty()
|
|
242
|
+
&& !output.ends_with(' ')
|
|
243
|
+
&& !output.ends_with('\t')
|
|
244
|
+
&& !output.ends_with('\n')
|
|
245
|
+
&& !output.ends_with('*') // Don't add space after italic/bold markers
|
|
246
|
+
&& !output.ends_with('`') // Don't add space after code markers
|
|
247
|
+
&& !output.ends_with('_') // Don't add space after underline markers
|
|
248
|
+
&& !output.ends_with('[')
|
|
249
|
+
// Don't add space after opening bracket (link/image alt)
|
|
250
|
+
{
|
|
251
|
+
output.push(' ');
|
|
252
|
+
}
|
|
253
|
+
|
|
240
254
|
if !element.text.is_empty() {
|
|
241
255
|
output.push_str(&element.text);
|
|
242
|
-
output.push(' ');
|
|
243
256
|
}
|
|
244
257
|
}
|
|
245
258
|
|
{html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/extractor.rs
RENAMED
|
@@ -78,69 +78,58 @@ fn collect_hocr_elements(
|
|
|
78
78
|
}
|
|
79
79
|
}
|
|
80
80
|
|
|
81
|
-
/// Extract hOCR metadata from HTML head
|
|
81
|
+
/// Extract hOCR metadata from HTML head (or from orphaned meta tags after sanitization)
|
|
82
82
|
fn extract_metadata(dom: &tl::VDom) -> HocrMetadata {
|
|
83
83
|
let mut metadata = HocrMetadata::default();
|
|
84
84
|
let parser = dom.parser();
|
|
85
85
|
|
|
86
|
-
//
|
|
87
|
-
fn
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
86
|
+
// Helper function to extract metadata from a single meta tag
|
|
87
|
+
fn extract_from_meta_tag(meta_tag: &tl::HTMLTag, metadata: &mut HocrMetadata) {
|
|
88
|
+
let attrs = meta_tag.attributes();
|
|
89
|
+
if let (Some(name), Some(content)) = (attrs.get("name").flatten(), attrs.get("content").flatten()) {
|
|
90
|
+
let name_str = name.as_utf8_str();
|
|
91
|
+
let content_str = content.as_utf8_str().to_string();
|
|
92
|
+
|
|
93
|
+
match name_str.as_ref() {
|
|
94
|
+
"ocr-system" => metadata.ocr_system = Some(content_str),
|
|
95
|
+
"ocr-capabilities" => {
|
|
96
|
+
metadata.ocr_capabilities = content_str.split_whitespace().map(|s| s.to_string()).collect();
|
|
97
|
+
}
|
|
98
|
+
"ocr-number-of-pages" => {
|
|
99
|
+
metadata.ocr_number_of_pages = content_str.parse().ok();
|
|
100
|
+
}
|
|
101
|
+
"ocr-langs" => {
|
|
102
|
+
metadata.ocr_langs = content_str.split_whitespace().map(|s| s.to_string()).collect();
|
|
103
|
+
}
|
|
104
|
+
"ocr-scripts" => {
|
|
105
|
+
metadata.ocr_scripts = content_str.split_whitespace().map(|s| s.to_string()).collect();
|
|
106
|
+
}
|
|
107
|
+
_ => {}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Recursively search for meta tags (either inside head or as orphans after sanitization)
|
|
113
|
+
fn find_meta_tags<'a>(node_handle: &tl::NodeHandle, parser: &'a tl::Parser<'a>, metadata: &mut HocrMetadata) {
|
|
92
114
|
if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
|
|
93
115
|
let tag_name = tag.name().as_utf8_str();
|
|
94
116
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
{
|
|
105
|
-
let name_str = name.as_utf8_str();
|
|
106
|
-
let content_str = content.as_utf8_str().to_string();
|
|
107
|
-
|
|
108
|
-
match name_str.as_ref() {
|
|
109
|
-
"ocr-system" => metadata.ocr_system = Some(content_str),
|
|
110
|
-
"ocr-capabilities" => {
|
|
111
|
-
metadata.ocr_capabilities =
|
|
112
|
-
content_str.split_whitespace().map(|s| s.to_string()).collect();
|
|
113
|
-
}
|
|
114
|
-
"ocr-number-of-pages" => {
|
|
115
|
-
metadata.ocr_number_of_pages = content_str.parse().ok();
|
|
116
|
-
}
|
|
117
|
-
"ocr-langs" => {
|
|
118
|
-
metadata.ocr_langs =
|
|
119
|
-
content_str.split_whitespace().map(|s| s.to_string()).collect();
|
|
120
|
-
}
|
|
121
|
-
"ocr-scripts" => {
|
|
122
|
-
metadata.ocr_scripts =
|
|
123
|
-
content_str.split_whitespace().map(|s| s.to_string()).collect();
|
|
124
|
-
}
|
|
125
|
-
_ => {}
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
} else {
|
|
132
|
-
// Keep searching in children
|
|
133
|
-
let children = tag.children();
|
|
134
|
-
for child_handle in children.top().iter() {
|
|
135
|
-
find_head_and_extract(child_handle, parser, metadata);
|
|
136
|
-
}
|
|
117
|
+
// Extract from meta tags directly (handles both meta inside head and orphaned meta)
|
|
118
|
+
if tag_name == "meta" {
|
|
119
|
+
extract_from_meta_tag(tag, metadata);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Recursively search children
|
|
123
|
+
let children = tag.children();
|
|
124
|
+
for child_handle in children.top().iter() {
|
|
125
|
+
find_meta_tags(child_handle, parser, metadata);
|
|
137
126
|
}
|
|
138
127
|
}
|
|
139
128
|
}
|
|
140
129
|
|
|
141
130
|
// Search from root
|
|
142
131
|
for child_handle in dom.children().iter() {
|
|
143
|
-
|
|
132
|
+
find_meta_tags(child_handle, parser, &mut metadata);
|
|
144
133
|
}
|
|
145
134
|
|
|
146
135
|
metadata
|
|
@@ -49,7 +49,7 @@ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String>
|
|
|
49
49
|
let normalized_html = html.replace("\r\n", "\n").replace('\r', "\n");
|
|
50
50
|
|
|
51
51
|
let clean_html = if options.preprocessing.enabled {
|
|
52
|
-
sanitizer::sanitize(&normalized_html, &options.preprocessing)?
|
|
52
|
+
sanitizer::sanitize(&normalized_html, &options.preprocessing, &options.preserve_tags)?
|
|
53
53
|
} else {
|
|
54
54
|
normalized_html
|
|
55
55
|
};
|
|
@@ -86,7 +86,7 @@ pub fn convert_with_inline_images(
|
|
|
86
86
|
let normalized_html = html.replace("\r\n", "\n").replace('\r', "\n");
|
|
87
87
|
|
|
88
88
|
let clean_html = if options.preprocessing.enabled {
|
|
89
|
-
sanitizer::sanitize(&normalized_html, &options.preprocessing)?
|
|
89
|
+
sanitizer::sanitize(&normalized_html, &options.preprocessing, &options.preserve_tags)?
|
|
90
90
|
} else {
|
|
91
91
|
normalized_html
|
|
92
92
|
};
|
|
@@ -200,6 +200,10 @@ pub struct ConversionOptions {
|
|
|
200
200
|
|
|
201
201
|
/// List of HTML tags to strip (output only text content, no markdown conversion)
|
|
202
202
|
pub strip_tags: Vec<String>,
|
|
203
|
+
|
|
204
|
+
/// List of HTML tags to preserve as-is in the output (keep original HTML)
|
|
205
|
+
/// Useful for complex elements like tables that don't convert well to Markdown
|
|
206
|
+
pub preserve_tags: Vec<String>,
|
|
203
207
|
}
|
|
204
208
|
|
|
205
209
|
impl Default for ConversionOptions {
|
|
@@ -235,6 +239,7 @@ impl Default for ConversionOptions {
|
|
|
235
239
|
encoding: "utf-8".to_string(),
|
|
236
240
|
debug: false,
|
|
237
241
|
strip_tags: Vec::new(),
|
|
242
|
+
preserve_tags: Vec::new(),
|
|
238
243
|
}
|
|
239
244
|
}
|
|
240
245
|
}
|