html-to-markdown 2.4.0__tar.gz → 2.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/Cargo.lock +5 -5
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/Cargo.toml +2 -2
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/PKG-INFO +1 -1
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/converter.rs +177 -1
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/__init__.py +1 -1
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/bin/html-to-markdown +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/pyproject.toml +1 -1
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/LICENSE +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/README_PYPI.md +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/Cargo.toml +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/README.md +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/benches/conversion_benchmark.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/benches/micro_benchmark.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/benches/profiling_benchmark.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/basic.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/table.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_escape.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_inline_formatting.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_lists.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_semantic_tags.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_tables.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_task_lists.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_whitespace.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/error.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/converter.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/extractor.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/mod.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/parser.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/spatial.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/types.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/inline_images.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/lib.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/options.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/sanitizer.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/text.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/wrapper.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/tests/commonmark_compliance_test.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/tests/hocr_compliance_test.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/tests/integration_test.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/Cargo.toml +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/README.md +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/python/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/python/html_to_markdown/_html_to_markdown.pyi +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/src/lib.rs +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/uv.lock +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/_rust.pyi +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/api.py +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/cli.py +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/cli_proxy.py +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/exceptions.py +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/options.py +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/v1_compat.py +0 -0
|
@@ -595,7 +595,7 @@ dependencies = [
|
|
|
595
595
|
|
|
596
596
|
[[package]]
|
|
597
597
|
name = "html-to-markdown-cli"
|
|
598
|
-
version = "2.4.
|
|
598
|
+
version = "2.4.1"
|
|
599
599
|
dependencies = [
|
|
600
600
|
"assert_cmd",
|
|
601
601
|
"clap",
|
|
@@ -609,7 +609,7 @@ dependencies = [
|
|
|
609
609
|
|
|
610
610
|
[[package]]
|
|
611
611
|
name = "html-to-markdown-node"
|
|
612
|
-
version = "2.4.
|
|
612
|
+
version = "2.4.1"
|
|
613
613
|
dependencies = [
|
|
614
614
|
"html-to-markdown-rs",
|
|
615
615
|
"mimalloc-rust",
|
|
@@ -620,7 +620,7 @@ dependencies = [
|
|
|
620
620
|
|
|
621
621
|
[[package]]
|
|
622
622
|
name = "html-to-markdown-py"
|
|
623
|
-
version = "2.4.
|
|
623
|
+
version = "2.4.1"
|
|
624
624
|
dependencies = [
|
|
625
625
|
"base64",
|
|
626
626
|
"html-to-markdown-rs",
|
|
@@ -630,7 +630,7 @@ dependencies = [
|
|
|
630
630
|
|
|
631
631
|
[[package]]
|
|
632
632
|
name = "html-to-markdown-rs"
|
|
633
|
-
version = "2.4.
|
|
633
|
+
version = "2.4.1"
|
|
634
634
|
dependencies = [
|
|
635
635
|
"ammonia",
|
|
636
636
|
"base64",
|
|
@@ -647,7 +647,7 @@ dependencies = [
|
|
|
647
647
|
|
|
648
648
|
[[package]]
|
|
649
649
|
name = "html-to-markdown-wasm"
|
|
650
|
-
version = "2.4.
|
|
650
|
+
version = "2.4.1"
|
|
651
651
|
dependencies = [
|
|
652
652
|
"console_error_panic_hook",
|
|
653
653
|
"getrandom",
|
|
@@ -3,7 +3,7 @@ resolver = "2"
|
|
|
3
3
|
members = ["crates/html-to-markdown-py"]
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "2.4.
|
|
6
|
+
version = "2.4.1"
|
|
7
7
|
edition = "2021"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
9
9
|
license = "MIT"
|
|
@@ -15,7 +15,7 @@ rust-version = "1.80"
|
|
|
15
15
|
|
|
16
16
|
[workspace.dependencies]
|
|
17
17
|
# Core library
|
|
18
|
-
html-to-markdown-rs = { version = "2.4.
|
|
18
|
+
html-to-markdown-rs = { version = "2.4.1", path = "crates/html-to-markdown" }
|
|
19
19
|
|
|
20
20
|
# HTML parsing and sanitization
|
|
21
21
|
tl = "0.7"
|
|
@@ -47,6 +47,9 @@ use std::collections::BTreeMap;
|
|
|
47
47
|
#[cfg(feature = "inline-images")]
|
|
48
48
|
use std::rc::Rc;
|
|
49
49
|
|
|
50
|
+
use std::borrow::Cow;
|
|
51
|
+
use std::str;
|
|
52
|
+
|
|
50
53
|
use crate::error::Result;
|
|
51
54
|
#[cfg(feature = "inline-images")]
|
|
52
55
|
use crate::inline_images::{InlineImageCollector, InlineImageFormat, InlineImageSource};
|
|
@@ -971,7 +974,9 @@ fn convert_html_impl(
|
|
|
971
974
|
.replace("<hr/>", "<hr>")
|
|
972
975
|
.replace("<img/>", "<img>");
|
|
973
976
|
|
|
974
|
-
let
|
|
977
|
+
let html = strip_script_and_style_sections(&html);
|
|
978
|
+
|
|
979
|
+
let dom = tl::parse(html.as_ref(), tl::ParserOptions::default())
|
|
975
980
|
.map_err(|_| crate::error::ConversionError::ParseError("Failed to parse HTML".to_string()))?;
|
|
976
981
|
|
|
977
982
|
let parser = dom.parser();
|
|
@@ -1075,6 +1080,161 @@ fn convert_html_impl(
|
|
|
1075
1080
|
}
|
|
1076
1081
|
}
|
|
1077
1082
|
|
|
1083
|
+
fn strip_script_and_style_sections(input: &str) -> Cow<'_, str> {
|
|
1084
|
+
const TAGS: [&[u8]; 2] = [b"script", b"style"];
|
|
1085
|
+
const SVG: &[u8] = b"svg";
|
|
1086
|
+
|
|
1087
|
+
let bytes = input.as_bytes();
|
|
1088
|
+
let len = bytes.len();
|
|
1089
|
+
let mut idx = 0;
|
|
1090
|
+
let mut last = 0;
|
|
1091
|
+
let mut output: Option<String> = None;
|
|
1092
|
+
let mut svg_depth = 0usize;
|
|
1093
|
+
|
|
1094
|
+
while idx < len {
|
|
1095
|
+
if bytes[idx] == b'<' {
|
|
1096
|
+
if matches_tag_start(bytes, idx + 1, SVG) {
|
|
1097
|
+
if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
|
|
1098
|
+
svg_depth += 1;
|
|
1099
|
+
idx = open_end;
|
|
1100
|
+
continue;
|
|
1101
|
+
}
|
|
1102
|
+
} else if matches_end_tag_start(bytes, idx + 1, SVG) {
|
|
1103
|
+
if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
|
|
1104
|
+
if svg_depth > 0 {
|
|
1105
|
+
svg_depth = svg_depth.saturating_sub(1);
|
|
1106
|
+
}
|
|
1107
|
+
idx = close_end;
|
|
1108
|
+
continue;
|
|
1109
|
+
}
|
|
1110
|
+
}
|
|
1111
|
+
|
|
1112
|
+
if svg_depth == 0 {
|
|
1113
|
+
let mut handled = false;
|
|
1114
|
+
for tag in TAGS {
|
|
1115
|
+
if matches_tag_start(bytes, idx + 1, tag) {
|
|
1116
|
+
if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
|
|
1117
|
+
let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
|
|
1118
|
+
let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
|
|
1119
|
+
out.push_str(&input[last..idx]);
|
|
1120
|
+
out.push_str(&input[idx..open_end]);
|
|
1121
|
+
out.push_str("</");
|
|
1122
|
+
out.push_str(str::from_utf8(tag).unwrap());
|
|
1123
|
+
out.push('>');
|
|
1124
|
+
|
|
1125
|
+
last = remove_end;
|
|
1126
|
+
idx = remove_end;
|
|
1127
|
+
handled = true;
|
|
1128
|
+
}
|
|
1129
|
+
}
|
|
1130
|
+
|
|
1131
|
+
if handled {
|
|
1132
|
+
break;
|
|
1133
|
+
}
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
if handled {
|
|
1137
|
+
continue;
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
}
|
|
1141
|
+
|
|
1142
|
+
idx += 1;
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
if let Some(mut out) = output {
|
|
1146
|
+
if last < input.len() {
|
|
1147
|
+
out.push_str(&input[last..]);
|
|
1148
|
+
}
|
|
1149
|
+
Cow::Owned(out)
|
|
1150
|
+
} else {
|
|
1151
|
+
Cow::Borrowed(input)
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
|
|
1155
|
+
fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
|
|
1156
|
+
if start >= bytes.len() {
|
|
1157
|
+
return false;
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1160
|
+
if start + tag.len() > bytes.len() {
|
|
1161
|
+
return false;
|
|
1162
|
+
}
|
|
1163
|
+
|
|
1164
|
+
if !bytes[start..start + tag.len()].eq_ignore_ascii_case(tag) {
|
|
1165
|
+
return false;
|
|
1166
|
+
}
|
|
1167
|
+
|
|
1168
|
+
start += tag.len();
|
|
1169
|
+
|
|
1170
|
+
match bytes.get(start) {
|
|
1171
|
+
Some(b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r') => true,
|
|
1172
|
+
Some(_) => false,
|
|
1173
|
+
None => true,
|
|
1174
|
+
}
|
|
1175
|
+
}
|
|
1176
|
+
|
|
1177
|
+
fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
|
|
1178
|
+
let len = bytes.len();
|
|
1179
|
+
let mut in_quote: Option<u8> = None;
|
|
1180
|
+
|
|
1181
|
+
while idx < len {
|
|
1182
|
+
match bytes[idx] {
|
|
1183
|
+
b'"' | b'\'' => {
|
|
1184
|
+
if let Some(current) = in_quote {
|
|
1185
|
+
if current == bytes[idx] {
|
|
1186
|
+
in_quote = None;
|
|
1187
|
+
}
|
|
1188
|
+
} else {
|
|
1189
|
+
in_quote = Some(bytes[idx]);
|
|
1190
|
+
}
|
|
1191
|
+
}
|
|
1192
|
+
b'>' if in_quote.is_none() => return Some(idx + 1),
|
|
1193
|
+
_ => {}
|
|
1194
|
+
}
|
|
1195
|
+
idx += 1;
|
|
1196
|
+
}
|
|
1197
|
+
|
|
1198
|
+
None
|
|
1199
|
+
}
|
|
1200
|
+
|
|
1201
|
+
fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
|
|
1202
|
+
let len = bytes.len();
|
|
1203
|
+
let mut depth = 1usize;
|
|
1204
|
+
|
|
1205
|
+
while idx < len {
|
|
1206
|
+
if bytes[idx] == b'<' {
|
|
1207
|
+
if matches_tag_start(bytes, idx + 1, tag) {
|
|
1208
|
+
if let Some(next) = find_tag_end(bytes, idx + 1 + tag.len()) {
|
|
1209
|
+
depth += 1;
|
|
1210
|
+
idx = next;
|
|
1211
|
+
continue;
|
|
1212
|
+
}
|
|
1213
|
+
} else if matches_end_tag_start(bytes, idx + 1, tag) {
|
|
1214
|
+
if let Some(close) = find_tag_end(bytes, idx + 2 + tag.len()) {
|
|
1215
|
+
depth -= 1;
|
|
1216
|
+
if depth == 0 {
|
|
1217
|
+
return Some(close);
|
|
1218
|
+
}
|
|
1219
|
+
idx = close;
|
|
1220
|
+
continue;
|
|
1221
|
+
}
|
|
1222
|
+
}
|
|
1223
|
+
}
|
|
1224
|
+
|
|
1225
|
+
idx += 1;
|
|
1226
|
+
}
|
|
1227
|
+
|
|
1228
|
+
None
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
|
|
1232
|
+
if start >= bytes.len() || bytes[start] != b'/' {
|
|
1233
|
+
return false;
|
|
1234
|
+
}
|
|
1235
|
+
matches_tag_start(bytes, start + 1, tag)
|
|
1236
|
+
}
|
|
1237
|
+
|
|
1078
1238
|
/// Check if an element is inline (not block-level).
|
|
1079
1239
|
fn is_inline_element(tag_name: &str) -> bool {
|
|
1080
1240
|
matches!(
|
|
@@ -4002,6 +4162,22 @@ mod tests {
|
|
|
4002
4162
|
assert_eq!(calculate_list_continuation_indent(4), 7);
|
|
4003
4163
|
}
|
|
4004
4164
|
|
|
4165
|
+
#[test]
|
|
4166
|
+
fn strips_script_sections_without_removing_following_content() {
|
|
4167
|
+
let input = "<div>before</div><script>1 < 2</script><p>after</p>";
|
|
4168
|
+
let stripped = strip_script_and_style_sections(input);
|
|
4169
|
+
assert_eq!(stripped, "<div>before</div><script></script><p>after</p>");
|
|
4170
|
+
}
|
|
4171
|
+
|
|
4172
|
+
#[test]
|
|
4173
|
+
fn strips_multiline_script_sections() {
|
|
4174
|
+
let input = "<html>\n<script>1 < 2</script>\nContent\n</html>";
|
|
4175
|
+
let stripped = strip_script_and_style_sections(input);
|
|
4176
|
+
assert!(stripped.contains("Content"));
|
|
4177
|
+
assert!(stripped.contains("<script"));
|
|
4178
|
+
assert!(!stripped.contains("1 < 2"));
|
|
4179
|
+
}
|
|
4180
|
+
|
|
4005
4181
|
#[test]
|
|
4006
4182
|
fn test_add_list_continuation_indent_blank_line() {
|
|
4007
4183
|
let opts = ConversionOptions::default();
|
|
Binary file
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/benches/micro_benchmark.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_escape.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_lists.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_tables.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/converter.rs
RENAMED
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/extractor.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/parser.rs
RENAMED
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/spatial.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/inline_images.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/tests/integration_test.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|