html-to-markdown 2.4.0__tar.gz → 2.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (54) hide show
  1. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/Cargo.lock +5 -5
  2. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/Cargo.toml +2 -2
  3. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/PKG-INFO +1 -1
  4. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/converter.rs +177 -1
  5. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/__init__.py +1 -1
  6. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/bin/html-to-markdown +0 -0
  7. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/pyproject.toml +1 -1
  8. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/LICENSE +0 -0
  9. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/README_PYPI.md +0 -0
  10. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/Cargo.toml +0 -0
  11. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/README.md +0 -0
  12. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/benches/conversion_benchmark.rs +0 -0
  13. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/benches/micro_benchmark.rs +0 -0
  14. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/benches/profiling_benchmark.rs +0 -0
  15. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/basic.rs +0 -0
  16. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/table.rs +0 -0
  17. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_escape.rs +0 -0
  18. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_inline_formatting.rs +0 -0
  19. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_lists.rs +0 -0
  20. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_semantic_tags.rs +0 -0
  21. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_tables.rs +0 -0
  22. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_task_lists.rs +0 -0
  23. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/examples/test_whitespace.rs +0 -0
  24. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/error.rs +0 -0
  25. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/converter.rs +0 -0
  26. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/extractor.rs +0 -0
  27. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/mod.rs +0 -0
  28. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/parser.rs +0 -0
  29. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/spatial.rs +0 -0
  30. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/hocr/types.rs +0 -0
  31. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/inline_images.rs +0 -0
  32. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/lib.rs +0 -0
  33. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/options.rs +0 -0
  34. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/sanitizer.rs +0 -0
  35. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/text.rs +0 -0
  36. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/src/wrapper.rs +0 -0
  37. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/tests/commonmark_compliance_test.rs +0 -0
  38. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/tests/hocr_compliance_test.rs +0 -0
  39. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown/tests/integration_test.rs +0 -0
  40. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/Cargo.toml +0 -0
  41. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/README.md +0 -0
  42. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/python/html_to_markdown/__init__.py +0 -0
  43. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/python/html_to_markdown/_html_to_markdown.pyi +0 -0
  44. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/src/lib.rs +0 -0
  45. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/crates/html-to-markdown-py/uv.lock +0 -0
  46. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/__main__.py +0 -0
  47. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/_rust.pyi +0 -0
  48. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/api.py +0 -0
  49. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/cli.py +0 -0
  50. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/cli_proxy.py +0 -0
  51. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/exceptions.py +0 -0
  52. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/options.py +0 -0
  53. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/py.typed +0 -0
  54. {html_to_markdown-2.4.0 → html_to_markdown-2.4.1}/html_to_markdown/v1_compat.py +0 -0
@@ -595,7 +595,7 @@ dependencies = [
595
595
 
596
596
  [[package]]
597
597
  name = "html-to-markdown-cli"
598
- version = "2.4.0"
598
+ version = "2.4.1"
599
599
  dependencies = [
600
600
  "assert_cmd",
601
601
  "clap",
@@ -609,7 +609,7 @@ dependencies = [
609
609
 
610
610
  [[package]]
611
611
  name = "html-to-markdown-node"
612
- version = "2.4.0"
612
+ version = "2.4.1"
613
613
  dependencies = [
614
614
  "html-to-markdown-rs",
615
615
  "mimalloc-rust",
@@ -620,7 +620,7 @@ dependencies = [
620
620
 
621
621
  [[package]]
622
622
  name = "html-to-markdown-py"
623
- version = "2.4.0"
623
+ version = "2.4.1"
624
624
  dependencies = [
625
625
  "base64",
626
626
  "html-to-markdown-rs",
@@ -630,7 +630,7 @@ dependencies = [
630
630
 
631
631
  [[package]]
632
632
  name = "html-to-markdown-rs"
633
- version = "2.4.0"
633
+ version = "2.4.1"
634
634
  dependencies = [
635
635
  "ammonia",
636
636
  "base64",
@@ -647,7 +647,7 @@ dependencies = [
647
647
 
648
648
  [[package]]
649
649
  name = "html-to-markdown-wasm"
650
- version = "2.4.0"
650
+ version = "2.4.1"
651
651
  dependencies = [
652
652
  "console_error_panic_hook",
653
653
  "getrandom",
@@ -3,7 +3,7 @@ resolver = "2"
3
3
  members = ["crates/html-to-markdown-py"]
4
4
 
5
5
  [workspace.package]
6
- version = "2.4.0"
6
+ version = "2.4.1"
7
7
  edition = "2021"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
9
9
  license = "MIT"
@@ -15,7 +15,7 @@ rust-version = "1.80"
15
15
 
16
16
  [workspace.dependencies]
17
17
  # Core library
18
- html-to-markdown-rs = { version = "2.4.0", path = "crates/html-to-markdown" }
18
+ html-to-markdown-rs = { version = "2.4.1", path = "crates/html-to-markdown" }
19
19
 
20
20
  # HTML parsing and sanitization
21
21
  tl = "0.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 2.4.0
3
+ Version: 2.4.1
4
4
  Classifier: Development Status :: 5 - Production/Stable
5
5
  Classifier: Environment :: Console
6
6
  Classifier: Intended Audience :: Developers
@@ -47,6 +47,9 @@ use std::collections::BTreeMap;
47
47
  #[cfg(feature = "inline-images")]
48
48
  use std::rc::Rc;
49
49
 
50
+ use std::borrow::Cow;
51
+ use std::str;
52
+
50
53
  use crate::error::Result;
51
54
  #[cfg(feature = "inline-images")]
52
55
  use crate::inline_images::{InlineImageCollector, InlineImageFormat, InlineImageSource};
@@ -971,7 +974,9 @@ fn convert_html_impl(
971
974
  .replace("<hr/>", "<hr>")
972
975
  .replace("<img/>", "<img>");
973
976
 
974
- let dom = tl::parse(&html, tl::ParserOptions::default())
977
+ let html = strip_script_and_style_sections(&html);
978
+
979
+ let dom = tl::parse(html.as_ref(), tl::ParserOptions::default())
975
980
  .map_err(|_| crate::error::ConversionError::ParseError("Failed to parse HTML".to_string()))?;
976
981
 
977
982
  let parser = dom.parser();
@@ -1075,6 +1080,161 @@ fn convert_html_impl(
1075
1080
  }
1076
1081
  }
1077
1082
 
1083
+ fn strip_script_and_style_sections(input: &str) -> Cow<'_, str> {
1084
+ const TAGS: [&[u8]; 2] = [b"script", b"style"];
1085
+ const SVG: &[u8] = b"svg";
1086
+
1087
+ let bytes = input.as_bytes();
1088
+ let len = bytes.len();
1089
+ let mut idx = 0;
1090
+ let mut last = 0;
1091
+ let mut output: Option<String> = None;
1092
+ let mut svg_depth = 0usize;
1093
+
1094
+ while idx < len {
1095
+ if bytes[idx] == b'<' {
1096
+ if matches_tag_start(bytes, idx + 1, SVG) {
1097
+ if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
1098
+ svg_depth += 1;
1099
+ idx = open_end;
1100
+ continue;
1101
+ }
1102
+ } else if matches_end_tag_start(bytes, idx + 1, SVG) {
1103
+ if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
1104
+ if svg_depth > 0 {
1105
+ svg_depth = svg_depth.saturating_sub(1);
1106
+ }
1107
+ idx = close_end;
1108
+ continue;
1109
+ }
1110
+ }
1111
+
1112
+ if svg_depth == 0 {
1113
+ let mut handled = false;
1114
+ for tag in TAGS {
1115
+ if matches_tag_start(bytes, idx + 1, tag) {
1116
+ if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
1117
+ let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
1118
+ let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1119
+ out.push_str(&input[last..idx]);
1120
+ out.push_str(&input[idx..open_end]);
1121
+ out.push_str("</");
1122
+ out.push_str(str::from_utf8(tag).unwrap());
1123
+ out.push('>');
1124
+
1125
+ last = remove_end;
1126
+ idx = remove_end;
1127
+ handled = true;
1128
+ }
1129
+ }
1130
+
1131
+ if handled {
1132
+ break;
1133
+ }
1134
+ }
1135
+
1136
+ if handled {
1137
+ continue;
1138
+ }
1139
+ }
1140
+ }
1141
+
1142
+ idx += 1;
1143
+ }
1144
+
1145
+ if let Some(mut out) = output {
1146
+ if last < input.len() {
1147
+ out.push_str(&input[last..]);
1148
+ }
1149
+ Cow::Owned(out)
1150
+ } else {
1151
+ Cow::Borrowed(input)
1152
+ }
1153
+ }
1154
+
1155
+ fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
1156
+ if start >= bytes.len() {
1157
+ return false;
1158
+ }
1159
+
1160
+ if start + tag.len() > bytes.len() {
1161
+ return false;
1162
+ }
1163
+
1164
+ if !bytes[start..start + tag.len()].eq_ignore_ascii_case(tag) {
1165
+ return false;
1166
+ }
1167
+
1168
+ start += tag.len();
1169
+
1170
+ match bytes.get(start) {
1171
+ Some(b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r') => true,
1172
+ Some(_) => false,
1173
+ None => true,
1174
+ }
1175
+ }
1176
+
1177
+ fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
1178
+ let len = bytes.len();
1179
+ let mut in_quote: Option<u8> = None;
1180
+
1181
+ while idx < len {
1182
+ match bytes[idx] {
1183
+ b'"' | b'\'' => {
1184
+ if let Some(current) = in_quote {
1185
+ if current == bytes[idx] {
1186
+ in_quote = None;
1187
+ }
1188
+ } else {
1189
+ in_quote = Some(bytes[idx]);
1190
+ }
1191
+ }
1192
+ b'>' if in_quote.is_none() => return Some(idx + 1),
1193
+ _ => {}
1194
+ }
1195
+ idx += 1;
1196
+ }
1197
+
1198
+ None
1199
+ }
1200
+
1201
+ fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
1202
+ let len = bytes.len();
1203
+ let mut depth = 1usize;
1204
+
1205
+ while idx < len {
1206
+ if bytes[idx] == b'<' {
1207
+ if matches_tag_start(bytes, idx + 1, tag) {
1208
+ if let Some(next) = find_tag_end(bytes, idx + 1 + tag.len()) {
1209
+ depth += 1;
1210
+ idx = next;
1211
+ continue;
1212
+ }
1213
+ } else if matches_end_tag_start(bytes, idx + 1, tag) {
1214
+ if let Some(close) = find_tag_end(bytes, idx + 2 + tag.len()) {
1215
+ depth -= 1;
1216
+ if depth == 0 {
1217
+ return Some(close);
1218
+ }
1219
+ idx = close;
1220
+ continue;
1221
+ }
1222
+ }
1223
+ }
1224
+
1225
+ idx += 1;
1226
+ }
1227
+
1228
+ None
1229
+ }
1230
+
1231
+ fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
1232
+ if start >= bytes.len() || bytes[start] != b'/' {
1233
+ return false;
1234
+ }
1235
+ matches_tag_start(bytes, start + 1, tag)
1236
+ }
1237
+
1078
1238
  /// Check if an element is inline (not block-level).
1079
1239
  fn is_inline_element(tag_name: &str) -> bool {
1080
1240
  matches!(
@@ -4002,6 +4162,22 @@ mod tests {
4002
4162
  assert_eq!(calculate_list_continuation_indent(4), 7);
4003
4163
  }
4004
4164
 
4165
+ #[test]
4166
+ fn strips_script_sections_without_removing_following_content() {
4167
+ let input = "<div>before</div><script>1 < 2</script><p>after</p>";
4168
+ let stripped = strip_script_and_style_sections(input);
4169
+ assert_eq!(stripped, "<div>before</div><script></script><p>after</p>");
4170
+ }
4171
+
4172
+ #[test]
4173
+ fn strips_multiline_script_sections() {
4174
+ let input = "<html>\n<script>1 < 2</script>\nContent\n</html>";
4175
+ let stripped = strip_script_and_style_sections(input);
4176
+ assert!(stripped.contains("Content"));
4177
+ assert!(stripped.contains("<script"));
4178
+ assert!(!stripped.contains("1 < 2"));
4179
+ }
4180
+
4005
4181
  #[test]
4006
4182
  fn test_add_list_continuation_indent_blank_line() {
4007
4183
  let opts = ConversionOptions::default();
@@ -49,4 +49,4 @@ __all__ = [
49
49
  "markdownify",
50
50
  ]
51
51
 
52
- __version__ = "2.4.0"
52
+ __version__ = "2.4.1"
@@ -7,7 +7,7 @@ requires = [
7
7
 
8
8
  [project]
9
9
  name = "html-to-markdown"
10
- version = "2.4.0"
10
+ version = "2.4.1"
11
11
  description = "High-performance HTML to Markdown converter powered by Rust with a clean Python API"
12
12
  readme = "README_PYPI.md"
13
13
  keywords = [