html-to-markdown 2.6.0__tar.gz → 2.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (54) hide show
  1. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/Cargo.lock +6 -6
  2. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/Cargo.toml +2 -2
  3. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/PKG-INFO +1 -1
  4. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/benches/conversion_benchmark.rs +2 -1
  5. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/benches/micro_benchmark.rs +2 -1
  6. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/converter.rs +35 -14
  7. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/bin/html-to-markdown +0 -0
  8. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/pyproject.toml +1 -1
  9. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/LICENSE +0 -0
  10. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/README.md +0 -0
  11. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/Cargo.toml +0 -0
  12. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/README.md +0 -0
  13. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/benches/profiling_benchmark.rs +0 -0
  14. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/basic.rs +0 -0
  15. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/table.rs +0 -0
  16. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_escape.rs +0 -0
  17. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_inline_formatting.rs +0 -0
  18. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_lists.rs +0 -0
  19. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_semantic_tags.rs +0 -0
  20. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_tables.rs +0 -0
  21. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_task_lists.rs +0 -0
  22. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_whitespace.rs +0 -0
  23. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/error.rs +0 -0
  24. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/converter.rs +0 -0
  25. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/extractor.rs +0 -0
  26. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/mod.rs +0 -0
  27. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/parser.rs +0 -0
  28. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/spatial.rs +0 -0
  29. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/types.rs +0 -0
  30. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/inline_images.rs +0 -0
  31. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/lib.rs +0 -0
  32. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/options.rs +0 -0
  33. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/sanitizer.rs +0 -0
  34. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/text.rs +0 -0
  35. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/wrapper.rs +0 -0
  36. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/tests/commonmark_compliance_test.rs +0 -0
  37. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/tests/hocr_compliance_test.rs +0 -0
  38. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/tests/integration_test.rs +0 -0
  39. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown-py/Cargo.toml +0 -0
  40. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown-py/README.md +0 -0
  41. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown-py/python/html_to_markdown/__init__.py +0 -0
  42. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown-py/python/html_to_markdown/_html_to_markdown.pyi +0 -0
  43. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown-py/src/lib.rs +0 -0
  44. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown-py/uv.lock +0 -0
  45. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/__init__.py +0 -0
  46. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/__main__.py +0 -0
  47. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/_rust.pyi +0 -0
  48. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/api.py +0 -0
  49. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/cli.py +0 -0
  50. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/cli_proxy.py +0 -0
  51. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/exceptions.py +0 -0
  52. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/options.py +0 -0
  53. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/py.typed +0 -0
  54. {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/v1_compat.py +0 -0
@@ -997,7 +997,7 @@ dependencies = [
997
997
 
998
998
  [[package]]
999
999
  name = "html-to-markdown-cli"
1000
- version = "2.6.0"
1000
+ version = "2.6.2"
1001
1001
  dependencies = [
1002
1002
  "assert_cmd",
1003
1003
  "clap",
@@ -1011,7 +1011,7 @@ dependencies = [
1011
1011
 
1012
1012
  [[package]]
1013
1013
  name = "html-to-markdown-node"
1014
- version = "2.6.0"
1014
+ version = "2.6.2"
1015
1015
  dependencies = [
1016
1016
  "html-to-markdown-rs",
1017
1017
  "mimalloc-rust",
@@ -1022,7 +1022,7 @@ dependencies = [
1022
1022
 
1023
1023
  [[package]]
1024
1024
  name = "html-to-markdown-php"
1025
- version = "2.6.0"
1025
+ version = "2.6.2"
1026
1026
  dependencies = [
1027
1027
  "ext-php-rs",
1028
1028
  "html-to-markdown-rs",
@@ -1030,7 +1030,7 @@ dependencies = [
1030
1030
 
1031
1031
  [[package]]
1032
1032
  name = "html-to-markdown-py"
1033
- version = "2.6.0"
1033
+ version = "2.6.2"
1034
1034
  dependencies = [
1035
1035
  "base64",
1036
1036
  "html-to-markdown-rs",
@@ -1049,7 +1049,7 @@ dependencies = [
1049
1049
 
1050
1050
  [[package]]
1051
1051
  name = "html-to-markdown-rs"
1052
- version = "2.6.0"
1052
+ version = "2.6.2"
1053
1053
  dependencies = [
1054
1054
  "ammonia",
1055
1055
  "base64",
@@ -1066,7 +1066,7 @@ dependencies = [
1066
1066
 
1067
1067
  [[package]]
1068
1068
  name = "html-to-markdown-wasm"
1069
- version = "2.6.0"
1069
+ version = "2.6.2"
1070
1070
  dependencies = [
1071
1071
  "console_error_panic_hook",
1072
1072
  "html-to-markdown-rs",
@@ -3,7 +3,7 @@ resolver = "2"
3
3
  members = ["crates/html-to-markdown-py"]
4
4
 
5
5
  [workspace.package]
6
- version = "2.6.0"
6
+ version = "2.6.2"
7
7
  edition = "2021"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
9
9
  license = "MIT"
@@ -15,7 +15,7 @@ rust-version = "1.80"
15
15
 
16
16
  [workspace.dependencies]
17
17
  # Core library
18
- html-to-markdown-rs = { version = "2.6.0", path = "crates/html-to-markdown" }
18
+ html-to-markdown-rs = { version = "2.6.1", path = "crates/html-to-markdown" }
19
19
 
20
20
  # HTML parsing and sanitization
21
21
  tl = "0.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 2.6.0
3
+ Version: 2.6.2
4
4
  Classifier: Development Status :: 5 - Production/Stable
5
5
  Classifier: Environment :: Console
6
6
  Classifier: Intended Audience :: Developers
@@ -3,8 +3,9 @@
3
3
  //! Measures throughput (ops/sec, MB/sec) and provides performance baselines
4
4
  //! for the core Rust conversion engine.
5
5
 
6
- use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
6
+ use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
7
7
  use html_to_markdown_rs::{convert, CodeBlockStyle, ConversionOptions, HeadingStyle};
8
+ use std::hint::black_box;
8
9
  use std::time::Duration;
9
10
 
10
11
  /// Generate HTML with varying complexity
@@ -1,7 +1,8 @@
1
1
  //! Micro-benchmarks for specific operations
2
2
 
3
- use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
3
+ use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
4
4
  use html_to_markdown_rs::{convert, ConversionOptions};
5
+ use std::hint::black_box;
5
6
 
6
7
  /// Benchmark text-heavy documents
7
8
  fn bench_text_operations(c: &mut Criterion) {
@@ -4522,7 +4522,7 @@ fn convert_table_row(
4522
4522
  options: &ConversionOptions,
4523
4523
  ctx: &Context,
4524
4524
  row_index: usize,
4525
- rowspan_tracker: &mut std::collections::HashMap<usize, usize>,
4525
+ rowspan_tracker: &mut std::collections::HashMap<usize, (String, usize)>,
4526
4526
  dom_ctx: &DomContext,
4527
4527
  ) {
4528
4528
  let mut row_text = String::with_capacity(256);
@@ -4546,8 +4546,10 @@ fn convert_table_row(
4546
4546
  let mut cell_iter = cells.iter();
4547
4547
 
4548
4548
  loop {
4549
- if let Some(remaining_rows) = rowspan_tracker.get_mut(&col_index) {
4549
+ if let Some((content, remaining_rows)) = rowspan_tracker.get_mut(&col_index) {
4550
4550
  if *remaining_rows > 0 {
4551
+ row_text.push(' ');
4552
+ row_text.push_str(content);
4551
4553
  row_text.push_str(" |");
4552
4554
  *remaining_rows -= 1;
4553
4555
  if *remaining_rows == 0 {
@@ -4559,12 +4561,21 @@ fn convert_table_row(
4559
4561
  }
4560
4562
 
4561
4563
  if let Some(cell_handle) = cell_iter.next() {
4564
+ let cell_start = row_text.len();
4562
4565
  convert_table_cell(cell_handle, parser, &mut row_text, options, ctx, "", dom_ctx);
4563
4566
 
4564
4567
  let (colspan, rowspan) = get_colspan_rowspan(cell_handle, parser);
4565
4568
 
4566
4569
  if rowspan > 1 {
4567
- rowspan_tracker.insert(col_index, rowspan - 1);
4570
+ // Extract the cell content that was just added (without separators)
4571
+ let cell_text = &row_text[cell_start..];
4572
+ // Strip leading space and trailing " |"
4573
+ let cell_content = cell_text
4574
+ .trim_start_matches(' ')
4575
+ .trim_end_matches(" |")
4576
+ .trim()
4577
+ .to_string();
4578
+ rowspan_tracker.insert(col_index, (cell_content, rowspan - 1));
4568
4579
  }
4569
4580
 
4570
4581
  col_index += colspan;
@@ -4920,8 +4931,10 @@ mod tests {
4920
4931
  #[test]
4921
4932
  fn test_preserve_tags_simple_table() {
4922
4933
  let html = r#"<div><table><tr><td>Cell 1</td><td>Cell 2</td></tr></table><p>Text</p></div>"#;
4923
- let mut options = ConversionOptions::default();
4924
- options.preserve_tags = vec!["table".to_string()];
4934
+ let options = ConversionOptions {
4935
+ preserve_tags: vec!["table".to_string()],
4936
+ ..Default::default()
4937
+ };
4925
4938
  let result = convert_html(html, &options).unwrap();
4926
4939
 
4927
4940
  assert!(result.contains("<table>"), "Should preserve table tag");
@@ -4934,8 +4947,10 @@ mod tests {
4934
4947
  #[test]
4935
4948
  fn test_preserve_tags_with_attributes() {
4936
4949
  let html = r#"<table class="data" id="mytable"><tr><td>Data</td></tr></table>"#;
4937
- let mut options = ConversionOptions::default();
4938
- options.preserve_tags = vec!["table".to_string()];
4950
+ let options = ConversionOptions {
4951
+ preserve_tags: vec!["table".to_string()],
4952
+ ..Default::default()
4953
+ };
4939
4954
  let result = convert_html(html, &options).unwrap();
4940
4955
 
4941
4956
  assert!(result.contains("<table"), "Should preserve table tag");
@@ -4947,8 +4962,10 @@ mod tests {
4947
4962
  #[test]
4948
4963
  fn test_preserve_tags_multiple_tags() {
4949
4964
  let html = r#"<div><table><tr><td>Table</td></tr></table><form><input type="text"/></form><p>Text</p></div>"#;
4950
- let mut options = ConversionOptions::default();
4951
- options.preserve_tags = vec!["table".to_string(), "form".to_string()];
4965
+ let options = ConversionOptions {
4966
+ preserve_tags: vec!["table".to_string(), "form".to_string()],
4967
+ ..Default::default()
4968
+ };
4952
4969
  let result = convert_html(html, &options).unwrap();
4953
4970
 
4954
4971
  assert!(result.contains("<table>"), "Should preserve table");
@@ -4959,8 +4976,10 @@ mod tests {
4959
4976
  #[test]
4960
4977
  fn test_preserve_tags_nested_content() {
4961
4978
  let html = r#"<table><thead><tr><th>Header</th></tr></thead><tbody><tr><td>Data</td></tr></tbody></table>"#;
4962
- let mut options = ConversionOptions::default();
4963
- options.preserve_tags = vec!["table".to_string()];
4979
+ let options = ConversionOptions {
4980
+ preserve_tags: vec!["table".to_string()],
4981
+ ..Default::default()
4982
+ };
4964
4983
  let result = convert_html(html, &options).unwrap();
4965
4984
 
4966
4985
  assert!(result.contains("<thead>"), "Should preserve nested thead");
@@ -4985,9 +5004,11 @@ mod tests {
4985
5004
  #[test]
4986
5005
  fn test_preserve_tags_vs_strip_tags() {
4987
5006
  let html = r#"<table><tr><td>Table</td></tr></table><div><span>Text</span></div>"#;
4988
- let mut options = ConversionOptions::default();
4989
- options.preserve_tags = vec!["table".to_string()];
4990
- options.strip_tags = vec!["span".to_string()];
5007
+ let options = ConversionOptions {
5008
+ preserve_tags: vec!["table".to_string()],
5009
+ strip_tags: vec!["span".to_string()],
5010
+ ..Default::default()
5011
+ };
4991
5012
  let result = convert_html(html, &options).unwrap();
4992
5013
 
4993
5014
  assert!(result.contains("<table>"), "Should preserve table");
@@ -7,7 +7,7 @@ requires = [
7
7
 
8
8
  [project]
9
9
  name = "html-to-markdown"
10
- version = "2.6.0"
10
+ version = "2.6.2"
11
11
  description = "High-performance HTML to Markdown converter powered by Rust with a clean Python API"
12
12
  readme = "README.md"
13
13
  keywords = [