html-to-markdown 2.6.0__tar.gz → 2.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/Cargo.lock +6 -6
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/Cargo.toml +2 -2
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/PKG-INFO +1 -1
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/benches/conversion_benchmark.rs +2 -1
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/benches/micro_benchmark.rs +2 -1
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/converter.rs +35 -14
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/bin/html-to-markdown +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/pyproject.toml +1 -1
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/LICENSE +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/README.md +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/Cargo.toml +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/README.md +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/benches/profiling_benchmark.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/basic.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/table.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_escape.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_inline_formatting.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_lists.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_semantic_tags.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_tables.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_task_lists.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_whitespace.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/error.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/converter.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/extractor.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/mod.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/parser.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/spatial.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/types.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/inline_images.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/lib.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/options.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/sanitizer.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/text.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/wrapper.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/tests/commonmark_compliance_test.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/tests/hocr_compliance_test.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/tests/integration_test.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown-py/Cargo.toml +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown-py/README.md +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown-py/python/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown-py/python/html_to_markdown/_html_to_markdown.pyi +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown-py/src/lib.rs +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown-py/uv.lock +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/_rust.pyi +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/api.py +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/cli.py +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/cli_proxy.py +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/exceptions.py +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/options.py +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/html_to_markdown/v1_compat.py +0 -0
|
@@ -997,7 +997,7 @@ dependencies = [
|
|
|
997
997
|
|
|
998
998
|
[[package]]
|
|
999
999
|
name = "html-to-markdown-cli"
|
|
1000
|
-
version = "2.6.
|
|
1000
|
+
version = "2.6.2"
|
|
1001
1001
|
dependencies = [
|
|
1002
1002
|
"assert_cmd",
|
|
1003
1003
|
"clap",
|
|
@@ -1011,7 +1011,7 @@ dependencies = [
|
|
|
1011
1011
|
|
|
1012
1012
|
[[package]]
|
|
1013
1013
|
name = "html-to-markdown-node"
|
|
1014
|
-
version = "2.6.
|
|
1014
|
+
version = "2.6.2"
|
|
1015
1015
|
dependencies = [
|
|
1016
1016
|
"html-to-markdown-rs",
|
|
1017
1017
|
"mimalloc-rust",
|
|
@@ -1022,7 +1022,7 @@ dependencies = [
|
|
|
1022
1022
|
|
|
1023
1023
|
[[package]]
|
|
1024
1024
|
name = "html-to-markdown-php"
|
|
1025
|
-
version = "2.6.
|
|
1025
|
+
version = "2.6.2"
|
|
1026
1026
|
dependencies = [
|
|
1027
1027
|
"ext-php-rs",
|
|
1028
1028
|
"html-to-markdown-rs",
|
|
@@ -1030,7 +1030,7 @@ dependencies = [
|
|
|
1030
1030
|
|
|
1031
1031
|
[[package]]
|
|
1032
1032
|
name = "html-to-markdown-py"
|
|
1033
|
-
version = "2.6.
|
|
1033
|
+
version = "2.6.2"
|
|
1034
1034
|
dependencies = [
|
|
1035
1035
|
"base64",
|
|
1036
1036
|
"html-to-markdown-rs",
|
|
@@ -1049,7 +1049,7 @@ dependencies = [
|
|
|
1049
1049
|
|
|
1050
1050
|
[[package]]
|
|
1051
1051
|
name = "html-to-markdown-rs"
|
|
1052
|
-
version = "2.6.
|
|
1052
|
+
version = "2.6.2"
|
|
1053
1053
|
dependencies = [
|
|
1054
1054
|
"ammonia",
|
|
1055
1055
|
"base64",
|
|
@@ -1066,7 +1066,7 @@ dependencies = [
|
|
|
1066
1066
|
|
|
1067
1067
|
[[package]]
|
|
1068
1068
|
name = "html-to-markdown-wasm"
|
|
1069
|
-
version = "2.6.
|
|
1069
|
+
version = "2.6.2"
|
|
1070
1070
|
dependencies = [
|
|
1071
1071
|
"console_error_panic_hook",
|
|
1072
1072
|
"html-to-markdown-rs",
|
|
@@ -3,7 +3,7 @@ resolver = "2"
|
|
|
3
3
|
members = ["crates/html-to-markdown-py"]
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "2.6.
|
|
6
|
+
version = "2.6.2"
|
|
7
7
|
edition = "2021"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
9
9
|
license = "MIT"
|
|
@@ -15,7 +15,7 @@ rust-version = "1.80"
|
|
|
15
15
|
|
|
16
16
|
[workspace.dependencies]
|
|
17
17
|
# Core library
|
|
18
|
-
html-to-markdown-rs = { version = "2.6.
|
|
18
|
+
html-to-markdown-rs = { version = "2.6.1", path = "crates/html-to-markdown" }
|
|
19
19
|
|
|
20
20
|
# HTML parsing and sanitization
|
|
21
21
|
tl = "0.7"
|
|
@@ -3,8 +3,9 @@
|
|
|
3
3
|
//! Measures throughput (ops/sec, MB/sec) and provides performance baselines
|
|
4
4
|
//! for the core Rust conversion engine.
|
|
5
5
|
|
|
6
|
-
use criterion::{
|
|
6
|
+
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
|
7
7
|
use html_to_markdown_rs::{convert, CodeBlockStyle, ConversionOptions, HeadingStyle};
|
|
8
|
+
use std::hint::black_box;
|
|
8
9
|
use std::time::Duration;
|
|
9
10
|
|
|
10
11
|
/// Generate HTML with varying complexity
|
{html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/benches/micro_benchmark.rs
RENAMED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
//! Micro-benchmarks for specific operations
|
|
2
2
|
|
|
3
|
-
use criterion::{
|
|
3
|
+
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
|
4
4
|
use html_to_markdown_rs::{convert, ConversionOptions};
|
|
5
|
+
use std::hint::black_box;
|
|
5
6
|
|
|
6
7
|
/// Benchmark text-heavy documents
|
|
7
8
|
fn bench_text_operations(c: &mut Criterion) {
|
|
@@ -4522,7 +4522,7 @@ fn convert_table_row(
|
|
|
4522
4522
|
options: &ConversionOptions,
|
|
4523
4523
|
ctx: &Context,
|
|
4524
4524
|
row_index: usize,
|
|
4525
|
-
rowspan_tracker: &mut std::collections::HashMap<usize, usize>,
|
|
4525
|
+
rowspan_tracker: &mut std::collections::HashMap<usize, (String, usize)>,
|
|
4526
4526
|
dom_ctx: &DomContext,
|
|
4527
4527
|
) {
|
|
4528
4528
|
let mut row_text = String::with_capacity(256);
|
|
@@ -4546,8 +4546,10 @@ fn convert_table_row(
|
|
|
4546
4546
|
let mut cell_iter = cells.iter();
|
|
4547
4547
|
|
|
4548
4548
|
loop {
|
|
4549
|
-
if let Some(remaining_rows) = rowspan_tracker.get_mut(&col_index) {
|
|
4549
|
+
if let Some((content, remaining_rows)) = rowspan_tracker.get_mut(&col_index) {
|
|
4550
4550
|
if *remaining_rows > 0 {
|
|
4551
|
+
row_text.push(' ');
|
|
4552
|
+
row_text.push_str(content);
|
|
4551
4553
|
row_text.push_str(" |");
|
|
4552
4554
|
*remaining_rows -= 1;
|
|
4553
4555
|
if *remaining_rows == 0 {
|
|
@@ -4559,12 +4561,21 @@ fn convert_table_row(
|
|
|
4559
4561
|
}
|
|
4560
4562
|
|
|
4561
4563
|
if let Some(cell_handle) = cell_iter.next() {
|
|
4564
|
+
let cell_start = row_text.len();
|
|
4562
4565
|
convert_table_cell(cell_handle, parser, &mut row_text, options, ctx, "", dom_ctx);
|
|
4563
4566
|
|
|
4564
4567
|
let (colspan, rowspan) = get_colspan_rowspan(cell_handle, parser);
|
|
4565
4568
|
|
|
4566
4569
|
if rowspan > 1 {
|
|
4567
|
-
|
|
4570
|
+
// Extract the cell content that was just added (without separators)
|
|
4571
|
+
let cell_text = &row_text[cell_start..];
|
|
4572
|
+
// Strip leading space and trailing " |"
|
|
4573
|
+
let cell_content = cell_text
|
|
4574
|
+
.trim_start_matches(' ')
|
|
4575
|
+
.trim_end_matches(" |")
|
|
4576
|
+
.trim()
|
|
4577
|
+
.to_string();
|
|
4578
|
+
rowspan_tracker.insert(col_index, (cell_content, rowspan - 1));
|
|
4568
4579
|
}
|
|
4569
4580
|
|
|
4570
4581
|
col_index += colspan;
|
|
@@ -4920,8 +4931,10 @@ mod tests {
|
|
|
4920
4931
|
#[test]
|
|
4921
4932
|
fn test_preserve_tags_simple_table() {
|
|
4922
4933
|
let html = r#"<div><table><tr><td>Cell 1</td><td>Cell 2</td></tr></table><p>Text</p></div>"#;
|
|
4923
|
-
let
|
|
4924
|
-
|
|
4934
|
+
let options = ConversionOptions {
|
|
4935
|
+
preserve_tags: vec!["table".to_string()],
|
|
4936
|
+
..Default::default()
|
|
4937
|
+
};
|
|
4925
4938
|
let result = convert_html(html, &options).unwrap();
|
|
4926
4939
|
|
|
4927
4940
|
assert!(result.contains("<table>"), "Should preserve table tag");
|
|
@@ -4934,8 +4947,10 @@ mod tests {
|
|
|
4934
4947
|
#[test]
|
|
4935
4948
|
fn test_preserve_tags_with_attributes() {
|
|
4936
4949
|
let html = r#"<table class="data" id="mytable"><tr><td>Data</td></tr></table>"#;
|
|
4937
|
-
let
|
|
4938
|
-
|
|
4950
|
+
let options = ConversionOptions {
|
|
4951
|
+
preserve_tags: vec!["table".to_string()],
|
|
4952
|
+
..Default::default()
|
|
4953
|
+
};
|
|
4939
4954
|
let result = convert_html(html, &options).unwrap();
|
|
4940
4955
|
|
|
4941
4956
|
assert!(result.contains("<table"), "Should preserve table tag");
|
|
@@ -4947,8 +4962,10 @@ mod tests {
|
|
|
4947
4962
|
#[test]
|
|
4948
4963
|
fn test_preserve_tags_multiple_tags() {
|
|
4949
4964
|
let html = r#"<div><table><tr><td>Table</td></tr></table><form><input type="text"/></form><p>Text</p></div>"#;
|
|
4950
|
-
let
|
|
4951
|
-
|
|
4965
|
+
let options = ConversionOptions {
|
|
4966
|
+
preserve_tags: vec!["table".to_string(), "form".to_string()],
|
|
4967
|
+
..Default::default()
|
|
4968
|
+
};
|
|
4952
4969
|
let result = convert_html(html, &options).unwrap();
|
|
4953
4970
|
|
|
4954
4971
|
assert!(result.contains("<table>"), "Should preserve table");
|
|
@@ -4959,8 +4976,10 @@ mod tests {
|
|
|
4959
4976
|
#[test]
|
|
4960
4977
|
fn test_preserve_tags_nested_content() {
|
|
4961
4978
|
let html = r#"<table><thead><tr><th>Header</th></tr></thead><tbody><tr><td>Data</td></tr></tbody></table>"#;
|
|
4962
|
-
let
|
|
4963
|
-
|
|
4979
|
+
let options = ConversionOptions {
|
|
4980
|
+
preserve_tags: vec!["table".to_string()],
|
|
4981
|
+
..Default::default()
|
|
4982
|
+
};
|
|
4964
4983
|
let result = convert_html(html, &options).unwrap();
|
|
4965
4984
|
|
|
4966
4985
|
assert!(result.contains("<thead>"), "Should preserve nested thead");
|
|
@@ -4985,9 +5004,11 @@ mod tests {
|
|
|
4985
5004
|
#[test]
|
|
4986
5005
|
fn test_preserve_tags_vs_strip_tags() {
|
|
4987
5006
|
let html = r#"<table><tr><td>Table</td></tr></table><div><span>Text</span></div>"#;
|
|
4988
|
-
let
|
|
4989
|
-
|
|
4990
|
-
|
|
5007
|
+
let options = ConversionOptions {
|
|
5008
|
+
preserve_tags: vec!["table".to_string()],
|
|
5009
|
+
strip_tags: vec!["span".to_string()],
|
|
5010
|
+
..Default::default()
|
|
5011
|
+
};
|
|
4991
5012
|
let result = convert_html(html, &options).unwrap();
|
|
4992
5013
|
|
|
4993
5014
|
assert!(result.contains("<table>"), "Should preserve table");
|
|
Binary file
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_escape.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_lists.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/examples/test_tables.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/converter.rs
RENAMED
|
File without changes
|
{html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/extractor.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/parser.rs
RENAMED
|
File without changes
|
{html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/hocr/spatial.rs
RENAMED
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/src/inline_images.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-2.6.0 → html_to_markdown-2.6.2}/crates/html-to-markdown/tests/integration_test.rs
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|