html-to-markdown 2.30.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -19
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +6 -3
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +126 -52
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "html-to-markdown-rs"
|
|
3
|
-
version = "
|
|
3
|
+
version = "3.0.1"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -14,25 +14,22 @@ keywords = ["html", "markdown", "converter", "astral-tl", "doc-processing"]
|
|
|
14
14
|
categories = ["parsing", "text-processing", "web-programming"]
|
|
15
15
|
|
|
16
16
|
[package.metadata.cargo-machete]
|
|
17
|
-
ignored = ["once_cell", "
|
|
17
|
+
ignored = ["once_cell", "ahash"]
|
|
18
18
|
|
|
19
19
|
[lib]
|
|
20
20
|
crate-type = ["rlib"]
|
|
21
21
|
|
|
22
22
|
[features]
|
|
23
23
|
default = ["metadata"]
|
|
24
|
-
full = ["inline-images", "metadata", "visitor", "
|
|
24
|
+
full = ["inline-images", "metadata", "visitor", "serde"]
|
|
25
25
|
inline-images = ["dep:image"]
|
|
26
26
|
metadata = ["dep:serde", "dep:serde_json"]
|
|
27
27
|
visitor = []
|
|
28
|
-
async-visitor = ["visitor", "dep:async-trait", "dep:futures", "dep:tokio"]
|
|
29
28
|
serde = ["dep:serde", "dep:serde_json"]
|
|
30
29
|
|
|
31
30
|
[dependencies]
|
|
32
31
|
ahash = { version = "0.8", features = ["std", "compile-time-rng"], default-features = false }
|
|
33
|
-
async-trait = { version = "0.1", optional = true }
|
|
34
32
|
base64 = "0.22"
|
|
35
|
-
futures = { version = "0.3", optional = true }
|
|
36
33
|
html-escape = "0.2.13"
|
|
37
34
|
html5ever = "0.39.0"
|
|
38
35
|
image = { version = "0.25", default-features = false, features = [
|
|
@@ -43,16 +40,13 @@ image = { version = "0.25", default-features = false, features = [
|
|
|
43
40
|
"webp",
|
|
44
41
|
], optional = true }
|
|
45
42
|
lru = "0.16"
|
|
43
|
+
memchr = "2"
|
|
46
44
|
once_cell = "1.21"
|
|
47
45
|
regex = "1.12"
|
|
48
46
|
serde = { version = "1.0", features = ["derive"], optional = true }
|
|
49
47
|
serde_json = { version = "1.0", optional = true }
|
|
50
48
|
thiserror = "2.0"
|
|
51
49
|
tl = { package = "astral-tl", version = "0.7.11" }
|
|
52
|
-
tokio = { version = "1.50", features = [
|
|
53
|
-
"rt-multi-thread",
|
|
54
|
-
"sync",
|
|
55
|
-
], optional = true }
|
|
56
50
|
|
|
57
51
|
[dev-dependencies]
|
|
58
52
|
serde = { version = "1.0", features = ["derive"] }
|
|
@@ -18,42 +18,80 @@ Fast, reliable HTML to Markdown conversion with full CommonMark compliance. Buil
|
|
|
18
18
|
|
|
19
19
|
```toml
|
|
20
20
|
[dependencies]
|
|
21
|
-
html-to-markdown-rs = "
|
|
21
|
+
html-to-markdown-rs = "3.0"
|
|
22
22
|
```
|
|
23
23
|
|
|
24
24
|
## Basic Usage
|
|
25
25
|
|
|
26
|
+
`convert()` returns a structured `ConversionResult` with the converted text, metadata, tables, and more:
|
|
27
|
+
|
|
26
28
|
```rust
|
|
27
|
-
use html_to_markdown_rs::
|
|
29
|
+
use html_to_markdown_rs::convert;
|
|
28
30
|
|
|
29
31
|
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
30
32
|
let html = r#"
|
|
31
|
-
<
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
<
|
|
35
|
-
<
|
|
36
|
-
|
|
33
|
+
<html lang="en">
|
|
34
|
+
<head><title>Welcome</title></head>
|
|
35
|
+
<body>
|
|
36
|
+
<h1>Welcome</h1>
|
|
37
|
+
<p>This is <strong>fast</strong> conversion!</p>
|
|
38
|
+
<ul>
|
|
39
|
+
<li>Built with Rust</li>
|
|
40
|
+
<li>CommonMark compliant</li>
|
|
41
|
+
</ul>
|
|
42
|
+
</body>
|
|
43
|
+
</html>
|
|
37
44
|
"#;
|
|
38
45
|
|
|
39
|
-
let
|
|
40
|
-
println!("{}",
|
|
46
|
+
let result = convert(html, None)?;
|
|
47
|
+
println!("{}", result.content.unwrap_or_default());
|
|
48
|
+
|
|
49
|
+
if let Some(metadata) = &result.metadata {
|
|
50
|
+
println!("Title: {:?}", metadata.document.title);
|
|
51
|
+
println!("Headers: {:?}", metadata.headers);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
for table in &result.tables {
|
|
55
|
+
println!("Table with {} rows", table.cells.len());
|
|
56
|
+
}
|
|
57
|
+
|
|
41
58
|
Ok(())
|
|
42
59
|
}
|
|
43
60
|
```
|
|
44
61
|
|
|
45
62
|
## Error Handling
|
|
46
63
|
|
|
47
|
-
Conversion returns a `Result<
|
|
64
|
+
Conversion returns a `Result<ConversionResult, ConversionError>`. Inputs that look like binary data are rejected with
|
|
48
65
|
`ConversionError::InvalidInput` to prevent runaway allocations. Table `colspan`/`rowspan` values are also clamped
|
|
49
66
|
internally to keep output sizes bounded.
|
|
50
67
|
|
|
51
68
|
## Configuration
|
|
52
69
|
|
|
70
|
+
### Builder Pattern
|
|
71
|
+
|
|
72
|
+
```rust
|
|
73
|
+
use html_to_markdown_rs::{
|
|
74
|
+
convert, ConversionOptions, HeadingStyle, CodeBlockStyle,
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
let options = ConversionOptions::builder()
|
|
78
|
+
.heading_style(HeadingStyle::Atx)
|
|
79
|
+
.list_indent_width(2)
|
|
80
|
+
.bullets("-")
|
|
81
|
+
.autolinks(true)
|
|
82
|
+
.wrap(true)
|
|
83
|
+
.wrap_width(80)
|
|
84
|
+
.build();
|
|
85
|
+
|
|
86
|
+
let result = convert(html, Some(options))?;
|
|
87
|
+
println!("{}", result.content.unwrap_or_default());
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Struct Literal
|
|
91
|
+
|
|
53
92
|
```rust
|
|
54
93
|
use html_to_markdown_rs::{
|
|
55
94
|
convert, ConversionOptions, HeadingStyle, ListIndentType,
|
|
56
|
-
PreprocessingOptions, PreprocessingPreset,
|
|
57
95
|
};
|
|
58
96
|
|
|
59
97
|
let options = ConversionOptions {
|
|
@@ -69,12 +107,13 @@ let options = ConversionOptions {
|
|
|
69
107
|
..Default::default()
|
|
70
108
|
};
|
|
71
109
|
|
|
72
|
-
let
|
|
110
|
+
let result = convert(html, Some(options))?;
|
|
111
|
+
println!("{}", result.content.unwrap_or_default());
|
|
73
112
|
```
|
|
74
113
|
|
|
75
114
|
### Preserving HTML Tags
|
|
76
115
|
|
|
77
|
-
The `preserve_tags` option allows you to keep specific HTML tags in their original form instead of converting them to Markdown
|
|
116
|
+
The `preserve_tags` option allows you to keep specific HTML tags in their original form instead of converting them to Markdown:
|
|
78
117
|
|
|
79
118
|
```rust
|
|
80
119
|
use html_to_markdown_rs::{convert, ConversionOptions};
|
|
@@ -93,18 +132,8 @@ let options = ConversionOptions {
|
|
|
93
132
|
..Default::default()
|
|
94
133
|
};
|
|
95
134
|
|
|
96
|
-
let
|
|
97
|
-
//
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
You can preserve multiple tag types and combine with `strip_tags`:
|
|
101
|
-
|
|
102
|
-
```rust
|
|
103
|
-
let options = ConversionOptions {
|
|
104
|
-
preserve_tags: vec!["table".to_string(), "form".to_string()],
|
|
105
|
-
strip_tags: vec!["script".to_string(), "style".to_string()],
|
|
106
|
-
..Default::default()
|
|
107
|
-
};
|
|
135
|
+
let result = convert(html, Some(options))?;
|
|
136
|
+
// result.content => "Before table\n\n<table class=\"data\">...</table>\n\nAfter table\n"
|
|
108
137
|
```
|
|
109
138
|
|
|
110
139
|
## Web Scraping with Preprocessing
|
|
@@ -118,46 +147,62 @@ options.preprocessing.preset = html_to_markdown_rs::PreprocessingPreset::Aggress
|
|
|
118
147
|
options.preprocessing.remove_navigation = true;
|
|
119
148
|
options.preprocessing.remove_forms = true;
|
|
120
149
|
|
|
121
|
-
let
|
|
150
|
+
let result = convert(scraped_html, Some(options))?;
|
|
151
|
+
println!("{}", result.content.unwrap_or_default());
|
|
122
152
|
```
|
|
123
153
|
|
|
124
|
-
##
|
|
154
|
+
## Metadata Extraction
|
|
125
155
|
|
|
126
|
-
|
|
156
|
+
Metadata is automatically included in the result. Configure which fields to extract via `MetadataConfig`:
|
|
127
157
|
|
|
128
158
|
```rust
|
|
129
|
-
use html_to_markdown_rs::convert;
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
159
|
+
use html_to_markdown_rs::{convert, ConversionOptions, MetadataConfig};
|
|
160
|
+
|
|
161
|
+
let options = ConversionOptions::builder()
|
|
162
|
+
.metadata_config(MetadataConfig {
|
|
163
|
+
extract_headers: true,
|
|
164
|
+
extract_links: true,
|
|
165
|
+
extract_images: false,
|
|
166
|
+
..Default::default()
|
|
167
|
+
})
|
|
168
|
+
.build();
|
|
169
|
+
|
|
170
|
+
let result = convert(html, Some(options))?;
|
|
171
|
+
if let Some(metadata) = &result.metadata {
|
|
172
|
+
println!("Title: {:?}", metadata.document.title);
|
|
173
|
+
for header in &metadata.headers {
|
|
174
|
+
println!("H{}: {}", header.level, header.text);
|
|
175
|
+
}
|
|
176
|
+
for link in &metadata.links {
|
|
177
|
+
println!("Link: {} -> {}", link.text, link.href);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
134
180
|
```
|
|
135
181
|
|
|
136
|
-
##
|
|
182
|
+
## Image Extraction
|
|
137
183
|
|
|
138
184
|
```rust
|
|
139
|
-
use html_to_markdown_rs::{
|
|
140
|
-
|
|
141
|
-
let config = InlineImageConfig::new(5 * 1024 * 1024) // 5MB max
|
|
142
|
-
.with_infer_dimensions(true)
|
|
143
|
-
.with_filename_prefix("img_".to_string());
|
|
185
|
+
use html_to_markdown_rs::{convert, ConversionOptions};
|
|
144
186
|
|
|
145
|
-
let
|
|
187
|
+
let options = ConversionOptions::builder()
|
|
188
|
+
.extract_images(true)
|
|
189
|
+
.max_image_size(5 * 1024 * 1024) // 5 MB max
|
|
190
|
+
.infer_dimensions(true)
|
|
191
|
+
.build();
|
|
146
192
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
193
|
+
let result = convert(html, Some(options))?;
|
|
194
|
+
println!("{}", result.content.unwrap_or_default());
|
|
195
|
+
for img in &result.images {
|
|
196
|
+
println!("Image: {} ({} bytes)", img.src, img.data.as_ref().map_or(0, |d| d.len()));
|
|
150
197
|
}
|
|
151
198
|
```
|
|
152
199
|
|
|
153
200
|
## Table Extraction
|
|
154
201
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
Requires the `visitor` feature.
|
|
202
|
+
Structured table data is always included in `ConversionResult.tables`:
|
|
158
203
|
|
|
159
204
|
```rust
|
|
160
|
-
use html_to_markdown_rs::
|
|
205
|
+
use html_to_markdown_rs::convert;
|
|
161
206
|
|
|
162
207
|
let html = r#"
|
|
163
208
|
<table>
|
|
@@ -167,9 +212,9 @@ let html = r#"
|
|
|
167
212
|
</table>
|
|
168
213
|
"#;
|
|
169
214
|
|
|
170
|
-
let result =
|
|
215
|
+
let result = convert(html, None)?;
|
|
171
216
|
|
|
172
|
-
println!("{}", result.content);
|
|
217
|
+
println!("{}", result.content.unwrap_or_default());
|
|
173
218
|
for table in &result.tables {
|
|
174
219
|
println!("Table with {} rows:", table.cells.len());
|
|
175
220
|
for (i, row) in table.cells.iter().enumerate() {
|
|
@@ -179,6 +224,34 @@ for table in &result.tables {
|
|
|
179
224
|
}
|
|
180
225
|
```
|
|
181
226
|
|
|
227
|
+
## Custom Visitors
|
|
228
|
+
|
|
229
|
+
```rust
|
|
230
|
+
use html_to_markdown_rs::{convert, ConversionOptions};
|
|
231
|
+
use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};
|
|
232
|
+
|
|
233
|
+
struct NoImagesVisitor;
|
|
234
|
+
|
|
235
|
+
impl HtmlVisitor for NoImagesVisitor {
|
|
236
|
+
fn visit_image(
|
|
237
|
+
&mut self,
|
|
238
|
+
_ctx: &NodeContext,
|
|
239
|
+
_src: &str,
|
|
240
|
+
_alt: &str,
|
|
241
|
+
_title: Option<&str>,
|
|
242
|
+
) -> VisitResult {
|
|
243
|
+
VisitResult::Skip
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
let options = ConversionOptions::builder()
|
|
248
|
+
.visitor(Box::new(NoImagesVisitor))
|
|
249
|
+
.build();
|
|
250
|
+
|
|
251
|
+
let result = convert(html, Some(options))?;
|
|
252
|
+
println!("{}", result.content.unwrap_or_default());
|
|
253
|
+
```
|
|
254
|
+
|
|
182
255
|
## Other Language Bindings
|
|
183
256
|
|
|
184
257
|
This is the core Rust library. For other languages:
|
|
@@ -191,13 +264,14 @@ This is the core Rust library. For other languages:
|
|
|
191
264
|
|
|
192
265
|
## Documentation
|
|
193
266
|
|
|
194
|
-
- [Full Documentation](https://
|
|
267
|
+
- [Full Documentation](https://docs.html-to-markdown.kreuzberg.dev)
|
|
195
268
|
- [API Reference](https://docs.rs/html-to-markdown-rs)
|
|
269
|
+
- [Migration Guide (v2 -> v3)](https://docs.html-to-markdown.kreuzberg.dev/migration/v3/)
|
|
196
270
|
- [Contributing Guide](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/CONTRIBUTING.md)
|
|
197
271
|
|
|
198
272
|
## Performance
|
|
199
273
|
|
|
200
|
-
10-30x faster than pure Python/JavaScript implementations, delivering 150-
|
|
274
|
+
10-30x faster than pure Python/JavaScript implementations, delivering 150-280 MB/s throughput.
|
|
201
275
|
|
|
202
276
|
## License
|
|
203
277
|
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Basic HTML to Markdown conversion
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = "<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>";
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Converting HTML tables to Markdown
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = r"<table>
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Testing HTML escape sequences and special characters
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = "<p>Use *wildcards* for search</p>";
|
|
@@ -1,6 +1,12 @@
|
|
|
1
|
-
|
|
1
|
+
#![allow(missing_docs)]
|
|
2
|
+
fn convert(
|
|
3
|
+
html: &str,
|
|
4
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
5
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
6
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
}
|
|
2
8
|
|
|
3
|
-
use html_to_markdown_rs::
|
|
9
|
+
use html_to_markdown_rs::ConversionOptions;
|
|
4
10
|
|
|
5
11
|
fn main() {
|
|
6
12
|
let html = "<p>This is <mark>highlighted</mark> text</p>";
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Testing HTML list conversion (ordered and unordered lists)
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = "<ol><li>First item</li><li>Second item</li><li>Third item</li></ol>";
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Testing HTML5 semantic tags (article, section, nav, etc.)
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = r"<article>
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Converting HTML tables to Markdown
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = r"<table>
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Testing task list conversion (checkboxes)
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = r#"<ul>
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Testing whitespace handling and normalization
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = "<p>text with multiple spaces</p>";
|