html-to-markdown 2.29.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +18 -41
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +7 -4
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +127 -51
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "html-to-markdown-rs"
|
|
3
|
-
version = "
|
|
3
|
+
version = "3.0.0"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -14,25 +14,22 @@ keywords = ["html", "markdown", "converter", "astral-tl", "doc-processing"]
|
|
|
14
14
|
categories = ["parsing", "text-processing", "web-programming"]
|
|
15
15
|
|
|
16
16
|
[package.metadata.cargo-machete]
|
|
17
|
-
ignored = ["once_cell", "
|
|
17
|
+
ignored = ["once_cell", "ahash"]
|
|
18
18
|
|
|
19
19
|
[lib]
|
|
20
20
|
crate-type = ["rlib"]
|
|
21
21
|
|
|
22
22
|
[features]
|
|
23
23
|
default = ["metadata"]
|
|
24
|
-
full = ["inline-images", "metadata", "visitor", "
|
|
24
|
+
full = ["inline-images", "metadata", "visitor", "serde"]
|
|
25
25
|
inline-images = ["dep:image"]
|
|
26
26
|
metadata = ["dep:serde", "dep:serde_json"]
|
|
27
27
|
visitor = []
|
|
28
|
-
async-visitor = ["visitor", "dep:async-trait", "dep:futures", "dep:tokio"]
|
|
29
28
|
serde = ["dep:serde", "dep:serde_json"]
|
|
30
29
|
|
|
31
30
|
[dependencies]
|
|
32
31
|
ahash = { version = "0.8", features = ["std", "compile-time-rng"], default-features = false }
|
|
33
|
-
async-trait = { version = "0.1", optional = true }
|
|
34
32
|
base64 = "0.22"
|
|
35
|
-
futures = { version = "0.3", optional = true }
|
|
36
33
|
html-escape = "0.2.13"
|
|
37
34
|
html5ever = "0.39.0"
|
|
38
35
|
image = { version = "0.25", default-features = false, features = [
|
|
@@ -43,16 +40,13 @@ image = { version = "0.25", default-features = false, features = [
|
|
|
43
40
|
"webp",
|
|
44
41
|
], optional = true }
|
|
45
42
|
lru = "0.16"
|
|
43
|
+
memchr = "2"
|
|
46
44
|
once_cell = "1.21"
|
|
47
45
|
regex = "1.12"
|
|
48
46
|
serde = { version = "1.0", features = ["derive"], optional = true }
|
|
49
47
|
serde_json = { version = "1.0", optional = true }
|
|
50
48
|
thiserror = "2.0"
|
|
51
49
|
tl = { package = "astral-tl", version = "0.7.11" }
|
|
52
|
-
tokio = { version = "1.50", features = [
|
|
53
|
-
"rt-multi-thread",
|
|
54
|
-
"sync",
|
|
55
|
-
], optional = true }
|
|
56
50
|
|
|
57
51
|
[dev-dependencies]
|
|
58
52
|
serde = { version = "1.0", features = ["derive"] }
|
|
@@ -18,42 +18,80 @@ Fast, reliable HTML to Markdown conversion with full CommonMark compliance. Buil
|
|
|
18
18
|
|
|
19
19
|
```toml
|
|
20
20
|
[dependencies]
|
|
21
|
-
html-to-markdown-rs = "
|
|
21
|
+
html-to-markdown-rs = "3.0"
|
|
22
22
|
```
|
|
23
23
|
|
|
24
24
|
## Basic Usage
|
|
25
25
|
|
|
26
|
+
`convert()` returns a structured `ConversionResult` with the converted text, metadata, tables, and more:
|
|
27
|
+
|
|
26
28
|
```rust
|
|
27
|
-
use html_to_markdown_rs::
|
|
29
|
+
use html_to_markdown_rs::convert;
|
|
28
30
|
|
|
29
31
|
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
30
32
|
let html = r#"
|
|
31
|
-
<
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
<
|
|
35
|
-
<
|
|
36
|
-
|
|
33
|
+
<html lang="en">
|
|
34
|
+
<head><title>Welcome</title></head>
|
|
35
|
+
<body>
|
|
36
|
+
<h1>Welcome</h1>
|
|
37
|
+
<p>This is <strong>fast</strong> conversion!</p>
|
|
38
|
+
<ul>
|
|
39
|
+
<li>Built with Rust</li>
|
|
40
|
+
<li>CommonMark compliant</li>
|
|
41
|
+
</ul>
|
|
42
|
+
</body>
|
|
43
|
+
</html>
|
|
37
44
|
"#;
|
|
38
45
|
|
|
39
|
-
let
|
|
40
|
-
println!("{}",
|
|
46
|
+
let result = convert(html, None)?;
|
|
47
|
+
println!("{}", result.content.unwrap_or_default());
|
|
48
|
+
|
|
49
|
+
if let Some(metadata) = &result.metadata {
|
|
50
|
+
println!("Title: {:?}", metadata.document.title);
|
|
51
|
+
println!("Headers: {:?}", metadata.headers);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
for table in &result.tables {
|
|
55
|
+
println!("Table with {} rows", table.cells.len());
|
|
56
|
+
}
|
|
57
|
+
|
|
41
58
|
Ok(())
|
|
42
59
|
}
|
|
43
60
|
```
|
|
44
61
|
|
|
45
62
|
## Error Handling
|
|
46
63
|
|
|
47
|
-
Conversion returns a `Result<
|
|
64
|
+
Conversion returns a `Result<ConversionResult, ConversionError>`. Inputs that look like binary data are rejected with
|
|
48
65
|
`ConversionError::InvalidInput` to prevent runaway allocations. Table `colspan`/`rowspan` values are also clamped
|
|
49
66
|
internally to keep output sizes bounded.
|
|
50
67
|
|
|
51
68
|
## Configuration
|
|
52
69
|
|
|
70
|
+
### Builder Pattern
|
|
71
|
+
|
|
72
|
+
```rust
|
|
73
|
+
use html_to_markdown_rs::{
|
|
74
|
+
convert, ConversionOptions, HeadingStyle, CodeBlockStyle,
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
let options = ConversionOptions::builder()
|
|
78
|
+
.heading_style(HeadingStyle::Atx)
|
|
79
|
+
.list_indent_width(2)
|
|
80
|
+
.bullets("-")
|
|
81
|
+
.autolinks(true)
|
|
82
|
+
.wrap(true)
|
|
83
|
+
.wrap_width(80)
|
|
84
|
+
.build();
|
|
85
|
+
|
|
86
|
+
let result = convert(html, Some(options))?;
|
|
87
|
+
println!("{}", result.content.unwrap_or_default());
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Struct Literal
|
|
91
|
+
|
|
53
92
|
```rust
|
|
54
93
|
use html_to_markdown_rs::{
|
|
55
94
|
convert, ConversionOptions, HeadingStyle, ListIndentType,
|
|
56
|
-
PreprocessingOptions, PreprocessingPreset,
|
|
57
95
|
};
|
|
58
96
|
|
|
59
97
|
let options = ConversionOptions {
|
|
@@ -69,12 +107,13 @@ let options = ConversionOptions {
|
|
|
69
107
|
..Default::default()
|
|
70
108
|
};
|
|
71
109
|
|
|
72
|
-
let
|
|
110
|
+
let result = convert(html, Some(options))?;
|
|
111
|
+
println!("{}", result.content.unwrap_or_default());
|
|
73
112
|
```
|
|
74
113
|
|
|
75
114
|
### Preserving HTML Tags
|
|
76
115
|
|
|
77
|
-
The `preserve_tags` option allows you to keep specific HTML tags in their original form instead of converting them to Markdown
|
|
116
|
+
The `preserve_tags` option allows you to keep specific HTML tags in their original form instead of converting them to Markdown:
|
|
78
117
|
|
|
79
118
|
```rust
|
|
80
119
|
use html_to_markdown_rs::{convert, ConversionOptions};
|
|
@@ -93,18 +132,8 @@ let options = ConversionOptions {
|
|
|
93
132
|
..Default::default()
|
|
94
133
|
};
|
|
95
134
|
|
|
96
|
-
let
|
|
97
|
-
//
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
You can preserve multiple tag types and combine with `strip_tags`:
|
|
101
|
-
|
|
102
|
-
```rust
|
|
103
|
-
let options = ConversionOptions {
|
|
104
|
-
preserve_tags: vec!["table".to_string(), "form".to_string()],
|
|
105
|
-
strip_tags: vec!["script".to_string(), "style".to_string()],
|
|
106
|
-
..Default::default()
|
|
107
|
-
};
|
|
135
|
+
let result = convert(html, Some(options))?;
|
|
136
|
+
// result.content => "Before table\n\n<table class=\"data\">...</table>\n\nAfter table\n"
|
|
108
137
|
```
|
|
109
138
|
|
|
110
139
|
## Web Scraping with Preprocessing
|
|
@@ -118,44 +147,62 @@ options.preprocessing.preset = html_to_markdown_rs::PreprocessingPreset::Aggress
|
|
|
118
147
|
options.preprocessing.remove_navigation = true;
|
|
119
148
|
options.preprocessing.remove_forms = true;
|
|
120
149
|
|
|
121
|
-
let
|
|
150
|
+
let result = convert(scraped_html, Some(options))?;
|
|
151
|
+
println!("{}", result.content.unwrap_or_default());
|
|
122
152
|
```
|
|
123
153
|
|
|
124
|
-
##
|
|
154
|
+
## Metadata Extraction
|
|
125
155
|
|
|
126
|
-
|
|
127
|
-
use html_to_markdown_rs::convert;
|
|
156
|
+
Metadata is automatically included in the result. Configure which fields to extract via `MetadataConfig`:
|
|
128
157
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
158
|
+
```rust
|
|
159
|
+
use html_to_markdown_rs::{convert, ConversionOptions, MetadataConfig};
|
|
160
|
+
|
|
161
|
+
let options = ConversionOptions::builder()
|
|
162
|
+
.metadata_config(MetadataConfig {
|
|
163
|
+
extract_headers: true,
|
|
164
|
+
extract_links: true,
|
|
165
|
+
extract_images: false,
|
|
166
|
+
..Default::default()
|
|
167
|
+
})
|
|
168
|
+
.build();
|
|
169
|
+
|
|
170
|
+
let result = convert(html, Some(options))?;
|
|
171
|
+
if let Some(metadata) = &result.metadata {
|
|
172
|
+
println!("Title: {:?}", metadata.document.title);
|
|
173
|
+
for header in &metadata.headers {
|
|
174
|
+
println!("H{}: {}", header.level, header.text);
|
|
175
|
+
}
|
|
176
|
+
for link in &metadata.links {
|
|
177
|
+
println!("Link: {} -> {}", link.text, link.href);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
132
180
|
```
|
|
133
181
|
|
|
134
|
-
##
|
|
182
|
+
## Image Extraction
|
|
135
183
|
|
|
136
184
|
```rust
|
|
137
|
-
use html_to_markdown_rs::{
|
|
138
|
-
|
|
139
|
-
let config = InlineImageConfig::new(5 * 1024 * 1024) // 5MB max
|
|
140
|
-
.with_infer_dimensions(true)
|
|
141
|
-
.with_filename_prefix("img_".to_string());
|
|
185
|
+
use html_to_markdown_rs::{convert, ConversionOptions};
|
|
142
186
|
|
|
143
|
-
let
|
|
187
|
+
let options = ConversionOptions::builder()
|
|
188
|
+
.extract_images(true)
|
|
189
|
+
.max_image_size(5 * 1024 * 1024) // 5 MB max
|
|
190
|
+
.infer_dimensions(true)
|
|
191
|
+
.build();
|
|
144
192
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
193
|
+
let result = convert(html, Some(options))?;
|
|
194
|
+
println!("{}", result.content.unwrap_or_default());
|
|
195
|
+
for img in &result.images {
|
|
196
|
+
println!("Image: {} ({} bytes)", img.src, img.data.as_ref().map_or(0, |d| d.len()));
|
|
148
197
|
}
|
|
149
198
|
```
|
|
150
199
|
|
|
151
200
|
## Table Extraction
|
|
152
201
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
Requires the `visitor` feature.
|
|
202
|
+
Structured table data is always included in `ConversionResult.tables`:
|
|
156
203
|
|
|
157
204
|
```rust
|
|
158
|
-
use html_to_markdown_rs::
|
|
205
|
+
use html_to_markdown_rs::convert;
|
|
159
206
|
|
|
160
207
|
let html = r#"
|
|
161
208
|
<table>
|
|
@@ -165,9 +212,9 @@ let html = r#"
|
|
|
165
212
|
</table>
|
|
166
213
|
"#;
|
|
167
214
|
|
|
168
|
-
let result =
|
|
215
|
+
let result = convert(html, None)?;
|
|
169
216
|
|
|
170
|
-
println!("{}", result.content);
|
|
217
|
+
println!("{}", result.content.unwrap_or_default());
|
|
171
218
|
for table in &result.tables {
|
|
172
219
|
println!("Table with {} rows:", table.cells.len());
|
|
173
220
|
for (i, row) in table.cells.iter().enumerate() {
|
|
@@ -177,6 +224,34 @@ for table in &result.tables {
|
|
|
177
224
|
}
|
|
178
225
|
```
|
|
179
226
|
|
|
227
|
+
## Custom Visitors
|
|
228
|
+
|
|
229
|
+
```rust
|
|
230
|
+
use html_to_markdown_rs::{convert, ConversionOptions};
|
|
231
|
+
use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};
|
|
232
|
+
|
|
233
|
+
struct NoImagesVisitor;
|
|
234
|
+
|
|
235
|
+
impl HtmlVisitor for NoImagesVisitor {
|
|
236
|
+
fn visit_image(
|
|
237
|
+
&mut self,
|
|
238
|
+
_ctx: &NodeContext,
|
|
239
|
+
_src: &str,
|
|
240
|
+
_alt: &str,
|
|
241
|
+
_title: Option<&str>,
|
|
242
|
+
) -> VisitResult {
|
|
243
|
+
VisitResult::Skip
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
let options = ConversionOptions::builder()
|
|
248
|
+
.visitor(Box::new(NoImagesVisitor))
|
|
249
|
+
.build();
|
|
250
|
+
|
|
251
|
+
let result = convert(html, Some(options))?;
|
|
252
|
+
println!("{}", result.content.unwrap_or_default());
|
|
253
|
+
```
|
|
254
|
+
|
|
180
255
|
## Other Language Bindings
|
|
181
256
|
|
|
182
257
|
This is the core Rust library. For other languages:
|
|
@@ -189,13 +264,14 @@ This is the core Rust library. For other languages:
|
|
|
189
264
|
|
|
190
265
|
## Documentation
|
|
191
266
|
|
|
192
|
-
- [Full Documentation](https://
|
|
267
|
+
- [Full Documentation](https://docs.html-to-markdown.kreuzberg.dev)
|
|
193
268
|
- [API Reference](https://docs.rs/html-to-markdown-rs)
|
|
269
|
+
- [Migration Guide (v2 -> v3)](https://docs.html-to-markdown.kreuzberg.dev/migration/v3/)
|
|
194
270
|
- [Contributing Guide](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/CONTRIBUTING.md)
|
|
195
271
|
|
|
196
272
|
## Performance
|
|
197
273
|
|
|
198
|
-
10-30x faster than pure Python/JavaScript implementations, delivering 150-
|
|
274
|
+
10-30x faster than pure Python/JavaScript implementations, delivering 150-280 MB/s throughput.
|
|
199
275
|
|
|
200
276
|
## License
|
|
201
277
|
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Basic HTML to Markdown conversion
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = "<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>";
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Converting HTML tables to Markdown
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = r"<table>
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Testing HTML escape sequences and special characters
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = "<p>Use *wildcards* for search</p>";
|
|
@@ -1,6 +1,12 @@
|
|
|
1
|
-
|
|
1
|
+
#![allow(missing_docs)]
|
|
2
|
+
fn convert(
|
|
3
|
+
html: &str,
|
|
4
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
5
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
6
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
}
|
|
2
8
|
|
|
3
|
-
use html_to_markdown_rs::
|
|
9
|
+
use html_to_markdown_rs::ConversionOptions;
|
|
4
10
|
|
|
5
11
|
fn main() {
|
|
6
12
|
let html = "<p>This is <mark>highlighted</mark> text</p>";
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Testing HTML list conversion (ordered and unordered lists)
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = "<ol><li>First item</li><li>Second item</li><li>Third item</li></ol>";
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Testing HTML5 semantic tags (article, section, nav, etc.)
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = r"<article>
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Converting HTML tables to Markdown
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = r"<table>
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Testing task list conversion (checkboxes)
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = r#"<ul>
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
//! Example: Testing whitespace handling and normalization
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
4
9
|
|
|
5
10
|
fn main() {
|
|
6
11
|
let html = "<p>text with multiple spaces</p>";
|