html-to-markdown 2.29.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +18 -41
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +7 -4
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +127 -51
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "2.29.0"
3
+ version = "3.0.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -14,25 +14,22 @@ keywords = ["html", "markdown", "converter", "astral-tl", "doc-processing"]
14
14
  categories = ["parsing", "text-processing", "web-programming"]
15
15
 
16
16
  [package.metadata.cargo-machete]
17
- ignored = ["once_cell", "futures"]
17
+ ignored = ["once_cell", "ahash"]
18
18
 
19
19
  [lib]
20
20
  crate-type = ["rlib"]
21
21
 
22
22
  [features]
23
23
  default = ["metadata"]
24
- full = ["inline-images", "metadata", "visitor", "async-visitor", "serde"]
24
+ full = ["inline-images", "metadata", "visitor", "serde"]
25
25
  inline-images = ["dep:image"]
26
26
  metadata = ["dep:serde", "dep:serde_json"]
27
27
  visitor = []
28
- async-visitor = ["visitor", "dep:async-trait", "dep:futures", "dep:tokio"]
29
28
  serde = ["dep:serde", "dep:serde_json"]
30
29
 
31
30
  [dependencies]
32
31
  ahash = { version = "0.8", features = ["std", "compile-time-rng"], default-features = false }
33
- async-trait = { version = "0.1", optional = true }
34
32
  base64 = "0.22"
35
- futures = { version = "0.3", optional = true }
36
33
  html-escape = "0.2.13"
37
34
  html5ever = "0.39.0"
38
35
  image = { version = "0.25", default-features = false, features = [
@@ -43,16 +40,13 @@ image = { version = "0.25", default-features = false, features = [
43
40
  "webp",
44
41
  ], optional = true }
45
42
  lru = "0.16"
43
+ memchr = "2"
46
44
  once_cell = "1.21"
47
45
  regex = "1.12"
48
46
  serde = { version = "1.0", features = ["derive"], optional = true }
49
47
  serde_json = { version = "1.0", optional = true }
50
48
  thiserror = "2.0"
51
49
  tl = { package = "astral-tl", version = "0.7.11" }
52
- tokio = { version = "1.50", features = [
53
- "rt-multi-thread",
54
- "sync",
55
- ], optional = true }
56
50
 
57
51
  [dev-dependencies]
58
52
  serde = { version = "1.0", features = ["derive"] }
@@ -18,42 +18,80 @@ Fast, reliable HTML to Markdown conversion with full CommonMark compliance. Buil
18
18
 
19
19
  ```toml
20
20
  [dependencies]
21
- html-to-markdown-rs = "2.3"
21
+ html-to-markdown-rs = "3.0"
22
22
  ```
23
23
 
24
24
  ## Basic Usage
25
25
 
26
+ `convert()` returns a structured `ConversionResult` with the converted text, metadata, tables, and more:
27
+
26
28
  ```rust
27
- use html_to_markdown_rs::{convert, ConversionOptions};
29
+ use html_to_markdown_rs::convert;
28
30
 
29
31
  fn main() -> Result<(), Box<dyn std::error::Error>> {
30
32
  let html = r#"
31
- <h1>Welcome</h1>
32
- <p>This is <strong>fast</strong> conversion!</p>
33
- <ul>
34
- <li>Built with Rust</li>
35
- <li>CommonMark compliant</li>
36
- </ul>
33
+ <html lang="en">
34
+ <head><title>Welcome</title></head>
35
+ <body>
36
+ <h1>Welcome</h1>
37
+ <p>This is <strong>fast</strong> conversion!</p>
38
+ <ul>
39
+ <li>Built with Rust</li>
40
+ <li>CommonMark compliant</li>
41
+ </ul>
42
+ </body>
43
+ </html>
37
44
  "#;
38
45
 
39
- let markdown = convert(html, None)?;
40
- println!("{}", markdown);
46
+ let result = convert(html, None)?;
47
+ println!("{}", result.content.unwrap_or_default());
48
+
49
+ if let Some(metadata) = &result.metadata {
50
+ println!("Title: {:?}", metadata.document.title);
51
+ println!("Headers: {:?}", metadata.headers);
52
+ }
53
+
54
+ for table in &result.tables {
55
+ println!("Table with {} rows", table.cells.len());
56
+ }
57
+
41
58
  Ok(())
42
59
  }
43
60
  ```
44
61
 
45
62
  ## Error Handling
46
63
 
47
- Conversion returns a `Result<String, ConversionError>`. Inputs that look like binary data are rejected with
64
+ Conversion returns a `Result<ConversionResult, ConversionError>`. Inputs that look like binary data are rejected with
48
65
  `ConversionError::InvalidInput` to prevent runaway allocations. Table `colspan`/`rowspan` values are also clamped
49
66
  internally to keep output sizes bounded.
50
67
 
51
68
  ## Configuration
52
69
 
70
+ ### Builder Pattern
71
+
72
+ ```rust
73
+ use html_to_markdown_rs::{
74
+ convert, ConversionOptions, HeadingStyle, CodeBlockStyle,
75
+ };
76
+
77
+ let options = ConversionOptions::builder()
78
+ .heading_style(HeadingStyle::Atx)
79
+ .list_indent_width(2)
80
+ .bullets("-")
81
+ .autolinks(true)
82
+ .wrap(true)
83
+ .wrap_width(80)
84
+ .build();
85
+
86
+ let result = convert(html, Some(options))?;
87
+ println!("{}", result.content.unwrap_or_default());
88
+ ```
89
+
90
+ ### Struct Literal
91
+
53
92
  ```rust
54
93
  use html_to_markdown_rs::{
55
94
  convert, ConversionOptions, HeadingStyle, ListIndentType,
56
- PreprocessingOptions, PreprocessingPreset,
57
95
  };
58
96
 
59
97
  let options = ConversionOptions {
@@ -69,12 +107,13 @@ let options = ConversionOptions {
69
107
  ..Default::default()
70
108
  };
71
109
 
72
- let markdown = convert(html, Some(options))?;
110
+ let result = convert(html, Some(options))?;
111
+ println!("{}", result.content.unwrap_or_default());
73
112
  ```
74
113
 
75
114
  ### Preserving HTML Tags
76
115
 
77
- The `preserve_tags` option allows you to keep specific HTML tags in their original form instead of converting them to Markdown. This is useful for complex elements like tables that may not convert well:
116
+ The `preserve_tags` option allows you to keep specific HTML tags in their original form instead of converting them to Markdown:
78
117
 
79
118
  ```rust
80
119
  use html_to_markdown_rs::{convert, ConversionOptions};
@@ -93,18 +132,8 @@ let options = ConversionOptions {
93
132
  ..Default::default()
94
133
  };
95
134
 
96
- let markdown = convert(html, Some(options))?;
97
- // Result: "Before table\n\n<table class=\"data\">...</table>\n\nAfter table\n"
98
- ```
99
-
100
- You can preserve multiple tag types and combine with `strip_tags`:
101
-
102
- ```rust
103
- let options = ConversionOptions {
104
- preserve_tags: vec!["table".to_string(), "form".to_string()],
105
- strip_tags: vec!["script".to_string(), "style".to_string()],
106
- ..Default::default()
107
- };
135
+ let result = convert(html, Some(options))?;
136
+ // result.content => "Before table\n\n<table class=\"data\">...</table>\n\nAfter table\n"
108
137
  ```
109
138
 
110
139
  ## Web Scraping with Preprocessing
@@ -118,44 +147,62 @@ options.preprocessing.preset = html_to_markdown_rs::PreprocessingPreset::Aggress
118
147
  options.preprocessing.remove_navigation = true;
119
148
  options.preprocessing.remove_forms = true;
120
149
 
121
- let markdown = convert(scraped_html, Some(options))?;
150
+ let result = convert(scraped_html, Some(options))?;
151
+ println!("{}", result.content.unwrap_or_default());
122
152
  ```
123
153
 
124
- ## hOCR Table Extraction
154
+ ## Metadata Extraction
125
155
 
126
- ```rust
127
- use html_to_markdown_rs::convert;
156
+ Metadata is automatically included in the result. Configure which fields to extract via `MetadataConfig`:
128
157
 
129
- // hOCR documents (from Tesseract, etc.) are detected automatically.
130
- // Tables and spatial layout are reconstructed without additional options.
131
- let markdown = convert(hocr_html, None)?;
158
+ ```rust
159
+ use html_to_markdown_rs::{convert, ConversionOptions, MetadataConfig};
160
+
161
+ let options = ConversionOptions::builder()
162
+ .metadata_config(MetadataConfig {
163
+ extract_headers: true,
164
+ extract_links: true,
165
+ extract_images: false,
166
+ ..Default::default()
167
+ })
168
+ .build();
169
+
170
+ let result = convert(html, Some(options))?;
171
+ if let Some(metadata) = &result.metadata {
172
+ println!("Title: {:?}", metadata.document.title);
173
+ for header in &metadata.headers {
174
+ println!("H{}: {}", header.level, header.text);
175
+ }
176
+ for link in &metadata.links {
177
+ println!("Link: {} -> {}", link.text, link.href);
178
+ }
179
+ }
132
180
  ```
133
181
 
134
- ## Inline Image Extraction
182
+ ## Image Extraction
135
183
 
136
184
  ```rust
137
- use html_to_markdown_rs::{convert_with_inline_images, InlineImageConfig};
138
-
139
- let config = InlineImageConfig::new(5 * 1024 * 1024) // 5MB max
140
- .with_infer_dimensions(true)
141
- .with_filename_prefix("img_".to_string());
185
+ use html_to_markdown_rs::{convert, ConversionOptions};
142
186
 
143
- let extraction = convert_with_inline_images(html, None, config)?;
187
+ let options = ConversionOptions::builder()
188
+ .extract_images(true)
189
+ .max_image_size(5 * 1024 * 1024) // 5 MB max
190
+ .infer_dimensions(true)
191
+ .build();
144
192
 
145
- println!("{}", extraction.markdown);
146
- for (i, img) in extraction.inline_images.iter().enumerate() {
147
- println!("Image {}: {} ({} bytes)", i, img.format, img.data.len());
193
+ let result = convert(html, Some(options))?;
194
+ println!("{}", result.content.unwrap_or_default());
195
+ for img in &result.images {
196
+ println!("Image: {} ({} bytes)", img.src, img.data.as_ref().map_or(0, |d| d.len()));
148
197
  }
149
198
  ```
150
199
 
151
200
  ## Table Extraction
152
201
 
153
- Extract structured table data alongside the Markdown conversion. Each table found in the HTML is returned with its cell contents, header row flags, and rendered Markdown output.
154
-
155
- Requires the `visitor` feature.
202
+ Structured table data is always included in `ConversionResult.tables`:
156
203
 
157
204
  ```rust
158
- use html_to_markdown_rs::convert_with_tables;
205
+ use html_to_markdown_rs::convert;
159
206
 
160
207
  let html = r#"
161
208
  <table>
@@ -165,9 +212,9 @@ let html = r#"
165
212
  </table>
166
213
  "#;
167
214
 
168
- let result = convert_with_tables(html, None, None)?;
215
+ let result = convert(html, None)?;
169
216
 
170
- println!("{}", result.content);
217
+ println!("{}", result.content.unwrap_or_default());
171
218
  for table in &result.tables {
172
219
  println!("Table with {} rows:", table.cells.len());
173
220
  for (i, row) in table.cells.iter().enumerate() {
@@ -177,6 +224,34 @@ for table in &result.tables {
177
224
  }
178
225
  ```
179
226
 
227
+ ## Custom Visitors
228
+
229
+ ```rust
230
+ use html_to_markdown_rs::{convert, ConversionOptions};
231
+ use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};
232
+
233
+ struct NoImagesVisitor;
234
+
235
+ impl HtmlVisitor for NoImagesVisitor {
236
+ fn visit_image(
237
+ &mut self,
238
+ _ctx: &NodeContext,
239
+ _src: &str,
240
+ _alt: &str,
241
+ _title: Option<&str>,
242
+ ) -> VisitResult {
243
+ VisitResult::Skip
244
+ }
245
+ }
246
+
247
+ let options = ConversionOptions::builder()
248
+ .visitor(Box::new(NoImagesVisitor))
249
+ .build();
250
+
251
+ let result = convert(html, Some(options))?;
252
+ println!("{}", result.content.unwrap_or_default());
253
+ ```
254
+
180
255
  ## Other Language Bindings
181
256
 
182
257
  This is the core Rust library. For other languages:
@@ -189,13 +264,14 @@ This is the core Rust library. For other languages:
189
264
 
190
265
  ## Documentation
191
266
 
192
- - [Full Documentation](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/README.md)
267
+ - [Full Documentation](https://docs.html-to-markdown.kreuzberg.dev)
193
268
  - [API Reference](https://docs.rs/html-to-markdown-rs)
269
+ - [Migration Guide (v2 -> v3)](https://docs.html-to-markdown.kreuzberg.dev/migration/v3/)
194
270
  - [Contributing Guide](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/CONTRIBUTING.md)
195
271
 
196
272
  ## Performance
197
273
 
198
- 10-30x faster than pure Python/JavaScript implementations, delivering 150-210 MB/s throughput.
274
+ 10-30x faster than pure Python/JavaScript implementations, delivering 150-280 MB/s throughput.
199
275
 
200
276
  ## License
201
277
 
@@ -1,6 +1,11 @@
1
1
  //! Example: Basic HTML to Markdown conversion
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = "<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>";
@@ -1,6 +1,11 @@
1
1
  //! Example: Converting HTML tables to Markdown
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = r"<table>
@@ -1,6 +1,11 @@
1
1
  //! Example: Testing HTML escape sequences and special characters
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = "<p>Use *wildcards* for search</p>";
@@ -1,6 +1,12 @@
1
- //! Example: Testing inline formatting (bold, italic, code, etc.)
1
+ #![allow(missing_docs)]
2
+ fn convert(
3
+ html: &str,
4
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
5
+ ) -> html_to_markdown_rs::error::Result<String> {
6
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ }
2
8
 
3
- use html_to_markdown_rs::{ConversionOptions, convert};
9
+ use html_to_markdown_rs::ConversionOptions;
4
10
 
5
11
  fn main() {
6
12
  let html = "<p>This is <mark>highlighted</mark> text</p>";
@@ -1,6 +1,11 @@
1
1
  //! Example: Testing HTML list conversion (ordered and unordered lists)
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = "<ol><li>First item</li><li>Second item</li><li>Third item</li></ol>";
@@ -1,6 +1,11 @@
1
1
  //! Example: Testing HTML5 semantic tags (article, section, nav, etc.)
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = r"<article>
@@ -1,6 +1,11 @@
1
1
  //! Example: Converting HTML tables to Markdown
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = r"<table>
@@ -1,6 +1,11 @@
1
1
  //! Example: Testing task list conversion (checkboxes)
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = r#"<ul>
@@ -1,6 +1,11 @@
1
1
  //! Example: Testing whitespace handling and normalization
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = "<p>text with multiple spaces</p>";