html-to-markdown 2.30.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +6 -19
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +6 -3
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +126 -52
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "2.30.0"
3
+ version = "3.0.1"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -14,25 +14,22 @@ keywords = ["html", "markdown", "converter", "astral-tl", "doc-processing"]
14
14
  categories = ["parsing", "text-processing", "web-programming"]
15
15
 
16
16
  [package.metadata.cargo-machete]
17
- ignored = ["once_cell", "futures"]
17
+ ignored = ["once_cell", "ahash"]
18
18
 
19
19
  [lib]
20
20
  crate-type = ["rlib"]
21
21
 
22
22
  [features]
23
23
  default = ["metadata"]
24
- full = ["inline-images", "metadata", "visitor", "async-visitor", "serde"]
24
+ full = ["inline-images", "metadata", "visitor", "serde"]
25
25
  inline-images = ["dep:image"]
26
26
  metadata = ["dep:serde", "dep:serde_json"]
27
27
  visitor = []
28
- async-visitor = ["visitor", "dep:async-trait", "dep:futures", "dep:tokio"]
29
28
  serde = ["dep:serde", "dep:serde_json"]
30
29
 
31
30
  [dependencies]
32
31
  ahash = { version = "0.8", features = ["std", "compile-time-rng"], default-features = false }
33
- async-trait = { version = "0.1", optional = true }
34
32
  base64 = "0.22"
35
- futures = { version = "0.3", optional = true }
36
33
  html-escape = "0.2.13"
37
34
  html5ever = "0.39.0"
38
35
  image = { version = "0.25", default-features = false, features = [
@@ -43,16 +40,13 @@ image = { version = "0.25", default-features = false, features = [
43
40
  "webp",
44
41
  ], optional = true }
45
42
  lru = "0.16"
43
+ memchr = "2"
46
44
  once_cell = "1.21"
47
45
  regex = "1.12"
48
46
  serde = { version = "1.0", features = ["derive"], optional = true }
49
47
  serde_json = { version = "1.0", optional = true }
50
48
  thiserror = "2.0"
51
49
  tl = { package = "astral-tl", version = "0.7.11" }
52
- tokio = { version = "1.50", features = [
53
- "rt-multi-thread",
54
- "sync",
55
- ], optional = true }
56
50
 
57
51
  [dev-dependencies]
58
52
  serde = { version = "1.0", features = ["derive"] }
@@ -18,42 +18,80 @@ Fast, reliable HTML to Markdown conversion with full CommonMark compliance. Buil
18
18
 
19
19
  ```toml
20
20
  [dependencies]
21
- html-to-markdown-rs = "2.3"
21
+ html-to-markdown-rs = "3.0"
22
22
  ```
23
23
 
24
24
  ## Basic Usage
25
25
 
26
+ `convert()` returns a structured `ConversionResult` with the converted text, metadata, tables, and more:
27
+
26
28
  ```rust
27
- use html_to_markdown_rs::{convert, ConversionOptions};
29
+ use html_to_markdown_rs::convert;
28
30
 
29
31
  fn main() -> Result<(), Box<dyn std::error::Error>> {
30
32
  let html = r#"
31
- <h1>Welcome</h1>
32
- <p>This is <strong>fast</strong> conversion!</p>
33
- <ul>
34
- <li>Built with Rust</li>
35
- <li>CommonMark compliant</li>
36
- </ul>
33
+ <html lang="en">
34
+ <head><title>Welcome</title></head>
35
+ <body>
36
+ <h1>Welcome</h1>
37
+ <p>This is <strong>fast</strong> conversion!</p>
38
+ <ul>
39
+ <li>Built with Rust</li>
40
+ <li>CommonMark compliant</li>
41
+ </ul>
42
+ </body>
43
+ </html>
37
44
  "#;
38
45
 
39
- let markdown = convert(html, None)?;
40
- println!("{}", markdown);
46
+ let result = convert(html, None)?;
47
+ println!("{}", result.content.unwrap_or_default());
48
+
49
+ if let Some(metadata) = &result.metadata {
50
+ println!("Title: {:?}", metadata.document.title);
51
+ println!("Headers: {:?}", metadata.headers);
52
+ }
53
+
54
+ for table in &result.tables {
55
+ println!("Table with {} rows", table.cells.len());
56
+ }
57
+
41
58
  Ok(())
42
59
  }
43
60
  ```
44
61
 
45
62
  ## Error Handling
46
63
 
47
- Conversion returns a `Result<String, ConversionError>`. Inputs that look like binary data are rejected with
64
+ Conversion returns a `Result<ConversionResult, ConversionError>`. Inputs that look like binary data are rejected with
48
65
  `ConversionError::InvalidInput` to prevent runaway allocations. Table `colspan`/`rowspan` values are also clamped
49
66
  internally to keep output sizes bounded.
50
67
 
51
68
  ## Configuration
52
69
 
70
+ ### Builder Pattern
71
+
72
+ ```rust
73
+ use html_to_markdown_rs::{
74
+ convert, ConversionOptions, HeadingStyle, CodeBlockStyle,
75
+ };
76
+
77
+ let options = ConversionOptions::builder()
78
+ .heading_style(HeadingStyle::Atx)
79
+ .list_indent_width(2)
80
+ .bullets("-")
81
+ .autolinks(true)
82
+ .wrap(true)
83
+ .wrap_width(80)
84
+ .build();
85
+
86
+ let result = convert(html, Some(options))?;
87
+ println!("{}", result.content.unwrap_or_default());
88
+ ```
89
+
90
+ ### Struct Literal
91
+
53
92
  ```rust
54
93
  use html_to_markdown_rs::{
55
94
  convert, ConversionOptions, HeadingStyle, ListIndentType,
56
- PreprocessingOptions, PreprocessingPreset,
57
95
  };
58
96
 
59
97
  let options = ConversionOptions {
@@ -69,12 +107,13 @@ let options = ConversionOptions {
69
107
  ..Default::default()
70
108
  };
71
109
 
72
- let markdown = convert(html, Some(options))?;
110
+ let result = convert(html, Some(options))?;
111
+ println!("{}", result.content.unwrap_or_default());
73
112
  ```
74
113
 
75
114
  ### Preserving HTML Tags
76
115
 
77
- The `preserve_tags` option allows you to keep specific HTML tags in their original form instead of converting them to Markdown. This is useful for complex elements like tables that may not convert well:
116
+ The `preserve_tags` option allows you to keep specific HTML tags in their original form instead of converting them to Markdown:
78
117
 
79
118
  ```rust
80
119
  use html_to_markdown_rs::{convert, ConversionOptions};
@@ -93,18 +132,8 @@ let options = ConversionOptions {
93
132
  ..Default::default()
94
133
  };
95
134
 
96
- let markdown = convert(html, Some(options))?;
97
- // Result: "Before table\n\n<table class=\"data\">...</table>\n\nAfter table\n"
98
- ```
99
-
100
- You can preserve multiple tag types and combine with `strip_tags`:
101
-
102
- ```rust
103
- let options = ConversionOptions {
104
- preserve_tags: vec!["table".to_string(), "form".to_string()],
105
- strip_tags: vec!["script".to_string(), "style".to_string()],
106
- ..Default::default()
107
- };
135
+ let result = convert(html, Some(options))?;
136
+ // result.content => "Before table\n\n<table class=\"data\">...</table>\n\nAfter table\n"
108
137
  ```
109
138
 
110
139
  ## Web Scraping with Preprocessing
@@ -118,46 +147,62 @@ options.preprocessing.preset = html_to_markdown_rs::PreprocessingPreset::Aggress
118
147
  options.preprocessing.remove_navigation = true;
119
148
  options.preprocessing.remove_forms = true;
120
149
 
121
- let markdown = convert(scraped_html, Some(options))?;
150
+ let result = convert(scraped_html, Some(options))?;
151
+ println!("{}", result.content.unwrap_or_default());
122
152
  ```
123
153
 
124
- ## hOCR Table Extraction (Deprecated)
154
+ ## Metadata Extraction
125
155
 
126
- > **Deprecated since 2.30.0**: hOCR support will be removed in v3.
156
+ Metadata is automatically included in the result. Configure which fields to extract via `MetadataConfig`:
127
157
 
128
158
  ```rust
129
- use html_to_markdown_rs::convert;
130
-
131
- // hOCR documents (from Tesseract, etc.) are detected automatically.
132
- // Tables and spatial layout are reconstructed without additional options.
133
- let markdown = convert(hocr_html, None)?;
159
+ use html_to_markdown_rs::{convert, ConversionOptions, MetadataConfig};
160
+
161
+ let options = ConversionOptions::builder()
162
+ .metadata_config(MetadataConfig {
163
+ extract_headers: true,
164
+ extract_links: true,
165
+ extract_images: false,
166
+ ..Default::default()
167
+ })
168
+ .build();
169
+
170
+ let result = convert(html, Some(options))?;
171
+ if let Some(metadata) = &result.metadata {
172
+ println!("Title: {:?}", metadata.document.title);
173
+ for header in &metadata.headers {
174
+ println!("H{}: {}", header.level, header.text);
175
+ }
176
+ for link in &metadata.links {
177
+ println!("Link: {} -> {}", link.text, link.href);
178
+ }
179
+ }
134
180
  ```
135
181
 
136
- ## Inline Image Extraction
182
+ ## Image Extraction
137
183
 
138
184
  ```rust
139
- use html_to_markdown_rs::{convert_with_inline_images, InlineImageConfig};
140
-
141
- let config = InlineImageConfig::new(5 * 1024 * 1024) // 5MB max
142
- .with_infer_dimensions(true)
143
- .with_filename_prefix("img_".to_string());
185
+ use html_to_markdown_rs::{convert, ConversionOptions};
144
186
 
145
- let extraction = convert_with_inline_images(html, None, config)?;
187
+ let options = ConversionOptions::builder()
188
+ .extract_images(true)
189
+ .max_image_size(5 * 1024 * 1024) // 5 MB max
190
+ .infer_dimensions(true)
191
+ .build();
146
192
 
147
- println!("{}", extraction.markdown);
148
- for (i, img) in extraction.inline_images.iter().enumerate() {
149
- println!("Image {}: {} ({} bytes)", i, img.format, img.data.len());
193
+ let result = convert(html, Some(options))?;
194
+ println!("{}", result.content.unwrap_or_default());
195
+ for img in &result.images {
196
+ println!("Image: {} ({} bytes)", img.src, img.data.as_ref().map_or(0, |d| d.len()));
150
197
  }
151
198
  ```
152
199
 
153
200
  ## Table Extraction
154
201
 
155
- Extract structured table data alongside the Markdown conversion. Each table found in the HTML is returned with its cell contents, header row flags, and rendered Markdown output.
156
-
157
- Requires the `visitor` feature.
202
+ Structured table data is always included in `ConversionResult.tables`:
158
203
 
159
204
  ```rust
160
- use html_to_markdown_rs::convert_with_tables;
205
+ use html_to_markdown_rs::convert;
161
206
 
162
207
  let html = r#"
163
208
  <table>
@@ -167,9 +212,9 @@ let html = r#"
167
212
  </table>
168
213
  "#;
169
214
 
170
- let result = convert_with_tables(html, None, None)?;
215
+ let result = convert(html, None)?;
171
216
 
172
- println!("{}", result.content);
217
+ println!("{}", result.content.unwrap_or_default());
173
218
  for table in &result.tables {
174
219
  println!("Table with {} rows:", table.cells.len());
175
220
  for (i, row) in table.cells.iter().enumerate() {
@@ -179,6 +224,34 @@ for table in &result.tables {
179
224
  }
180
225
  ```
181
226
 
227
+ ## Custom Visitors
228
+
229
+ ```rust
230
+ use html_to_markdown_rs::{convert, ConversionOptions};
231
+ use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};
232
+
233
+ struct NoImagesVisitor;
234
+
235
+ impl HtmlVisitor for NoImagesVisitor {
236
+ fn visit_image(
237
+ &mut self,
238
+ _ctx: &NodeContext,
239
+ _src: &str,
240
+ _alt: &str,
241
+ _title: Option<&str>,
242
+ ) -> VisitResult {
243
+ VisitResult::Skip
244
+ }
245
+ }
246
+
247
+ let options = ConversionOptions::builder()
248
+ .visitor(Box::new(NoImagesVisitor))
249
+ .build();
250
+
251
+ let result = convert(html, Some(options))?;
252
+ println!("{}", result.content.unwrap_or_default());
253
+ ```
254
+
182
255
  ## Other Language Bindings
183
256
 
184
257
  This is the core Rust library. For other languages:
@@ -191,13 +264,14 @@ This is the core Rust library. For other languages:
191
264
 
192
265
  ## Documentation
193
266
 
194
- - [Full Documentation](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/README.md)
267
+ - [Full Documentation](https://docs.html-to-markdown.kreuzberg.dev)
195
268
  - [API Reference](https://docs.rs/html-to-markdown-rs)
269
+ - [Migration Guide (v2 -> v3)](https://docs.html-to-markdown.kreuzberg.dev/migration/v3/)
196
270
  - [Contributing Guide](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/CONTRIBUTING.md)
197
271
 
198
272
  ## Performance
199
273
 
200
- 10-30x faster than pure Python/JavaScript implementations, delivering 150-210 MB/s throughput.
274
+ 10-30x faster than pure Python/JavaScript implementations, delivering 150-280 MB/s throughput.
201
275
 
202
276
  ## License
203
277
 
@@ -1,6 +1,11 @@
1
1
  //! Example: Basic HTML to Markdown conversion
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = "<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>";
@@ -1,6 +1,11 @@
1
1
  //! Example: Converting HTML tables to Markdown
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = r"<table>
@@ -1,6 +1,11 @@
1
1
  //! Example: Testing HTML escape sequences and special characters
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = "<p>Use *wildcards* for search</p>";
@@ -1,6 +1,12 @@
1
- //! Example: Testing inline formatting (bold, italic, code, etc.)
1
+ #![allow(missing_docs)]
2
+ fn convert(
3
+ html: &str,
4
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
5
+ ) -> html_to_markdown_rs::error::Result<String> {
6
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ }
2
8
 
3
- use html_to_markdown_rs::{ConversionOptions, convert};
9
+ use html_to_markdown_rs::ConversionOptions;
4
10
 
5
11
  fn main() {
6
12
  let html = "<p>This is <mark>highlighted</mark> text</p>";
@@ -1,6 +1,11 @@
1
1
  //! Example: Testing HTML list conversion (ordered and unordered lists)
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = "<ol><li>First item</li><li>Second item</li><li>Third item</li></ol>";
@@ -1,6 +1,11 @@
1
1
  //! Example: Testing HTML5 semantic tags (article, section, nav, etc.)
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = r"<article>
@@ -1,6 +1,11 @@
1
1
  //! Example: Converting HTML tables to Markdown
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = r"<table>
@@ -1,6 +1,11 @@
1
1
  //! Example: Testing task list conversion (checkboxes)
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = r#"<ul>
@@ -1,6 +1,11 @@
1
1
  //! Example: Testing whitespace handling and normalization
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  fn main() {
6
11
  let html = "<p>text with multiple spaces</p>";