html-to-markdown 2.30.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -14
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +5 -2
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +126 -52
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,26 +1,23 @@
1
- //! Main HTML to Markdown conversion APIs.
1
+ //! Main HTML to Markdown conversion API.
2
2
  //!
3
- //! This module provides the primary public functions for converting HTML to Markdown,
4
- //! including support for metadata extraction, inline image collection, and custom visitors.
3
+ //! This module provides the primary `convert()` function for converting HTML to Markdown.
5
4
 
6
5
  use std::borrow::Cow;
7
6
 
8
7
  use crate::error::Result;
9
8
  use crate::options::{ConversionOptions, WhitespaceMode};
10
9
  use crate::text;
10
+ use crate::types::ConversionResult;
11
11
  use crate::validation::{Utf16Encoding, detect_utf16_encoding, validate_input};
12
12
  use crate::{ConversionError, ConversionOptionsUpdate};
13
13
 
14
- #[cfg(feature = "visitor")]
15
- use crate::visitor;
16
- #[cfg(feature = "async-visitor")]
17
- use crate::visitor_helpers;
18
- #[cfg(feature = "metadata")]
19
- use crate::{ExtendedMetadata, MetadataConfig};
20
14
  #[cfg(feature = "inline-images")]
21
- use crate::{HtmlExtraction, InlineImageConfig};
15
+ use crate::InlineImageConfig;
16
+ #[cfg(feature = "metadata")]
17
+ use crate::{HtmlMetadata, MetadataConfig};
22
18
 
23
- /// Convert HTML to Markdown.
19
+ /// Convert HTML to Markdown, returning a [`ConversionResult`] with content, metadata, images,
20
+ /// and warnings.
24
21
  ///
25
22
  /// # Arguments
26
23
  ///
@@ -33,265 +30,121 @@ use crate::{HtmlExtraction, InlineImageConfig};
33
30
  /// use html_to_markdown_rs::{convert, ConversionOptions};
34
31
  ///
35
32
  /// let html = "<h1>Hello World</h1>";
36
- /// let markdown = convert(html, None).unwrap();
37
- /// assert!(markdown.contains("Hello World"));
33
+ /// let result = convert(html, None).unwrap();
34
+ /// assert!(result.content.as_deref().unwrap_or("").contains("Hello World"));
38
35
  /// ```
36
+ ///
39
37
  /// # Errors
40
38
  ///
41
39
  /// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
42
- pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String> {
40
+ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<ConversionResult> {
41
+ use std::cell::RefCell;
42
+ use std::rc::Rc;
43
+
43
44
  let options = options.unwrap_or_default();
44
45
 
45
46
  let normalized_html = normalize_input(html)?;
46
47
 
48
+ // Fast path: plain text with no HTML tags — skip full parsing pipeline.
47
49
  if !options.wrap {
48
50
  if let Some(markdown) = fast_text_only(normalized_html.as_ref(), &options) {
49
- return Ok(markdown);
51
+ return Ok(ConversionResult {
52
+ content: Some(markdown),
53
+ ..ConversionResult::default()
54
+ });
50
55
  }
51
56
  }
52
57
 
53
- let markdown = crate::converter::convert_html(normalized_html.as_ref(), &options)?;
54
-
55
- if options.wrap {
56
- Ok(crate::wrapper::wrap_markdown(&markdown, &options))
57
- } else {
58
- Ok(markdown)
59
- }
60
- }
61
-
62
- /// Convert HTML to Markdown while collecting inline image assets (requires the `inline-images` feature).
63
- ///
64
- /// Extracts inline image data URIs and inline `<svg>` elements alongside Markdown conversion.
65
- ///
66
- /// # Arguments
67
- ///
68
- /// * `html` - The HTML string to convert
69
- /// * `options` - Optional conversion options (defaults to `ConversionOptions::default()`)
70
- /// * `image_cfg` - Configuration controlling inline image extraction
71
- /// * `visitor` - Optional visitor for customizing conversion behavior. Only used if `visitor` feature is enabled.
72
- /// # Errors
73
- ///
74
- /// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
75
- #[cfg(feature = "inline-images")]
76
- pub fn convert_with_inline_images(
77
- html: &str,
78
- options: Option<ConversionOptions>,
79
- image_cfg: InlineImageConfig,
80
- #[cfg(feature = "visitor")] visitor: Option<visitor::VisitorHandle>,
81
- #[cfg(not(feature = "visitor"))] _visitor: Option<()>,
82
- ) -> Result<HtmlExtraction> {
83
- use std::cell::RefCell;
84
- use std::rc::Rc;
85
-
86
- let options = options.unwrap_or_default();
87
-
88
- let normalized_html = normalize_input(html)?;
89
-
90
- let collector = Rc::new(RefCell::new(crate::inline_images::InlineImageCollector::new(
91
- image_cfg,
92
- )?));
58
+ // Determine whether metadata / inline-image extraction is requested.
59
+ #[cfg(feature = "metadata")]
60
+ let wants_metadata = options.extract_metadata;
61
+ #[cfg(not(feature = "metadata"))]
62
+ let wants_metadata = false;
93
63
 
94
- #[cfg(feature = "visitor")]
95
- let markdown = crate::converter::convert_html_impl(
96
- normalized_html.as_ref(),
97
- &options,
98
- Some(Rc::clone(&collector)),
99
- None,
100
- visitor,
101
- )?;
102
- #[cfg(not(feature = "visitor"))]
103
- let markdown = crate::converter::convert_html_impl(
104
- normalized_html.as_ref(),
105
- &options,
106
- Some(Rc::clone(&collector)),
107
- None,
108
- None,
109
- )?;
64
+ #[cfg(feature = "inline-images")]
65
+ let wants_images = options.extract_images;
66
+ #[cfg(not(feature = "inline-images"))]
67
+ let wants_images = false;
110
68
 
111
- let markdown = if options.wrap {
112
- crate::wrapper::wrap_markdown(&markdown, &options)
69
+ // Build optional collectors based on requested features.
70
+ #[cfg(feature = "metadata")]
71
+ let metadata_collector = if wants_metadata {
72
+ Some(Rc::new(RefCell::new(crate::metadata::MetadataCollector::new(
73
+ MetadataConfig::default(),
74
+ ))))
113
75
  } else {
114
- markdown
76
+ None
115
77
  };
116
78
 
117
- let collector = Rc::try_unwrap(collector)
118
- .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
119
- .into_inner();
120
- let (inline_images, warnings) = collector.finish();
121
-
122
- Ok(HtmlExtraction {
123
- markdown,
124
- inline_images,
125
- warnings,
126
- })
127
- }
128
-
129
- /// Convert HTML to Markdown with comprehensive metadata extraction (requires the `metadata` feature).
130
- ///
131
- /// Performs HTML-to-Markdown conversion while simultaneously extracting structured metadata in a
132
- /// single pass for maximum efficiency. Ideal for content analysis, SEO optimization, and document
133
- /// indexing workflows.
134
- ///
135
- /// # Arguments
136
- ///
137
- /// * `html` - The HTML string to convert. Will normalize line endings (CRLF → LF).
138
- /// * `options` - Optional conversion configuration. Defaults to `ConversionOptions::default()` if `None`.
139
- /// Controls heading style, list indentation, escape behavior, wrapping, and other output formatting.
140
- /// * `metadata_cfg` - Configuration for metadata extraction granularity. Use `MetadataConfig::default()`
141
- /// to extract all metadata types, or customize with selective extraction flags.
142
- /// * `visitor` - Optional visitor for customizing conversion behavior. Only used if `visitor` feature is enabled.
143
- ///
144
- /// # Returns
145
- ///
146
- /// On success, returns a tuple of:
147
- /// - `String`: The converted Markdown output
148
- /// - `ExtendedMetadata`: Comprehensive metadata containing:
149
- /// - `document`: Title, description, author, language, Open Graph, Twitter Card, and other meta tags
150
- /// - `headers`: All heading elements (h1-h6) with hierarchy and IDs
151
- /// - `links`: Hyperlinks classified as anchor, internal, external, email, or phone
152
- /// - `images`: Image elements with source, dimensions, and alt text
153
- /// - `structured_data`: JSON-LD, Microdata, and `RDFa` blocks
154
- ///
155
- /// # Errors
156
- ///
157
- /// Returns `ConversionError` if:
158
- /// - HTML parsing fails
159
- /// - Invalid UTF-8 sequences encountered
160
- /// - Internal panic during conversion (wrapped in `ConversionError::Panic`)
161
- /// - Configuration size limits exceeded
162
- ///
163
- /// # Performance Notes
164
- ///
165
- /// - Single-pass collection: metadata extraction has minimal overhead
166
- /// - Zero cost when metadata feature is disabled
167
- /// - Pre-allocated buffers: typically handles 50+ headers, 100+ links, 20+ images efficiently
168
- /// - Structured data size-limited to prevent memory exhaustion (configurable)
169
- ///
170
- /// # Example: Basic Usage
171
- ///
172
- /// ```ignore
173
- /// use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
174
- ///
175
- /// let html = r#"
176
- /// <html lang="en">
177
- /// <head><title>My Article</title></head>
178
- /// <body>
179
- /// <h1 id="intro">Introduction</h1>
180
- /// <p>Welcome to <a href="https://example.com">our site</a></p>
181
- /// </body>
182
- /// </html>
183
- /// "#;
184
- ///
185
- /// let (markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default(), None)?;
186
- ///
187
- /// assert_eq!(metadata.document.title, Some("My Article".to_string()));
188
- /// assert_eq!(metadata.document.language, Some("en".to_string()));
189
- /// assert_eq!(metadata.headers[0].text, "Introduction");
190
- /// assert_eq!(metadata.headers[0].id, Some("intro".to_string()));
191
- /// assert_eq!(metadata.links.len(), 1);
192
- /// # Ok::<(), html_to_markdown_rs::ConversionError>(())
193
- /// ```
194
- ///
195
- /// # Example: Selective Metadata Extraction
196
- ///
197
- /// ```ignore
198
- /// use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
199
- ///
200
- /// let html = "<html><body><h1>Title</h1><a href='#anchor'>Link</a></body></html>";
201
- ///
202
- /// // Extract only headers and document metadata, skip links/images
203
- /// let config = MetadataConfig {
204
- /// extract_headers: true,
205
- /// extract_links: false,
206
- /// extract_images: false,
207
- /// extract_structured_data: false,
208
- /// max_structured_data_size: 0,
209
- /// };
210
- ///
211
- /// let (markdown, metadata) = convert_with_metadata(html, None, config, None)?;
212
- /// assert!(metadata.headers.len() > 0);
213
- /// assert!(metadata.links.is_empty()); // Not extracted
214
- /// # Ok::<(), html_to_markdown_rs::ConversionError>(())
215
- /// ```
216
- ///
217
- /// # Example: With Conversion Options and Metadata Config
218
- ///
219
- /// ```ignore
220
- /// use html_to_markdown_rs::{convert_with_metadata, ConversionOptions, MetadataConfig, HeadingStyle};
221
- ///
222
- /// let html = "<html><head><title>Blog Post</title></head><body><h1>Hello</h1></body></html>";
223
- ///
224
- /// let options = ConversionOptions {
225
- /// heading_style: HeadingStyle::Atx,
226
- /// wrap: true,
227
- /// wrap_width: 80,
228
- /// ..Default::default()
229
- /// };
230
- ///
231
- /// let metadata_cfg = MetadataConfig::default();
232
- ///
233
- /// let (markdown, metadata) = convert_with_metadata(html, Some(options), metadata_cfg, None)?;
234
- /// // Markdown will use ATX-style headings (# H1, ## H2, etc.)
235
- /// // Wrapped at 80 characters
236
- /// // All metadata extracted
237
- /// # Ok::<(), html_to_markdown_rs::ConversionError>(())
238
- /// ```
239
- ///
240
- /// # See Also
241
- ///
242
- /// - [`convert`] - Simple HTML to Markdown conversion without metadata
243
- /// - [`convert_with_inline_images`] - Conversion with inline image extraction
244
- /// - [`MetadataConfig`] - Configuration for metadata extraction
245
- /// - [`ExtendedMetadata`] - Metadata structure documentation
246
- /// - [`metadata`] module - Detailed type documentation for metadata components
247
- #[cfg(feature = "metadata")]
248
- pub fn convert_with_metadata(
249
- html: &str,
250
- options: Option<ConversionOptions>,
251
- metadata_cfg: MetadataConfig,
252
- #[cfg(feature = "visitor")] visitor: Option<visitor::VisitorHandle>,
253
- #[cfg(not(feature = "visitor"))] _visitor: Option<()>,
254
- ) -> Result<(String, ExtendedMetadata)> {
255
- use std::cell::RefCell;
256
- use std::rc::Rc;
257
-
258
- // Disable YAML frontmatter prepending: metadata is returned as a struct,
259
- // so embedding it in the content string is redundant and pollutes the output.
260
- let mut options = options.unwrap_or_default();
261
- options.extract_metadata = false;
79
+ #[cfg(feature = "inline-images")]
80
+ let image_collector = if wants_images {
81
+ use crate::inline_images::{DEFAULT_INLINE_IMAGE_LIMIT, InlineImageConfig as IIC};
82
+ Some(Rc::new(RefCell::new(crate::inline_images::InlineImageCollector::new(
83
+ IIC::new(DEFAULT_INLINE_IMAGE_LIMIT),
84
+ )?)))
85
+ } else {
86
+ None
87
+ };
262
88
 
263
- let normalized_html = normalize_input(html)?;
264
- if !metadata_cfg.any_enabled() {
265
- #[cfg(feature = "visitor")]
266
- let markdown = crate::converter::convert_html_impl(normalized_html.as_ref(), &options, None, None, visitor)?;
267
- #[cfg(not(feature = "visitor"))]
268
- let markdown = crate::converter::convert_html_impl(normalized_html.as_ref(), &options, None, None, None)?;
269
- let markdown = if options.wrap {
270
- crate::wrapper::wrap_markdown(&markdown, &options)
89
+ // Build optional structure collector when requested.
90
+ let structure_collector: Option<std::rc::Rc<std::cell::RefCell<crate::types::StructureCollector>>> =
91
+ if options.include_document_structure {
92
+ Some(std::rc::Rc::new(std::cell::RefCell::new(
93
+ crate::types::StructureCollector::new(),
94
+ )))
271
95
  } else {
272
- markdown
96
+ None
273
97
  };
274
- return Ok((markdown, ExtendedMetadata::default()));
275
- }
276
98
 
277
- let metadata_collector = Rc::new(RefCell::new(crate::metadata::MetadataCollector::new(metadata_cfg)));
278
-
279
- #[cfg(feature = "visitor")]
280
- let markdown = crate::converter::convert_html_impl(
281
- normalized_html.as_ref(),
282
- &options,
283
- None,
284
- Some(Rc::clone(&metadata_collector)),
285
- visitor,
286
- )?;
287
- #[cfg(not(feature = "visitor"))]
288
- let markdown = crate::converter::convert_html_impl(
289
- normalized_html.as_ref(),
290
- &options,
291
- None,
292
- Some(Rc::clone(&metadata_collector)),
293
- None,
294
- )?;
99
+ // Run the conversion pipeline.
100
+ // Pass structure_collector by value — convert_html_impl will consume it via Rc::try_unwrap
101
+ // to return the finished DocumentStructure. We must not hold a second Rc reference.
102
+ let (markdown, document) = {
103
+ #[cfg(all(feature = "metadata", feature = "inline-images"))]
104
+ {
105
+ crate::converter::convert_html_impl(
106
+ normalized_html.as_ref(),
107
+ &options,
108
+ image_collector.as_ref().map(Rc::clone),
109
+ metadata_collector.as_ref().map(Rc::clone),
110
+ None,
111
+ structure_collector,
112
+ )?
113
+ }
114
+ #[cfg(all(feature = "metadata", not(feature = "inline-images")))]
115
+ {
116
+ crate::converter::convert_html_impl(
117
+ normalized_html.as_ref(),
118
+ &options,
119
+ None,
120
+ metadata_collector.as_ref().map(Rc::clone),
121
+ None,
122
+ structure_collector,
123
+ )?
124
+ }
125
+ #[cfg(all(not(feature = "metadata"), feature = "inline-images"))]
126
+ {
127
+ crate::converter::convert_html_impl(
128
+ normalized_html.as_ref(),
129
+ &options,
130
+ image_collector.as_ref().map(Rc::clone),
131
+ None,
132
+ None,
133
+ structure_collector,
134
+ )?
135
+ }
136
+ #[cfg(all(not(feature = "metadata"), not(feature = "inline-images")))]
137
+ {
138
+ crate::converter::convert_html_impl(
139
+ normalized_html.as_ref(),
140
+ &options,
141
+ None,
142
+ None,
143
+ None,
144
+ structure_collector,
145
+ )?
146
+ }
147
+ };
295
148
 
296
149
  let markdown = if options.wrap {
297
150
  crate::wrapper::wrap_markdown(&markdown, &options)
@@ -299,146 +152,67 @@ pub fn convert_with_metadata(
299
152
  markdown
300
153
  };
301
154
 
302
- let metadata_collector = Rc::try_unwrap(metadata_collector)
303
- .map_err(|_| ConversionError::Other("failed to recover metadata state".to_string()))?
304
- .into_inner();
305
- let metadata = metadata_collector.finish();
155
+ // Collect metadata if extracted.
156
+ #[cfg(feature = "metadata")]
157
+ let metadata = if let Some(collector) = metadata_collector {
158
+ Rc::try_unwrap(collector)
159
+ .map_err(|_| ConversionError::Other("failed to recover metadata state".to_string()))?
160
+ .into_inner()
161
+ .finish()
162
+ } else {
163
+ HtmlMetadata::default()
164
+ };
165
+
166
+ // Collect inline images if extracted.
167
+ #[cfg(feature = "inline-images")]
168
+ let (images, image_warnings) = if let Some(collector) = image_collector {
169
+ let c = Rc::try_unwrap(collector)
170
+ .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
171
+ .into_inner();
172
+ c.finish()
173
+ } else {
174
+ (Vec::new(), Vec::new())
175
+ };
306
176
 
307
- Ok((markdown, metadata))
177
+ // Map InlineImageWarnings → ProcessingWarnings.
178
+ #[cfg(feature = "inline-images")]
179
+ let warnings: Vec<crate::types::ProcessingWarning> = image_warnings
180
+ .into_iter()
181
+ .map(|w| crate::types::ProcessingWarning {
182
+ kind: crate::types::WarningKind::ImageExtractionFailed,
183
+ message: w.message,
184
+ })
185
+ .collect();
186
+ #[cfg(not(feature = "inline-images"))]
187
+ let warnings: Vec<crate::types::ProcessingWarning> = Vec::new();
188
+
189
+ let _ = wants_metadata;
190
+ let _ = wants_images;
191
+
192
+ Ok(ConversionResult {
193
+ content: Some(markdown),
194
+ document,
195
+ #[cfg(feature = "metadata")]
196
+ metadata,
197
+ tables: Vec::new(),
198
+ #[cfg(feature = "inline-images")]
199
+ images,
200
+ warnings,
201
+ })
308
202
  }
309
203
 
310
- /// Convert HTML to Markdown with a custom visitor callback.
311
- ///
312
- /// This function allows you to provide a visitor implementation that can inspect,
313
- /// modify, or replace the default conversion behavior for any HTML element type.
314
- ///
315
- /// # Arguments
316
- ///
317
- /// * `html` - The HTML input to convert
318
- /// * `options` - Optional conversion options (uses defaults if None)
319
- /// * `visitor` - Mutable reference to visitor implementation for customization
320
- ///
321
- /// # Example
322
- ///
323
- /// ```ignore
324
- /// use html_to_markdown_rs::convert_with_visitor;
325
- /// use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};
326
- ///
327
- /// #[derive(Debug)]
328
- /// struct CustomVisitor;
329
- ///
330
- /// impl HtmlVisitor for CustomVisitor {
331
- /// fn visit_code_block(
332
- /// &mut self,
333
- /// _ctx: &NodeContext,
334
- /// language: Option<&str>,
335
- /// code: &str,
336
- /// ) -> VisitResult {
337
- /// VisitResult::Custom(format!("```{}\n{}\n```", language.unwrap_or(""), code))
338
- /// }
339
- /// }
340
- ///
341
- /// let html = "<pre><code class=\"language-rust\">fn main() {}</code></pre>";
342
- /// let mut visitor = CustomVisitor;
343
- /// let markdown = convert_with_visitor(html, None, &mut visitor).unwrap();
344
- /// ```
204
+ /// Internal: convert with visitor support. Used by FFI crate.
205
+ /// Will be removed when convert() accepts visitor parameter directly.
345
206
  #[cfg(feature = "visitor")]
346
- /// # Errors
347
- ///
348
- /// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
207
+ #[doc(hidden)]
349
208
  pub fn convert_with_visitor(
350
209
  html: &str,
351
210
  options: Option<ConversionOptions>,
352
- visitor: Option<visitor::VisitorHandle>,
211
+ visitor: Option<crate::visitor::VisitorHandle>,
353
212
  ) -> Result<String> {
354
213
  let options = options.unwrap_or_default();
355
-
356
214
  let normalized_html = normalize_input(html)?;
357
-
358
215
  let markdown = crate::converter::convert_html_with_visitor(normalized_html.as_ref(), &options, visitor)?;
359
-
360
- if options.wrap {
361
- Ok(crate::wrapper::wrap_markdown(&markdown, &options))
362
- } else {
363
- Ok(markdown)
364
- }
365
- }
366
-
367
- #[cfg(feature = "async-visitor")]
368
- /// Convert HTML to Markdown with an async visitor callback.
369
- ///
370
- /// This async function allows you to provide an async visitor implementation that can inspect,
371
- /// modify, or replace the default conversion behavior for any HTML element type.
372
- ///
373
- /// This function is useful for:
374
- /// - Python async functions (with `async def` and `asyncio`)
375
- /// - TypeScript/JavaScript async functions (with `Promise`-based callbacks)
376
- /// - Elixir processes (with message-passing async operations)
377
- ///
378
- /// For synchronous languages (Ruby, PHP, Go, Java, C#), use `convert_with_visitor` instead.
379
- ///
380
- /// # Note
381
- ///
382
- /// The async visitor trait (`AsyncHtmlVisitor`) and async dispatch helpers are designed to be
383
- /// consumed by language bindings (`PyO3`, NAPI-RS, Magnus, etc.) which can bridge async/await
384
- /// semantics from their host languages. The conversion pipeline wraps async visitor calls using
385
- /// tokio's runtime to support both multi-threaded and current_thread runtimes (like NAPI's).
386
- ///
387
- /// Binding implementations will be responsible for running async callbacks on appropriate
388
- /// event loops (asyncio for Python, Promise chains for TypeScript, etc.).
389
- ///
390
- /// # Arguments
391
- ///
392
- /// * `html` - The HTML input to convert
393
- /// * `options` - Optional conversion options (uses defaults if None)
394
- /// * `visitor` - Optional async visitor implementing `AsyncHtmlVisitor` trait for customization
395
- ///
396
- /// # Example (Rust-like async)
397
- ///
398
- /// ```ignore
399
- /// use html_to_markdown_rs::convert_with_async_visitor;
400
- /// use html_to_markdown_rs::visitor::{AsyncHtmlVisitor, NodeContext, VisitResult};
401
- /// use async_trait::async_trait;
402
- /// use std::rc::Rc;
403
- /// use std::cell::RefCell;
404
- ///
405
- /// #[derive(Debug)]
406
- /// struct CustomAsyncVisitor;
407
- ///
408
- /// #[async_trait]
409
- /// impl AsyncHtmlVisitor for CustomAsyncVisitor {
410
- /// async fn visit_code_block(
411
- /// &mut self,
412
- /// _ctx: &NodeContext,
413
- /// language: Option<&str>,
414
- /// code: &str,
415
- /// ) -> VisitResult {
416
- /// // Can perform async operations here (e.g., syntax highlighting via service)
417
- /// VisitResult::Custom(format!("```{}\n{}\n```", language.unwrap_or(""), code))
418
- /// }
419
- /// }
420
- ///
421
- /// let html = "<pre><code class=\"language-rust\">fn main() {}</code></pre>";
422
- /// let visitor = Some(Rc::new(RefCell::new(CustomAsyncVisitor) as _));
423
- /// let markdown = convert_with_async_visitor(html, None, visitor).await.unwrap();
424
- /// ```
425
- #[allow(clippy::future_not_send)]
426
- /// # Errors
427
- ///
428
- /// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
429
- pub async fn convert_with_async_visitor(
430
- html: &str,
431
- options: Option<ConversionOptions>,
432
- visitor: Option<visitor_helpers::AsyncVisitorHandle>,
433
- ) -> Result<String> {
434
- let options = options.unwrap_or_default();
435
-
436
- let normalized_html = normalize_input(html)?;
437
-
438
- // Use the async implementation that properly awaits visitor callbacks
439
- let markdown =
440
- crate::converter::convert_html_with_visitor_async(normalized_html.as_ref(), &options, visitor).await?;
441
-
442
216
  if options.wrap {
443
217
  Ok(crate::wrapper::wrap_markdown(&markdown, &options))
444
218
  } else {
@@ -681,371 +455,3 @@ pub fn metadata_config_from_json(json: &str) -> Result<MetadataConfig> {
681
455
  let update: crate::MetadataConfigUpdate = parse_json(json)?;
682
456
  Ok(MetadataConfig::from(update))
683
457
  }
684
-
685
- // ============================================================================
686
- // Table Extraction API (requires visitor feature)
687
- // ============================================================================
688
-
689
- /// Extracted table data from HTML conversion.
690
- ///
691
- /// Each instance represents a single `<table>` element found during conversion.
692
- /// Tables are collected in document order.
693
- #[cfg(feature = "visitor")]
694
- #[derive(Debug, Clone)]
695
- #[cfg_attr(
696
- any(feature = "serde", feature = "metadata"),
697
- derive(serde::Serialize, serde::Deserialize)
698
- )]
699
- pub struct TableData {
700
- /// Table cells organized as rows x columns. Cell contents are already
701
- /// converted to the target output format (markdown/djot/plain).
702
- pub cells: Vec<Vec<String>>,
703
- /// Complete rendered table in the target output format.
704
- pub markdown: String,
705
- /// Per-row flag indicating whether the row was inside `<thead>`.
706
- pub is_header_row: Vec<bool>,
707
- }
708
-
709
- /// Result of HTML-to-markdown conversion with extracted table data.
710
- #[cfg(feature = "visitor")]
711
- #[derive(Debug, Clone)]
712
- #[cfg_attr(
713
- any(feature = "serde", feature = "metadata"),
714
- derive(serde::Serialize, serde::Deserialize)
715
- )]
716
- pub struct ConversionWithTables {
717
- /// Converted markdown/djot/plain text content.
718
- pub content: String,
719
- /// Extended metadata (if metadata extraction was requested).
720
- #[cfg(feature = "metadata")]
721
- pub metadata: Option<ExtendedMetadata>,
722
- /// All tables found in the HTML, in document order.
723
- pub tables: Vec<TableData>,
724
- }
725
-
726
- #[cfg(feature = "visitor")]
727
- #[derive(Debug)]
728
- struct TableCollector {
729
- tables: Vec<TableData>,
730
- current_rows: Vec<Vec<String>>,
731
- current_is_header: Vec<bool>,
732
- }
733
-
734
- #[cfg(feature = "visitor")]
735
- impl TableCollector {
736
- fn new() -> Self {
737
- Self {
738
- tables: Vec::new(),
739
- current_rows: Vec::new(),
740
- current_is_header: Vec::new(),
741
- }
742
- }
743
- }
744
-
745
- #[cfg(feature = "visitor")]
746
- impl visitor::HtmlVisitor for TableCollector {
747
- fn visit_table_start(&mut self, _ctx: &visitor::NodeContext) -> visitor::VisitResult {
748
- self.current_rows.clear();
749
- self.current_is_header.clear();
750
- visitor::VisitResult::Continue
751
- }
752
-
753
- fn visit_table_row(
754
- &mut self,
755
- _ctx: &visitor::NodeContext,
756
- cells: &[String],
757
- is_header: bool,
758
- ) -> visitor::VisitResult {
759
- self.current_rows.push(cells.to_vec());
760
- self.current_is_header.push(is_header);
761
- visitor::VisitResult::Continue
762
- }
763
-
764
- fn visit_table_end(&mut self, _ctx: &visitor::NodeContext, output: &str) -> visitor::VisitResult {
765
- if !self.current_rows.is_empty() {
766
- self.tables.push(TableData {
767
- cells: std::mem::take(&mut self.current_rows),
768
- markdown: output.to_string(),
769
- is_header_row: std::mem::take(&mut self.current_is_header),
770
- });
771
- }
772
- visitor::VisitResult::Continue
773
- }
774
- }
775
-
776
- /// Convert HTML to markdown/djot/plain text with structured table extraction.
777
- ///
778
- /// Combines conversion, optional metadata extraction, and table data collection
779
- /// in a single DOM walk. Each table found in the HTML is returned with its
780
- /// cell contents (already converted to the target format) and rendered output.
781
- ///
782
- /// # Arguments
783
- ///
784
- /// * `html` - The HTML string to convert
785
- /// * `options` - Optional conversion options (defaults to `ConversionOptions::default()`)
786
- /// * `metadata_cfg` - Optional metadata extraction configuration (requires `metadata` feature)
787
- ///
788
- /// # Example
789
- ///
790
- /// ```ignore
791
- /// use html_to_markdown_rs::convert_with_tables;
792
- ///
793
- /// let html = r#"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>"#;
794
- /// let result = convert_with_tables(html, None, None).unwrap();
795
- /// assert_eq!(result.tables.len(), 1);
796
- /// assert_eq!(result.tables[0].cells[0], vec!["Name", "Age"]);
797
- /// ```
798
- ///
799
- /// # Errors
800
- ///
801
- /// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
802
- #[cfg(feature = "visitor")]
803
- pub fn convert_with_tables(
804
- html: &str,
805
- options: Option<ConversionOptions>,
806
- #[cfg(feature = "metadata")] metadata_cfg: Option<MetadataConfig>,
807
- #[cfg(not(feature = "metadata"))] _metadata_cfg: Option<()>,
808
- ) -> Result<ConversionWithTables> {
809
- use std::cell::RefCell;
810
- use std::rc::Rc;
811
-
812
- let collector = Rc::new(RefCell::new(TableCollector::new()));
813
- let visitor_handle: visitor::VisitorHandle = Rc::clone(&collector) as visitor::VisitorHandle;
814
-
815
- #[cfg(feature = "metadata")]
816
- let result = {
817
- let metadata_config = metadata_cfg.unwrap_or_default();
818
- let (content, metadata) = convert_with_metadata(html, options, metadata_config, Some(visitor_handle))?;
819
- let tables = Rc::try_unwrap(collector)
820
- .map_err(|_| ConversionError::Other("failed to recover table collector state".into()))?
821
- .into_inner()
822
- .tables;
823
- ConversionWithTables {
824
- content,
825
- metadata: Some(metadata),
826
- tables,
827
- }
828
- };
829
-
830
- #[cfg(not(feature = "metadata"))]
831
- let result = {
832
- let content = convert_with_visitor(html, options, Some(visitor_handle))?;
833
- let tables = Rc::try_unwrap(collector)
834
- .map_err(|_| ConversionError::Other("failed to recover table collector state".into()))?
835
- .into_inner()
836
- .tables;
837
- ConversionWithTables { content, tables }
838
- };
839
-
840
- Ok(result)
841
- }
842
-
843
- #[cfg(test)]
844
- #[cfg(feature = "visitor")]
845
- mod table_extraction_tests {
846
- use super::*;
847
-
848
- fn tables_from_html(html: &str) -> ConversionWithTables {
849
- convert_with_tables(
850
- html,
851
- None,
852
- #[cfg(feature = "metadata")]
853
- None,
854
- #[cfg(not(feature = "metadata"))]
855
- None,
856
- )
857
- .unwrap()
858
- }
859
-
860
- #[test]
861
- fn test_convert_with_tables_basic() {
862
- let html = r"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>";
863
- let result = tables_from_html(html);
864
- assert_eq!(result.tables.len(), 1);
865
- assert_eq!(result.tables[0].cells.len(), 2);
866
- assert_eq!(result.tables[0].cells[0], vec!["Name", "Age"]);
867
- assert_eq!(result.tables[0].cells[1], vec!["Alice", "30"]);
868
- assert!(result.tables[0].is_header_row[0]);
869
- assert!(!result.tables[0].is_header_row[1]);
870
- assert!(result.tables[0].markdown.contains('|'));
871
- }
872
-
873
- #[test]
874
- fn test_convert_with_tables_nested() {
875
- let html = r"
876
- <table>
877
- <tr><th>Category</th><th>Details</th></tr>
878
- <tr>
879
- <td>Project Alpha</td>
880
- <td>
881
- <table>
882
- <tr><th>Task</th><th>Status</th></tr>
883
- <tr><td>001</td><td>Done</td></tr>
884
- </table>
885
- </td>
886
- </tr>
887
- </table>";
888
- let result = tables_from_html(html);
889
- assert!(
890
- result.tables.len() >= 2,
891
- "Expected at least 2 tables (outer + nested), got {}",
892
- result.tables.len()
893
- );
894
- }
895
-
896
- #[test]
897
- fn test_convert_with_tables_no_tables() {
898
- let html = "<p>No tables here</p>";
899
- let result = tables_from_html(html);
900
- assert!(result.tables.is_empty());
901
- assert!(result.content.contains("No tables here"));
902
- }
903
-
904
- #[test]
905
- fn test_convert_with_tables_empty_table() {
906
- let result = tables_from_html("<table></table>");
907
- assert!(result.tables.is_empty(), "Empty table should not produce TableData");
908
- }
909
-
910
- #[test]
911
- fn test_convert_with_tables_headers_only() {
912
- let html = r"<table><thead><tr><th>A</th><th>B</th></tr></thead></table>";
913
- let result = tables_from_html(html);
914
- assert_eq!(result.tables.len(), 1);
915
- assert!(result.tables[0].is_header_row[0]);
916
- assert_eq!(result.tables[0].cells[0], vec!["A", "B"]);
917
- }
918
-
919
- #[test]
920
- fn test_convert_with_tables_thead_tbody_tfoot() {
921
- let html = r"
922
- <table>
923
- <thead><tr><th>H1</th></tr></thead>
924
- <tbody><tr><td>B1</td></tr></tbody>
925
- <tfoot><tr><td>F1</td></tr></tfoot>
926
- </table>";
927
- let result = tables_from_html(html);
928
- assert_eq!(result.tables.len(), 1);
929
- let t = &result.tables[0];
930
- assert!(t.is_header_row[0], "thead row should be header");
931
- assert!(!t.is_header_row[1], "tbody row should not be header");
932
- assert_eq!(t.cells[0], vec!["H1"]);
933
- assert_eq!(t.cells[1], vec!["B1"]);
934
- }
935
-
936
- #[test]
937
- fn test_convert_with_tables_multiple_separate() {
938
- let html = r"
939
- <table><tr><td>T1</td></tr></table>
940
- <p>Between tables</p>
941
- <table><tr><td>T2</td></tr></table>";
942
- let result = tables_from_html(html);
943
- assert_eq!(result.tables.len(), 2, "Should find 2 separate tables");
944
- }
945
-
946
- #[test]
947
- fn test_convert_with_tables_special_chars() {
948
- let html = r"<table><tr><td>a | b</td><td>c*d</td></tr></table>";
949
- let result = tables_from_html(html);
950
- assert_eq!(result.tables.len(), 1);
951
- assert!(!result.tables[0].cells[0].is_empty());
952
- }
953
-
954
- #[test]
955
- fn test_convert_with_tables_single_cell() {
956
- let html = r"<table><tr><td>Only cell</td></tr></table>";
957
- let result = tables_from_html(html);
958
- assert_eq!(result.tables.len(), 1);
959
- assert_eq!(result.tables[0].cells.len(), 1);
960
- assert_eq!(result.tables[0].cells[0], vec!["Only cell"]);
961
- }
962
-
963
- #[test]
964
- fn test_convert_with_tables_content_preserved() {
965
- let html = r"<p>Before</p><table><tr><td>Cell</td></tr></table><p>After</p>";
966
- let result = tables_from_html(html);
967
- assert!(result.content.contains("Before"));
968
- assert!(result.content.contains("After"));
969
- assert!(result.content.contains('|'), "Markdown table should appear in content");
970
- }
971
-
972
- #[test]
973
- fn test_convert_with_tables_with_options() {
974
- let options = ConversionOptions {
975
- heading_style: crate::options::HeadingStyle::Underlined,
976
- ..ConversionOptions::default()
977
- };
978
- let html = r"<h1>Title</h1><table><tr><td>Cell</td></tr></table>";
979
- let result = convert_with_tables(
980
- html,
981
- Some(options),
982
- #[cfg(feature = "metadata")]
983
- None,
984
- #[cfg(not(feature = "metadata"))]
985
- None,
986
- )
987
- .unwrap();
988
- assert_eq!(result.tables.len(), 1);
989
- assert!(result.content.contains("Title"));
990
- }
991
-
992
- #[test]
993
- fn test_convert_with_tables_plain_text_format() {
994
- let options = ConversionOptions {
995
- output_format: crate::options::OutputFormat::Plain,
996
- ..ConversionOptions::default()
997
- };
998
- let html = r"<table><tr><th>Name</th></tr><tr><td>Alice</td></tr></table>";
999
- let result = convert_with_tables(
1000
- html,
1001
- Some(options),
1002
- #[cfg(feature = "metadata")]
1003
- None,
1004
- #[cfg(not(feature = "metadata"))]
1005
- None,
1006
- )
1007
- .unwrap();
1008
- assert!(
1009
- !result.tables.is_empty(),
1010
- "Tables should be populated even with plain text output format"
1011
- );
1012
- assert_eq!(result.tables[0].cells[0], vec!["Name"]);
1013
- }
1014
-
1015
- #[cfg(feature = "metadata")]
1016
- #[test]
1017
- fn test_convert_with_tables_metadata_integration() {
1018
- let html = r#"<html lang="en"><head><title>Test</title></head><body>
1019
- <table><tr><th>Col</th></tr><tr><td>Val</td></tr></table>
1020
- </body></html>"#;
1021
- let config = MetadataConfig::default();
1022
- let result = convert_with_tables(html, None, Some(config)).unwrap();
1023
- assert_eq!(result.tables.len(), 1);
1024
- let meta = result.metadata.as_ref().expect("metadata should be present");
1025
- assert_eq!(meta.document.language, Some("en".to_string()));
1026
- }
1027
-
1028
- #[cfg(feature = "metadata")]
1029
- #[test]
1030
- fn test_convert_with_tables_plain_text_metadata() {
1031
- let options = ConversionOptions {
1032
- output_format: crate::options::OutputFormat::Plain,
1033
- ..ConversionOptions::default()
1034
- };
1035
- let html = r#"<html lang="fr"><body>
1036
- <table><tr><td>Cell</td></tr></table>
1037
- </body></html>"#;
1038
- let config = MetadataConfig::default();
1039
- let result = convert_with_tables(html, Some(options), Some(config)).unwrap();
1040
- assert!(
1041
- !result.tables.is_empty(),
1042
- "Tables should be populated in plain text mode"
1043
- );
1044
- let meta = result.metadata.as_ref().expect("metadata should be present");
1045
- assert_eq!(
1046
- meta.document.language,
1047
- Some("fr".to_string()),
1048
- "Metadata should be populated in plain text mode"
1049
- );
1050
- }
1051
- }