html-to-markdown 2.30.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -14
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +5 -2
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +126 -52
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -33,7 +33,6 @@
33
33
  clippy::assigning_clones,
34
34
  clippy::uninlined_format_args
35
35
  )]
36
- #![allow(dead_code)]
37
36
 
38
37
  //! High-performance HTML to Markdown converter.
39
38
  //!
@@ -50,7 +49,6 @@
50
49
 
51
50
  pub mod converter;
52
51
  pub mod error;
53
- pub mod hocr;
54
52
  #[cfg(feature = "inline-images")]
55
53
  mod inline_images;
56
54
  #[cfg(feature = "metadata")]
@@ -58,6 +56,7 @@ pub mod metadata;
58
56
  pub mod options;
59
57
  pub mod safety;
60
58
  pub mod text;
59
+ pub mod types;
61
60
  #[cfg(feature = "visitor")]
62
61
  pub mod visitor;
63
62
  #[cfg(feature = "visitor")]
@@ -76,6 +75,11 @@ mod validation;
76
75
  // ============================================================================
77
76
 
78
77
  pub use exports::*;
78
+ pub use types::{
79
+ AnnotationKind, ConversionResult, DocumentNode, DocumentStructure, GridCell, NodeContent, ProcessingWarning,
80
+ TableGrid, TextAnnotation, WarningKind,
81
+ };
82
+ // Note: types::TableData will replace convert_api::TableData when convert() is refactored
79
83
 
80
84
  // ============================================================================
81
85
  // Main Public API Functions
@@ -90,141 +94,15 @@ pub use convert_api::{conversion_options_from_json, conversion_options_update_fr
90
94
  pub use convert_api::metadata_config_from_json;
91
95
 
92
96
  #[cfg(feature = "inline-images")]
93
- pub use convert_api::{convert_with_inline_images, inline_image_config_from_json};
94
-
95
- #[cfg(feature = "metadata")]
96
- pub use convert_api::convert_with_metadata;
97
+ pub use convert_api::inline_image_config_from_json;
97
98
 
98
99
  #[cfg(feature = "visitor")]
100
+ #[doc(hidden)]
99
101
  pub use convert_api::convert_with_visitor;
100
102
 
101
- #[cfg(feature = "visitor")]
102
- pub use convert_api::{ConversionWithTables, TableData, convert_with_tables};
103
-
104
- #[cfg(feature = "async-visitor")]
105
- pub use convert_api::convert_with_async_visitor;
106
-
107
103
  // Tests
108
104
  // ============================================================================
109
105
 
110
- #[cfg(all(test, feature = "metadata"))]
111
- mod tests {
112
- use super::*;
113
-
114
- #[test]
115
- fn test_convert_with_metadata_full_workflow() {
116
- let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
117
-
118
- let config = MetadataConfig {
119
- extract_document: true,
120
- extract_headers: true,
121
- extract_links: true,
122
- extract_images: true,
123
- extract_structured_data: true,
124
- max_structured_data_size: metadata::DEFAULT_MAX_STRUCTURED_DATA_SIZE,
125
- };
126
-
127
- let (markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
128
-
129
- assert!(!markdown.is_empty());
130
- assert!(markdown.contains("Main Title"));
131
- assert!(markdown.contains("Subsection"));
132
-
133
- assert_eq!(metadata.document.language, Some("en".to_string()));
134
-
135
- assert_eq!(metadata.headers.len(), 2);
136
- assert_eq!(metadata.headers[0].level, 1);
137
- assert_eq!(metadata.headers[0].text, "Main Title");
138
- assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
139
- assert_eq!(metadata.headers[1].level, 2);
140
- assert_eq!(metadata.headers[1].text, "Subsection");
141
-
142
- assert!(metadata.links.len() >= 2);
143
- let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
144
- assert!(external_link.is_some());
145
- let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
146
- assert!(anchor_link.is_some());
147
-
148
- assert_eq!(metadata.images.len(), 1);
149
- assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
150
- assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
151
- assert_eq!(metadata.images[0].image_type, ImageType::External);
152
- }
153
-
154
- #[test]
155
- fn test_convert_with_metadata_document_fields() {
156
- let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
157
-
158
- let (_markdown, metadata) =
159
- convert_with_metadata(html, None, MetadataConfig::default(), None).expect("conversion should succeed");
160
-
161
- assert_eq!(
162
- metadata.document.title,
163
- Some("Test Article".to_string()),
164
- "document: {:?}",
165
- metadata.document
166
- );
167
- assert_eq!(metadata.document.description, Some("Desc".to_string()));
168
- assert_eq!(metadata.document.author, Some("Author".to_string()));
169
- assert_eq!(metadata.document.language, Some("en".to_string()));
170
- assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
171
- assert_eq!(
172
- metadata.document.open_graph.get("description"),
173
- Some(&"OG Desc".to_string())
174
- );
175
- }
176
-
177
- #[test]
178
- fn test_convert_with_metadata_empty_config() {
179
- let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
180
-
181
- let config = MetadataConfig {
182
- extract_document: false,
183
- extract_headers: false,
184
- extract_links: false,
185
- extract_images: false,
186
- extract_structured_data: false,
187
- max_structured_data_size: 0,
188
- };
189
-
190
- let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
191
-
192
- assert!(metadata.headers.is_empty());
193
- assert!(metadata.links.is_empty());
194
- assert!(metadata.images.is_empty());
195
- assert_eq!(metadata.document.language, None);
196
- }
197
-
198
- #[test]
199
- fn test_convert_with_metadata_data_uri_image() {
200
- let html = "<html><body><img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==\" alt=\"Pixel\"></body></html>";
201
-
202
- let config = MetadataConfig::default();
203
-
204
- let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
205
-
206
- assert_eq!(metadata.images.len(), 1);
207
- assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
208
- assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
209
- }
210
-
211
- #[test]
212
- fn test_convert_with_metadata_relative_paths() {
213
- let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
214
-
215
- let config = MetadataConfig::default();
216
-
217
- let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
218
-
219
- let internal_link_count = metadata
220
- .links
221
- .iter()
222
- .filter(|l| l.link_type == LinkType::Internal)
223
- .count();
224
- assert_eq!(internal_link_count, 2);
225
- }
226
- }
227
-
228
106
  #[cfg(test)]
229
107
  mod basic_tests {
230
108
  use super::*;
@@ -253,7 +131,8 @@ mod basic_tests {
253
131
  #[test]
254
132
  fn test_plain_text_allowed() {
255
133
  let result = convert("Just text", None).unwrap();
256
- assert!(result.contains("Just text"));
134
+ let content = result.content.unwrap_or_default();
135
+ assert!(content.contains("Just text"));
257
136
  }
258
137
 
259
138
  #[test]
@@ -264,7 +143,8 @@ mod basic_tests {
264
143
  ..ConversionOptions::default()
265
144
  };
266
145
  let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
267
- assert!(result.contains(r"\*asterisks\*"));
268
- assert!(result.contains(r"\_underscores\_"));
146
+ let content = result.content.unwrap_or_default();
147
+ assert!(content.contains(r"\*asterisks\*"));
148
+ assert!(content.contains(r"\_underscores\_"));
269
149
  }
270
150
  }
@@ -2,7 +2,7 @@
2
2
 
3
3
  use super::config::MetadataConfig;
4
4
  use super::extraction::{extract_document_metadata, extract_structured_data};
5
- use super::types::{ExtendedMetadata, ImageMetadata, ImageType, LinkMetadata};
5
+ use super::types::{HtmlMetadata, ImageMetadata, ImageType, LinkMetadata};
6
6
  use std::collections::BTreeMap;
7
7
 
8
8
  /// Internal metadata collector for single-pass extraction.
@@ -256,13 +256,13 @@ impl MetadataCollector {
256
256
  /// Finish collection and return all extracted metadata.
257
257
  ///
258
258
  /// Performs final processing, validation, and consolidation of all
259
- /// collected data into the [`ExtendedMetadata`] output structure.
259
+ /// collected data into the [`HtmlMetadata`] output structure.
260
260
  #[allow(dead_code)]
261
- pub(crate) fn finish(self) -> ExtendedMetadata {
261
+ pub(crate) fn finish(self) -> HtmlMetadata {
262
262
  let structured_data = extract_structured_data(self.json_ld);
263
263
  let document = extract_document_metadata(self.head_metadata, self.lang, self.dir);
264
264
 
265
- ExtendedMetadata {
265
+ HtmlMetadata {
266
266
  document,
267
267
  headers: self.headers,
268
268
  links: self.links,
@@ -40,14 +40,14 @@
40
40
  //! - [`ImageMetadata`]: Image element with src, alt, title, dimensions, type, and attributes
41
41
  //! - [`StructuredData`]: Structured data block with type and raw JSON
42
42
  //! - [`MetadataConfig`]: Configuration controlling extraction granularity and size limits
43
- //! - [`ExtendedMetadata`]: Top-level result containing all extracted metadata
43
+ //! - [`HtmlMetadata`]: Top-level result containing all extracted metadata
44
44
  //!
45
45
  //! # Examples
46
46
  //!
47
- //! ## Basic Usage with `convert_with_metadata`
47
+ //! ## Basic Usage with `convert()`
48
48
  //!
49
49
  //! ```ignore
50
- //! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
50
+ //! use html_to_markdown_rs::convert;
51
51
  //!
52
52
  //! let html = r#"
53
53
  //! <html lang="en">
@@ -63,8 +63,8 @@
63
63
  //! </html>
64
64
  //! "#;
65
65
  //!
66
- //! let config = MetadataConfig::default();
67
- //! let (markdown, metadata) = convert_with_metadata(html, None, config)?;
66
+ //! let result = convert(html, None)?;
67
+ //! let metadata = result.metadata.unwrap();
68
68
  //!
69
69
  //! // Access document metadata
70
70
  //! assert_eq!(metadata.document.title, Some("My Article".to_string()));
@@ -88,28 +88,26 @@
88
88
  //! ## Selective Extraction
89
89
  //!
90
90
  //! ```ignore
91
- //! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
91
+ //! use html_to_markdown_rs::{convert, ConversionOptions};
92
92
  //!
93
- //! let config = MetadataConfig {
94
- //! extract_headers: true,
95
- //! extract_links: true,
96
- //! extract_images: false, // Skip images
97
- //! extract_structured_data: false, // Skip structured data
98
- //! max_structured_data_size: 0,
93
+ //! let options = ConversionOptions {
94
+ //! extract_metadata: false, // Disable metadata extraction
95
+ //! ..Default::default()
99
96
  //! };
100
97
  //!
101
- //! let (markdown, metadata) = convert_with_metadata(html, None, config)?;
102
- //! assert_eq!(metadata.images.len(), 0); // Images not extracted
98
+ //! let result = convert(html, Some(options))?;
99
+ //! assert!(result.metadata.is_none()); // Metadata not extracted
103
100
  //! # Ok::<(), html_to_markdown_rs::ConversionError>(())
104
101
  //! ```
105
102
  //!
106
103
  //! ## Analyzing Link Types
107
104
  //!
108
105
  //! ```ignore
109
- //! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
106
+ //! use html_to_markdown_rs::convert;
110
107
  //! use html_to_markdown_rs::metadata::LinkType;
111
108
  //!
112
- //! let (_markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default())?;
109
+ //! let result = convert(html, None)?;
110
+ //! let metadata = result.metadata.unwrap();
113
111
  //!
114
112
  //! for link in &metadata.links {
115
113
  //! match link.link_type {
@@ -129,11 +127,13 @@
129
127
  //! This enables easy export to JSON, YAML, or other formats:
130
128
  //!
131
129
  //! ```ignore
132
- //! use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
130
+ //! use html_to_markdown_rs::convert;
133
131
  //!
134
- //! let (_markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default())?;
135
- //! let json = serde_json::to_string_pretty(&metadata)?;
136
- //! println!("{}", json);
132
+ //! let result = convert(html, None)?;
133
+ //! if let Some(metadata) = &result.metadata {
134
+ //! let json = serde_json::to_string_pretty(metadata)?;
135
+ //! println!("{}", json);
136
+ //! }
137
137
  //! # Ok::<(), Box<dyn std::error::Error>>(())
138
138
  //! ```
139
139
 
@@ -146,8 +146,8 @@ pub mod types;
146
146
  pub use collector::MetadataCollector;
147
147
  pub use config::{DEFAULT_MAX_STRUCTURED_DATA_SIZE, MetadataConfig, MetadataConfigUpdate};
148
148
  pub use types::{
149
- DocumentMetadata, ExtendedMetadata, HeaderMetadata, ImageMetadata, ImageType, LinkMetadata, LinkType,
150
- StructuredData, StructuredDataType, TextDirection,
149
+ DocumentMetadata, HeaderMetadata, HtmlMetadata, ImageMetadata, ImageType, LinkMetadata, LinkType, StructuredData,
150
+ StructuredDataType, TextDirection,
151
151
  };
152
152
 
153
153
  // Internal handle type for shared mutable access during tree traversal
@@ -446,8 +446,8 @@ pub struct StructuredData {
446
446
  /// # Examples
447
447
  ///
448
448
  /// ```
449
- /// # use html_to_markdown_rs::metadata::ExtendedMetadata;
450
- /// let metadata = ExtendedMetadata {
449
+ /// # use html_to_markdown_rs::metadata::HtmlMetadata;
450
+ /// let metadata = HtmlMetadata {
451
451
  /// document: Default::default(),
452
452
  /// headers: Vec::new(),
453
453
  /// links: Vec::new(),
@@ -459,7 +459,7 @@ pub struct StructuredData {
459
459
  /// ```
460
460
  #[derive(Debug, Clone, Default)]
461
461
  #[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
462
- pub struct ExtendedMetadata {
462
+ pub struct HtmlMetadata {
463
463
  /// Document-level metadata (title, description, canonical, etc.)
464
464
  pub document: DocumentMetadata,
465
465