html-to-markdown 3.0.2 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +42 -12
  3. data/Gemfile +1 -0
  4. data/Gemfile.lock +27 -55
  5. data/README.md +9 -10
  6. data/Rakefile +4 -10
  7. data/ext/html-to-markdown_rb/Cargo.toml +14 -0
  8. data/ext/html_to_markdown_rb/Cargo.toml +16 -0
  9. data/ext/html_to_markdown_rb/extconf.rb +10 -0
  10. data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +6 -0
  11. data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +9 -0
  12. data/ext/html_to_markdown_rb/src/lib.rs +3941 -0
  13. data/html-to-markdown-rb.gemspec +1 -1
  14. data/lib/html_to_markdown/version.rb +1 -1
  15. data/lib/html_to_markdown.rb +31 -21
  16. data/{ext/html-to-markdown-rb/native/extconf.rb → lib/html_to_markdown_rs.rb} +1 -1
  17. data/sig/html_to_markdown.rbs +17 -5
  18. data/vendor/Cargo.toml +4 -4
  19. data/vendor/html-to-markdown-rs/Cargo.toml +2 -2
  20. data/vendor/html-to-markdown-rs/examples/test_deser.rs +12 -0
  21. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +1 -1
  22. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +1 -1
  23. data/vendor/html-to-markdown-rs/src/converter/context.rs +5 -0
  24. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
  25. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +38 -14
  26. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +56 -17
  27. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +11 -0
  28. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +17 -0
  29. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +1 -1
  30. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +10 -2
  31. data/vendor/html-to-markdown-rs/src/converter/main.rs +25 -0
  32. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +42 -15
  33. data/vendor/html-to-markdown-rs/src/converter/mod.rs +3 -2
  34. data/vendor/html-to-markdown-rs/src/converter/reference_collector.rs +69 -0
  35. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
  36. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +1 -1
  37. data/vendor/html-to-markdown-rs/src/exports.rs +3 -2
  38. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  39. data/vendor/html-to-markdown-rs/src/lib.rs +1 -2
  40. data/vendor/html-to-markdown-rs/src/metadata/config.rs +1 -1
  41. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +5 -5
  42. data/vendor/html-to-markdown-rs/src/options/conversion.rs +14 -13
  43. data/vendor/html-to-markdown-rs/src/options/mod.rs +2 -2
  44. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +3 -9
  45. data/vendor/html-to-markdown-rs/src/options/validation.rs +46 -4
  46. data/vendor/html-to-markdown-rs/src/types/document.rs +11 -0
  47. data/vendor/html-to-markdown-rs/src/types/result.rs +5 -2
  48. data/vendor/html-to-markdown-rs/src/types/tables.rs +1 -1
  49. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +1 -1
  50. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/state.rs +1 -1
  51. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/traversal.rs +1 -1
  52. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +8 -8
  53. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +6 -0
  54. data/vendor/html-to-markdown-rs/tests/integration_test.rs +27 -3
  55. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -2
  56. data/vendor/html-to-markdown-rs/tests/lists_test.rs +4 -4
  57. data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +169 -0
  58. metadata +13 -18
  59. data/ext/html-to-markdown-rb/extconf.rb +0 -41
  60. data/ext/html-to-markdown-rb/native/Cargo.lock +0 -934
  61. data/ext/html-to-markdown-rb/native/Cargo.toml +0 -48
  62. data/ext/html-to-markdown-rb/native/README.md +0 -215
  63. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +0 -54
  64. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +0 -158
  65. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -11
  66. data/ext/html-to-markdown-rb/native/src/lib.rs +0 -128
  67. data/ext/html-to-markdown-rb/native/src/options.rs +0 -238
  68. data/ext/html-to-markdown-rb/native/src/types.rs +0 -24
  69. data/lib/html_to_markdown/cli.rb +0 -21
  70. data/lib/html_to_markdown/cli_proxy.rb +0 -74
  71. data/spec/cli_proxy_spec.rb +0 -42
  72. data/spec/spec_helper.rb +0 -10
@@ -78,11 +78,20 @@ pub(crate) fn handle_audio(
78
78
  };
79
79
 
80
80
  if should_output_media_link(&src) {
81
- output.push('[');
82
- output.push_str(&src);
83
- output.push_str("](");
84
- output.push_str(&src);
85
- output.push(')');
81
+ if let Some(ref collector) = ctx.reference_collector {
82
+ let ref_num = collector.borrow_mut().get_or_insert(&src, None);
83
+ output.push('[');
84
+ output.push_str(&src);
85
+ output.push_str("][");
86
+ output.push_str(&ref_num.to_string());
87
+ output.push(']');
88
+ } else {
89
+ output.push('[');
90
+ output.push_str(&src);
91
+ output.push_str("](");
92
+ output.push_str(&src);
93
+ output.push(')');
94
+ }
86
95
  if !ctx.in_paragraph && !ctx.convert_as_inline {
87
96
  output.push_str("\n\n");
88
97
  }
@@ -132,11 +141,20 @@ pub(crate) fn handle_video(
132
141
  };
133
142
 
134
143
  if should_output_media_link(&src) {
135
- output.push('[');
136
- output.push_str(&src);
137
- output.push_str("](");
138
- output.push_str(&src);
139
- output.push(')');
144
+ if let Some(ref collector) = ctx.reference_collector {
145
+ let ref_num = collector.borrow_mut().get_or_insert(&src, None);
146
+ output.push('[');
147
+ output.push_str(&src);
148
+ output.push_str("][");
149
+ output.push_str(&ref_num.to_string());
150
+ output.push(']');
151
+ } else {
152
+ output.push('[');
153
+ output.push_str(&src);
154
+ output.push_str("](");
155
+ output.push_str(&src);
156
+ output.push(')');
157
+ }
140
158
  if !ctx.in_paragraph && !ctx.convert_as_inline {
141
159
  output.push_str("\n\n");
142
160
  }
@@ -199,11 +217,20 @@ pub(crate) fn handle_iframe(tag: &HTMLTag, output: &mut String, ctx: &Context) {
199
217
  .map_or(Cow::Borrowed(""), |v| v.as_utf8_str());
200
218
 
201
219
  if !src.is_empty() {
202
- output.push('[');
203
- output.push_str(&src);
204
- output.push_str("](");
205
- output.push_str(&src);
206
- output.push(')');
220
+ if let Some(ref collector) = ctx.reference_collector {
221
+ let ref_num = collector.borrow_mut().get_or_insert(&src, None);
222
+ output.push('[');
223
+ output.push_str(&src);
224
+ output.push_str("][");
225
+ output.push_str(&ref_num.to_string());
226
+ output.push(']');
227
+ } else {
228
+ output.push('[');
229
+ output.push_str(&src);
230
+ output.push_str("](");
231
+ output.push_str(&src);
232
+ output.push(')');
233
+ }
207
234
  if !ctx.in_paragraph && !ctx.convert_as_inline {
208
235
  output.push_str("\n\n");
209
236
  }
@@ -40,7 +40,7 @@
40
40
  //!
41
41
  //! Each submodule (block, inline, list, etc.) follows a consistent pattern:
42
42
  //!
43
- //! ```ignore
43
+ //! ```text
44
44
  //! // Module declares handlers for specific element types
45
45
  //! pub fn dispatch_<category>_handler(
46
46
  //! tag_name: &str,
@@ -74,7 +74,7 @@
74
74
  //! Once `converter.rs` is refactored to use `converter/main.rs`, the walk_node function
75
75
  //! will use dispatch functions like:
76
76
  //!
77
- //! ```ignore
77
+ //! ```text
78
78
  //! use crate::converter::{block, inline, list, media, semantic, form};
79
79
  //!
80
80
  //! fn walk_node(...) {
@@ -103,6 +103,7 @@ pub mod media;
103
103
  mod metadata;
104
104
  pub mod plain_text;
105
105
  pub mod preprocessing_helpers;
106
+ pub mod reference_collector;
106
107
  pub mod semantic;
107
108
  pub mod text;
108
109
  mod text_node;
@@ -0,0 +1,69 @@
1
+ //! Collector for reference-style link definitions.
2
+
3
+ use std::cell::RefCell;
4
+ use std::collections::HashMap;
5
+ use std::rc::Rc;
6
+
7
+ /// Shared handle for passing the collector through the conversion context.
8
+ pub type ReferenceCollectorHandle = Rc<RefCell<ReferenceCollector>>;
9
+
10
+ #[derive(Debug, Clone, Hash, Eq, PartialEq)]
11
+ struct ReferenceKey {
12
+ url: String,
13
+ title: Option<String>,
14
+ }
15
+
16
+ /// Collects link/image references during conversion and produces a reference
17
+ /// definitions section at the end of the document.
18
+ #[derive(Debug, Default)]
19
+ pub struct ReferenceCollector {
20
+ map: HashMap<ReferenceKey, usize>,
21
+ entries: Vec<(usize, String, Option<String>)>,
22
+ }
23
+
24
+ impl ReferenceCollector {
25
+ /// Create a new, empty reference collector.
26
+ pub fn new() -> Self {
27
+ Self::default()
28
+ }
29
+
30
+ /// Register a URL (and optional title) and return its 1-based reference number.
31
+ ///
32
+ /// If the same URL+title pair was already registered, the existing number is returned.
33
+ pub fn get_or_insert(&mut self, url: &str, title: Option<&str>) -> usize {
34
+ let key = ReferenceKey {
35
+ url: url.to_string(),
36
+ title: title.map(String::from),
37
+ };
38
+ if let Some(&num) = self.map.get(&key) {
39
+ return num;
40
+ }
41
+ let num = self.entries.len() + 1;
42
+ self.map.insert(key, num);
43
+ self.entries.push((num, url.to_string(), title.map(String::from)));
44
+ num
45
+ }
46
+
47
+ /// Produce the reference definitions section.
48
+ ///
49
+ /// Returns an empty string when no references were collected.
50
+ pub fn finish(&self) -> String {
51
+ if self.entries.is_empty() {
52
+ return String::new();
53
+ }
54
+ let mut out = String::new();
55
+ for (num, url, title) in &self.entries {
56
+ out.push('[');
57
+ out.push_str(&num.to_string());
58
+ out.push_str("]: ");
59
+ out.push_str(url);
60
+ if let Some(t) = title {
61
+ out.push_str(" \"");
62
+ out.push_str(&t.replace('"', "\\\""));
63
+ out.push('"');
64
+ }
65
+ out.push('\n');
66
+ }
67
+ out
68
+ }
69
+ }
@@ -64,7 +64,7 @@ pub use summary::handle as handle_summary;
64
64
  ///
65
65
  /// # Example
66
66
  ///
67
- /// ```ignore
67
+ /// ```text
68
68
  /// if dispatch_semantic_handler(tag_name, &node_handle, &parser, output, options, ctx, depth, dom_ctx) {
69
69
  /// // Tag was handled
70
70
  /// } else {
@@ -166,7 +166,7 @@ pub(crate) fn is_block_level_element(tag_name: &str) -> bool {
166
166
  /// If `index` is already a char boundary it is returned unchanged.
167
167
  /// Otherwise it walks backwards to find one. Returns 0 if no boundary
168
168
  /// is found before `index`.
169
- pub(crate) fn floor_char_boundary(s: &str, index: usize) -> usize {
169
+ pub fn floor_char_boundary(s: &str, index: usize) -> usize {
170
170
  if index >= s.len() {
171
171
  s.len()
172
172
  } else {
@@ -18,6 +18,7 @@ pub use crate::metadata::{
18
18
  };
19
19
 
20
20
  pub use crate::options::{
21
- CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, ListIndentType,
22
- NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset, WhitespaceMode,
21
+ CodeBlockStyle, ConversionOptions, ConversionOptionsBuilder, ConversionOptionsUpdate, HeadingStyle, HighlightStyle,
22
+ LinkStyle, ListIndentType, NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate,
23
+ PreprocessingPreset, WhitespaceMode,
23
24
  };
@@ -26,7 +26,7 @@ pub const DEFAULT_INLINE_IMAGE_LIMIT: u64 = 5 * 1024 * 1024;
26
26
  /// corresponding fields unchanged when applied via [`InlineImageConfig::apply_update`].
27
27
  #[derive(Debug, Clone, Default)]
28
28
  #[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
29
- #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
29
+ #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
30
30
  pub struct InlineImageConfigUpdate {
31
31
  /// Optional maximum decoded size override in bytes.
32
32
  pub max_decoded_size_bytes: Option<u64>,
@@ -77,9 +77,8 @@ mod validation;
77
77
  pub use exports::*;
78
78
  pub use types::{
79
79
  AnnotationKind, ConversionResult, DocumentNode, DocumentStructure, GridCell, NodeContent, ProcessingWarning,
80
- TableGrid, TextAnnotation, WarningKind,
80
+ TableData, TableGrid, TextAnnotation, WarningKind,
81
81
  };
82
- // Note: types::TableData will replace convert_api::TableData when convert() is refactored
83
82
 
84
83
  // ============================================================================
85
84
  // Main Public API Functions
@@ -133,7 +133,7 @@ pub struct MetadataConfig {
133
133
  /// ```
134
134
  #[derive(Debug, Clone, Default)]
135
135
  #[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
136
- #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
136
+ #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
137
137
  pub struct MetadataConfigUpdate {
138
138
  /// Optional override for extracting document-level metadata.
139
139
  ///
@@ -46,7 +46,7 @@
46
46
  //!
47
47
  //! ## Basic Usage with `convert()`
48
48
  //!
49
- //! ```ignore
49
+ //! ```text
50
50
  //! use html_to_markdown_rs::convert;
51
51
  //!
52
52
  //! let html = r#"
@@ -87,7 +87,7 @@
87
87
  //!
88
88
  //! ## Selective Extraction
89
89
  //!
90
- //! ```ignore
90
+ //! ```text
91
91
  //! use html_to_markdown_rs::{convert, ConversionOptions};
92
92
  //!
93
93
  //! let options = ConversionOptions {
@@ -102,7 +102,7 @@
102
102
  //!
103
103
  //! ## Analyzing Link Types
104
104
  //!
105
- //! ```ignore
105
+ //! ```text
106
106
  //! use html_to_markdown_rs::convert;
107
107
  //! use html_to_markdown_rs::metadata::LinkType;
108
108
  //!
@@ -126,7 +126,7 @@
126
126
  //! All types in this module support serialization via `serde` when the `metadata` feature is enabled.
127
127
  //! This enables easy export to JSON, YAML, or other formats:
128
128
  //!
129
- //! ```ignore
129
+ //! ```text
130
130
  //! use html_to_markdown_rs::convert;
131
131
  //!
132
132
  //! let result = convert(html, None)?;
@@ -160,7 +160,7 @@ use std::rc::Rc;
160
160
  ///
161
161
  /// # Examples
162
162
  ///
163
- /// ```ignore
163
+ /// ```text
164
164
  /// let collector = MetadataCollector::new(MetadataConfig::default());
165
165
  /// let handle = Rc::new(RefCell::new(collector));
166
166
  ///
@@ -4,7 +4,7 @@
4
4
 
5
5
  use crate::options::preprocessing::PreprocessingOptions;
6
6
  use crate::options::validation::{
7
- CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
7
+ CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
8
8
  };
9
9
 
10
10
  /// Main conversion options for HTML to Markdown conversion.
@@ -13,7 +13,7 @@ use crate::options::validation::{
13
13
  ///
14
14
  /// # Example
15
15
  ///
16
- /// ```rust,ignore
16
+ /// ```text
17
17
  /// use html_to_markdown_rs::ConversionOptions;
18
18
  ///
19
19
  /// let options = ConversionOptions::builder()
@@ -27,10 +27,7 @@ use crate::options::validation::{
27
27
  any(feature = "serde", feature = "metadata"),
28
28
  derive(serde::Serialize, serde::Deserialize)
29
29
  )]
30
- #[cfg_attr(
31
- any(feature = "serde", feature = "metadata"),
32
- serde(rename_all = "camelCase", default, deny_unknown_fields)
33
- )]
30
+ #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(default, deny_unknown_fields))]
34
31
  pub struct ConversionOptions {
35
32
  /// Heading style to use in Markdown output (ATX `#` or Setext underline).
36
33
  pub heading_style: HeadingStyle,
@@ -94,6 +91,8 @@ pub struct ConversionOptions {
94
91
  pub preserve_tags: Vec<String>,
95
92
  /// Skip conversion of `<img>` elements (omit images from output).
96
93
  pub skip_images: bool,
94
+ /// Link rendering style (inline or reference).
95
+ pub link_style: LinkStyle,
97
96
  /// Target output format (Markdown, plain text, etc.).
98
97
  pub output_format: OutputFormat,
99
98
  /// Include structured document tree in result.
@@ -114,7 +113,7 @@ impl Default for ConversionOptions {
114
113
  heading_style: HeadingStyle::default(),
115
114
  list_indent_type: ListIndentType::default(),
116
115
  list_indent_width: 2,
117
- bullets: "-".to_string(),
116
+ bullets: "-*+".to_string(),
118
117
  strong_em_symbol: '*',
119
118
  escape_asterisks: false,
120
119
  escape_underscores: false,
@@ -142,6 +141,7 @@ impl Default for ConversionOptions {
142
141
  strip_tags: Vec::new(),
143
142
  preserve_tags: Vec::new(),
144
143
  skip_images: false,
144
+ link_style: LinkStyle::default(),
145
145
  output_format: OutputFormat::default(),
146
146
  include_document_structure: false,
147
147
  extract_images: false,
@@ -207,6 +207,7 @@ impl ConversionOptionsBuilder {
207
207
  builder_setter!(newline_style, NewlineStyle);
208
208
  builder_setter!(highlight_style, HighlightStyle);
209
209
  builder_setter_into!(code_language, String);
210
+ builder_setter!(link_style, LinkStyle);
210
211
  builder_setter!(autolinks, bool);
211
212
  builder_setter!(default_title, bool);
212
213
  builder_setter!(br_in_tables, bool);
@@ -289,10 +290,7 @@ use crate::options::preprocessing::PreprocessingOptionsUpdate;
289
290
  any(feature = "serde", feature = "metadata"),
290
291
  derive(serde::Serialize, serde::Deserialize)
291
292
  )]
292
- #[cfg_attr(
293
- any(feature = "serde", feature = "metadata"),
294
- serde(rename_all = "camelCase", deny_unknown_fields)
295
- )]
293
+ #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
296
294
  pub struct ConversionOptionsUpdate {
297
295
  /// Optional override for [`ConversionOptions::heading_style`].
298
296
  pub heading_style: Option<HeadingStyle>,
@@ -356,6 +354,8 @@ pub struct ConversionOptionsUpdate {
356
354
  pub preserve_tags: Option<Vec<String>>,
357
355
  /// Optional override for [`ConversionOptions::skip_images`].
358
356
  pub skip_images: Option<bool>,
357
+ /// Optional override for [`ConversionOptions::link_style`].
358
+ pub link_style: Option<LinkStyle>,
359
359
  /// Optional override for [`ConversionOptions::output_format`].
360
360
  pub output_format: Option<OutputFormat>,
361
361
  /// Optional override for [`ConversionOptions::include_document_structure`].
@@ -410,6 +410,7 @@ impl ConversionOptions {
410
410
  apply!(strip_tags);
411
411
  apply!(preserve_tags);
412
412
  apply!(skip_images);
413
+ apply!(link_style);
413
414
  apply!(output_format);
414
415
  apply!(include_document_structure);
415
416
  apply!(extract_images);
@@ -465,8 +466,8 @@ mod tests {
465
466
  #[test]
466
467
  fn test_conversion_options_partial_deserialization() {
467
468
  let partial_json = r#"{
468
- "headingStyle": "atxClosed",
469
- "listIndentWidth": 4,
469
+ "heading_style": "atxclosed",
470
+ "list_indent_width": 4,
470
471
  "bullets": "*"
471
472
  }"#;
472
473
 
@@ -10,10 +10,10 @@ pub mod preprocessing;
10
10
  pub mod validation;
11
11
 
12
12
  // Re-exports for easy access
13
- pub use conversion::{ConversionOptions, ConversionOptionsUpdate};
13
+ pub use conversion::{ConversionOptions, ConversionOptionsBuilder, ConversionOptionsUpdate};
14
14
  pub use preprocessing::{PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset};
15
15
  pub use validation::{
16
- CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
16
+ CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
17
17
  };
18
18
 
19
19
  // Note: InlineImageConfig is re-exported from the inline_images module,
@@ -42,10 +42,7 @@ impl PreprocessingPreset {
42
42
  any(feature = "serde", feature = "metadata"),
43
43
  derive(serde::Serialize, serde::Deserialize)
44
44
  )]
45
- #[cfg_attr(
46
- any(feature = "serde", feature = "metadata"),
47
- serde(rename_all = "camelCase", deny_unknown_fields)
48
- )]
45
+ #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(default, deny_unknown_fields))]
49
46
  pub struct PreprocessingOptions {
50
47
  /// Enable HTML preprocessing globally
51
48
  pub enabled: bool,
@@ -70,10 +67,7 @@ pub struct PreprocessingOptions {
70
67
  any(feature = "serde", feature = "metadata"),
71
68
  derive(serde::Serialize, serde::Deserialize)
72
69
  )]
73
- #[cfg_attr(
74
- any(feature = "serde", feature = "metadata"),
75
- serde(rename_all = "camelCase", deny_unknown_fields)
76
- )]
70
+ #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
77
71
  pub struct PreprocessingOptionsUpdate {
78
72
  /// Optional global preprocessing enablement override
79
73
  pub enabled: Option<bool>,
@@ -91,7 +85,7 @@ pub struct PreprocessingOptionsUpdate {
91
85
  impl Default for PreprocessingOptions {
92
86
  fn default() -> Self {
93
87
  Self {
94
- enabled: false,
88
+ enabled: true,
95
89
  preset: PreprocessingPreset::default(),
96
90
  remove_navigation: true,
97
91
  remove_forms: true,
@@ -115,10 +115,10 @@ impl NewlineStyle {
115
115
  /// Determines how code blocks (`<pre><code>`) are rendered in Markdown.
116
116
  #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
117
117
  pub enum CodeBlockStyle {
118
- /// Indented code blocks (4 spaces). Default. `CommonMark` standard.
119
- #[default]
118
+ /// Indented code blocks (4 spaces). `CommonMark` standard.
120
119
  Indented,
121
- /// Fenced code blocks with backticks (```). Supports language hints.
120
+ /// Fenced code blocks with backticks (```). Default (GFM). Supports language hints.
121
+ #[default]
122
122
  Backticks,
123
123
  /// Fenced code blocks with tildes (~~~). Supports language hints.
124
124
  Tildes,
@@ -172,6 +172,33 @@ impl HighlightStyle {
172
172
  }
173
173
  }
174
174
 
175
+ /// Link rendering style in Markdown output.
176
+ ///
177
+ /// Controls whether links and images use inline `[text](url)` syntax or
178
+ /// reference-style `[text][1]` syntax with definitions collected at the end.
179
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
180
+ pub enum LinkStyle {
181
+ /// Inline links: `[text](url)`. Default.
182
+ #[default]
183
+ Inline,
184
+ /// Reference-style links: `[text][1]` with `[1]: url` at end of document.
185
+ Reference,
186
+ }
187
+
188
+ impl LinkStyle {
189
+ /// Parse a link style from a string.
190
+ ///
191
+ /// Accepts "reference" or defaults to Inline.
192
+ /// Input is normalized (lowercased, alphanumeric only).
193
+ #[must_use]
194
+ pub fn parse(value: &str) -> Self {
195
+ match normalize_token(value).as_str() {
196
+ "reference" => Self::Reference,
197
+ _ => Self::Inline,
198
+ }
199
+ }
200
+ }
201
+
175
202
  /// Output format for conversion.
176
203
  ///
177
204
  /// Specifies the target markup language format for the conversion output.
@@ -215,7 +242,8 @@ pub(crate) fn normalize_token(value: &str) -> String {
215
242
  #[cfg(any(feature = "serde", feature = "metadata"))]
216
243
  mod serde_impls {
217
244
  use super::{
218
- CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
245
+ CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat,
246
+ WhitespaceMode,
219
247
  };
220
248
  use serde::{Deserialize, Serialize, Serializer};
221
249
 
@@ -239,6 +267,7 @@ mod serde_impls {
239
267
  impl_deserialize_from_parse!(NewlineStyle, NewlineStyle::parse);
240
268
  impl_deserialize_from_parse!(CodeBlockStyle, CodeBlockStyle::parse);
241
269
  impl_deserialize_from_parse!(HighlightStyle, HighlightStyle::parse);
270
+ impl_deserialize_from_parse!(LinkStyle, LinkStyle::parse);
242
271
  impl_deserialize_from_parse!(OutputFormat, OutputFormat::parse);
243
272
 
244
273
  // Serialize implementations that convert enum variants to their string representations
@@ -324,6 +353,19 @@ mod serde_impls {
324
353
  }
325
354
  }
326
355
 
356
+ impl Serialize for LinkStyle {
357
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
358
+ where
359
+ S: Serializer,
360
+ {
361
+ let s = match self {
362
+ Self::Inline => "inline",
363
+ Self::Reference => "reference",
364
+ };
365
+ serializer.serialize_str(s)
366
+ }
367
+ }
368
+
327
369
  impl Serialize for OutputFormat {
328
370
  fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
329
371
  where
@@ -147,8 +147,10 @@ pub struct TextAnnotation {
147
147
  /// Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
148
148
  #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
149
149
  #[serde(tag = "annotation_type", rename_all = "snake_case")]
150
+ #[derive(Default)]
150
151
  pub enum AnnotationKind {
151
152
  /// Bold / strong emphasis.
153
+ #[default]
152
154
  Bold,
153
155
  /// Italic / emphasis.
154
156
  Italic,
@@ -173,3 +175,12 @@ pub enum AnnotationKind {
173
175
  title: Option<String>,
174
176
  },
175
177
  }
178
+
179
+ impl Default for NodeContent {
180
+ fn default() -> Self {
181
+ Self::Heading {
182
+ level: 1,
183
+ text: String::new(),
184
+ }
185
+ }
186
+ }
@@ -1,5 +1,7 @@
1
1
  //! The primary result type for HTML conversion and extraction.
2
2
 
3
+ use serde::{Deserialize, Serialize};
4
+
3
5
  use super::document::DocumentStructure;
4
6
  use super::tables::TableData;
5
7
  use super::warnings::ProcessingWarning;
@@ -11,14 +13,14 @@ use super::warnings::ProcessingWarning;
11
13
  ///
12
14
  /// # Example
13
15
  ///
14
- /// ```rust,ignore
16
+ /// ```text
15
17
  /// use html_to_markdown_rs::{convert, ConversionOptions};
16
18
  ///
17
19
  /// let result = convert("<h1>Hello</h1><p>World</p>", None)?;
18
20
  /// assert!(result.content.is_some());
19
21
  /// assert!(result.warnings.is_empty());
20
22
  /// ```
21
- #[derive(Debug, Clone, Default)]
23
+ #[derive(Debug, Clone, Default, Serialize, Deserialize)]
22
24
  pub struct ConversionResult {
23
25
  /// Converted text output (markdown, djot, or plain text).
24
26
  ///
@@ -42,6 +44,7 @@ pub struct ConversionResult {
42
44
  ///
43
45
  /// Populated when `extract_images` is `true` in options.
44
46
  #[cfg(feature = "inline-images")]
47
+ #[serde(skip)]
45
48
  pub images: Vec<crate::inline_images::InlineImage>,
46
49
 
47
50
  /// Non-fatal processing warnings.
@@ -3,7 +3,7 @@
3
3
  use serde::{Deserialize, Serialize};
4
4
 
5
5
  /// A structured table grid with cell-level data including spans.
6
- #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
6
+ #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
7
7
  pub struct TableGrid {
8
8
  /// Number of rows.
9
9
  pub rows: u32,
@@ -13,7 +13,7 @@
13
13
  //!
14
14
  //! # Example
15
15
  //!
16
- //! ```ignore
16
+ //! ```text
17
17
  //! use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};
18
18
  //!
19
19
  //! struct CustomVisitor;
@@ -42,7 +42,7 @@ use crate::visitor::NodeType;
42
42
  ///
43
43
  /// # Examples
44
44
  ///
45
- /// ```ignore
45
+ /// ```text
46
46
  /// let ctx = build_node_context(
47
47
  /// NodeType::Heading,
48
48
  /// "h1",
@@ -51,7 +51,7 @@ use super::content::VisitorDispatch;
51
51
  ///
52
52
  /// # Examples
53
53
  ///
54
- /// ```ignore
54
+ /// ```text
55
55
  /// let result = dispatch_visitor(
56
56
  /// &visitor,
57
57
  /// |v| v.visit_heading(&ctx, level, text, id),