html-to-markdown 3.1.0 → 3.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +42 -12
  3. data/Gemfile +1 -0
  4. data/Gemfile.lock +27 -55
  5. data/README.md +9 -10
  6. data/Rakefile +4 -10
  7. data/ext/html-to-markdown_rb/Cargo.toml +14 -0
  8. data/ext/html_to_markdown_rb/Cargo.toml +16 -0
  9. data/ext/html_to_markdown_rb/extconf.rb +10 -0
  10. data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +6 -0
  11. data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +9 -0
  12. data/ext/html_to_markdown_rb/src/lib.rs +3941 -0
  13. data/html-to-markdown-rb.gemspec +1 -1
  14. data/lib/html_to_markdown/version.rb +1 -1
  15. data/lib/html_to_markdown.rb +31 -21
  16. data/{ext/html-to-markdown-rb/native/extconf.rb → lib/html_to_markdown_rs.rb} +1 -1
  17. data/sig/html_to_markdown.rbs +17 -5
  18. data/vendor/Cargo.toml +4 -4
  19. data/vendor/html-to-markdown-rs/Cargo.toml +2 -2
  20. data/vendor/html-to-markdown-rs/examples/test_deser.rs +12 -0
  21. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +1 -1
  22. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +1 -1
  23. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
  24. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +1 -1
  25. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +10 -2
  26. data/vendor/html-to-markdown-rs/src/converter/mod.rs +2 -2
  27. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
  28. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +1 -1
  29. data/vendor/html-to-markdown-rs/src/exports.rs +3 -3
  30. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  31. data/vendor/html-to-markdown-rs/src/lib.rs +1 -2
  32. data/vendor/html-to-markdown-rs/src/metadata/config.rs +1 -1
  33. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +5 -5
  34. data/vendor/html-to-markdown-rs/src/options/conversion.rs +6 -12
  35. data/vendor/html-to-markdown-rs/src/options/mod.rs +1 -1
  36. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +3 -9
  37. data/vendor/html-to-markdown-rs/src/options/validation.rs +3 -3
  38. data/vendor/html-to-markdown-rs/src/types/document.rs +11 -0
  39. data/vendor/html-to-markdown-rs/src/types/result.rs +5 -2
  40. data/vendor/html-to-markdown-rs/src/types/tables.rs +1 -1
  41. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +1 -1
  42. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/state.rs +1 -1
  43. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/traversal.rs +1 -1
  44. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +8 -8
  45. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +6 -0
  46. data/vendor/html-to-markdown-rs/tests/integration_test.rs +3 -3
  47. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -2
  48. data/vendor/html-to-markdown-rs/tests/lists_test.rs +4 -4
  49. metadata +11 -18
  50. data/ext/html-to-markdown-rb/extconf.rb +0 -41
  51. data/ext/html-to-markdown-rb/native/Cargo.lock +0 -934
  52. data/ext/html-to-markdown-rb/native/Cargo.toml +0 -48
  53. data/ext/html-to-markdown-rb/native/README.md +0 -215
  54. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +0 -54
  55. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +0 -158
  56. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -11
  57. data/ext/html-to-markdown-rb/native/src/lib.rs +0 -128
  58. data/ext/html-to-markdown-rb/native/src/options.rs +0 -238
  59. data/ext/html-to-markdown-rb/native/src/types.rs +0 -24
  60. data/lib/html_to_markdown/cli.rb +0 -21
  61. data/lib/html_to_markdown/cli_proxy.rb +0 -74
  62. data/spec/cli_proxy_spec.rb +0 -42
  63. data/spec/spec_helper.rb +0 -10
@@ -87,7 +87,7 @@ Gem::Specification.new do |spec|
87
87
  spec.files = files
88
88
  spec.extra_rdoc_files = ['README.md']
89
89
 
90
- spec.extensions = ['ext/html-to-markdown-rb/extconf.rb']
90
+ spec.extensions = ['ext/html_to_markdown_rb/extconf.rb']
91
91
 
92
92
  spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
93
93
  spec.metadata['rubygems_mfa_required'] = 'true'
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '3.1.0'
4
+ VERSION = '3.2.1'
5
5
  end
@@ -3,28 +3,38 @@
3
3
  require_relative 'html_to_markdown/version'
4
4
  require 'html_to_markdown_rb'
5
5
 
6
+ # High-performance HTML to Markdown conversion.
7
+ #
8
+ # @example Simple conversion
9
+ # HtmlToMarkdown.convert('<h1>Hello</h1>') # => "# Hello\n\n"
10
+ #
11
+ # @example With options
12
+ # HtmlToMarkdown.convert('<h1>Hello</h1>', heading_style: 'atx')
6
13
  module HtmlToMarkdown
7
- autoload :CLI, 'html_to_markdown/cli'
8
- autoload :CLIProxy, 'html_to_markdown/cli_proxy'
9
-
10
- class << self
11
- alias native_convert convert
12
- end
13
-
14
- module_function
15
-
16
- # Convert HTML to Markdown, returning a Hash with:
17
- # - :content [String, nil] the converted Markdown output
18
- # - :document [nil] document structure (not yet exposed)
19
- # - :metadata [Hash, nil] extracted HTML metadata
20
- # - :tables [Array<Hash>] extracted tables with :grid and :markdown
21
- # - :images [Array<Hash>] extracted inline images
22
- # - :warnings [Array<Hash>] processing warnings
14
+ # Convert HTML to Markdown.
23
15
  #
24
- # @param html [String] HTML string to convert
25
- # @param options [Hash, nil] optional conversion options
26
- # @return [Hash] conversion result
27
- def convert(html, options = nil)
28
- native_convert(html.to_s, options)
16
+ # @param html [String] The HTML content to convert.
17
+ # @param options [Hash] Optional conversion options.
18
+ # Supported keys (all optional):
19
+ # - :heading_style - 'atx', 'atx_closed', 'setext', 'underlined'
20
+ # - :code_block_style - 'backticks', 'tildes', 'indented'
21
+ # - :escape_asterisks - Boolean
22
+ # - :escape_underscores - Boolean
23
+ # - :escape_misc - Boolean
24
+ # - :escape_ascii - Boolean
25
+ # - :strip_newlines - Boolean
26
+ # - :keep_inline_images_in - Array of tag names
27
+ # - :strip_tags - Array of tag names to strip
28
+ # - :preserve_tags - Array of tag names to preserve verbatim
29
+ # (and more, matching ConversionOptions fields)
30
+ # @return [String] The converted Markdown content.
31
+ def self.convert(html, options = {})
32
+ opts = if options.nil? || options.empty?
33
+ nil
34
+ else
35
+ HtmlToMarkdownRs::ConversionOptions.new(options)
36
+ end
37
+ result = HtmlToMarkdownRs.convert(html, opts)
38
+ result.content || ''
29
39
  end
30
40
  end
@@ -1,3 +1,3 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative '../extconf'
3
+ require 'html_to_markdown_rb'
@@ -1,3 +1,16 @@
1
+ # Native extension module (Magnus/rb-sys)
2
+ module HtmlToMarkdownRs
3
+ class ConversionOptions
4
+ def initialize: (Hash[Symbol, untyped]) -> void
5
+ end
6
+
7
+ class ConversionResult
8
+ def content: () -> String?
9
+ end
10
+
11
+ def self.convert: (String html, ConversionOptions? options) -> ConversionResult
12
+ end
13
+
1
14
  # Type definitions for HtmlToMarkdown Ruby gem
2
15
  module HtmlToMarkdown
3
16
  VERSION: String
@@ -8,6 +21,7 @@ module HtmlToMarkdown
8
21
  type whitespace_mode = :normalized | :strict
9
22
  type newline_style = :spaces | :backslash
10
23
  type code_block_style = :indented | :backticks | :tildes
24
+ type link_style = :inline | :reference
11
25
  type output_format = :markdown | :djot
12
26
  type preprocessing_preset = :minimal | :standard | :aggressive
13
27
 
@@ -49,6 +63,7 @@ module HtmlToMarkdown
49
63
  debug?: bool,
50
64
  strip_tags?: Array[String],
51
65
  preserve_tags?: Array[String],
66
+ link_style?: link_style,
52
67
  output_format?: output_format,
53
68
  skip_images?: bool,
54
69
  include_document_structure?: bool,
@@ -126,12 +141,9 @@ module HtmlToMarkdown
126
141
 
127
142
  public
128
143
 
129
- # Convert HTML to Markdown, returning a Hash with content, metadata, tables, images, and warnings.
144
+ # Convert HTML to Markdown, returning the markdown content string.
130
145
  #
131
146
  # Example:
132
147
  # result = HtmlToMarkdown.convert(html)
133
- def self.convert: (String html, ?conversion_options options) -> Hash[String, untyped]
134
-
135
- # Instance method version (created by module_function)
136
- def convert: (String html, ?conversion_options options) -> Hash[String, untyped]
148
+ def self.convert: (String html, ?conversion_options options) -> String
137
149
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "3.1.0"
6
+ version = "3.2.1"
7
7
  edition = "2024"
8
8
  rust-version = "1.85"
9
9
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -18,11 +18,11 @@ clap = { version = "4.6", features = ["derive"] }
18
18
  clap_complete = "4.6"
19
19
  clap_mangen = "0.3"
20
20
  encoding_rs = "0.8"
21
- ext-php-rs = "0.15.8"
21
+ ext-php-rs = "0.15.10"
22
22
  html5ever = "0.39.0"
23
23
  once_cell = "1.21"
24
- pyo3 = { version = "0.28.2", features = ["abi3-py310"] }
25
- rayon = "1.11"
24
+ pyo3 = { version = "0.28.3", features = ["abi3-py310"] }
25
+ rayon = "1.12"
26
26
  regex = "1.12"
27
27
  serde = { version = "1.0", features = ["derive"] }
28
28
  serde_json = "1.0"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "3.1.0"
3
+ version = "3.2.1"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -39,7 +39,7 @@ image = { version = "0.25", default-features = false, features = [
39
39
  "bmp",
40
40
  "webp",
41
41
  ], optional = true }
42
- lru = "0.16"
42
+ lru = "0.17"
43
43
  memchr = "2"
44
44
  once_cell = "1.21"
45
45
  regex = "1.12"
@@ -0,0 +1,12 @@
1
+ #![allow(missing_docs)]
2
+ use html_to_markdown_rs::ConversionOptions;
3
+
4
+ fn main() {
5
+ let json = r#"{"headingStyle":"","listIndentType":"","listIndentWidth":2,"bullets":"-*+","strongEmSymbol":"*","escapeAsterisks":false,"escapeUnderscores":false,"escapeMisc":false,"escapeAscii":false,"codeLanguage":"","autolinks":true,"defaultTitle":false,"brInTables":false,"highlightStyle":"","extractMetadata":true,"whitespaceMode":"","stripNewlines":false,"wrap":false,"wrapWidth":80,"convertAsInline":false,"subSymbol":"","supSymbol":"","newlineStyle":"spaces","codeBlockStyle":"tildes","keepInlineImagesIn":null,"preprocessing":{"enabled":false,"preset":"","removeNavigation":false,"removeForms":false},"encoding":"utf-8","debug":false,"stripTags":null,"preserveTags":null,"skipImages":false,"linkStyle":"","outputFormat":"","includeDocumentStructure":false,"extractImages":false,"maxImageSize":5242880,"captureSvg":false,"inferDimensions":true}"#;
6
+
7
+ let opts: ConversionOptions = serde_json::from_str(json).unwrap();
8
+ println!("code_block_style: {:?}", opts.code_block_style);
9
+
10
+ let result = html_to_markdown_rs::convert("<pre><code>some code</code></pre>", Some(opts)).unwrap();
11
+ println!("result: {:?}", result.content);
12
+ }
@@ -40,7 +40,7 @@ pub use super::{Context, DomContext};
40
40
  /// element was handled, `false` otherwise.
41
41
  ///
42
42
  /// # Usage in converter.rs
43
- /// ```ignore
43
+ /// ```text
44
44
  /// if crate::converter::block::dispatch_block_handler(
45
45
  /// &tag_name,
46
46
  /// node_handle,
@@ -29,7 +29,7 @@ pub(crate) use caption::handle_caption;
29
29
  /// Dispatches table element handling to the main convert_table function.
30
30
  ///
31
31
  /// # Usage in converter.rs
32
- /// ```ignore
32
+ /// ```text
33
33
  /// if "table" == tag_name {
34
34
  /// crate::converter::block::table::handle_table(
35
35
  /// node_handle,
@@ -53,7 +53,7 @@ pub use elements::handle as handle_form_elements;
53
53
  ///
54
54
  /// # Example
55
55
  ///
56
- /// ```ignore
56
+ /// ```text
57
57
  /// if dispatch_form_handler(tag_name, &node_handle, &parser, output, options, ctx, depth, dom_ctx) {
58
58
  /// // Tag was handled
59
59
  /// } else {
@@ -64,7 +64,7 @@ pub use super::{Context, DomContext};
64
64
  ///
65
65
  /// # Usage in converter.rs
66
66
  ///
67
- /// ```ignore
67
+ /// ```text
68
68
  /// if crate::converter::inline::dispatch_inline_handler(
69
69
  /// &tag_name,
70
70
  /// &node_handle,
@@ -198,7 +198,11 @@ pub(crate) fn handle_li(
198
198
  } else {
199
199
  let bullets: Vec<char> = options.bullets.chars().collect();
200
200
  let bullet_index = if ctx.ul_depth > 0 { ctx.ul_depth - 1 } else { 0 };
201
- let bullet = bullets.get(bullet_index % bullets.len()).copied().unwrap_or('*');
201
+ let bullet = if bullets.is_empty() {
202
+ '*'
203
+ } else {
204
+ bullets[bullet_index % bullets.len()]
205
+ };
202
206
  output.push(bullet);
203
207
  output.push(' ');
204
208
  }
@@ -265,7 +269,11 @@ pub(crate) fn handle_li(
265
269
  } else {
266
270
  let bullets: Vec<char> = options.bullets.chars().collect();
267
271
  let bullet_index = if ctx.ul_depth > 0 { ctx.ul_depth - 1 } else { 0 };
268
- let bullet = bullets.get(bullet_index % bullets.len()).copied().unwrap_or('*');
272
+ let bullet = if bullets.is_empty() {
273
+ '*'
274
+ } else {
275
+ bullets[bullet_index % bullets.len()]
276
+ };
269
277
  let bullet_str = bullet.to_string();
270
278
  let text_start = last_line.find(bullet).map_or(0, |pos| pos + 1);
271
279
  (bullet_str, last_line[text_start..].trim().to_string())
@@ -40,7 +40,7 @@
40
40
  //!
41
41
  //! Each submodule (block, inline, list, etc.) follows a consistent pattern:
42
42
  //!
43
- //! ```ignore
43
+ //! ```text
44
44
  //! // Module declares handlers for specific element types
45
45
  //! pub fn dispatch_<category>_handler(
46
46
  //! tag_name: &str,
@@ -74,7 +74,7 @@
74
74
  //! Once `converter.rs` is refactored to use `converter/main.rs`, the walk_node function
75
75
  //! will use dispatch functions like:
76
76
  //!
77
- //! ```ignore
77
+ //! ```text
78
78
  //! use crate::converter::{block, inline, list, media, semantic, form};
79
79
  //!
80
80
  //! fn walk_node(...) {
@@ -64,7 +64,7 @@ pub use summary::handle as handle_summary;
64
64
  ///
65
65
  /// # Example
66
66
  ///
67
- /// ```ignore
67
+ /// ```text
68
68
  /// if dispatch_semantic_handler(tag_name, &node_handle, &parser, output, options, ctx, depth, dom_ctx) {
69
69
  /// // Tag was handled
70
70
  /// } else {
@@ -166,7 +166,7 @@ pub(crate) fn is_block_level_element(tag_name: &str) -> bool {
166
166
  /// If `index` is already a char boundary it is returned unchanged.
167
167
  /// Otherwise it walks backwards to find one. Returns 0 if no boundary
168
168
  /// is found before `index`.
169
- pub(crate) fn floor_char_boundary(s: &str, index: usize) -> usize {
169
+ pub fn floor_char_boundary(s: &str, index: usize) -> usize {
170
170
  if index >= s.len() {
171
171
  s.len()
172
172
  } else {
@@ -18,7 +18,7 @@ pub use crate::metadata::{
18
18
  };
19
19
 
20
20
  pub use crate::options::{
21
- CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, LinkStyle,
22
- ListIndentType, NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset,
23
- WhitespaceMode,
21
+ CodeBlockStyle, ConversionOptions, ConversionOptionsBuilder, ConversionOptionsUpdate, HeadingStyle, HighlightStyle,
22
+ LinkStyle, ListIndentType, NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate,
23
+ PreprocessingPreset, WhitespaceMode,
24
24
  };
@@ -26,7 +26,7 @@ pub const DEFAULT_INLINE_IMAGE_LIMIT: u64 = 5 * 1024 * 1024;
26
26
  /// corresponding fields unchanged when applied via [`InlineImageConfig::apply_update`].
27
27
  #[derive(Debug, Clone, Default)]
28
28
  #[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
29
- #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
29
+ #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
30
30
  pub struct InlineImageConfigUpdate {
31
31
  /// Optional maximum decoded size override in bytes.
32
32
  pub max_decoded_size_bytes: Option<u64>,
@@ -77,9 +77,8 @@ mod validation;
77
77
  pub use exports::*;
78
78
  pub use types::{
79
79
  AnnotationKind, ConversionResult, DocumentNode, DocumentStructure, GridCell, NodeContent, ProcessingWarning,
80
- TableGrid, TextAnnotation, WarningKind,
80
+ TableData, TableGrid, TextAnnotation, WarningKind,
81
81
  };
82
- // Note: types::TableData will replace convert_api::TableData when convert() is refactored
83
82
 
84
83
  // ============================================================================
85
84
  // Main Public API Functions
@@ -133,7 +133,7 @@ pub struct MetadataConfig {
133
133
  /// ```
134
134
  #[derive(Debug, Clone, Default)]
135
135
  #[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
136
- #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(rename_all = "camelCase"))]
136
+ #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
137
137
  pub struct MetadataConfigUpdate {
138
138
  /// Optional override for extracting document-level metadata.
139
139
  ///
@@ -46,7 +46,7 @@
46
46
  //!
47
47
  //! ## Basic Usage with `convert()`
48
48
  //!
49
- //! ```ignore
49
+ //! ```text
50
50
  //! use html_to_markdown_rs::convert;
51
51
  //!
52
52
  //! let html = r#"
@@ -87,7 +87,7 @@
87
87
  //!
88
88
  //! ## Selective Extraction
89
89
  //!
90
- //! ```ignore
90
+ //! ```text
91
91
  //! use html_to_markdown_rs::{convert, ConversionOptions};
92
92
  //!
93
93
  //! let options = ConversionOptions {
@@ -102,7 +102,7 @@
102
102
  //!
103
103
  //! ## Analyzing Link Types
104
104
  //!
105
- //! ```ignore
105
+ //! ```text
106
106
  //! use html_to_markdown_rs::convert;
107
107
  //! use html_to_markdown_rs::metadata::LinkType;
108
108
  //!
@@ -126,7 +126,7 @@
126
126
  //! All types in this module support serialization via `serde` when the `metadata` feature is enabled.
127
127
  //! This enables easy export to JSON, YAML, or other formats:
128
128
  //!
129
- //! ```ignore
129
+ //! ```text
130
130
  //! use html_to_markdown_rs::convert;
131
131
  //!
132
132
  //! let result = convert(html, None)?;
@@ -160,7 +160,7 @@ use std::rc::Rc;
160
160
  ///
161
161
  /// # Examples
162
162
  ///
163
- /// ```ignore
163
+ /// ```text
164
164
  /// let collector = MetadataCollector::new(MetadataConfig::default());
165
165
  /// let handle = Rc::new(RefCell::new(collector));
166
166
  ///
@@ -13,7 +13,7 @@ use crate::options::validation::{
13
13
  ///
14
14
  /// # Example
15
15
  ///
16
- /// ```rust,ignore
16
+ /// ```text
17
17
  /// use html_to_markdown_rs::ConversionOptions;
18
18
  ///
19
19
  /// let options = ConversionOptions::builder()
@@ -27,10 +27,7 @@ use crate::options::validation::{
27
27
  any(feature = "serde", feature = "metadata"),
28
28
  derive(serde::Serialize, serde::Deserialize)
29
29
  )]
30
- #[cfg_attr(
31
- any(feature = "serde", feature = "metadata"),
32
- serde(rename_all = "camelCase", default, deny_unknown_fields)
33
- )]
30
+ #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(default, deny_unknown_fields))]
34
31
  pub struct ConversionOptions {
35
32
  /// Heading style to use in Markdown output (ATX `#` or Setext underline).
36
33
  pub heading_style: HeadingStyle,
@@ -116,7 +113,7 @@ impl Default for ConversionOptions {
116
113
  heading_style: HeadingStyle::default(),
117
114
  list_indent_type: ListIndentType::default(),
118
115
  list_indent_width: 2,
119
- bullets: "-".to_string(),
116
+ bullets: "-*+".to_string(),
120
117
  strong_em_symbol: '*',
121
118
  escape_asterisks: false,
122
119
  escape_underscores: false,
@@ -293,10 +290,7 @@ use crate::options::preprocessing::PreprocessingOptionsUpdate;
293
290
  any(feature = "serde", feature = "metadata"),
294
291
  derive(serde::Serialize, serde::Deserialize)
295
292
  )]
296
- #[cfg_attr(
297
- any(feature = "serde", feature = "metadata"),
298
- serde(rename_all = "camelCase", deny_unknown_fields)
299
- )]
293
+ #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
300
294
  pub struct ConversionOptionsUpdate {
301
295
  /// Optional override for [`ConversionOptions::heading_style`].
302
296
  pub heading_style: Option<HeadingStyle>,
@@ -472,8 +466,8 @@ mod tests {
472
466
  #[test]
473
467
  fn test_conversion_options_partial_deserialization() {
474
468
  let partial_json = r#"{
475
- "headingStyle": "atxClosed",
476
- "listIndentWidth": 4,
469
+ "heading_style": "atxclosed",
470
+ "list_indent_width": 4,
477
471
  "bullets": "*"
478
472
  }"#;
479
473
 
@@ -10,7 +10,7 @@ pub mod preprocessing;
10
10
  pub mod validation;
11
11
 
12
12
  // Re-exports for easy access
13
- pub use conversion::{ConversionOptions, ConversionOptionsUpdate};
13
+ pub use conversion::{ConversionOptions, ConversionOptionsBuilder, ConversionOptionsUpdate};
14
14
  pub use preprocessing::{PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset};
15
15
  pub use validation::{
16
16
  CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
@@ -42,10 +42,7 @@ impl PreprocessingPreset {
42
42
  any(feature = "serde", feature = "metadata"),
43
43
  derive(serde::Serialize, serde::Deserialize)
44
44
  )]
45
- #[cfg_attr(
46
- any(feature = "serde", feature = "metadata"),
47
- serde(rename_all = "camelCase", deny_unknown_fields)
48
- )]
45
+ #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(default, deny_unknown_fields))]
49
46
  pub struct PreprocessingOptions {
50
47
  /// Enable HTML preprocessing globally
51
48
  pub enabled: bool,
@@ -70,10 +67,7 @@ pub struct PreprocessingOptions {
70
67
  any(feature = "serde", feature = "metadata"),
71
68
  derive(serde::Serialize, serde::Deserialize)
72
69
  )]
73
- #[cfg_attr(
74
- any(feature = "serde", feature = "metadata"),
75
- serde(rename_all = "camelCase", deny_unknown_fields)
76
- )]
70
+ #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
77
71
  pub struct PreprocessingOptionsUpdate {
78
72
  /// Optional global preprocessing enablement override
79
73
  pub enabled: Option<bool>,
@@ -91,7 +85,7 @@ pub struct PreprocessingOptionsUpdate {
91
85
  impl Default for PreprocessingOptions {
92
86
  fn default() -> Self {
93
87
  Self {
94
- enabled: false,
88
+ enabled: true,
95
89
  preset: PreprocessingPreset::default(),
96
90
  remove_navigation: true,
97
91
  remove_forms: true,
@@ -115,10 +115,10 @@ impl NewlineStyle {
115
115
  /// Determines how code blocks (`<pre><code>`) are rendered in Markdown.
116
116
  #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
117
117
  pub enum CodeBlockStyle {
118
- /// Indented code blocks (4 spaces). Default. `CommonMark` standard.
119
- #[default]
118
+ /// Indented code blocks (4 spaces). `CommonMark` standard.
120
119
  Indented,
121
- /// Fenced code blocks with backticks (```). Supports language hints.
120
+ /// Fenced code blocks with backticks (```). Default (GFM). Supports language hints.
121
+ #[default]
122
122
  Backticks,
123
123
  /// Fenced code blocks with tildes (~~~). Supports language hints.
124
124
  Tildes,
@@ -147,8 +147,10 @@ pub struct TextAnnotation {
147
147
  /// Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
148
148
  #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
149
149
  #[serde(tag = "annotation_type", rename_all = "snake_case")]
150
+ #[derive(Default)]
150
151
  pub enum AnnotationKind {
151
152
  /// Bold / strong emphasis.
153
+ #[default]
152
154
  Bold,
153
155
  /// Italic / emphasis.
154
156
  Italic,
@@ -173,3 +175,12 @@ pub enum AnnotationKind {
173
175
  title: Option<String>,
174
176
  },
175
177
  }
178
+
179
+ impl Default for NodeContent {
180
+ fn default() -> Self {
181
+ Self::Heading {
182
+ level: 1,
183
+ text: String::new(),
184
+ }
185
+ }
186
+ }
@@ -1,5 +1,7 @@
1
1
  //! The primary result type for HTML conversion and extraction.
2
2
 
3
+ use serde::{Deserialize, Serialize};
4
+
3
5
  use super::document::DocumentStructure;
4
6
  use super::tables::TableData;
5
7
  use super::warnings::ProcessingWarning;
@@ -11,14 +13,14 @@ use super::warnings::ProcessingWarning;
11
13
  ///
12
14
  /// # Example
13
15
  ///
14
- /// ```rust,ignore
16
+ /// ```text
15
17
  /// use html_to_markdown_rs::{convert, ConversionOptions};
16
18
  ///
17
19
  /// let result = convert("<h1>Hello</h1><p>World</p>", None)?;
18
20
  /// assert!(result.content.is_some());
19
21
  /// assert!(result.warnings.is_empty());
20
22
  /// ```
21
- #[derive(Debug, Clone, Default)]
23
+ #[derive(Debug, Clone, Default, Serialize, Deserialize)]
22
24
  pub struct ConversionResult {
23
25
  /// Converted text output (markdown, djot, or plain text).
24
26
  ///
@@ -42,6 +44,7 @@ pub struct ConversionResult {
42
44
  ///
43
45
  /// Populated when `extract_images` is `true` in options.
44
46
  #[cfg(feature = "inline-images")]
47
+ #[serde(skip)]
45
48
  pub images: Vec<crate::inline_images::InlineImage>,
46
49
 
47
50
  /// Non-fatal processing warnings.
@@ -3,7 +3,7 @@
3
3
  use serde::{Deserialize, Serialize};
4
4
 
5
5
  /// A structured table grid with cell-level data including spans.
6
- #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
6
+ #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
7
7
  pub struct TableGrid {
8
8
  /// Number of rows.
9
9
  pub rows: u32,
@@ -13,7 +13,7 @@
13
13
  //!
14
14
  //! # Example
15
15
  //!
16
- //! ```ignore
16
+ //! ```text
17
17
  //! use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};
18
18
  //!
19
19
  //! struct CustomVisitor;
@@ -42,7 +42,7 @@ use crate::visitor::NodeType;
42
42
  ///
43
43
  /// # Examples
44
44
  ///
45
- /// ```ignore
45
+ /// ```text
46
46
  /// let ctx = build_node_context(
47
47
  /// NodeType::Heading,
48
48
  /// "h1",
@@ -51,7 +51,7 @@ use super::content::VisitorDispatch;
51
51
  ///
52
52
  /// # Examples
53
53
  ///
54
- /// ```ignore
54
+ /// ```text
55
55
  /// let result = dispatch_visitor(
56
56
  /// &visitor,
57
57
  /// |v| v.visit_heading(&ctx, level, text, id),