html-to-markdown 2.6.6 → 2.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 957492df2cee14d93b3cd74ac48f819058c886ecf93b63fe3649afc0f9e19935
4
- data.tar.gz: 97c0dce3aa1621c33b42d82a2c1f2253b0d02b71ea9ff6cbd7f10ad9ba3a0d22
3
+ metadata.gz: 92b1acd7c60d7aa288f3a73ee5d5f67e0397719f61879edd0d8fa4d8e3b09601
4
+ data.tar.gz: 687dabe472299a6007d1cc0462acb8a5103b6a41d63c4987788c8d915bdfe8c1
5
5
  SHA512:
6
- metadata.gz: 74eb762ff081d375a44cac7265950b4648a068a5a5c9125dd8d4f8aa3ca1a3a6083c2bc37f393ae4ade99ce15958bbf12a7d908881760ac28985aafb93f6c92f
7
- data.tar.gz: 80ab5cefd151fb887ddd792875d16146228253dc32d4b31d3ab3b033b25726a85ccbee345d7bdb71ee592de7ce882ad648b45dd92f79076a2b7ed04c8bec81b9
6
+ metadata.gz: c98e25f2a37a2cedec0fa611e0460aaa6f26e7be19b3fba461f9a5a4fa6ebcc8bd76e0698489e1ba0c5e8a8a172596e67cdae9ba8dc0409c7466fb34329adb93
7
+ data.tar.gz: 8cea9bc49e6156ce2242c155959793bd778f8747aaa6f343a8eec8d14285d503b2ef659d3cc3304b955f614dbc8fe123c7e2b7a878d6ddc965ae0a4350fab443
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.6.6)
4
+ html-to-markdown (2.7.1)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
data/README.md CHANGED
@@ -62,6 +62,23 @@ Apple M4 • Real Wikipedia documents • `HtmlToMarkdown.convert` (Ruby)
62
62
 
63
63
  > Same core, same benchmarks: the Ruby extension stays within single-digit % of the Rust CLI and mirrors the Python/Node numbers.
64
64
 
65
+ ### Benchmark Fixtures (Apple M4)
66
+
67
+ Measured via `task bench:bindings -- --language ruby` with the shared Wikipedia + hOCR suite:
68
+
69
+ | Document | Size | ops/sec (Ruby) |
70
+ | ---------------------- | ------ | -------------- |
71
+ | Lists (Timeline) | 129 KB | 1,349 |
72
+ | Tables (Countries) | 360 KB | 326 |
73
+ | Medium (Python) | 657 KB | 157 |
74
+ | Large (Rust) | 567 KB | 174 |
75
+ | Small (Intro) | 463 KB | 214 |
76
+ | hOCR German PDF | 44 KB | 2,936 |
77
+ | hOCR Invoice | 4 KB | 25,740 |
78
+ | hOCR Embedded Tables | 37 KB | 3,328 |
79
+
80
+ > These numbers line up with the Python/Node bindings because everything flows through the same Rust engine.
81
+
65
82
  ## Quick Start
66
83
 
67
84
  ```ruby
@@ -109,6 +126,18 @@ markdown = HtmlToMarkdown.convert(
109
126
  puts markdown
110
127
  ```
111
128
 
129
+ ### Reusing Options
130
+
131
+ If you’re running tight loops or benchmarks, build the options once and pass the handle back into `convert_with_options`:
132
+
133
+ ```ruby
134
+ handle = HtmlToMarkdown.options(hocr_spatial_tables: false)
135
+
136
+ 100.times do
137
+ HtmlToMarkdown.convert_with_options('<h1>Handles</h1>', handle)
138
+ end
139
+ ```
140
+
112
141
  ### HTML Preprocessing
113
142
 
114
143
  Clean up scraped HTML (navigation, forms, malformed markup) before conversion:
data/bin/benchmark.rb ADDED
@@ -0,0 +1,94 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'optparse'
5
+ require 'time'
6
+
7
+ $LOAD_PATH.unshift(File.expand_path('../lib', __dir__))
8
+ require 'html_to_markdown'
9
+
10
+ def json_escape(value)
11
+ value.to_s.gsub(/["\\\n\r]/) do |char|
12
+ case char
13
+ when '"', '\\'
14
+ "\\#{char}"
15
+ when "\n"
16
+ '\\n'
17
+ when "\r"
18
+ '\\r'
19
+ end
20
+ end
21
+ end
22
+
23
+ options = {
24
+ iterations: 50,
25
+ format: 'html'
26
+ }
27
+
28
+ OptionParser.new do |parser|
29
+ parser.banner = 'ruby benchmark.rb --file path/to/fixture.html [--iterations 200]'
30
+
31
+ parser.on('--file FILE', 'HTML fixture to convert repeatedly') do |file|
32
+ options[:file] = file
33
+ end
34
+
35
+ parser.on('--iterations N', Integer, 'Number of conversion iterations (default: 50)') do |n|
36
+ options[:iterations] = n.positive? ? n : 1
37
+ end
38
+
39
+ parser.on('--format FORMAT', 'Fixture format (html or hocr)') do |format|
40
+ options[:format] = format.downcase
41
+ end
42
+ end.parse!
43
+
44
+ fixture = options.fetch(:file) do
45
+ warn 'Missing --file parameter'
46
+ exit 1
47
+ end
48
+
49
+ unless File.exist?(fixture)
50
+ warn "Fixture not found: #{fixture}"
51
+ exit 1
52
+ end
53
+
54
+ unless %w[html hocr].include?(options[:format])
55
+ warn "Unsupported format: #{options[:format]}"
56
+ exit 1
57
+ end
58
+
59
+ html = File.binread(fixture)
60
+ html.force_encoding(Encoding::UTF_8)
61
+ html.freeze
62
+ iterations = options[:iterations]
63
+ options_handle = HtmlToMarkdown.options(
64
+ options[:format] == 'hocr' ? { hocr_spatial_tables: false } : nil
65
+ )
66
+
67
+ def convert_document(html, options_handle)
68
+ HtmlToMarkdown.convert_with_options(html, options_handle)
69
+ end
70
+
71
+ convert_document(html, options_handle)
72
+
73
+ start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
74
+ iterations.times { convert_document(html, options_handle) }
75
+ elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
76
+
77
+ payload_size_bytes = html.bytesize
78
+ bytes_processed = payload_size_bytes * iterations
79
+ ops_per_sec = iterations / elapsed
80
+ mb_per_sec = (bytes_processed.to_f / (1024 * 1024)) / elapsed
81
+
82
+ payload = %({
83
+ "language":"ruby",
84
+ "fixture":"#{json_escape(File.basename(fixture))}",
85
+ "fixture_path":"#{json_escape(fixture)}",
86
+ "iterations":#{iterations},
87
+ "elapsed_seconds":#{format('%.8f', elapsed)},
88
+ "ops_per_sec":#{format('%.4f', ops_per_sec)},
89
+ "mb_per_sec":#{format('%.4f', mb_per_sec)},
90
+ "bytes_processed":#{bytes_processed},
91
+ "payload_size_bytes":#{payload_size_bytes}
92
+ })
93
+
94
+ puts payload.strip
@@ -3,6 +3,7 @@
3
3
  require 'mkmf'
4
4
  require 'rb_sys/mkmf'
5
5
  require 'rbconfig'
6
+ require 'pathname'
6
7
 
7
8
  if RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
8
9
  devkit = ENV.fetch('RI_DEVKIT', nil)
@@ -24,5 +25,14 @@ default_profile = ENV.fetch('CARGO_PROFILE', 'release')
24
25
 
25
26
  create_rust_makefile('html_to_markdown_rb') do |config|
26
27
  config.profile = default_profile.to_sym
27
- config.ext_dir = File.expand_path('native', __dir__)
28
+
29
+ native_dir = File.expand_path('native', __dir__)
30
+ relative_native =
31
+ begin
32
+ Pathname.new(native_dir).relative_path_from(Pathname.new(__dir__)).to_s
33
+ rescue ArgumentError
34
+ native_dir
35
+ end
36
+
37
+ config.ext_dir = relative_native
28
38
  end
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version = "2.6.6"
3
+ version = "2.7.1"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
6
6
  license = "MIT"
@@ -21,7 +21,7 @@ crate-type = ["cdylib", "rlib"]
21
21
  default = []
22
22
 
23
23
  [dependencies]
24
- html-to-markdown-rs = { version = "2.6.5", features = ["inline-images"] }
24
+ html-to-markdown-rs = { version = "2.7.1", features = ["inline-images"] }
25
25
  magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
26
26
 
27
27
  [dev-dependencies]
@@ -5,8 +5,13 @@ use html_to_markdown_rs::{
5
5
  convert_with_inline_images as convert_with_inline_images_inner, error::ConversionError,
6
6
  };
7
7
  use magnus::prelude::*;
8
+ use magnus::r_hash::ForEach;
8
9
  use magnus::{Error, RArray, RHash, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args};
9
10
 
11
+ #[derive(Clone)]
12
+ #[magnus::wrap(class = "HtmlToMarkdown::Options", free_immediately)]
13
+ struct OptionsHandle(ConversionOptions);
14
+
10
15
  const DEFAULT_INLINE_IMAGE_LIMIT: u64 = 5 * 1024 * 1024;
11
16
 
12
17
  fn conversion_error(err: ConversionError) -> Error {
@@ -34,11 +39,6 @@ fn symbol_to_string(value: Value) -> Result<String, Error> {
34
39
  }
35
40
  }
36
41
 
37
- fn get_kw(ruby: &Ruby, hash: RHash, name: &str) -> Option<Value> {
38
- let sym = ruby.intern(name);
39
- hash.get(sym).or_else(|| hash.get(name))
40
- }
41
-
42
42
  fn parse_heading_style(value: Value) -> Result<HeadingStyle, Error> {
43
43
  match symbol_to_string(value)?.as_str() {
44
44
  "underlined" => Ok(HeadingStyle::Underlined),
@@ -106,26 +106,30 @@ fn parse_vec_of_strings(value: Value) -> Result<Vec<String>, Error> {
106
106
  array.to_vec::<String>()
107
107
  }
108
108
 
109
- fn parse_preprocessing_options(ruby: &Ruby, value: Value) -> Result<PreprocessingOptions, Error> {
109
+ fn parse_preprocessing_options(_ruby: &Ruby, value: Value) -> Result<PreprocessingOptions, Error> {
110
110
  let hash = RHash::from_value(value).ok_or_else(|| arg_error("expected preprocessing to be a Hash"))?;
111
111
 
112
112
  let mut opts = PreprocessingOptions::default();
113
113
 
114
- if let Some(enabled) = get_kw(ruby, hash, "enabled") {
115
- opts.enabled = bool::try_convert(enabled)?;
116
- }
117
-
118
- if let Some(preset) = get_kw(ruby, hash, "preset") {
119
- opts.preset = parse_preset(preset)?;
120
- }
121
-
122
- if let Some(remove_navigation) = get_kw(ruby, hash, "remove_navigation") {
123
- opts.remove_navigation = bool::try_convert(remove_navigation)?;
124
- }
125
-
126
- if let Some(remove_forms) = get_kw(ruby, hash, "remove_forms") {
127
- opts.remove_forms = bool::try_convert(remove_forms)?;
128
- }
114
+ hash.foreach(|key: Value, val: Value| {
115
+ let key_name = symbol_to_string(key)?;
116
+ match key_name.as_str() {
117
+ "enabled" => {
118
+ opts.enabled = bool::try_convert(val)?;
119
+ }
120
+ "preset" => {
121
+ opts.preset = parse_preset(val)?;
122
+ }
123
+ "remove_navigation" => {
124
+ opts.remove_navigation = bool::try_convert(val)?;
125
+ }
126
+ "remove_forms" => {
127
+ opts.remove_forms = bool::try_convert(val)?;
128
+ }
129
+ _ => {}
130
+ }
131
+ Ok(ForEach::Continue)
132
+ })?;
129
133
 
130
134
  Ok(opts)
131
135
  }
@@ -143,142 +147,119 @@ fn build_conversion_options(ruby: &Ruby, options: Option<Value>) -> Result<Conve
143
147
 
144
148
  let hash = RHash::from_value(options).ok_or_else(|| arg_error("options must be provided as a Hash"))?;
145
149
 
146
- if let Some(heading_style) = get_kw(ruby, hash, "heading_style") {
147
- opts.heading_style = parse_heading_style(heading_style)?;
148
- }
149
-
150
- if let Some(list_indent_type) = get_kw(ruby, hash, "list_indent_type") {
151
- opts.list_indent_type = parse_list_indent_type(list_indent_type)?;
152
- }
153
-
154
- if let Some(list_indent_width) = get_kw(ruby, hash, "list_indent_width") {
155
- opts.list_indent_width = usize::try_convert(list_indent_width)?;
156
- }
157
-
158
- if let Some(bullets) = get_kw(ruby, hash, "bullets") {
159
- opts.bullets = String::try_convert(bullets)?;
160
- }
161
-
162
- if let Some(strong_em_symbol) = get_kw(ruby, hash, "strong_em_symbol") {
163
- let value = String::try_convert(strong_em_symbol)?;
164
- let mut chars = value.chars();
165
- let ch = chars
166
- .next()
167
- .ok_or_else(|| arg_error("strong_em_symbol must not be empty"))?;
168
- if chars.next().is_some() {
169
- return Err(arg_error("strong_em_symbol must be a single character"));
150
+ hash.foreach(|key: Value, val: Value| {
151
+ let key_name = symbol_to_string(key)?;
152
+ match key_name.as_str() {
153
+ "heading_style" => {
154
+ opts.heading_style = parse_heading_style(val)?;
155
+ }
156
+ "list_indent_type" => {
157
+ opts.list_indent_type = parse_list_indent_type(val)?;
158
+ }
159
+ "list_indent_width" => {
160
+ opts.list_indent_width = usize::try_convert(val)?;
161
+ }
162
+ "bullets" => {
163
+ opts.bullets = String::try_convert(val)?;
164
+ }
165
+ "strong_em_symbol" => {
166
+ let value = String::try_convert(val)?;
167
+ let mut chars = value.chars();
168
+ let ch = chars
169
+ .next()
170
+ .ok_or_else(|| arg_error("strong_em_symbol must not be empty"))?;
171
+ if chars.next().is_some() {
172
+ return Err(arg_error("strong_em_symbol must be a single character"));
173
+ }
174
+ opts.strong_em_symbol = ch;
175
+ }
176
+ "escape_asterisks" => {
177
+ opts.escape_asterisks = bool::try_convert(val)?;
178
+ }
179
+ "escape_underscores" => {
180
+ opts.escape_underscores = bool::try_convert(val)?;
181
+ }
182
+ "escape_misc" => {
183
+ opts.escape_misc = bool::try_convert(val)?;
184
+ }
185
+ "escape_ascii" => {
186
+ opts.escape_ascii = bool::try_convert(val)?;
187
+ }
188
+ "code_language" => {
189
+ opts.code_language = String::try_convert(val)?;
190
+ }
191
+ "autolinks" => {
192
+ opts.autolinks = bool::try_convert(val)?;
193
+ }
194
+ "default_title" => {
195
+ opts.default_title = bool::try_convert(val)?;
196
+ }
197
+ "br_in_tables" => {
198
+ opts.br_in_tables = bool::try_convert(val)?;
199
+ }
200
+ "hocr_spatial_tables" => {
201
+ opts.hocr_spatial_tables = bool::try_convert(val)?;
202
+ }
203
+ "highlight_style" => {
204
+ opts.highlight_style = parse_highlight_style(val)?;
205
+ }
206
+ "extract_metadata" => {
207
+ opts.extract_metadata = bool::try_convert(val)?;
208
+ }
209
+ "whitespace_mode" => {
210
+ opts.whitespace_mode = parse_whitespace_mode(val)?;
211
+ }
212
+ "strip_newlines" => {
213
+ opts.strip_newlines = bool::try_convert(val)?;
214
+ }
215
+ "wrap" => {
216
+ opts.wrap = bool::try_convert(val)?;
217
+ }
218
+ "wrap_width" => {
219
+ opts.wrap_width = usize::try_convert(val)?;
220
+ }
221
+ "convert_as_inline" => {
222
+ opts.convert_as_inline = bool::try_convert(val)?;
223
+ }
224
+ "sub_symbol" => {
225
+ opts.sub_symbol = String::try_convert(val)?;
226
+ }
227
+ "sup_symbol" => {
228
+ opts.sup_symbol = String::try_convert(val)?;
229
+ }
230
+ "newline_style" => {
231
+ opts.newline_style = parse_newline_style(val)?;
232
+ }
233
+ "code_block_style" => {
234
+ opts.code_block_style = parse_code_block_style(val)?;
235
+ }
236
+ "keep_inline_images_in" => {
237
+ opts.keep_inline_images_in = parse_vec_of_strings(val)?;
238
+ }
239
+ "preprocessing" => {
240
+ opts.preprocessing = parse_preprocessing_options(ruby, val)?;
241
+ }
242
+ "encoding" => {
243
+ opts.encoding = String::try_convert(val)?;
244
+ }
245
+ "debug" => {
246
+ opts.debug = bool::try_convert(val)?;
247
+ }
248
+ "strip_tags" => {
249
+ opts.strip_tags = parse_vec_of_strings(val)?;
250
+ }
251
+ "preserve_tags" => {
252
+ opts.preserve_tags = parse_vec_of_strings(val)?;
253
+ }
254
+ _ => {}
170
255
  }
171
- opts.strong_em_symbol = ch;
172
- }
173
-
174
- if let Some(escape_asterisks) = get_kw(ruby, hash, "escape_asterisks") {
175
- opts.escape_asterisks = bool::try_convert(escape_asterisks)?;
176
- }
177
-
178
- if let Some(escape_underscores) = get_kw(ruby, hash, "escape_underscores") {
179
- opts.escape_underscores = bool::try_convert(escape_underscores)?;
180
- }
181
-
182
- if let Some(escape_misc) = get_kw(ruby, hash, "escape_misc") {
183
- opts.escape_misc = bool::try_convert(escape_misc)?;
184
- }
185
-
186
- if let Some(escape_ascii) = get_kw(ruby, hash, "escape_ascii") {
187
- opts.escape_ascii = bool::try_convert(escape_ascii)?;
188
- }
189
-
190
- if let Some(code_language) = get_kw(ruby, hash, "code_language") {
191
- opts.code_language = String::try_convert(code_language)?;
192
- }
193
-
194
- if let Some(autolinks) = get_kw(ruby, hash, "autolinks") {
195
- opts.autolinks = bool::try_convert(autolinks)?;
196
- }
197
-
198
- if let Some(default_title) = get_kw(ruby, hash, "default_title") {
199
- opts.default_title = bool::try_convert(default_title)?;
200
- }
201
-
202
- if let Some(br_in_tables) = get_kw(ruby, hash, "br_in_tables") {
203
- opts.br_in_tables = bool::try_convert(br_in_tables)?;
204
- }
205
-
206
- if let Some(hocr_spatial_tables) = get_kw(ruby, hash, "hocr_spatial_tables") {
207
- opts.hocr_spatial_tables = bool::try_convert(hocr_spatial_tables)?;
208
- }
209
-
210
- if let Some(highlight_style) = get_kw(ruby, hash, "highlight_style") {
211
- opts.highlight_style = parse_highlight_style(highlight_style)?;
212
- }
213
-
214
- if let Some(extract_metadata) = get_kw(ruby, hash, "extract_metadata") {
215
- opts.extract_metadata = bool::try_convert(extract_metadata)?;
216
- }
217
-
218
- if let Some(whitespace_mode) = get_kw(ruby, hash, "whitespace_mode") {
219
- opts.whitespace_mode = parse_whitespace_mode(whitespace_mode)?;
220
- }
221
-
222
- if let Some(strip_newlines) = get_kw(ruby, hash, "strip_newlines") {
223
- opts.strip_newlines = bool::try_convert(strip_newlines)?;
224
- }
225
-
226
- if let Some(wrap) = get_kw(ruby, hash, "wrap") {
227
- opts.wrap = bool::try_convert(wrap)?;
228
- }
229
-
230
- if let Some(wrap_width) = get_kw(ruby, hash, "wrap_width") {
231
- opts.wrap_width = usize::try_convert(wrap_width)?;
232
- }
233
-
234
- if let Some(convert_as_inline) = get_kw(ruby, hash, "convert_as_inline") {
235
- opts.convert_as_inline = bool::try_convert(convert_as_inline)?;
236
- }
237
-
238
- if let Some(sub_symbol) = get_kw(ruby, hash, "sub_symbol") {
239
- opts.sub_symbol = String::try_convert(sub_symbol)?;
240
- }
241
-
242
- if let Some(sup_symbol) = get_kw(ruby, hash, "sup_symbol") {
243
- opts.sup_symbol = String::try_convert(sup_symbol)?;
244
- }
245
-
246
- if let Some(newline_style) = get_kw(ruby, hash, "newline_style") {
247
- opts.newline_style = parse_newline_style(newline_style)?;
248
- }
249
-
250
- if let Some(code_block_style) = get_kw(ruby, hash, "code_block_style") {
251
- opts.code_block_style = parse_code_block_style(code_block_style)?;
252
- }
253
-
254
- if let Some(keep_inline_images_in) = get_kw(ruby, hash, "keep_inline_images_in") {
255
- opts.keep_inline_images_in = parse_vec_of_strings(keep_inline_images_in)?;
256
- }
257
-
258
- if let Some(preprocessing) = get_kw(ruby, hash, "preprocessing") {
259
- opts.preprocessing = parse_preprocessing_options(ruby, preprocessing)?;
260
- }
261
-
262
- if let Some(encoding) = get_kw(ruby, hash, "encoding") {
263
- opts.encoding = String::try_convert(encoding)?;
264
- }
265
-
266
- if let Some(debug) = get_kw(ruby, hash, "debug") {
267
- opts.debug = bool::try_convert(debug)?;
268
- }
269
-
270
- if let Some(strip_tags) = get_kw(ruby, hash, "strip_tags") {
271
- opts.strip_tags = parse_vec_of_strings(strip_tags)?;
272
- }
273
-
274
- if let Some(preserve_tags) = get_kw(ruby, hash, "preserve_tags") {
275
- opts.preserve_tags = parse_vec_of_strings(preserve_tags)?;
276
- }
256
+ Ok(ForEach::Continue)
257
+ })?;
277
258
 
278
259
  Ok(opts)
279
260
  }
280
261
 
281
- fn build_inline_image_config(ruby: &Ruby, config: Option<Value>) -> Result<InlineImageConfig, Error> {
262
+ fn build_inline_image_config(_ruby: &Ruby, config: Option<Value>) -> Result<InlineImageConfig, Error> {
282
263
  let mut cfg = InlineImageConfig::new(DEFAULT_INLINE_IMAGE_LIMIT);
283
264
 
284
265
  let Some(config) = config else {
@@ -291,25 +272,29 @@ fn build_inline_image_config(ruby: &Ruby, config: Option<Value>) -> Result<Inlin
291
272
 
292
273
  let hash = RHash::from_value(config).ok_or_else(|| arg_error("inline image config must be provided as a Hash"))?;
293
274
 
294
- if let Some(limit) = get_kw(ruby, hash, "max_decoded_size_bytes") {
295
- cfg.max_decoded_size_bytes = u64::try_convert(limit)?;
296
- }
297
-
298
- if let Some(prefix) = get_kw(ruby, hash, "filename_prefix") {
299
- cfg.filename_prefix = if prefix.is_nil() {
300
- None
301
- } else {
302
- Some(String::try_convert(prefix)?)
303
- };
304
- }
305
-
306
- if let Some(capture_svg) = get_kw(ruby, hash, "capture_svg") {
307
- cfg.capture_svg = bool::try_convert(capture_svg)?;
308
- }
309
-
310
- if let Some(infer_dimensions) = get_kw(ruby, hash, "infer_dimensions") {
311
- cfg.infer_dimensions = bool::try_convert(infer_dimensions)?;
312
- }
275
+ hash.foreach(|key: Value, val: Value| {
276
+ let key_name = symbol_to_string(key)?;
277
+ match key_name.as_str() {
278
+ "max_decoded_size_bytes" => {
279
+ cfg.max_decoded_size_bytes = u64::try_convert(val)?;
280
+ }
281
+ "filename_prefix" => {
282
+ cfg.filename_prefix = if val.is_nil() {
283
+ None
284
+ } else {
285
+ Some(String::try_convert(val)?)
286
+ };
287
+ }
288
+ "capture_svg" => {
289
+ cfg.capture_svg = bool::try_convert(val)?;
290
+ }
291
+ "infer_dimensions" => {
292
+ cfg.infer_dimensions = bool::try_convert(val)?;
293
+ }
294
+ _ => {}
295
+ }
296
+ Ok(ForEach::Continue)
297
+ })?;
313
298
 
314
299
  Ok(cfg)
315
300
  }
@@ -408,6 +393,19 @@ fn convert_fn(ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
408
393
  convert_inner(&html, Some(options)).map_err(conversion_error)
409
394
  }
410
395
 
396
+ fn options_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<OptionsHandle, Error> {
397
+ let parsed = scan_args::<(), (Option<Value>,), (), (), (), ()>(args)?;
398
+ let options = build_conversion_options(ruby, parsed.optional.0)?;
399
+ Ok(OptionsHandle(options))
400
+ }
401
+
402
+ fn convert_with_options_handle_fn(_ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
403
+ let parsed = scan_args::<(String, &OptionsHandle), (), (), (), (), ()>(args)?;
404
+ let html = parsed.required.0;
405
+ let handle = parsed.required.1;
406
+ convert_inner(&html, Some(handle.0.clone())).map_err(conversion_error)
407
+ }
408
+
411
409
  fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
412
410
  let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
413
411
  let html = parsed.required.0;
@@ -423,6 +421,8 @@ fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, E
423
421
  fn init(ruby: &Ruby) -> Result<(), Error> {
424
422
  let module = ruby.define_module("HtmlToMarkdown")?;
425
423
  module.define_singleton_method("convert", function!(convert_fn, -1))?;
424
+ module.define_singleton_method("options", function!(options_handle_fn, -1))?;
425
+ module.define_singleton_method("convert_with_options", function!(convert_with_options_handle_fn, -1))?;
426
426
  module.define_singleton_method(
427
427
  "convert_with_inline_images",
428
428
  function!(convert_with_inline_images_fn, -1),
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.6.6'
4
+ VERSION = '2.7.1'
5
5
  end
@@ -7,9 +7,13 @@ module HtmlToMarkdown
7
7
  autoload :CLI, 'html_to_markdown/cli'
8
8
  autoload :CLIProxy, 'html_to_markdown/cli_proxy'
9
9
 
10
+ class Options; end # rubocop:disable Lint/EmptyClass
11
+
10
12
  class << self
11
13
  alias native_convert convert
12
14
  alias native_convert_with_inline_images convert_with_inline_images
15
+ alias native_options options
16
+ alias native_convert_with_options convert_with_options
13
17
  end
14
18
 
15
19
  module_function
@@ -18,7 +22,15 @@ module HtmlToMarkdown
18
22
  native_convert(html.to_s, options)
19
23
  end
20
24
 
25
+ def convert_with_options(html, options_handle)
26
+ native_convert_with_options(html.to_s, options_handle)
27
+ end
28
+
21
29
  def convert_with_inline_images(html, options = nil, image_config = nil)
22
30
  native_convert_with_inline_images(html.to_s, options, image_config)
23
31
  end
32
+
33
+ def options(options_hash = nil)
34
+ native_options(options_hash)
35
+ end
24
36
  end
data/spec/convert_spec.rb CHANGED
@@ -26,4 +26,13 @@ RSpec.describe HtmlToMarkdown do
26
26
  expect(extraction[:inline_images].first[:description]).to eq('fake')
27
27
  end
28
28
  end
29
+
30
+ describe '.options' do
31
+ it 'returns a reusable options handle' do
32
+ handle = described_class.options(heading_style: :atx_closed)
33
+ expect(handle).to be_a(HtmlToMarkdown::Options)
34
+ result = described_class.convert_with_options('<h1>Hello</h1>', handle)
35
+ expect(result).to include('# Hello #')
36
+ end
37
+ end
29
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.6.6
4
+ version: 2.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-11-10 00:00:00.000000000 Z
11
+ date: 2025-11-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -47,6 +47,7 @@ files:
47
47
  - Gemfile.lock
48
48
  - README.md
49
49
  - Rakefile
50
+ - bin/benchmark.rb
50
51
  - exe/html-to-markdown
51
52
  - ext/html-to-markdown-rb/extconf.rb
52
53
  - ext/html-to-markdown-rb/native/Cargo.toml