html-to-markdown 2.9.2 → 2.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,432 +1,438 @@
1
- use html_to_markdown_rs::{
2
- CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, HtmlExtraction, InlineImage, InlineImageConfig,
3
- InlineImageFormat, InlineImageSource, InlineImageWarning, ListIndentType, NewlineStyle, PreprocessingOptions,
4
- PreprocessingPreset, WhitespaceMode, convert as convert_inner,
5
- convert_with_inline_images as convert_with_inline_images_inner, error::ConversionError,
6
- };
7
- use magnus::prelude::*;
8
- use magnus::r_hash::ForEach;
9
- use magnus::{Error, RArray, RHash, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args};
10
-
11
- #[derive(Clone)]
12
- #[magnus::wrap(class = "HtmlToMarkdown::Options", free_immediately)]
13
- struct OptionsHandle(ConversionOptions);
14
-
15
- const DEFAULT_INLINE_IMAGE_LIMIT: u64 = 5 * 1024 * 1024;
16
-
17
- fn conversion_error(err: ConversionError) -> Error {
18
- match err {
19
- ConversionError::ConfigError(msg) => arg_error(msg),
20
- other => runtime_error(other.to_string()),
21
- }
22
- }
23
-
24
- fn arg_error(message: impl Into<String>) -> Error {
25
- let ruby = Ruby::get().expect("Ruby not initialised");
26
- Error::new(ruby.exception_arg_error(), message.into())
27
- }
28
-
29
- fn runtime_error(message: impl Into<String>) -> Error {
30
- let ruby = Ruby::get().expect("Ruby not initialised");
31
- Error::new(ruby.exception_runtime_error(), message.into())
32
- }
33
-
34
- fn symbol_to_string(value: Value) -> Result<String, Error> {
35
- if let Some(symbol) = Symbol::from_value(value) {
36
- Ok(symbol.name()?.to_string())
37
- } else {
38
- String::try_convert(value)
39
- }
40
- }
41
-
42
- fn parse_heading_style(value: Value) -> Result<HeadingStyle, Error> {
43
- match symbol_to_string(value)?.as_str() {
44
- "underlined" => Ok(HeadingStyle::Underlined),
45
- "atx" => Ok(HeadingStyle::Atx),
46
- "atx_closed" => Ok(HeadingStyle::AtxClosed),
47
- other => Err(arg_error(format!("invalid heading_style: {other}"))),
48
- }
49
- }
50
-
51
- fn parse_list_indent_type(value: Value) -> Result<ListIndentType, Error> {
52
- match symbol_to_string(value)?.as_str() {
53
- "spaces" => Ok(ListIndentType::Spaces),
54
- "tabs" => Ok(ListIndentType::Tabs),
55
- other => Err(arg_error(format!("invalid list_indent_type: {other}"))),
56
- }
57
- }
58
-
59
- fn parse_highlight_style(value: Value) -> Result<HighlightStyle, Error> {
60
- match symbol_to_string(value)?.as_str() {
61
- "double_equal" => Ok(HighlightStyle::DoubleEqual),
62
- "html" => Ok(HighlightStyle::Html),
63
- "bold" => Ok(HighlightStyle::Bold),
64
- "none" => Ok(HighlightStyle::None),
65
- other => Err(arg_error(format!("invalid highlight_style: {other}"))),
66
- }
67
- }
68
-
69
- fn parse_whitespace_mode(value: Value) -> Result<WhitespaceMode, Error> {
70
- match symbol_to_string(value)?.as_str() {
71
- "normalized" => Ok(WhitespaceMode::Normalized),
72
- "strict" => Ok(WhitespaceMode::Strict),
73
- other => Err(arg_error(format!("invalid whitespace_mode: {other}"))),
74
- }
75
- }
76
-
77
- fn parse_newline_style(value: Value) -> Result<NewlineStyle, Error> {
78
- match symbol_to_string(value)?.as_str() {
79
- "spaces" => Ok(NewlineStyle::Spaces),
80
- "backslash" => Ok(NewlineStyle::Backslash),
81
- other => Err(arg_error(format!("invalid newline_style: {other}"))),
82
- }
83
- }
84
-
85
- fn parse_code_block_style(value: Value) -> Result<CodeBlockStyle, Error> {
86
- match symbol_to_string(value)?.as_str() {
87
- "indented" => Ok(CodeBlockStyle::Indented),
88
- "backticks" => Ok(CodeBlockStyle::Backticks),
89
- "tildes" => Ok(CodeBlockStyle::Tildes),
90
- other => Err(arg_error(format!("invalid code_block_style: {other}"))),
91
- }
92
- }
93
-
94
- fn parse_preset(value: Value) -> Result<PreprocessingPreset, Error> {
95
- match symbol_to_string(value)?.as_str() {
96
- "minimal" => Ok(PreprocessingPreset::Minimal),
97
- "standard" => Ok(PreprocessingPreset::Standard),
98
- "aggressive" => Ok(PreprocessingPreset::Aggressive),
99
- other => Err(arg_error(format!("invalid preprocessing preset: {other}"))),
100
- }
101
- }
102
-
103
- fn parse_vec_of_strings(value: Value) -> Result<Vec<String>, Error> {
104
- let array = RArray::from_value(value).ok_or_else(|| arg_error("expected an Array of strings"))?;
105
-
106
- array.to_vec::<String>()
107
- }
108
-
109
- fn parse_preprocessing_options(_ruby: &Ruby, value: Value) -> Result<PreprocessingOptions, Error> {
110
- let hash = RHash::from_value(value).ok_or_else(|| arg_error("expected preprocessing to be a Hash"))?;
111
-
112
- let mut opts = PreprocessingOptions::default();
113
-
114
- hash.foreach(|key: Value, val: Value| {
115
- let key_name = symbol_to_string(key)?;
116
- match key_name.as_str() {
117
- "enabled" => {
118
- opts.enabled = bool::try_convert(val)?;
119
- }
120
- "preset" => {
121
- opts.preset = parse_preset(val)?;
122
- }
123
- "remove_navigation" => {
124
- opts.remove_navigation = bool::try_convert(val)?;
125
- }
126
- "remove_forms" => {
127
- opts.remove_forms = bool::try_convert(val)?;
128
- }
129
- _ => {}
130
- }
131
- Ok(ForEach::Continue)
132
- })?;
133
-
134
- Ok(opts)
135
- }
136
-
137
- fn build_conversion_options(ruby: &Ruby, options: Option<Value>) -> Result<ConversionOptions, Error> {
138
- let mut opts = ConversionOptions::default();
139
-
140
- let Some(options) = options else {
141
- return Ok(opts);
142
- };
143
-
144
- if options.is_nil() {
145
- return Ok(opts);
146
- }
147
-
148
- let hash = RHash::from_value(options).ok_or_else(|| arg_error("options must be provided as a Hash"))?;
149
-
150
- hash.foreach(|key: Value, val: Value| {
151
- let key_name = symbol_to_string(key)?;
152
- match key_name.as_str() {
153
- "heading_style" => {
154
- opts.heading_style = parse_heading_style(val)?;
155
- }
156
- "list_indent_type" => {
157
- opts.list_indent_type = parse_list_indent_type(val)?;
158
- }
159
- "list_indent_width" => {
160
- opts.list_indent_width = usize::try_convert(val)?;
161
- }
162
- "bullets" => {
163
- opts.bullets = String::try_convert(val)?;
164
- }
165
- "strong_em_symbol" => {
166
- let value = String::try_convert(val)?;
167
- let mut chars = value.chars();
168
- let ch = chars
169
- .next()
170
- .ok_or_else(|| arg_error("strong_em_symbol must not be empty"))?;
171
- if chars.next().is_some() {
172
- return Err(arg_error("strong_em_symbol must be a single character"));
173
- }
174
- opts.strong_em_symbol = ch;
175
- }
176
- "escape_asterisks" => {
177
- opts.escape_asterisks = bool::try_convert(val)?;
178
- }
179
- "escape_underscores" => {
180
- opts.escape_underscores = bool::try_convert(val)?;
181
- }
182
- "escape_misc" => {
183
- opts.escape_misc = bool::try_convert(val)?;
184
- }
185
- "escape_ascii" => {
186
- opts.escape_ascii = bool::try_convert(val)?;
187
- }
188
- "code_language" => {
189
- opts.code_language = String::try_convert(val)?;
190
- }
191
- "autolinks" => {
192
- opts.autolinks = bool::try_convert(val)?;
193
- }
194
- "default_title" => {
195
- opts.default_title = bool::try_convert(val)?;
196
- }
197
- "br_in_tables" => {
198
- opts.br_in_tables = bool::try_convert(val)?;
199
- }
200
- "hocr_spatial_tables" => {
201
- opts.hocr_spatial_tables = bool::try_convert(val)?;
202
- }
203
- "highlight_style" => {
204
- opts.highlight_style = parse_highlight_style(val)?;
205
- }
206
- "extract_metadata" => {
207
- opts.extract_metadata = bool::try_convert(val)?;
208
- }
209
- "whitespace_mode" => {
210
- opts.whitespace_mode = parse_whitespace_mode(val)?;
211
- }
212
- "strip_newlines" => {
213
- opts.strip_newlines = bool::try_convert(val)?;
214
- }
215
- "wrap" => {
216
- opts.wrap = bool::try_convert(val)?;
217
- }
218
- "wrap_width" => {
219
- opts.wrap_width = usize::try_convert(val)?;
220
- }
221
- "convert_as_inline" => {
222
- opts.convert_as_inline = bool::try_convert(val)?;
223
- }
224
- "sub_symbol" => {
225
- opts.sub_symbol = String::try_convert(val)?;
226
- }
227
- "sup_symbol" => {
228
- opts.sup_symbol = String::try_convert(val)?;
229
- }
230
- "newline_style" => {
231
- opts.newline_style = parse_newline_style(val)?;
232
- }
233
- "code_block_style" => {
234
- opts.code_block_style = parse_code_block_style(val)?;
235
- }
236
- "keep_inline_images_in" => {
237
- opts.keep_inline_images_in = parse_vec_of_strings(val)?;
238
- }
239
- "preprocessing" => {
240
- opts.preprocessing = parse_preprocessing_options(ruby, val)?;
241
- }
242
- "encoding" => {
243
- opts.encoding = String::try_convert(val)?;
244
- }
245
- "debug" => {
246
- opts.debug = bool::try_convert(val)?;
247
- }
248
- "strip_tags" => {
249
- opts.strip_tags = parse_vec_of_strings(val)?;
250
- }
251
- "preserve_tags" => {
252
- opts.preserve_tags = parse_vec_of_strings(val)?;
253
- }
254
- _ => {}
255
- }
256
- Ok(ForEach::Continue)
257
- })?;
258
-
259
- Ok(opts)
260
- }
261
-
262
- fn build_inline_image_config(_ruby: &Ruby, config: Option<Value>) -> Result<InlineImageConfig, Error> {
263
- let mut cfg = InlineImageConfig::new(DEFAULT_INLINE_IMAGE_LIMIT);
264
-
265
- let Some(config) = config else {
266
- return Ok(cfg);
267
- };
268
-
269
- if config.is_nil() {
270
- return Ok(cfg);
271
- }
272
-
273
- let hash = RHash::from_value(config).ok_or_else(|| arg_error("inline image config must be provided as a Hash"))?;
274
-
275
- hash.foreach(|key: Value, val: Value| {
276
- let key_name = symbol_to_string(key)?;
277
- match key_name.as_str() {
278
- "max_decoded_size_bytes" => {
279
- cfg.max_decoded_size_bytes = u64::try_convert(val)?;
280
- }
281
- "filename_prefix" => {
282
- cfg.filename_prefix = if val.is_nil() {
283
- None
284
- } else {
285
- Some(String::try_convert(val)?)
286
- };
287
- }
288
- "capture_svg" => {
289
- cfg.capture_svg = bool::try_convert(val)?;
290
- }
291
- "infer_dimensions" => {
292
- cfg.infer_dimensions = bool::try_convert(val)?;
293
- }
294
- _ => {}
295
- }
296
- Ok(ForEach::Continue)
297
- })?;
298
-
299
- Ok(cfg)
300
- }
301
-
302
- fn inline_image_to_value(ruby: &Ruby, image: InlineImage) -> Result<Value, Error> {
303
- let InlineImage {
304
- data,
305
- format,
306
- filename,
307
- description,
308
- dimensions,
309
- source,
310
- attributes,
311
- } = image;
312
-
313
- let hash = ruby.hash_new();
314
- let data_value = ruby.str_from_slice(&data);
315
- hash.aset(ruby.intern("data"), data_value)?;
316
-
317
- let format_value = match format {
318
- InlineImageFormat::Png => "png".to_string(),
319
- InlineImageFormat::Jpeg => "jpeg".to_string(),
320
- InlineImageFormat::Gif => "gif".to_string(),
321
- InlineImageFormat::Bmp => "bmp".to_string(),
322
- InlineImageFormat::Webp => "webp".to_string(),
323
- InlineImageFormat::Svg => "svg".to_string(),
324
- InlineImageFormat::Other(other) => other,
325
- };
326
- hash.aset(ruby.intern("format"), format_value)?;
327
-
328
- match filename {
329
- Some(name) => hash.aset(ruby.intern("filename"), name)?,
330
- None => hash.aset(ruby.intern("filename"), ruby.qnil())?,
331
- }
332
-
333
- match description {
334
- Some(desc) => hash.aset(ruby.intern("description"), desc)?,
335
- None => hash.aset(ruby.intern("description"), ruby.qnil())?,
336
- }
337
-
338
- if let Some((width, height)) = dimensions {
339
- let dims = ruby.ary_new();
340
- dims.push(width as i64)?;
341
- dims.push(height as i64)?;
342
- hash.aset(ruby.intern("dimensions"), dims)?;
343
- } else {
344
- hash.aset(ruby.intern("dimensions"), ruby.qnil())?;
345
- }
346
-
347
- let source_value = match source {
348
- InlineImageSource::ImgDataUri => "img_data_uri",
349
- InlineImageSource::SvgElement => "svg_element",
350
- };
351
- hash.aset(ruby.intern("source"), source_value)?;
352
-
353
- let attrs = ruby.hash_new();
354
- for (key, value) in attributes {
355
- attrs.aset(key, value)?;
356
- }
357
- hash.aset(ruby.intern("attributes"), attrs)?;
358
-
359
- Ok(hash.as_value())
360
- }
361
-
362
- fn warning_to_value(ruby: &Ruby, warning: InlineImageWarning) -> Result<Value, Error> {
363
- let hash = ruby.hash_new();
364
- hash.aset(ruby.intern("index"), warning.index as i64)?;
365
- hash.aset(ruby.intern("message"), warning.message)?;
366
- Ok(hash.as_value())
367
- }
368
-
369
- fn extraction_to_value(ruby: &Ruby, extraction: HtmlExtraction) -> Result<Value, Error> {
370
- let hash = ruby.hash_new();
371
- hash.aset(ruby.intern("markdown"), extraction.markdown)?;
372
-
373
- let inline_images = ruby.ary_new();
374
- for image in extraction.inline_images {
375
- inline_images.push(inline_image_to_value(ruby, image)?)?;
376
- }
377
- hash.aset(ruby.intern("inline_images"), inline_images)?;
378
-
379
- let warnings = ruby.ary_new();
380
- for warning in extraction.warnings {
381
- warnings.push(warning_to_value(ruby, warning)?)?;
382
- }
383
- hash.aset(ruby.intern("warnings"), warnings)?;
384
-
385
- Ok(hash.as_value())
386
- }
387
-
388
- fn convert_fn(ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
389
- let parsed = scan_args::<(String,), (Option<Value>,), (), (), (), ()>(args)?;
390
- let html = parsed.required.0;
391
- let options = build_conversion_options(ruby, parsed.optional.0)?;
392
-
393
- convert_inner(&html, Some(options)).map_err(conversion_error)
394
- }
395
-
396
- fn options_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<OptionsHandle, Error> {
397
- let parsed = scan_args::<(), (Option<Value>,), (), (), (), ()>(args)?;
398
- let options = build_conversion_options(ruby, parsed.optional.0)?;
399
- Ok(OptionsHandle(options))
400
- }
401
-
402
- fn convert_with_options_handle_fn(_ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
403
- let parsed = scan_args::<(String, &OptionsHandle), (), (), (), (), ()>(args)?;
404
- let html = parsed.required.0;
405
- let handle = parsed.required.1;
406
- convert_inner(&html, Some(handle.0.clone())).map_err(conversion_error)
407
- }
408
-
409
- fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
410
- let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
411
- let html = parsed.required.0;
412
- let options = build_conversion_options(ruby, parsed.optional.0)?;
413
- let config = build_inline_image_config(ruby, parsed.optional.1)?;
414
-
415
- let extraction = convert_with_inline_images_inner(&html, Some(options), config).map_err(conversion_error)?;
416
-
417
- extraction_to_value(ruby, extraction)
418
- }
419
-
420
- #[magnus::init]
421
- fn init(ruby: &Ruby) -> Result<(), Error> {
422
- let module = ruby.define_module("HtmlToMarkdown")?;
423
- module.define_singleton_method("convert", function!(convert_fn, -1))?;
424
- module.define_singleton_method("options", function!(options_handle_fn, -1))?;
425
- module.define_singleton_method("convert_with_options", function!(convert_with_options_handle_fn, -1))?;
426
- module.define_singleton_method(
427
- "convert_with_inline_images",
428
- function!(convert_with_inline_images_fn, -1),
429
- )?;
430
-
431
- Ok(())
432
- }
1
+ use html_to_markdown_rs::{
2
+ CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, HtmlExtraction, InlineImage, InlineImageConfig,
3
+ InlineImageFormat, InlineImageSource, InlineImageWarning, ListIndentType, NewlineStyle, PreprocessingOptions,
4
+ PreprocessingPreset, WhitespaceMode, convert as convert_inner,
5
+ convert_with_inline_images as convert_with_inline_images_inner, error::ConversionError, safety::guard_panic,
6
+ };
7
+ use magnus::prelude::*;
8
+ use magnus::r_hash::ForEach;
9
+ use magnus::{Error, RArray, RHash, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args};
10
+
11
+ #[derive(Clone)]
12
+ #[magnus::wrap(class = "HtmlToMarkdown::Options", free_immediately)]
13
+ struct OptionsHandle(ConversionOptions);
14
+
15
+ const DEFAULT_INLINE_IMAGE_LIMIT: u64 = 5 * 1024 * 1024;
16
+
17
+ fn conversion_error(err: ConversionError) -> Error {
18
+ match err {
19
+ ConversionError::ConfigError(msg) => arg_error(msg),
20
+ ConversionError::Panic(message) => {
21
+ runtime_error(format!("html-to-markdown panic during conversion: {message}"))
22
+ }
23
+ other => runtime_error(other.to_string()),
24
+ }
25
+ }
26
+
27
+ fn arg_error(message: impl Into<String>) -> Error {
28
+ let ruby = Ruby::get().expect("Ruby not initialised");
29
+ Error::new(ruby.exception_arg_error(), message.into())
30
+ }
31
+
32
+ fn runtime_error(message: impl Into<String>) -> Error {
33
+ let ruby = Ruby::get().expect("Ruby not initialised");
34
+ Error::new(ruby.exception_runtime_error(), message.into())
35
+ }
36
+
37
+ fn symbol_to_string(value: Value) -> Result<String, Error> {
38
+ if let Some(symbol) = Symbol::from_value(value) {
39
+ Ok(symbol.name()?.to_string())
40
+ } else {
41
+ String::try_convert(value)
42
+ }
43
+ }
44
+
45
+ fn parse_heading_style(value: Value) -> Result<HeadingStyle, Error> {
46
+ match symbol_to_string(value)?.as_str() {
47
+ "underlined" => Ok(HeadingStyle::Underlined),
48
+ "atx" => Ok(HeadingStyle::Atx),
49
+ "atx_closed" => Ok(HeadingStyle::AtxClosed),
50
+ other => Err(arg_error(format!("invalid heading_style: {other}"))),
51
+ }
52
+ }
53
+
54
+ fn parse_list_indent_type(value: Value) -> Result<ListIndentType, Error> {
55
+ match symbol_to_string(value)?.as_str() {
56
+ "spaces" => Ok(ListIndentType::Spaces),
57
+ "tabs" => Ok(ListIndentType::Tabs),
58
+ other => Err(arg_error(format!("invalid list_indent_type: {other}"))),
59
+ }
60
+ }
61
+
62
+ fn parse_highlight_style(value: Value) -> Result<HighlightStyle, Error> {
63
+ match symbol_to_string(value)?.as_str() {
64
+ "double_equal" => Ok(HighlightStyle::DoubleEqual),
65
+ "html" => Ok(HighlightStyle::Html),
66
+ "bold" => Ok(HighlightStyle::Bold),
67
+ "none" => Ok(HighlightStyle::None),
68
+ other => Err(arg_error(format!("invalid highlight_style: {other}"))),
69
+ }
70
+ }
71
+
72
+ fn parse_whitespace_mode(value: Value) -> Result<WhitespaceMode, Error> {
73
+ match symbol_to_string(value)?.as_str() {
74
+ "normalized" => Ok(WhitespaceMode::Normalized),
75
+ "strict" => Ok(WhitespaceMode::Strict),
76
+ other => Err(arg_error(format!("invalid whitespace_mode: {other}"))),
77
+ }
78
+ }
79
+
80
+ fn parse_newline_style(value: Value) -> Result<NewlineStyle, Error> {
81
+ match symbol_to_string(value)?.as_str() {
82
+ "spaces" => Ok(NewlineStyle::Spaces),
83
+ "backslash" => Ok(NewlineStyle::Backslash),
84
+ other => Err(arg_error(format!("invalid newline_style: {other}"))),
85
+ }
86
+ }
87
+
88
+ fn parse_code_block_style(value: Value) -> Result<CodeBlockStyle, Error> {
89
+ match symbol_to_string(value)?.as_str() {
90
+ "indented" => Ok(CodeBlockStyle::Indented),
91
+ "backticks" => Ok(CodeBlockStyle::Backticks),
92
+ "tildes" => Ok(CodeBlockStyle::Tildes),
93
+ other => Err(arg_error(format!("invalid code_block_style: {other}"))),
94
+ }
95
+ }
96
+
97
+ fn parse_preset(value: Value) -> Result<PreprocessingPreset, Error> {
98
+ match symbol_to_string(value)?.as_str() {
99
+ "minimal" => Ok(PreprocessingPreset::Minimal),
100
+ "standard" => Ok(PreprocessingPreset::Standard),
101
+ "aggressive" => Ok(PreprocessingPreset::Aggressive),
102
+ other => Err(arg_error(format!("invalid preprocessing preset: {other}"))),
103
+ }
104
+ }
105
+
106
+ fn parse_vec_of_strings(value: Value) -> Result<Vec<String>, Error> {
107
+ let array = RArray::from_value(value).ok_or_else(|| arg_error("expected an Array of strings"))?;
108
+
109
+ array.to_vec::<String>()
110
+ }
111
+
112
+ fn parse_preprocessing_options(_ruby: &Ruby, value: Value) -> Result<PreprocessingOptions, Error> {
113
+ let hash = RHash::from_value(value).ok_or_else(|| arg_error("expected preprocessing to be a Hash"))?;
114
+
115
+ let mut opts = PreprocessingOptions::default();
116
+
117
+ hash.foreach(|key: Value, val: Value| {
118
+ let key_name = symbol_to_string(key)?;
119
+ match key_name.as_str() {
120
+ "enabled" => {
121
+ opts.enabled = bool::try_convert(val)?;
122
+ }
123
+ "preset" => {
124
+ opts.preset = parse_preset(val)?;
125
+ }
126
+ "remove_navigation" => {
127
+ opts.remove_navigation = bool::try_convert(val)?;
128
+ }
129
+ "remove_forms" => {
130
+ opts.remove_forms = bool::try_convert(val)?;
131
+ }
132
+ _ => {}
133
+ }
134
+ Ok(ForEach::Continue)
135
+ })?;
136
+
137
+ Ok(opts)
138
+ }
139
+
140
+ fn build_conversion_options(ruby: &Ruby, options: Option<Value>) -> Result<ConversionOptions, Error> {
141
+ let mut opts = ConversionOptions::default();
142
+
143
+ let Some(options) = options else {
144
+ return Ok(opts);
145
+ };
146
+
147
+ if options.is_nil() {
148
+ return Ok(opts);
149
+ }
150
+
151
+ let hash = RHash::from_value(options).ok_or_else(|| arg_error("options must be provided as a Hash"))?;
152
+
153
+ hash.foreach(|key: Value, val: Value| {
154
+ let key_name = symbol_to_string(key)?;
155
+ match key_name.as_str() {
156
+ "heading_style" => {
157
+ opts.heading_style = parse_heading_style(val)?;
158
+ }
159
+ "list_indent_type" => {
160
+ opts.list_indent_type = parse_list_indent_type(val)?;
161
+ }
162
+ "list_indent_width" => {
163
+ opts.list_indent_width = usize::try_convert(val)?;
164
+ }
165
+ "bullets" => {
166
+ opts.bullets = String::try_convert(val)?;
167
+ }
168
+ "strong_em_symbol" => {
169
+ let value = String::try_convert(val)?;
170
+ let mut chars = value.chars();
171
+ let ch = chars
172
+ .next()
173
+ .ok_or_else(|| arg_error("strong_em_symbol must not be empty"))?;
174
+ if chars.next().is_some() {
175
+ return Err(arg_error("strong_em_symbol must be a single character"));
176
+ }
177
+ opts.strong_em_symbol = ch;
178
+ }
179
+ "escape_asterisks" => {
180
+ opts.escape_asterisks = bool::try_convert(val)?;
181
+ }
182
+ "escape_underscores" => {
183
+ opts.escape_underscores = bool::try_convert(val)?;
184
+ }
185
+ "escape_misc" => {
186
+ opts.escape_misc = bool::try_convert(val)?;
187
+ }
188
+ "escape_ascii" => {
189
+ opts.escape_ascii = bool::try_convert(val)?;
190
+ }
191
+ "code_language" => {
192
+ opts.code_language = String::try_convert(val)?;
193
+ }
194
+ "autolinks" => {
195
+ opts.autolinks = bool::try_convert(val)?;
196
+ }
197
+ "default_title" => {
198
+ opts.default_title = bool::try_convert(val)?;
199
+ }
200
+ "br_in_tables" => {
201
+ opts.br_in_tables = bool::try_convert(val)?;
202
+ }
203
+ "hocr_spatial_tables" => {
204
+ opts.hocr_spatial_tables = bool::try_convert(val)?;
205
+ }
206
+ "highlight_style" => {
207
+ opts.highlight_style = parse_highlight_style(val)?;
208
+ }
209
+ "extract_metadata" => {
210
+ opts.extract_metadata = bool::try_convert(val)?;
211
+ }
212
+ "whitespace_mode" => {
213
+ opts.whitespace_mode = parse_whitespace_mode(val)?;
214
+ }
215
+ "strip_newlines" => {
216
+ opts.strip_newlines = bool::try_convert(val)?;
217
+ }
218
+ "wrap" => {
219
+ opts.wrap = bool::try_convert(val)?;
220
+ }
221
+ "wrap_width" => {
222
+ opts.wrap_width = usize::try_convert(val)?;
223
+ }
224
+ "convert_as_inline" => {
225
+ opts.convert_as_inline = bool::try_convert(val)?;
226
+ }
227
+ "sub_symbol" => {
228
+ opts.sub_symbol = String::try_convert(val)?;
229
+ }
230
+ "sup_symbol" => {
231
+ opts.sup_symbol = String::try_convert(val)?;
232
+ }
233
+ "newline_style" => {
234
+ opts.newline_style = parse_newline_style(val)?;
235
+ }
236
+ "code_block_style" => {
237
+ opts.code_block_style = parse_code_block_style(val)?;
238
+ }
239
+ "keep_inline_images_in" => {
240
+ opts.keep_inline_images_in = parse_vec_of_strings(val)?;
241
+ }
242
+ "preprocessing" => {
243
+ opts.preprocessing = parse_preprocessing_options(ruby, val)?;
244
+ }
245
+ "encoding" => {
246
+ opts.encoding = String::try_convert(val)?;
247
+ }
248
+ "debug" => {
249
+ opts.debug = bool::try_convert(val)?;
250
+ }
251
+ "strip_tags" => {
252
+ opts.strip_tags = parse_vec_of_strings(val)?;
253
+ }
254
+ "preserve_tags" => {
255
+ opts.preserve_tags = parse_vec_of_strings(val)?;
256
+ }
257
+ _ => {}
258
+ }
259
+ Ok(ForEach::Continue)
260
+ })?;
261
+
262
+ Ok(opts)
263
+ }
264
+
265
+ fn build_inline_image_config(_ruby: &Ruby, config: Option<Value>) -> Result<InlineImageConfig, Error> {
266
+ let mut cfg = InlineImageConfig::new(DEFAULT_INLINE_IMAGE_LIMIT);
267
+
268
+ let Some(config) = config else {
269
+ return Ok(cfg);
270
+ };
271
+
272
+ if config.is_nil() {
273
+ return Ok(cfg);
274
+ }
275
+
276
+ let hash = RHash::from_value(config).ok_or_else(|| arg_error("inline image config must be provided as a Hash"))?;
277
+
278
+ hash.foreach(|key: Value, val: Value| {
279
+ let key_name = symbol_to_string(key)?;
280
+ match key_name.as_str() {
281
+ "max_decoded_size_bytes" => {
282
+ cfg.max_decoded_size_bytes = u64::try_convert(val)?;
283
+ }
284
+ "filename_prefix" => {
285
+ cfg.filename_prefix = if val.is_nil() {
286
+ None
287
+ } else {
288
+ Some(String::try_convert(val)?)
289
+ };
290
+ }
291
+ "capture_svg" => {
292
+ cfg.capture_svg = bool::try_convert(val)?;
293
+ }
294
+ "infer_dimensions" => {
295
+ cfg.infer_dimensions = bool::try_convert(val)?;
296
+ }
297
+ _ => {}
298
+ }
299
+ Ok(ForEach::Continue)
300
+ })?;
301
+
302
+ Ok(cfg)
303
+ }
304
+
305
+ fn inline_image_to_value(ruby: &Ruby, image: InlineImage) -> Result<Value, Error> {
306
+ let InlineImage {
307
+ data,
308
+ format,
309
+ filename,
310
+ description,
311
+ dimensions,
312
+ source,
313
+ attributes,
314
+ } = image;
315
+
316
+ let hash = ruby.hash_new();
317
+ let data_value = ruby.str_from_slice(&data);
318
+ hash.aset(ruby.intern("data"), data_value)?;
319
+
320
+ let format_value = match format {
321
+ InlineImageFormat::Png => "png".to_string(),
322
+ InlineImageFormat::Jpeg => "jpeg".to_string(),
323
+ InlineImageFormat::Gif => "gif".to_string(),
324
+ InlineImageFormat::Bmp => "bmp".to_string(),
325
+ InlineImageFormat::Webp => "webp".to_string(),
326
+ InlineImageFormat::Svg => "svg".to_string(),
327
+ InlineImageFormat::Other(other) => other,
328
+ };
329
+ hash.aset(ruby.intern("format"), format_value)?;
330
+
331
+ match filename {
332
+ Some(name) => hash.aset(ruby.intern("filename"), name)?,
333
+ None => hash.aset(ruby.intern("filename"), ruby.qnil())?,
334
+ }
335
+
336
+ match description {
337
+ Some(desc) => hash.aset(ruby.intern("description"), desc)?,
338
+ None => hash.aset(ruby.intern("description"), ruby.qnil())?,
339
+ }
340
+
341
+ if let Some((width, height)) = dimensions {
342
+ let dims = ruby.ary_new();
343
+ dims.push(width as i64)?;
344
+ dims.push(height as i64)?;
345
+ hash.aset(ruby.intern("dimensions"), dims)?;
346
+ } else {
347
+ hash.aset(ruby.intern("dimensions"), ruby.qnil())?;
348
+ }
349
+
350
+ let source_value = match source {
351
+ InlineImageSource::ImgDataUri => "img_data_uri",
352
+ InlineImageSource::SvgElement => "svg_element",
353
+ };
354
+ hash.aset(ruby.intern("source"), source_value)?;
355
+
356
+ let attrs = ruby.hash_new();
357
+ for (key, value) in attributes {
358
+ attrs.aset(key, value)?;
359
+ }
360
+ hash.aset(ruby.intern("attributes"), attrs)?;
361
+
362
+ Ok(hash.as_value())
363
+ }
364
+
365
+ fn warning_to_value(ruby: &Ruby, warning: InlineImageWarning) -> Result<Value, Error> {
366
+ let hash = ruby.hash_new();
367
+ hash.aset(ruby.intern("index"), warning.index as i64)?;
368
+ hash.aset(ruby.intern("message"), warning.message)?;
369
+ Ok(hash.as_value())
370
+ }
371
+
372
+ fn extraction_to_value(ruby: &Ruby, extraction: HtmlExtraction) -> Result<Value, Error> {
373
+ let hash = ruby.hash_new();
374
+ hash.aset(ruby.intern("markdown"), extraction.markdown)?;
375
+
376
+ let inline_images = ruby.ary_new();
377
+ for image in extraction.inline_images {
378
+ inline_images.push(inline_image_to_value(ruby, image)?)?;
379
+ }
380
+ hash.aset(ruby.intern("inline_images"), inline_images)?;
381
+
382
+ let warnings = ruby.ary_new();
383
+ for warning in extraction.warnings {
384
+ warnings.push(warning_to_value(ruby, warning)?)?;
385
+ }
386
+ hash.aset(ruby.intern("warnings"), warnings)?;
387
+
388
+ Ok(hash.as_value())
389
+ }
390
+
391
+ fn convert_fn(ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
392
+ let parsed = scan_args::<(String,), (Option<Value>,), (), (), (), ()>(args)?;
393
+ let html = parsed.required.0;
394
+ let options = build_conversion_options(ruby, parsed.optional.0)?;
395
+
396
+ guard_panic(|| convert_inner(&html, Some(options))).map_err(conversion_error)
397
+ }
398
+
399
+ fn options_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<OptionsHandle, Error> {
400
+ let parsed = scan_args::<(), (Option<Value>,), (), (), (), ()>(args)?;
401
+ let options = build_conversion_options(ruby, parsed.optional.0)?;
402
+ Ok(OptionsHandle(options))
403
+ }
404
+
405
+ fn convert_with_options_handle_fn(_ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
406
+ let parsed = scan_args::<(String, &OptionsHandle), (), (), (), (), ()>(args)?;
407
+ let html = parsed.required.0;
408
+ let handle = parsed.required.1;
409
+ let options = handle.0.clone();
410
+
411
+ guard_panic(|| convert_inner(&html, Some(options))).map_err(conversion_error)
412
+ }
413
+
414
+ fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
415
+ let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
416
+ let html = parsed.required.0;
417
+ let options = build_conversion_options(ruby, parsed.optional.0)?;
418
+ let config = build_inline_image_config(ruby, parsed.optional.1)?;
419
+
420
+ let extraction =
421
+ guard_panic(|| convert_with_inline_images_inner(&html, Some(options), config)).map_err(conversion_error)?;
422
+
423
+ extraction_to_value(ruby, extraction)
424
+ }
425
+
426
+ #[magnus::init]
427
+ fn init(ruby: &Ruby) -> Result<(), Error> {
428
+ let module = ruby.define_module("HtmlToMarkdown")?;
429
+ module.define_singleton_method("convert", function!(convert_fn, -1))?;
430
+ module.define_singleton_method("options", function!(options_handle_fn, -1))?;
431
+ module.define_singleton_method("convert_with_options", function!(convert_with_options_handle_fn, -1))?;
432
+ module.define_singleton_method(
433
+ "convert_with_inline_images",
434
+ function!(convert_with_inline_images_fn, -1),
435
+ )?;
436
+
437
+ Ok(())
438
+ }