html-to-markdown 2.6.4 → 2.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +29 -29
- data/Gemfile +15 -15
- data/Gemfile.lock +2 -3
- data/README.md +209 -210
- data/Rakefile +24 -24
- data/exe/html-to-markdown +6 -6
- data/ext/html-to-markdown-rb/extconf.rb +28 -28
- data/ext/html-to-markdown-rb/native/Cargo.toml +28 -0
- data/ext/html-to-markdown-rb/native/README.md +209 -0
- data/ext/html-to-markdown-rb/native/extconf.rb +3 -0
- data/ext/html-to-markdown-rb/native/src/lib.rs +432 -0
- data/html-to-markdown-rb.gemspec +59 -59
- data/lib/html_to_markdown/cli.rb +21 -21
- data/lib/html_to_markdown/cli_proxy.rb +71 -71
- data/lib/html_to_markdown/version.rb +5 -5
- data/lib/html_to_markdown.rb +24 -24
- data/spec/cli_proxy_spec.rb +42 -42
- data/spec/convert_spec.rb +29 -29
- data/spec/spec_helper.rb +10 -10
- metadata +6 -2
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
use html_to_markdown_rs::{
|
|
2
|
+
CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, HtmlExtraction, InlineImage, InlineImageConfig,
|
|
3
|
+
InlineImageFormat, InlineImageSource, InlineImageWarning, ListIndentType, NewlineStyle, PreprocessingOptions,
|
|
4
|
+
PreprocessingPreset, WhitespaceMode, convert as convert_inner,
|
|
5
|
+
convert_with_inline_images as convert_with_inline_images_inner, error::ConversionError,
|
|
6
|
+
};
|
|
7
|
+
use magnus::prelude::*;
|
|
8
|
+
use magnus::{Error, RArray, RHash, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args};
|
|
9
|
+
|
|
10
|
+
const DEFAULT_INLINE_IMAGE_LIMIT: u64 = 5 * 1024 * 1024;
|
|
11
|
+
|
|
12
|
+
fn conversion_error(err: ConversionError) -> Error {
|
|
13
|
+
match err {
|
|
14
|
+
ConversionError::ConfigError(msg) => arg_error(msg),
|
|
15
|
+
other => runtime_error(other.to_string()),
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
fn arg_error(message: impl Into<String>) -> Error {
|
|
20
|
+
let ruby = Ruby::get().expect("Ruby not initialised");
|
|
21
|
+
Error::new(ruby.exception_arg_error(), message.into())
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
fn runtime_error(message: impl Into<String>) -> Error {
|
|
25
|
+
let ruby = Ruby::get().expect("Ruby not initialised");
|
|
26
|
+
Error::new(ruby.exception_runtime_error(), message.into())
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
fn symbol_to_string(value: Value) -> Result<String, Error> {
|
|
30
|
+
if let Some(symbol) = Symbol::from_value(value) {
|
|
31
|
+
Ok(symbol.name()?.to_string())
|
|
32
|
+
} else {
|
|
33
|
+
String::try_convert(value)
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
fn get_kw(ruby: &Ruby, hash: RHash, name: &str) -> Option<Value> {
|
|
38
|
+
let sym = ruby.intern(name);
|
|
39
|
+
hash.get(sym).or_else(|| hash.get(name))
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
fn parse_heading_style(value: Value) -> Result<HeadingStyle, Error> {
|
|
43
|
+
match symbol_to_string(value)?.as_str() {
|
|
44
|
+
"underlined" => Ok(HeadingStyle::Underlined),
|
|
45
|
+
"atx" => Ok(HeadingStyle::Atx),
|
|
46
|
+
"atx_closed" => Ok(HeadingStyle::AtxClosed),
|
|
47
|
+
other => Err(arg_error(format!("invalid heading_style: {other}"))),
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
fn parse_list_indent_type(value: Value) -> Result<ListIndentType, Error> {
|
|
52
|
+
match symbol_to_string(value)?.as_str() {
|
|
53
|
+
"spaces" => Ok(ListIndentType::Spaces),
|
|
54
|
+
"tabs" => Ok(ListIndentType::Tabs),
|
|
55
|
+
other => Err(arg_error(format!("invalid list_indent_type: {other}"))),
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
fn parse_highlight_style(value: Value) -> Result<HighlightStyle, Error> {
|
|
60
|
+
match symbol_to_string(value)?.as_str() {
|
|
61
|
+
"double_equal" => Ok(HighlightStyle::DoubleEqual),
|
|
62
|
+
"html" => Ok(HighlightStyle::Html),
|
|
63
|
+
"bold" => Ok(HighlightStyle::Bold),
|
|
64
|
+
"none" => Ok(HighlightStyle::None),
|
|
65
|
+
other => Err(arg_error(format!("invalid highlight_style: {other}"))),
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
fn parse_whitespace_mode(value: Value) -> Result<WhitespaceMode, Error> {
|
|
70
|
+
match symbol_to_string(value)?.as_str() {
|
|
71
|
+
"normalized" => Ok(WhitespaceMode::Normalized),
|
|
72
|
+
"strict" => Ok(WhitespaceMode::Strict),
|
|
73
|
+
other => Err(arg_error(format!("invalid whitespace_mode: {other}"))),
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
fn parse_newline_style(value: Value) -> Result<NewlineStyle, Error> {
|
|
78
|
+
match symbol_to_string(value)?.as_str() {
|
|
79
|
+
"spaces" => Ok(NewlineStyle::Spaces),
|
|
80
|
+
"backslash" => Ok(NewlineStyle::Backslash),
|
|
81
|
+
other => Err(arg_error(format!("invalid newline_style: {other}"))),
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
fn parse_code_block_style(value: Value) -> Result<CodeBlockStyle, Error> {
|
|
86
|
+
match symbol_to_string(value)?.as_str() {
|
|
87
|
+
"indented" => Ok(CodeBlockStyle::Indented),
|
|
88
|
+
"backticks" => Ok(CodeBlockStyle::Backticks),
|
|
89
|
+
"tildes" => Ok(CodeBlockStyle::Tildes),
|
|
90
|
+
other => Err(arg_error(format!("invalid code_block_style: {other}"))),
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
fn parse_preset(value: Value) -> Result<PreprocessingPreset, Error> {
|
|
95
|
+
match symbol_to_string(value)?.as_str() {
|
|
96
|
+
"minimal" => Ok(PreprocessingPreset::Minimal),
|
|
97
|
+
"standard" => Ok(PreprocessingPreset::Standard),
|
|
98
|
+
"aggressive" => Ok(PreprocessingPreset::Aggressive),
|
|
99
|
+
other => Err(arg_error(format!("invalid preprocessing preset: {other}"))),
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
fn parse_vec_of_strings(value: Value) -> Result<Vec<String>, Error> {
|
|
104
|
+
let array = RArray::from_value(value).ok_or_else(|| arg_error("expected an Array of strings"))?;
|
|
105
|
+
|
|
106
|
+
array.to_vec::<String>()
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
fn parse_preprocessing_options(ruby: &Ruby, value: Value) -> Result<PreprocessingOptions, Error> {
|
|
110
|
+
let hash = RHash::from_value(value).ok_or_else(|| arg_error("expected preprocessing to be a Hash"))?;
|
|
111
|
+
|
|
112
|
+
let mut opts = PreprocessingOptions::default();
|
|
113
|
+
|
|
114
|
+
if let Some(enabled) = get_kw(ruby, hash, "enabled") {
|
|
115
|
+
opts.enabled = bool::try_convert(enabled)?;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if let Some(preset) = get_kw(ruby, hash, "preset") {
|
|
119
|
+
opts.preset = parse_preset(preset)?;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if let Some(remove_navigation) = get_kw(ruby, hash, "remove_navigation") {
|
|
123
|
+
opts.remove_navigation = bool::try_convert(remove_navigation)?;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
if let Some(remove_forms) = get_kw(ruby, hash, "remove_forms") {
|
|
127
|
+
opts.remove_forms = bool::try_convert(remove_forms)?;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
Ok(opts)
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
fn build_conversion_options(ruby: &Ruby, options: Option<Value>) -> Result<ConversionOptions, Error> {
|
|
134
|
+
let mut opts = ConversionOptions::default();
|
|
135
|
+
|
|
136
|
+
let Some(options) = options else {
|
|
137
|
+
return Ok(opts);
|
|
138
|
+
};
|
|
139
|
+
|
|
140
|
+
if options.is_nil() {
|
|
141
|
+
return Ok(opts);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
let hash = RHash::from_value(options).ok_or_else(|| arg_error("options must be provided as a Hash"))?;
|
|
145
|
+
|
|
146
|
+
if let Some(heading_style) = get_kw(ruby, hash, "heading_style") {
|
|
147
|
+
opts.heading_style = parse_heading_style(heading_style)?;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if let Some(list_indent_type) = get_kw(ruby, hash, "list_indent_type") {
|
|
151
|
+
opts.list_indent_type = parse_list_indent_type(list_indent_type)?;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if let Some(list_indent_width) = get_kw(ruby, hash, "list_indent_width") {
|
|
155
|
+
opts.list_indent_width = usize::try_convert(list_indent_width)?;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
if let Some(bullets) = get_kw(ruby, hash, "bullets") {
|
|
159
|
+
opts.bullets = String::try_convert(bullets)?;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if let Some(strong_em_symbol) = get_kw(ruby, hash, "strong_em_symbol") {
|
|
163
|
+
let value = String::try_convert(strong_em_symbol)?;
|
|
164
|
+
let mut chars = value.chars();
|
|
165
|
+
let ch = chars
|
|
166
|
+
.next()
|
|
167
|
+
.ok_or_else(|| arg_error("strong_em_symbol must not be empty"))?;
|
|
168
|
+
if chars.next().is_some() {
|
|
169
|
+
return Err(arg_error("strong_em_symbol must be a single character"));
|
|
170
|
+
}
|
|
171
|
+
opts.strong_em_symbol = ch;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
if let Some(escape_asterisks) = get_kw(ruby, hash, "escape_asterisks") {
|
|
175
|
+
opts.escape_asterisks = bool::try_convert(escape_asterisks)?;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if let Some(escape_underscores) = get_kw(ruby, hash, "escape_underscores") {
|
|
179
|
+
opts.escape_underscores = bool::try_convert(escape_underscores)?;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
if let Some(escape_misc) = get_kw(ruby, hash, "escape_misc") {
|
|
183
|
+
opts.escape_misc = bool::try_convert(escape_misc)?;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
if let Some(escape_ascii) = get_kw(ruby, hash, "escape_ascii") {
|
|
187
|
+
opts.escape_ascii = bool::try_convert(escape_ascii)?;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if let Some(code_language) = get_kw(ruby, hash, "code_language") {
|
|
191
|
+
opts.code_language = String::try_convert(code_language)?;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
if let Some(autolinks) = get_kw(ruby, hash, "autolinks") {
|
|
195
|
+
opts.autolinks = bool::try_convert(autolinks)?;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
if let Some(default_title) = get_kw(ruby, hash, "default_title") {
|
|
199
|
+
opts.default_title = bool::try_convert(default_title)?;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
if let Some(br_in_tables) = get_kw(ruby, hash, "br_in_tables") {
|
|
203
|
+
opts.br_in_tables = bool::try_convert(br_in_tables)?;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
if let Some(hocr_spatial_tables) = get_kw(ruby, hash, "hocr_spatial_tables") {
|
|
207
|
+
opts.hocr_spatial_tables = bool::try_convert(hocr_spatial_tables)?;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
if let Some(highlight_style) = get_kw(ruby, hash, "highlight_style") {
|
|
211
|
+
opts.highlight_style = parse_highlight_style(highlight_style)?;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if let Some(extract_metadata) = get_kw(ruby, hash, "extract_metadata") {
|
|
215
|
+
opts.extract_metadata = bool::try_convert(extract_metadata)?;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if let Some(whitespace_mode) = get_kw(ruby, hash, "whitespace_mode") {
|
|
219
|
+
opts.whitespace_mode = parse_whitespace_mode(whitespace_mode)?;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
if let Some(strip_newlines) = get_kw(ruby, hash, "strip_newlines") {
|
|
223
|
+
opts.strip_newlines = bool::try_convert(strip_newlines)?;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
if let Some(wrap) = get_kw(ruby, hash, "wrap") {
|
|
227
|
+
opts.wrap = bool::try_convert(wrap)?;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if let Some(wrap_width) = get_kw(ruby, hash, "wrap_width") {
|
|
231
|
+
opts.wrap_width = usize::try_convert(wrap_width)?;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
if let Some(convert_as_inline) = get_kw(ruby, hash, "convert_as_inline") {
|
|
235
|
+
opts.convert_as_inline = bool::try_convert(convert_as_inline)?;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
if let Some(sub_symbol) = get_kw(ruby, hash, "sub_symbol") {
|
|
239
|
+
opts.sub_symbol = String::try_convert(sub_symbol)?;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
if let Some(sup_symbol) = get_kw(ruby, hash, "sup_symbol") {
|
|
243
|
+
opts.sup_symbol = String::try_convert(sup_symbol)?;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if let Some(newline_style) = get_kw(ruby, hash, "newline_style") {
|
|
247
|
+
opts.newline_style = parse_newline_style(newline_style)?;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
if let Some(code_block_style) = get_kw(ruby, hash, "code_block_style") {
|
|
251
|
+
opts.code_block_style = parse_code_block_style(code_block_style)?;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
if let Some(keep_inline_images_in) = get_kw(ruby, hash, "keep_inline_images_in") {
|
|
255
|
+
opts.keep_inline_images_in = parse_vec_of_strings(keep_inline_images_in)?;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
if let Some(preprocessing) = get_kw(ruby, hash, "preprocessing") {
|
|
259
|
+
opts.preprocessing = parse_preprocessing_options(ruby, preprocessing)?;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
if let Some(encoding) = get_kw(ruby, hash, "encoding") {
|
|
263
|
+
opts.encoding = String::try_convert(encoding)?;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
if let Some(debug) = get_kw(ruby, hash, "debug") {
|
|
267
|
+
opts.debug = bool::try_convert(debug)?;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
if let Some(strip_tags) = get_kw(ruby, hash, "strip_tags") {
|
|
271
|
+
opts.strip_tags = parse_vec_of_strings(strip_tags)?;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
if let Some(preserve_tags) = get_kw(ruby, hash, "preserve_tags") {
|
|
275
|
+
opts.preserve_tags = parse_vec_of_strings(preserve_tags)?;
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
Ok(opts)
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
fn build_inline_image_config(ruby: &Ruby, config: Option<Value>) -> Result<InlineImageConfig, Error> {
|
|
282
|
+
let mut cfg = InlineImageConfig::new(DEFAULT_INLINE_IMAGE_LIMIT);
|
|
283
|
+
|
|
284
|
+
let Some(config) = config else {
|
|
285
|
+
return Ok(cfg);
|
|
286
|
+
};
|
|
287
|
+
|
|
288
|
+
if config.is_nil() {
|
|
289
|
+
return Ok(cfg);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
let hash = RHash::from_value(config).ok_or_else(|| arg_error("inline image config must be provided as a Hash"))?;
|
|
293
|
+
|
|
294
|
+
if let Some(limit) = get_kw(ruby, hash, "max_decoded_size_bytes") {
|
|
295
|
+
cfg.max_decoded_size_bytes = u64::try_convert(limit)?;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
if let Some(prefix) = get_kw(ruby, hash, "filename_prefix") {
|
|
299
|
+
cfg.filename_prefix = if prefix.is_nil() {
|
|
300
|
+
None
|
|
301
|
+
} else {
|
|
302
|
+
Some(String::try_convert(prefix)?)
|
|
303
|
+
};
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
if let Some(capture_svg) = get_kw(ruby, hash, "capture_svg") {
|
|
307
|
+
cfg.capture_svg = bool::try_convert(capture_svg)?;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
if let Some(infer_dimensions) = get_kw(ruby, hash, "infer_dimensions") {
|
|
311
|
+
cfg.infer_dimensions = bool::try_convert(infer_dimensions)?;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
Ok(cfg)
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
fn inline_image_to_value(ruby: &Ruby, image: InlineImage) -> Result<Value, Error> {
|
|
318
|
+
let InlineImage {
|
|
319
|
+
data,
|
|
320
|
+
format,
|
|
321
|
+
filename,
|
|
322
|
+
description,
|
|
323
|
+
dimensions,
|
|
324
|
+
source,
|
|
325
|
+
attributes,
|
|
326
|
+
} = image;
|
|
327
|
+
|
|
328
|
+
let hash = ruby.hash_new();
|
|
329
|
+
let data_value = ruby.str_from_slice(&data);
|
|
330
|
+
hash.aset(ruby.intern("data"), data_value)?;
|
|
331
|
+
|
|
332
|
+
let format_value = match format {
|
|
333
|
+
InlineImageFormat::Png => "png".to_string(),
|
|
334
|
+
InlineImageFormat::Jpeg => "jpeg".to_string(),
|
|
335
|
+
InlineImageFormat::Gif => "gif".to_string(),
|
|
336
|
+
InlineImageFormat::Bmp => "bmp".to_string(),
|
|
337
|
+
InlineImageFormat::Webp => "webp".to_string(),
|
|
338
|
+
InlineImageFormat::Svg => "svg".to_string(),
|
|
339
|
+
InlineImageFormat::Other(other) => other,
|
|
340
|
+
};
|
|
341
|
+
hash.aset(ruby.intern("format"), format_value)?;
|
|
342
|
+
|
|
343
|
+
match filename {
|
|
344
|
+
Some(name) => hash.aset(ruby.intern("filename"), name)?,
|
|
345
|
+
None => hash.aset(ruby.intern("filename"), ruby.qnil())?,
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
match description {
|
|
349
|
+
Some(desc) => hash.aset(ruby.intern("description"), desc)?,
|
|
350
|
+
None => hash.aset(ruby.intern("description"), ruby.qnil())?,
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
if let Some((width, height)) = dimensions {
|
|
354
|
+
let dims = ruby.ary_new();
|
|
355
|
+
dims.push(width as i64)?;
|
|
356
|
+
dims.push(height as i64)?;
|
|
357
|
+
hash.aset(ruby.intern("dimensions"), dims)?;
|
|
358
|
+
} else {
|
|
359
|
+
hash.aset(ruby.intern("dimensions"), ruby.qnil())?;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
let source_value = match source {
|
|
363
|
+
InlineImageSource::ImgDataUri => "img_data_uri",
|
|
364
|
+
InlineImageSource::SvgElement => "svg_element",
|
|
365
|
+
};
|
|
366
|
+
hash.aset(ruby.intern("source"), source_value)?;
|
|
367
|
+
|
|
368
|
+
let attrs = ruby.hash_new();
|
|
369
|
+
for (key, value) in attributes {
|
|
370
|
+
attrs.aset(key, value)?;
|
|
371
|
+
}
|
|
372
|
+
hash.aset(ruby.intern("attributes"), attrs)?;
|
|
373
|
+
|
|
374
|
+
Ok(hash.as_value())
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
fn warning_to_value(ruby: &Ruby, warning: InlineImageWarning) -> Result<Value, Error> {
|
|
378
|
+
let hash = ruby.hash_new();
|
|
379
|
+
hash.aset(ruby.intern("index"), warning.index as i64)?;
|
|
380
|
+
hash.aset(ruby.intern("message"), warning.message)?;
|
|
381
|
+
Ok(hash.as_value())
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
fn extraction_to_value(ruby: &Ruby, extraction: HtmlExtraction) -> Result<Value, Error> {
|
|
385
|
+
let hash = ruby.hash_new();
|
|
386
|
+
hash.aset(ruby.intern("markdown"), extraction.markdown)?;
|
|
387
|
+
|
|
388
|
+
let inline_images = ruby.ary_new();
|
|
389
|
+
for image in extraction.inline_images {
|
|
390
|
+
inline_images.push(inline_image_to_value(ruby, image)?)?;
|
|
391
|
+
}
|
|
392
|
+
hash.aset(ruby.intern("inline_images"), inline_images)?;
|
|
393
|
+
|
|
394
|
+
let warnings = ruby.ary_new();
|
|
395
|
+
for warning in extraction.warnings {
|
|
396
|
+
warnings.push(warning_to_value(ruby, warning)?)?;
|
|
397
|
+
}
|
|
398
|
+
hash.aset(ruby.intern("warnings"), warnings)?;
|
|
399
|
+
|
|
400
|
+
Ok(hash.as_value())
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
fn convert_fn(ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
|
|
404
|
+
let parsed = scan_args::<(String,), (Option<Value>,), (), (), (), ()>(args)?;
|
|
405
|
+
let html = parsed.required.0;
|
|
406
|
+
let options = build_conversion_options(ruby, parsed.optional.0)?;
|
|
407
|
+
|
|
408
|
+
convert_inner(&html, Some(options)).map_err(conversion_error)
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
|
|
412
|
+
let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
|
|
413
|
+
let html = parsed.required.0;
|
|
414
|
+
let options = build_conversion_options(ruby, parsed.optional.0)?;
|
|
415
|
+
let config = build_inline_image_config(ruby, parsed.optional.1)?;
|
|
416
|
+
|
|
417
|
+
let extraction = convert_with_inline_images_inner(&html, Some(options), config).map_err(conversion_error)?;
|
|
418
|
+
|
|
419
|
+
extraction_to_value(ruby, extraction)
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
#[magnus::init]
|
|
423
|
+
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
424
|
+
let module = ruby.define_module("HtmlToMarkdown")?;
|
|
425
|
+
module.define_singleton_method("convert", function!(convert_fn, -1))?;
|
|
426
|
+
module.define_singleton_method(
|
|
427
|
+
"convert_with_inline_images",
|
|
428
|
+
function!(convert_with_inline_images_fn, -1),
|
|
429
|
+
)?;
|
|
430
|
+
|
|
431
|
+
Ok(())
|
|
432
|
+
}
|
data/html-to-markdown-rb.gemspec
CHANGED
|
@@ -1,59 +1,59 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require_relative 'lib/html_to_markdown/version'
|
|
4
|
-
|
|
5
|
-
repo_root = File.expand_path('../..', __dir__)
|
|
6
|
-
crate_prefix = 'packages/ruby/'
|
|
7
|
-
git_cmd = %(git -C "#{repo_root}" ls-files -z #{crate_prefix})
|
|
8
|
-
git_files =
|
|
9
|
-
`#{git_cmd}`.split("\x0")
|
|
10
|
-
.select { |path| path.start_with?(crate_prefix) }
|
|
11
|
-
.map { |path| path.delete_prefix(crate_prefix) }
|
|
12
|
-
fallback_files = Dir.chdir(__dir__) do
|
|
13
|
-
Dir.glob(
|
|
14
|
-
%w[
|
|
15
|
-
README.md
|
|
16
|
-
ext/html-to-markdown-rb/extconf.rb
|
|
17
|
-
exe/*
|
|
18
|
-
lib/**/*.rb
|
|
19
|
-
lib/bin/*
|
|
20
|
-
src/**/*.rs
|
|
21
|
-
spec/**/*.rb
|
|
22
|
-
]
|
|
23
|
-
)
|
|
24
|
-
end
|
|
25
|
-
files = git_files.empty? ? fallback_files : git_files
|
|
26
|
-
|
|
27
|
-
Gem::Specification.new do |spec|
|
|
28
|
-
spec.name = 'html-to-markdown'
|
|
29
|
-
spec.version = HtmlToMarkdown::VERSION
|
|
30
|
-
spec.authors = ["Na'aman Hirschfeld"]
|
|
31
|
-
spec.email = ['nhirschfeld@gmail.com']
|
|
32
|
-
|
|
33
|
-
spec.summary = 'Blazing-fast HTML to Markdown conversion for Ruby, powered by Rust.'
|
|
34
|
-
spec.description = <<~DESC.strip
|
|
35
|
-
html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
|
|
36
|
-
It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
|
|
37
|
-
DESC
|
|
38
|
-
spec.homepage = 'https://github.com/Goldziher/html-to-markdown'
|
|
39
|
-
spec.license = 'MIT'
|
|
40
|
-
|
|
41
|
-
spec.required_ruby_version = Gem::Requirement.new('>= 3.2')
|
|
42
|
-
|
|
43
|
-
spec.bindir = 'exe'
|
|
44
|
-
spec.executables = ['html-to-markdown']
|
|
45
|
-
spec.require_paths = ['lib']
|
|
46
|
-
|
|
47
|
-
spec.files = files
|
|
48
|
-
spec.extra_rdoc_files = ['README.md']
|
|
49
|
-
|
|
50
|
-
spec.extensions = ['ext/html-to-markdown-rb/extconf.rb']
|
|
51
|
-
|
|
52
|
-
spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
|
|
53
|
-
spec.metadata['rubygems_mfa_required'] = 'true'
|
|
54
|
-
spec.metadata['homepage_uri'] = 'https://github.com/Goldziher/html-to-markdown'
|
|
55
|
-
spec.metadata['source_code_uri'] = 'https://github.com/Goldziher/html-to-markdown'
|
|
56
|
-
spec.metadata['bug_tracker_uri'] = 'https://github.com/Goldziher/html-to-markdown/issues'
|
|
57
|
-
spec.metadata['changelog_uri'] = 'https://github.com/Goldziher/html-to-markdown/releases'
|
|
58
|
-
spec.metadata['documentation_uri'] = 'https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md'
|
|
59
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'lib/html_to_markdown/version'
|
|
4
|
+
|
|
5
|
+
repo_root = File.expand_path('../..', __dir__)
|
|
6
|
+
crate_prefix = 'packages/ruby/'
|
|
7
|
+
git_cmd = %(git -C "#{repo_root}" ls-files -z #{crate_prefix})
|
|
8
|
+
git_files =
|
|
9
|
+
`#{git_cmd}`.split("\x0")
|
|
10
|
+
.select { |path| path.start_with?(crate_prefix) }
|
|
11
|
+
.map { |path| path.delete_prefix(crate_prefix) }
|
|
12
|
+
fallback_files = Dir.chdir(__dir__) do
|
|
13
|
+
Dir.glob(
|
|
14
|
+
%w[
|
|
15
|
+
README.md
|
|
16
|
+
ext/html-to-markdown-rb/extconf.rb
|
|
17
|
+
exe/*
|
|
18
|
+
lib/**/*.rb
|
|
19
|
+
lib/bin/*
|
|
20
|
+
src/**/*.rs
|
|
21
|
+
spec/**/*.rb
|
|
22
|
+
]
|
|
23
|
+
)
|
|
24
|
+
end
|
|
25
|
+
files = git_files.empty? ? fallback_files : git_files
|
|
26
|
+
|
|
27
|
+
Gem::Specification.new do |spec|
|
|
28
|
+
spec.name = 'html-to-markdown'
|
|
29
|
+
spec.version = HtmlToMarkdown::VERSION
|
|
30
|
+
spec.authors = ["Na'aman Hirschfeld"]
|
|
31
|
+
spec.email = ['nhirschfeld@gmail.com']
|
|
32
|
+
|
|
33
|
+
spec.summary = 'Blazing-fast HTML to Markdown conversion for Ruby, powered by Rust.'
|
|
34
|
+
spec.description = <<~DESC.strip
|
|
35
|
+
html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
|
|
36
|
+
It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
|
|
37
|
+
DESC
|
|
38
|
+
spec.homepage = 'https://github.com/Goldziher/html-to-markdown'
|
|
39
|
+
spec.license = 'MIT'
|
|
40
|
+
|
|
41
|
+
spec.required_ruby_version = Gem::Requirement.new('>= 3.2')
|
|
42
|
+
|
|
43
|
+
spec.bindir = 'exe'
|
|
44
|
+
spec.executables = ['html-to-markdown']
|
|
45
|
+
spec.require_paths = ['lib']
|
|
46
|
+
|
|
47
|
+
spec.files = files
|
|
48
|
+
spec.extra_rdoc_files = ['README.md']
|
|
49
|
+
|
|
50
|
+
spec.extensions = ['ext/html-to-markdown-rb/extconf.rb']
|
|
51
|
+
|
|
52
|
+
spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
|
|
53
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
|
54
|
+
spec.metadata['homepage_uri'] = 'https://github.com/Goldziher/html-to-markdown'
|
|
55
|
+
spec.metadata['source_code_uri'] = 'https://github.com/Goldziher/html-to-markdown'
|
|
56
|
+
spec.metadata['bug_tracker_uri'] = 'https://github.com/Goldziher/html-to-markdown/issues'
|
|
57
|
+
spec.metadata['changelog_uri'] = 'https://github.com/Goldziher/html-to-markdown/releases'
|
|
58
|
+
spec.metadata['documentation_uri'] = 'https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md'
|
|
59
|
+
end
|
data/lib/html_to_markdown/cli.rb
CHANGED
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'html_to_markdown/cli_proxy'
|
|
4
|
-
|
|
5
|
-
module HtmlToMarkdown
|
|
6
|
-
module CLI
|
|
7
|
-
module_function
|
|
8
|
-
|
|
9
|
-
def run(argv = ARGV, stdout: $stdout, stderr: $stderr)
|
|
10
|
-
output = CLIProxy.call(argv)
|
|
11
|
-
stdout.print(output)
|
|
12
|
-
0
|
|
13
|
-
rescue CLIProxy::CLIExecutionError => e
|
|
14
|
-
stderr.print(e.stderr)
|
|
15
|
-
e.status || 1
|
|
16
|
-
rescue CLIProxy::MissingBinaryError, CLIProxy::Error => e
|
|
17
|
-
stderr.puts(e.message)
|
|
18
|
-
1
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'html_to_markdown/cli_proxy'
|
|
4
|
+
|
|
5
|
+
module HtmlToMarkdown
|
|
6
|
+
module CLI
|
|
7
|
+
module_function
|
|
8
|
+
|
|
9
|
+
def run(argv = ARGV, stdout: $stdout, stderr: $stderr)
|
|
10
|
+
output = CLIProxy.call(argv)
|
|
11
|
+
stdout.print(output)
|
|
12
|
+
0
|
|
13
|
+
rescue CLIProxy::CLIExecutionError => e
|
|
14
|
+
stderr.print(e.stderr)
|
|
15
|
+
e.status || 1
|
|
16
|
+
rescue CLIProxy::MissingBinaryError, CLIProxy::Error => e
|
|
17
|
+
stderr.puts(e.message)
|
|
18
|
+
1
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|