html-to-markdown 2.6.6 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +12 -0
- data/bin/benchmark.rb +94 -0
- data/ext/html-to-markdown-rb/native/Cargo.toml +4 -4
- data/ext/html-to-markdown-rb/native/src/lib.rs +171 -171
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +12 -0
- data/spec/convert_spec.rb +9 -0
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 56ca0b6b6d1c9e67dadddfa1865e693bcf986859cc203f7987a5f787203ff40f
|
|
4
|
+
data.tar.gz: 932a473d64548a6d976c452b4d82a357e10709f7975a43e5d699e543a9d3a372
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0e36518ae77b2c25f0ac0c26686524d152c687a026fc0f2f4c7b3ed03bd219cc8a5a799ddb8f9b698a5d3e64db7d05ac7d936b535b797aafd806a276af97e993
|
|
7
|
+
data.tar.gz: 31ab40f1ff1daae0e2b06e485e3e85d15f59fca17c07a8b6b64785b2326e9a6d208381183b043dcac92e01c83fd98b6a8607438faffb2b1e01d4953cb96e19f0
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
|
@@ -109,6 +109,18 @@ markdown = HtmlToMarkdown.convert(
|
|
|
109
109
|
puts markdown
|
|
110
110
|
```
|
|
111
111
|
|
|
112
|
+
### Reusing Options
|
|
113
|
+
|
|
114
|
+
If you’re running tight loops or benchmarks, build the options once and pass the handle back into `convert_with_options`:
|
|
115
|
+
|
|
116
|
+
```ruby
|
|
117
|
+
handle = HtmlToMarkdown.options(hocr_spatial_tables: false)
|
|
118
|
+
|
|
119
|
+
100.times do
|
|
120
|
+
HtmlToMarkdown.convert_with_options('<h1>Handles</h1>', handle)
|
|
121
|
+
end
|
|
122
|
+
```
|
|
123
|
+
|
|
112
124
|
### HTML Preprocessing
|
|
113
125
|
|
|
114
126
|
Clean up scraped HTML (navigation, forms, malformed markup) before conversion:
|
data/bin/benchmark.rb
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require 'optparse'
|
|
5
|
+
require 'time'
|
|
6
|
+
|
|
7
|
+
$LOAD_PATH.unshift(File.expand_path('../lib', __dir__))
|
|
8
|
+
require 'html_to_markdown'
|
|
9
|
+
|
|
10
|
+
def json_escape(value)
|
|
11
|
+
value.to_s.gsub(/["\\\n\r]/) do |char|
|
|
12
|
+
case char
|
|
13
|
+
when '"', '\\'
|
|
14
|
+
"\\#{char}"
|
|
15
|
+
when "\n"
|
|
16
|
+
'\\n'
|
|
17
|
+
when "\r"
|
|
18
|
+
'\\r'
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
options = {
|
|
24
|
+
iterations: 50,
|
|
25
|
+
format: 'html'
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
OptionParser.new do |parser|
|
|
29
|
+
parser.banner = 'ruby benchmark.rb --file path/to/fixture.html [--iterations 200]'
|
|
30
|
+
|
|
31
|
+
parser.on('--file FILE', 'HTML fixture to convert repeatedly') do |file|
|
|
32
|
+
options[:file] = file
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
parser.on('--iterations N', Integer, 'Number of conversion iterations (default: 50)') do |n|
|
|
36
|
+
options[:iterations] = n.positive? ? n : 1
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
parser.on('--format FORMAT', 'Fixture format (html or hocr)') do |format|
|
|
40
|
+
options[:format] = format.downcase
|
|
41
|
+
end
|
|
42
|
+
end.parse!
|
|
43
|
+
|
|
44
|
+
fixture = options.fetch(:file) do
|
|
45
|
+
warn 'Missing --file parameter'
|
|
46
|
+
exit 1
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
unless File.exist?(fixture)
|
|
50
|
+
warn "Fixture not found: #{fixture}"
|
|
51
|
+
exit 1
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
unless %w[html hocr].include?(options[:format])
|
|
55
|
+
warn "Unsupported format: #{options[:format]}"
|
|
56
|
+
exit 1
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
html = File.binread(fixture)
|
|
60
|
+
html.force_encoding(Encoding::UTF_8)
|
|
61
|
+
html.freeze
|
|
62
|
+
iterations = options[:iterations]
|
|
63
|
+
options_handle = HtmlToMarkdown.options(
|
|
64
|
+
options[:format] == 'hocr' ? { hocr_spatial_tables: false } : nil
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def convert_document(html, options_handle)
|
|
68
|
+
HtmlToMarkdown.convert_with_options(html, options_handle)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
convert_document(html, options_handle)
|
|
72
|
+
|
|
73
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
74
|
+
iterations.times { convert_document(html, options_handle) }
|
|
75
|
+
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
|
|
76
|
+
|
|
77
|
+
payload_size_bytes = html.bytesize
|
|
78
|
+
bytes_processed = payload_size_bytes * iterations
|
|
79
|
+
ops_per_sec = iterations / elapsed
|
|
80
|
+
mb_per_sec = (bytes_processed.to_f / (1024 * 1024)) / elapsed
|
|
81
|
+
|
|
82
|
+
payload = %({
|
|
83
|
+
"language":"ruby",
|
|
84
|
+
"fixture":"#{json_escape(File.basename(fixture))}",
|
|
85
|
+
"fixture_path":"#{json_escape(fixture)}",
|
|
86
|
+
"iterations":#{iterations},
|
|
87
|
+
"elapsed_seconds":#{format('%.8f', elapsed)},
|
|
88
|
+
"ops_per_sec":#{format('%.4f', ops_per_sec)},
|
|
89
|
+
"mb_per_sec":#{format('%.4f', mb_per_sec)},
|
|
90
|
+
"bytes_processed":#{bytes_processed},
|
|
91
|
+
"payload_size_bytes":#{payload_size_bytes}
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
puts payload.strip
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "html-to-markdown-rb"
|
|
3
|
-
version = "2.
|
|
4
|
-
edition =
|
|
3
|
+
version = "2.7.0"
|
|
4
|
+
edition.workspace = true
|
|
5
5
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
6
6
|
license = "MIT"
|
|
7
7
|
repository = "https://github.com/Goldziher/html-to-markdown"
|
|
8
8
|
homepage = "https://github.com/Goldziher/html-to-markdown"
|
|
9
9
|
documentation = "https://docs.rs/html-to-markdown-rs"
|
|
10
10
|
readme = "README.md"
|
|
11
|
-
rust-version =
|
|
11
|
+
rust-version.workspace = true
|
|
12
12
|
description = "Ruby bindings (Magnus) for html-to-markdown - high-performance HTML to Markdown converter"
|
|
13
13
|
keywords = ["html", "markdown", "ruby", "magnus", "bindings"]
|
|
14
14
|
categories = ["api-bindings"]
|
|
@@ -21,7 +21,7 @@ crate-type = ["cdylib", "rlib"]
|
|
|
21
21
|
default = []
|
|
22
22
|
|
|
23
23
|
[dependencies]
|
|
24
|
-
html-to-markdown-rs = { version = "2.
|
|
24
|
+
html-to-markdown-rs = { version = "2.7.0", features = ["inline-images"] }
|
|
25
25
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
|
|
26
26
|
|
|
27
27
|
[dev-dependencies]
|
|
@@ -5,8 +5,13 @@ use html_to_markdown_rs::{
|
|
|
5
5
|
convert_with_inline_images as convert_with_inline_images_inner, error::ConversionError,
|
|
6
6
|
};
|
|
7
7
|
use magnus::prelude::*;
|
|
8
|
+
use magnus::r_hash::ForEach;
|
|
8
9
|
use magnus::{Error, RArray, RHash, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args};
|
|
9
10
|
|
|
11
|
+
#[derive(Clone)]
|
|
12
|
+
#[magnus::wrap(class = "HtmlToMarkdown::Options", free_immediately)]
|
|
13
|
+
struct OptionsHandle(ConversionOptions);
|
|
14
|
+
|
|
10
15
|
const DEFAULT_INLINE_IMAGE_LIMIT: u64 = 5 * 1024 * 1024;
|
|
11
16
|
|
|
12
17
|
fn conversion_error(err: ConversionError) -> Error {
|
|
@@ -34,11 +39,6 @@ fn symbol_to_string(value: Value) -> Result<String, Error> {
|
|
|
34
39
|
}
|
|
35
40
|
}
|
|
36
41
|
|
|
37
|
-
fn get_kw(ruby: &Ruby, hash: RHash, name: &str) -> Option<Value> {
|
|
38
|
-
let sym = ruby.intern(name);
|
|
39
|
-
hash.get(sym).or_else(|| hash.get(name))
|
|
40
|
-
}
|
|
41
|
-
|
|
42
42
|
fn parse_heading_style(value: Value) -> Result<HeadingStyle, Error> {
|
|
43
43
|
match symbol_to_string(value)?.as_str() {
|
|
44
44
|
"underlined" => Ok(HeadingStyle::Underlined),
|
|
@@ -106,26 +106,30 @@ fn parse_vec_of_strings(value: Value) -> Result<Vec<String>, Error> {
|
|
|
106
106
|
array.to_vec::<String>()
|
|
107
107
|
}
|
|
108
108
|
|
|
109
|
-
fn parse_preprocessing_options(
|
|
109
|
+
fn parse_preprocessing_options(_ruby: &Ruby, value: Value) -> Result<PreprocessingOptions, Error> {
|
|
110
110
|
let hash = RHash::from_value(value).ok_or_else(|| arg_error("expected preprocessing to be a Hash"))?;
|
|
111
111
|
|
|
112
112
|
let mut opts = PreprocessingOptions::default();
|
|
113
113
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
114
|
+
hash.foreach(|key: Value, val: Value| {
|
|
115
|
+
let key_name = symbol_to_string(key)?;
|
|
116
|
+
match key_name.as_str() {
|
|
117
|
+
"enabled" => {
|
|
118
|
+
opts.enabled = bool::try_convert(val)?;
|
|
119
|
+
}
|
|
120
|
+
"preset" => {
|
|
121
|
+
opts.preset = parse_preset(val)?;
|
|
122
|
+
}
|
|
123
|
+
"remove_navigation" => {
|
|
124
|
+
opts.remove_navigation = bool::try_convert(val)?;
|
|
125
|
+
}
|
|
126
|
+
"remove_forms" => {
|
|
127
|
+
opts.remove_forms = bool::try_convert(val)?;
|
|
128
|
+
}
|
|
129
|
+
_ => {}
|
|
130
|
+
}
|
|
131
|
+
Ok(ForEach::Continue)
|
|
132
|
+
})?;
|
|
129
133
|
|
|
130
134
|
Ok(opts)
|
|
131
135
|
}
|
|
@@ -143,142 +147,119 @@ fn build_conversion_options(ruby: &Ruby, options: Option<Value>) -> Result<Conve
|
|
|
143
147
|
|
|
144
148
|
let hash = RHash::from_value(options).ok_or_else(|| arg_error("options must be provided as a Hash"))?;
|
|
145
149
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
150
|
+
hash.foreach(|key: Value, val: Value| {
|
|
151
|
+
let key_name = symbol_to_string(key)?;
|
|
152
|
+
match key_name.as_str() {
|
|
153
|
+
"heading_style" => {
|
|
154
|
+
opts.heading_style = parse_heading_style(val)?;
|
|
155
|
+
}
|
|
156
|
+
"list_indent_type" => {
|
|
157
|
+
opts.list_indent_type = parse_list_indent_type(val)?;
|
|
158
|
+
}
|
|
159
|
+
"list_indent_width" => {
|
|
160
|
+
opts.list_indent_width = usize::try_convert(val)?;
|
|
161
|
+
}
|
|
162
|
+
"bullets" => {
|
|
163
|
+
opts.bullets = String::try_convert(val)?;
|
|
164
|
+
}
|
|
165
|
+
"strong_em_symbol" => {
|
|
166
|
+
let value = String::try_convert(val)?;
|
|
167
|
+
let mut chars = value.chars();
|
|
168
|
+
let ch = chars
|
|
169
|
+
.next()
|
|
170
|
+
.ok_or_else(|| arg_error("strong_em_symbol must not be empty"))?;
|
|
171
|
+
if chars.next().is_some() {
|
|
172
|
+
return Err(arg_error("strong_em_symbol must be a single character"));
|
|
173
|
+
}
|
|
174
|
+
opts.strong_em_symbol = ch;
|
|
175
|
+
}
|
|
176
|
+
"escape_asterisks" => {
|
|
177
|
+
opts.escape_asterisks = bool::try_convert(val)?;
|
|
178
|
+
}
|
|
179
|
+
"escape_underscores" => {
|
|
180
|
+
opts.escape_underscores = bool::try_convert(val)?;
|
|
181
|
+
}
|
|
182
|
+
"escape_misc" => {
|
|
183
|
+
opts.escape_misc = bool::try_convert(val)?;
|
|
184
|
+
}
|
|
185
|
+
"escape_ascii" => {
|
|
186
|
+
opts.escape_ascii = bool::try_convert(val)?;
|
|
187
|
+
}
|
|
188
|
+
"code_language" => {
|
|
189
|
+
opts.code_language = String::try_convert(val)?;
|
|
190
|
+
}
|
|
191
|
+
"autolinks" => {
|
|
192
|
+
opts.autolinks = bool::try_convert(val)?;
|
|
193
|
+
}
|
|
194
|
+
"default_title" => {
|
|
195
|
+
opts.default_title = bool::try_convert(val)?;
|
|
196
|
+
}
|
|
197
|
+
"br_in_tables" => {
|
|
198
|
+
opts.br_in_tables = bool::try_convert(val)?;
|
|
199
|
+
}
|
|
200
|
+
"hocr_spatial_tables" => {
|
|
201
|
+
opts.hocr_spatial_tables = bool::try_convert(val)?;
|
|
202
|
+
}
|
|
203
|
+
"highlight_style" => {
|
|
204
|
+
opts.highlight_style = parse_highlight_style(val)?;
|
|
205
|
+
}
|
|
206
|
+
"extract_metadata" => {
|
|
207
|
+
opts.extract_metadata = bool::try_convert(val)?;
|
|
208
|
+
}
|
|
209
|
+
"whitespace_mode" => {
|
|
210
|
+
opts.whitespace_mode = parse_whitespace_mode(val)?;
|
|
211
|
+
}
|
|
212
|
+
"strip_newlines" => {
|
|
213
|
+
opts.strip_newlines = bool::try_convert(val)?;
|
|
214
|
+
}
|
|
215
|
+
"wrap" => {
|
|
216
|
+
opts.wrap = bool::try_convert(val)?;
|
|
217
|
+
}
|
|
218
|
+
"wrap_width" => {
|
|
219
|
+
opts.wrap_width = usize::try_convert(val)?;
|
|
220
|
+
}
|
|
221
|
+
"convert_as_inline" => {
|
|
222
|
+
opts.convert_as_inline = bool::try_convert(val)?;
|
|
223
|
+
}
|
|
224
|
+
"sub_symbol" => {
|
|
225
|
+
opts.sub_symbol = String::try_convert(val)?;
|
|
226
|
+
}
|
|
227
|
+
"sup_symbol" => {
|
|
228
|
+
opts.sup_symbol = String::try_convert(val)?;
|
|
229
|
+
}
|
|
230
|
+
"newline_style" => {
|
|
231
|
+
opts.newline_style = parse_newline_style(val)?;
|
|
232
|
+
}
|
|
233
|
+
"code_block_style" => {
|
|
234
|
+
opts.code_block_style = parse_code_block_style(val)?;
|
|
235
|
+
}
|
|
236
|
+
"keep_inline_images_in" => {
|
|
237
|
+
opts.keep_inline_images_in = parse_vec_of_strings(val)?;
|
|
238
|
+
}
|
|
239
|
+
"preprocessing" => {
|
|
240
|
+
opts.preprocessing = parse_preprocessing_options(ruby, val)?;
|
|
241
|
+
}
|
|
242
|
+
"encoding" => {
|
|
243
|
+
opts.encoding = String::try_convert(val)?;
|
|
244
|
+
}
|
|
245
|
+
"debug" => {
|
|
246
|
+
opts.debug = bool::try_convert(val)?;
|
|
247
|
+
}
|
|
248
|
+
"strip_tags" => {
|
|
249
|
+
opts.strip_tags = parse_vec_of_strings(val)?;
|
|
250
|
+
}
|
|
251
|
+
"preserve_tags" => {
|
|
252
|
+
opts.preserve_tags = parse_vec_of_strings(val)?;
|
|
253
|
+
}
|
|
254
|
+
_ => {}
|
|
170
255
|
}
|
|
171
|
-
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
if let Some(escape_asterisks) = get_kw(ruby, hash, "escape_asterisks") {
|
|
175
|
-
opts.escape_asterisks = bool::try_convert(escape_asterisks)?;
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
if let Some(escape_underscores) = get_kw(ruby, hash, "escape_underscores") {
|
|
179
|
-
opts.escape_underscores = bool::try_convert(escape_underscores)?;
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
if let Some(escape_misc) = get_kw(ruby, hash, "escape_misc") {
|
|
183
|
-
opts.escape_misc = bool::try_convert(escape_misc)?;
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
if let Some(escape_ascii) = get_kw(ruby, hash, "escape_ascii") {
|
|
187
|
-
opts.escape_ascii = bool::try_convert(escape_ascii)?;
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
if let Some(code_language) = get_kw(ruby, hash, "code_language") {
|
|
191
|
-
opts.code_language = String::try_convert(code_language)?;
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
if let Some(autolinks) = get_kw(ruby, hash, "autolinks") {
|
|
195
|
-
opts.autolinks = bool::try_convert(autolinks)?;
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
if let Some(default_title) = get_kw(ruby, hash, "default_title") {
|
|
199
|
-
opts.default_title = bool::try_convert(default_title)?;
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
if let Some(br_in_tables) = get_kw(ruby, hash, "br_in_tables") {
|
|
203
|
-
opts.br_in_tables = bool::try_convert(br_in_tables)?;
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
if let Some(hocr_spatial_tables) = get_kw(ruby, hash, "hocr_spatial_tables") {
|
|
207
|
-
opts.hocr_spatial_tables = bool::try_convert(hocr_spatial_tables)?;
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
if let Some(highlight_style) = get_kw(ruby, hash, "highlight_style") {
|
|
211
|
-
opts.highlight_style = parse_highlight_style(highlight_style)?;
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
if let Some(extract_metadata) = get_kw(ruby, hash, "extract_metadata") {
|
|
215
|
-
opts.extract_metadata = bool::try_convert(extract_metadata)?;
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
if let Some(whitespace_mode) = get_kw(ruby, hash, "whitespace_mode") {
|
|
219
|
-
opts.whitespace_mode = parse_whitespace_mode(whitespace_mode)?;
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
if let Some(strip_newlines) = get_kw(ruby, hash, "strip_newlines") {
|
|
223
|
-
opts.strip_newlines = bool::try_convert(strip_newlines)?;
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
if let Some(wrap) = get_kw(ruby, hash, "wrap") {
|
|
227
|
-
opts.wrap = bool::try_convert(wrap)?;
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
if let Some(wrap_width) = get_kw(ruby, hash, "wrap_width") {
|
|
231
|
-
opts.wrap_width = usize::try_convert(wrap_width)?;
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
if let Some(convert_as_inline) = get_kw(ruby, hash, "convert_as_inline") {
|
|
235
|
-
opts.convert_as_inline = bool::try_convert(convert_as_inline)?;
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
if let Some(sub_symbol) = get_kw(ruby, hash, "sub_symbol") {
|
|
239
|
-
opts.sub_symbol = String::try_convert(sub_symbol)?;
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
if let Some(sup_symbol) = get_kw(ruby, hash, "sup_symbol") {
|
|
243
|
-
opts.sup_symbol = String::try_convert(sup_symbol)?;
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
if let Some(newline_style) = get_kw(ruby, hash, "newline_style") {
|
|
247
|
-
opts.newline_style = parse_newline_style(newline_style)?;
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
if let Some(code_block_style) = get_kw(ruby, hash, "code_block_style") {
|
|
251
|
-
opts.code_block_style = parse_code_block_style(code_block_style)?;
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
if let Some(keep_inline_images_in) = get_kw(ruby, hash, "keep_inline_images_in") {
|
|
255
|
-
opts.keep_inline_images_in = parse_vec_of_strings(keep_inline_images_in)?;
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
if let Some(preprocessing) = get_kw(ruby, hash, "preprocessing") {
|
|
259
|
-
opts.preprocessing = parse_preprocessing_options(ruby, preprocessing)?;
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
if let Some(encoding) = get_kw(ruby, hash, "encoding") {
|
|
263
|
-
opts.encoding = String::try_convert(encoding)?;
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
if let Some(debug) = get_kw(ruby, hash, "debug") {
|
|
267
|
-
opts.debug = bool::try_convert(debug)?;
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
if let Some(strip_tags) = get_kw(ruby, hash, "strip_tags") {
|
|
271
|
-
opts.strip_tags = parse_vec_of_strings(strip_tags)?;
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
if let Some(preserve_tags) = get_kw(ruby, hash, "preserve_tags") {
|
|
275
|
-
opts.preserve_tags = parse_vec_of_strings(preserve_tags)?;
|
|
276
|
-
}
|
|
256
|
+
Ok(ForEach::Continue)
|
|
257
|
+
})?;
|
|
277
258
|
|
|
278
259
|
Ok(opts)
|
|
279
260
|
}
|
|
280
261
|
|
|
281
|
-
fn build_inline_image_config(
|
|
262
|
+
fn build_inline_image_config(_ruby: &Ruby, config: Option<Value>) -> Result<InlineImageConfig, Error> {
|
|
282
263
|
let mut cfg = InlineImageConfig::new(DEFAULT_INLINE_IMAGE_LIMIT);
|
|
283
264
|
|
|
284
265
|
let Some(config) = config else {
|
|
@@ -291,25 +272,29 @@ fn build_inline_image_config(ruby: &Ruby, config: Option<Value>) -> Result<Inlin
|
|
|
291
272
|
|
|
292
273
|
let hash = RHash::from_value(config).ok_or_else(|| arg_error("inline image config must be provided as a Hash"))?;
|
|
293
274
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
275
|
+
hash.foreach(|key: Value, val: Value| {
|
|
276
|
+
let key_name = symbol_to_string(key)?;
|
|
277
|
+
match key_name.as_str() {
|
|
278
|
+
"max_decoded_size_bytes" => {
|
|
279
|
+
cfg.max_decoded_size_bytes = u64::try_convert(val)?;
|
|
280
|
+
}
|
|
281
|
+
"filename_prefix" => {
|
|
282
|
+
cfg.filename_prefix = if val.is_nil() {
|
|
283
|
+
None
|
|
284
|
+
} else {
|
|
285
|
+
Some(String::try_convert(val)?)
|
|
286
|
+
};
|
|
287
|
+
}
|
|
288
|
+
"capture_svg" => {
|
|
289
|
+
cfg.capture_svg = bool::try_convert(val)?;
|
|
290
|
+
}
|
|
291
|
+
"infer_dimensions" => {
|
|
292
|
+
cfg.infer_dimensions = bool::try_convert(val)?;
|
|
293
|
+
}
|
|
294
|
+
_ => {}
|
|
295
|
+
}
|
|
296
|
+
Ok(ForEach::Continue)
|
|
297
|
+
})?;
|
|
313
298
|
|
|
314
299
|
Ok(cfg)
|
|
315
300
|
}
|
|
@@ -408,6 +393,19 @@ fn convert_fn(ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
|
|
|
408
393
|
convert_inner(&html, Some(options)).map_err(conversion_error)
|
|
409
394
|
}
|
|
410
395
|
|
|
396
|
+
fn options_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<OptionsHandle, Error> {
|
|
397
|
+
let parsed = scan_args::<(), (Option<Value>,), (), (), (), ()>(args)?;
|
|
398
|
+
let options = build_conversion_options(ruby, parsed.optional.0)?;
|
|
399
|
+
Ok(OptionsHandle(options))
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
fn convert_with_options_handle_fn(_ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
|
|
403
|
+
let parsed = scan_args::<(String, &OptionsHandle), (), (), (), (), ()>(args)?;
|
|
404
|
+
let html = parsed.required.0;
|
|
405
|
+
let handle = parsed.required.1;
|
|
406
|
+
convert_inner(&html, Some(handle.0.clone())).map_err(conversion_error)
|
|
407
|
+
}
|
|
408
|
+
|
|
411
409
|
fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
|
|
412
410
|
let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
|
|
413
411
|
let html = parsed.required.0;
|
|
@@ -423,6 +421,8 @@ fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, E
|
|
|
423
421
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
424
422
|
let module = ruby.define_module("HtmlToMarkdown")?;
|
|
425
423
|
module.define_singleton_method("convert", function!(convert_fn, -1))?;
|
|
424
|
+
module.define_singleton_method("options", function!(options_handle_fn, -1))?;
|
|
425
|
+
module.define_singleton_method("convert_with_options", function!(convert_with_options_handle_fn, -1))?;
|
|
426
426
|
module.define_singleton_method(
|
|
427
427
|
"convert_with_inline_images",
|
|
428
428
|
function!(convert_with_inline_images_fn, -1),
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -7,9 +7,13 @@ module HtmlToMarkdown
|
|
|
7
7
|
autoload :CLI, 'html_to_markdown/cli'
|
|
8
8
|
autoload :CLIProxy, 'html_to_markdown/cli_proxy'
|
|
9
9
|
|
|
10
|
+
class Options; end # rubocop:disable Lint/EmptyClass
|
|
11
|
+
|
|
10
12
|
class << self
|
|
11
13
|
alias native_convert convert
|
|
12
14
|
alias native_convert_with_inline_images convert_with_inline_images
|
|
15
|
+
alias native_options options
|
|
16
|
+
alias native_convert_with_options convert_with_options
|
|
13
17
|
end
|
|
14
18
|
|
|
15
19
|
module_function
|
|
@@ -18,7 +22,15 @@ module HtmlToMarkdown
|
|
|
18
22
|
native_convert(html.to_s, options)
|
|
19
23
|
end
|
|
20
24
|
|
|
25
|
+
def convert_with_options(html, options_handle)
|
|
26
|
+
native_convert_with_options(html.to_s, options_handle)
|
|
27
|
+
end
|
|
28
|
+
|
|
21
29
|
def convert_with_inline_images(html, options = nil, image_config = nil)
|
|
22
30
|
native_convert_with_inline_images(html.to_s, options, image_config)
|
|
23
31
|
end
|
|
32
|
+
|
|
33
|
+
def options(options_hash = nil)
|
|
34
|
+
native_options(options_hash)
|
|
35
|
+
end
|
|
24
36
|
end
|
data/spec/convert_spec.rb
CHANGED
|
@@ -26,4 +26,13 @@ RSpec.describe HtmlToMarkdown do
|
|
|
26
26
|
expect(extraction[:inline_images].first[:description]).to eq('fake')
|
|
27
27
|
end
|
|
28
28
|
end
|
|
29
|
+
|
|
30
|
+
describe '.options' do
|
|
31
|
+
it 'returns a reusable options handle' do
|
|
32
|
+
handle = described_class.options(heading_style: :atx_closed)
|
|
33
|
+
expect(handle).to be_a(HtmlToMarkdown::Options)
|
|
34
|
+
result = described_class.convert_with_options('<h1>Hello</h1>', handle)
|
|
35
|
+
expect(result).to include('# Hello #')
|
|
36
|
+
end
|
|
37
|
+
end
|
|
29
38
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.7.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-11-
|
|
11
|
+
date: 2025-11-11 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -47,6 +47,7 @@ files:
|
|
|
47
47
|
- Gemfile.lock
|
|
48
48
|
- README.md
|
|
49
49
|
- Rakefile
|
|
50
|
+
- bin/benchmark.rb
|
|
50
51
|
- exe/html-to-markdown
|
|
51
52
|
- ext/html-to-markdown-rb/extconf.rb
|
|
52
53
|
- ext/html-to-markdown-rb/native/Cargo.toml
|