html-to-markdown 2.15.0 → 2.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +9 -9
- data/bin/benchmark.rb +58 -8
- data/ext/html-to-markdown-rb/extconf.rb +3 -0
- data/ext/html-to-markdown-rb/native/Cargo.toml +9 -6
- data/ext/html-to-markdown-rb/native/README.md +15 -0
- data/ext/html-to-markdown-rb/native/src/lib.rs +130 -119
- data/ext/html-to-markdown-rb/native/src/profiling.rs +211 -0
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +10 -0
- data/sig/html_to_markdown.rbs +40 -0
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 98fb6b9702672ba55302719d35311f7a38bcd020a9a7830da656f38dc527c071
|
|
4
|
+
data.tar.gz: 91fb49273a53f2c9d5dcbb592a0c5ac8c995f4e15919ea0fc8bf7b2905d418d3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 95a0de0a0486737f904d7504a2f45b7112021a2777abba28e3df1b64560337d1b1f08978c02a3b6b3e8e5a71cac2fefa9318cecc44591512c3ea9cea149551c2
|
|
7
|
+
data.tar.gz: 5f66690cc665c690083ce01e210eb164f839c6141bb8a1ef94b18bbdc5862cde769381179db20362b0162982128d1b60097abfaccf20cf7ba1f7187d20d1038e
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
|
@@ -69,18 +69,18 @@ Apple M4 • Real Wikipedia documents • `HtmlToMarkdown.convert` (Ruby)
|
|
|
69
69
|
|
|
70
70
|
### Benchmark Fixtures (Apple M4)
|
|
71
71
|
|
|
72
|
-
Measured via `task bench:
|
|
72
|
+
Measured via `task bench:harness` with the shared Wikipedia + hOCR suite:
|
|
73
73
|
|
|
74
74
|
| Document | Size | ops/sec (Ruby) |
|
|
75
75
|
| ---------------------- | ------ | -------------- |
|
|
76
|
-
| Lists (Timeline) | 129 KB |
|
|
77
|
-
| Tables (Countries) | 360 KB |
|
|
78
|
-
| Medium (Python) | 657 KB |
|
|
79
|
-
| Large (Rust) | 567 KB |
|
|
80
|
-
| Small (Intro) | 463 KB |
|
|
81
|
-
| hOCR German PDF | 44 KB |
|
|
82
|
-
| hOCR Invoice | 4 KB |
|
|
83
|
-
| hOCR Embedded Tables | 37 KB |
|
|
76
|
+
| Lists (Timeline) | 129 KB | 3,156 |
|
|
77
|
+
| Tables (Countries) | 360 KB | 921 |
|
|
78
|
+
| Medium (Python) | 657 KB | 469 |
|
|
79
|
+
| Large (Rust) | 567 KB | 534 |
|
|
80
|
+
| Small (Intro) | 463 KB | 629 |
|
|
81
|
+
| hOCR German PDF | 44 KB | 7,250 |
|
|
82
|
+
| hOCR Invoice | 4 KB | 83,883 |
|
|
83
|
+
| hOCR Embedded Tables | 37 KB | 7,890 |
|
|
84
84
|
|
|
85
85
|
> These numbers line up with the Python/Node bindings because everything flows through the same Rust engine.
|
|
86
86
|
|
data/bin/benchmark.rb
CHANGED
|
@@ -22,7 +22,8 @@ end
|
|
|
22
22
|
|
|
23
23
|
options = {
|
|
24
24
|
iterations: 50,
|
|
25
|
-
format: 'html'
|
|
25
|
+
format: 'html',
|
|
26
|
+
scenario: 'convert-default'
|
|
26
27
|
}
|
|
27
28
|
|
|
28
29
|
OptionParser.new do |parser|
|
|
@@ -36,6 +37,10 @@ OptionParser.new do |parser|
|
|
|
36
37
|
options[:iterations] = n.positive? ? n : 1
|
|
37
38
|
end
|
|
38
39
|
|
|
40
|
+
parser.on('--scenario SCENARIO', 'Scenario to benchmark') do |scenario|
|
|
41
|
+
options[:scenario] = scenario
|
|
42
|
+
end
|
|
43
|
+
|
|
39
44
|
parser.on('--format FORMAT', 'Fixture format (html or hocr)') do |format|
|
|
40
45
|
options[:format] = format.downcase
|
|
41
46
|
end
|
|
@@ -56,24 +61,68 @@ unless %w[html hocr].include?(options[:format])
|
|
|
56
61
|
exit 1
|
|
57
62
|
end
|
|
58
63
|
|
|
64
|
+
supported_scenarios = %w[
|
|
65
|
+
convert-default
|
|
66
|
+
convert-options
|
|
67
|
+
inline-images-default
|
|
68
|
+
inline-images-options
|
|
69
|
+
metadata-default
|
|
70
|
+
metadata-options
|
|
71
|
+
]
|
|
72
|
+
unless supported_scenarios.include?(options[:scenario])
|
|
73
|
+
warn "Unsupported scenario: #{options[:scenario]}"
|
|
74
|
+
exit 1
|
|
75
|
+
end
|
|
76
|
+
|
|
59
77
|
html = File.binread(fixture)
|
|
60
78
|
html.force_encoding(Encoding::UTF_8)
|
|
61
79
|
html.freeze
|
|
62
80
|
iterations = options[:iterations]
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
)
|
|
81
|
+
conversion_options = options[:format] == 'hocr' ? { hocr_spatial_tables: false } : {}
|
|
82
|
+
options_handle = if %w[convert-options inline-images-options metadata-options].include?(options[:scenario])
|
|
83
|
+
HtmlToMarkdown.options(conversion_options)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
SCENARIO_RUNNERS = {
|
|
87
|
+
'convert-default' => ->(html, _options, _handle) { HtmlToMarkdown.convert(html) },
|
|
88
|
+
'convert-options' => lambda do |html, _options, handle|
|
|
89
|
+
raise ArgumentError, 'options handle required' unless handle
|
|
90
|
+
|
|
91
|
+
HtmlToMarkdown.convert_with_options(html, handle)
|
|
92
|
+
end,
|
|
93
|
+
'inline-images-default' => ->(html, _options, _handle) { HtmlToMarkdown.convert_with_inline_images(html, nil, nil) },
|
|
94
|
+
'inline-images-options' => lambda do |html, _options, handle|
|
|
95
|
+
raise ArgumentError, 'options handle required' unless handle
|
|
96
|
+
|
|
97
|
+
HtmlToMarkdown.convert_with_inline_images_handle(html, handle, nil)
|
|
98
|
+
end,
|
|
99
|
+
'metadata-default' => ->(html, _options, _handle) { HtmlToMarkdown.convert_with_metadata(html, nil, nil) },
|
|
100
|
+
'metadata-options' => lambda do |html, _options, handle|
|
|
101
|
+
raise ArgumentError, 'options handle required' unless handle
|
|
102
|
+
|
|
103
|
+
HtmlToMarkdown.convert_with_metadata_handle(html, handle, nil)
|
|
104
|
+
end
|
|
105
|
+
}.freeze
|
|
66
106
|
|
|
67
|
-
def
|
|
68
|
-
|
|
107
|
+
def run_scenario(html, scenario, options, handle)
|
|
108
|
+
runner = SCENARIO_RUNNERS.fetch(scenario) { raise ArgumentError, "Unsupported scenario: #{scenario}" }
|
|
109
|
+
runner.call(html, options, handle)
|
|
69
110
|
end
|
|
70
111
|
|
|
71
|
-
|
|
112
|
+
run_scenario(html, options[:scenario], conversion_options, options_handle)
|
|
113
|
+
|
|
114
|
+
profile_output = ENV.fetch('HTML_TO_MARKDOWN_PROFILE_OUTPUT', nil)
|
|
115
|
+
if profile_output && HtmlToMarkdown.respond_to?(:start_profiling)
|
|
116
|
+
freq = Integer(ENV.fetch('HTML_TO_MARKDOWN_PROFILE_FREQUENCY', '1000'), 10)
|
|
117
|
+
HtmlToMarkdown.start_profiling(profile_output, freq)
|
|
118
|
+
end
|
|
72
119
|
|
|
73
120
|
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
74
|
-
iterations.times {
|
|
121
|
+
iterations.times { run_scenario(html, options[:scenario], conversion_options, options_handle) }
|
|
75
122
|
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
|
|
76
123
|
|
|
124
|
+
HtmlToMarkdown.stop_profiling if profile_output && HtmlToMarkdown.respond_to?(:stop_profiling)
|
|
125
|
+
|
|
77
126
|
payload_size_bytes = html.bytesize
|
|
78
127
|
bytes_processed = payload_size_bytes * iterations
|
|
79
128
|
ops_per_sec = iterations / elapsed
|
|
@@ -83,6 +132,7 @@ payload = %({
|
|
|
83
132
|
"language":"ruby",
|
|
84
133
|
"fixture":"#{json_escape(File.basename(fixture))}",
|
|
85
134
|
"fixture_path":"#{json_escape(fixture)}",
|
|
135
|
+
"scenario":"#{json_escape(options[:scenario])}",
|
|
86
136
|
"iterations":#{iterations},
|
|
87
137
|
"elapsed_seconds":#{format('%.8f', elapsed)},
|
|
88
138
|
"ops_per_sec":#{format('%.4f', ops_per_sec)},
|
|
@@ -25,6 +25,9 @@ default_profile = ENV.fetch('CARGO_PROFILE', 'release')
|
|
|
25
25
|
|
|
26
26
|
create_rust_makefile('html_to_markdown_rb') do |config|
|
|
27
27
|
config.profile = default_profile.to_sym
|
|
28
|
+
features_env = ENV.fetch('HTML_TO_MARKDOWN_CARGO_FEATURES', '')
|
|
29
|
+
features = features_env.split(',').map(&:strip).reject(&:empty?)
|
|
30
|
+
config.features = features unless features.empty?
|
|
28
31
|
|
|
29
32
|
native_dir = File.expand_path('native', __dir__)
|
|
30
33
|
relative_native =
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "html-to-markdown-rb"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.16.0"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -17,13 +17,16 @@ categories = ["api-bindings"]
|
|
|
17
17
|
name = "html_to_markdown_rb"
|
|
18
18
|
crate-type = ["cdylib", "rlib"]
|
|
19
19
|
|
|
20
|
-
[features]
|
|
21
|
-
default = ["metadata"]
|
|
22
|
-
metadata = ["html-to-markdown-rs/metadata"]
|
|
23
|
-
|
|
24
20
|
[dependencies]
|
|
25
|
-
html-to-markdown-rs = { version = "2.
|
|
21
|
+
html-to-markdown-rs = { version = "2.16.0", features = ["inline-images"] }
|
|
26
22
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
|
|
27
23
|
|
|
24
|
+
[target.'cfg(not(target_os = "windows"))'.dependencies]
|
|
25
|
+
pprof = { version = "0.15", features = ["flamegraph"], optional = true }
|
|
26
|
+
|
|
28
27
|
[dev-dependencies]
|
|
29
28
|
pretty_assertions = "1.4"
|
|
29
|
+
[features]
|
|
30
|
+
default = ["metadata"]
|
|
31
|
+
metadata = ["html-to-markdown-rs/metadata"]
|
|
32
|
+
profiling = ["dep:pprof"]
|
|
@@ -62,6 +62,21 @@ Apple M4 • Real Wikipedia documents • `HtmlToMarkdown.convert` (Ruby)
|
|
|
62
62
|
|
|
63
63
|
> Same core, same benchmarks: the Ruby extension stays within single-digit % of the Rust CLI and mirrors the Python/Node numbers.
|
|
64
64
|
|
|
65
|
+
### Benchmark Fixtures (Apple M4)
|
|
66
|
+
|
|
67
|
+
Measured via `task bench:harness` with the shared Wikipedia + hOCR suite:
|
|
68
|
+
|
|
69
|
+
| Document | Size | ops/sec (Ruby) |
|
|
70
|
+
| ---------------------- | ------ | -------------- |
|
|
71
|
+
| Lists (Timeline) | 129 KB | 3,156 |
|
|
72
|
+
| Tables (Countries) | 360 KB | 921 |
|
|
73
|
+
| Medium (Python) | 657 KB | 469 |
|
|
74
|
+
| Large (Rust) | 567 KB | 534 |
|
|
75
|
+
| Small (Intro) | 463 KB | 629 |
|
|
76
|
+
| hOCR German PDF | 44 KB | 7,250 |
|
|
77
|
+
| hOCR Invoice | 4 KB | 83,883 |
|
|
78
|
+
| hOCR Embedded Tables | 37 KB | 7,890 |
|
|
79
|
+
|
|
65
80
|
## Quick Start
|
|
66
81
|
|
|
67
82
|
```ruby
|
|
@@ -1,30 +1,29 @@
|
|
|
1
1
|
use html_to_markdown_rs::{
|
|
2
|
-
CodeBlockStyle, ConversionOptions,
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
convert_with_inline_images as convert_with_inline_images_inner, error::ConversionError,
|
|
2
|
+
CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, DEFAULT_INLINE_IMAGE_LIMIT, HeadingStyle,
|
|
3
|
+
HighlightStyle, HtmlExtraction, InlineImage, InlineImageConfig, InlineImageConfigUpdate, InlineImageWarning,
|
|
4
|
+
ListIndentType, NewlineStyle, PreprocessingOptionsUpdate, PreprocessingPreset, WhitespaceMode,
|
|
5
|
+
convert as convert_inner, convert_with_inline_images as convert_with_inline_images_inner, error::ConversionError,
|
|
6
|
+
safety::guard_panic,
|
|
6
7
|
};
|
|
7
8
|
|
|
8
9
|
#[cfg(feature = "metadata")]
|
|
9
10
|
use html_to_markdown_rs::convert_with_metadata as convert_with_metadata_inner;
|
|
11
|
+
mod profiling;
|
|
10
12
|
#[cfg(feature = "metadata")]
|
|
11
13
|
use html_to_markdown_rs::metadata::{
|
|
12
14
|
DocumentMetadata as RustDocumentMetadata, ExtendedMetadata as RustExtendedMetadata,
|
|
13
|
-
HeaderMetadata as RustHeaderMetadata, ImageMetadata as RustImageMetadata,
|
|
14
|
-
|
|
15
|
-
StructuredData as RustStructuredData, StructuredDataType as RustStructuredDataType,
|
|
16
|
-
TextDirection as RustTextDirection,
|
|
15
|
+
HeaderMetadata as RustHeaderMetadata, ImageMetadata as RustImageMetadata, LinkMetadata as RustLinkMetadata,
|
|
16
|
+
MetadataConfig as RustMetadataConfig, StructuredData as RustStructuredData, TextDirection as RustTextDirection,
|
|
17
17
|
};
|
|
18
18
|
use magnus::prelude::*;
|
|
19
19
|
use magnus::r_hash::ForEach;
|
|
20
20
|
use magnus::{Error, RArray, RHash, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args};
|
|
21
|
+
use std::path::PathBuf;
|
|
21
22
|
|
|
22
23
|
#[derive(Clone)]
|
|
23
24
|
#[magnus::wrap(class = "HtmlToMarkdown::Options", free_immediately)]
|
|
24
25
|
struct OptionsHandle(ConversionOptions);
|
|
25
26
|
|
|
26
|
-
const DEFAULT_INLINE_IMAGE_LIMIT: u64 = 5 * 1024 * 1024;
|
|
27
|
-
|
|
28
27
|
fn conversion_error(err: ConversionError) -> Error {
|
|
29
28
|
match err {
|
|
30
29
|
ConversionError::ConfigError(msg) => arg_error(msg),
|
|
@@ -120,43 +119,43 @@ fn parse_vec_of_strings(value: Value) -> Result<Vec<String>, Error> {
|
|
|
120
119
|
array.to_vec::<String>()
|
|
121
120
|
}
|
|
122
121
|
|
|
123
|
-
fn parse_preprocessing_options(_ruby: &Ruby, value: Value) -> Result<
|
|
122
|
+
fn parse_preprocessing_options(_ruby: &Ruby, value: Value) -> Result<PreprocessingOptionsUpdate, Error> {
|
|
124
123
|
let hash = RHash::from_value(value).ok_or_else(|| arg_error("expected preprocessing to be a Hash"))?;
|
|
125
124
|
|
|
126
|
-
let mut
|
|
125
|
+
let mut update = PreprocessingOptionsUpdate::default();
|
|
127
126
|
|
|
128
127
|
hash.foreach(|key: Value, val: Value| {
|
|
129
128
|
let key_name = symbol_to_string(key)?;
|
|
130
129
|
match key_name.as_str() {
|
|
131
130
|
"enabled" => {
|
|
132
|
-
|
|
131
|
+
update.enabled = Some(bool::try_convert(val)?);
|
|
133
132
|
}
|
|
134
133
|
"preset" => {
|
|
135
|
-
|
|
134
|
+
update.preset = Some(parse_preset(val)?);
|
|
136
135
|
}
|
|
137
136
|
"remove_navigation" => {
|
|
138
|
-
|
|
137
|
+
update.remove_navigation = Some(bool::try_convert(val)?);
|
|
139
138
|
}
|
|
140
139
|
"remove_forms" => {
|
|
141
|
-
|
|
140
|
+
update.remove_forms = Some(bool::try_convert(val)?);
|
|
142
141
|
}
|
|
143
142
|
_ => {}
|
|
144
143
|
}
|
|
145
144
|
Ok(ForEach::Continue)
|
|
146
145
|
})?;
|
|
147
146
|
|
|
148
|
-
Ok(
|
|
147
|
+
Ok(update)
|
|
149
148
|
}
|
|
150
149
|
|
|
151
150
|
fn build_conversion_options(ruby: &Ruby, options: Option<Value>) -> Result<ConversionOptions, Error> {
|
|
152
|
-
let mut
|
|
151
|
+
let mut update = ConversionOptionsUpdate::default();
|
|
153
152
|
|
|
154
153
|
let Some(options) = options else {
|
|
155
|
-
return Ok(
|
|
154
|
+
return Ok(ConversionOptions::default());
|
|
156
155
|
};
|
|
157
156
|
|
|
158
157
|
if options.is_nil() {
|
|
159
|
-
return Ok(
|
|
158
|
+
return Ok(ConversionOptions::default());
|
|
160
159
|
}
|
|
161
160
|
|
|
162
161
|
let hash = RHash::from_value(options).ok_or_else(|| arg_error("options must be provided as a Hash"))?;
|
|
@@ -165,16 +164,16 @@ fn build_conversion_options(ruby: &Ruby, options: Option<Value>) -> Result<Conve
|
|
|
165
164
|
let key_name = symbol_to_string(key)?;
|
|
166
165
|
match key_name.as_str() {
|
|
167
166
|
"heading_style" => {
|
|
168
|
-
|
|
167
|
+
update.heading_style = Some(parse_heading_style(val)?);
|
|
169
168
|
}
|
|
170
169
|
"list_indent_type" => {
|
|
171
|
-
|
|
170
|
+
update.list_indent_type = Some(parse_list_indent_type(val)?);
|
|
172
171
|
}
|
|
173
172
|
"list_indent_width" => {
|
|
174
|
-
|
|
173
|
+
update.list_indent_width = Some(usize::try_convert(val)?);
|
|
175
174
|
}
|
|
176
175
|
"bullets" => {
|
|
177
|
-
|
|
176
|
+
update.bullets = Some(String::try_convert(val)?);
|
|
178
177
|
}
|
|
179
178
|
"strong_em_symbol" => {
|
|
180
179
|
let value = String::try_convert(val)?;
|
|
@@ -185,103 +184,103 @@ fn build_conversion_options(ruby: &Ruby, options: Option<Value>) -> Result<Conve
|
|
|
185
184
|
if chars.next().is_some() {
|
|
186
185
|
return Err(arg_error("strong_em_symbol must be a single character"));
|
|
187
186
|
}
|
|
188
|
-
|
|
187
|
+
update.strong_em_symbol = Some(ch);
|
|
189
188
|
}
|
|
190
189
|
"escape_asterisks" => {
|
|
191
|
-
|
|
190
|
+
update.escape_asterisks = Some(bool::try_convert(val)?);
|
|
192
191
|
}
|
|
193
192
|
"escape_underscores" => {
|
|
194
|
-
|
|
193
|
+
update.escape_underscores = Some(bool::try_convert(val)?);
|
|
195
194
|
}
|
|
196
195
|
"escape_misc" => {
|
|
197
|
-
|
|
196
|
+
update.escape_misc = Some(bool::try_convert(val)?);
|
|
198
197
|
}
|
|
199
198
|
"escape_ascii" => {
|
|
200
|
-
|
|
199
|
+
update.escape_ascii = Some(bool::try_convert(val)?);
|
|
201
200
|
}
|
|
202
201
|
"code_language" => {
|
|
203
|
-
|
|
202
|
+
update.code_language = Some(String::try_convert(val)?);
|
|
204
203
|
}
|
|
205
204
|
"autolinks" => {
|
|
206
|
-
|
|
205
|
+
update.autolinks = Some(bool::try_convert(val)?);
|
|
207
206
|
}
|
|
208
207
|
"default_title" => {
|
|
209
|
-
|
|
208
|
+
update.default_title = Some(bool::try_convert(val)?);
|
|
210
209
|
}
|
|
211
210
|
"br_in_tables" => {
|
|
212
|
-
|
|
211
|
+
update.br_in_tables = Some(bool::try_convert(val)?);
|
|
213
212
|
}
|
|
214
213
|
"hocr_spatial_tables" => {
|
|
215
|
-
|
|
214
|
+
update.hocr_spatial_tables = Some(bool::try_convert(val)?);
|
|
216
215
|
}
|
|
217
216
|
"highlight_style" => {
|
|
218
|
-
|
|
217
|
+
update.highlight_style = Some(parse_highlight_style(val)?);
|
|
219
218
|
}
|
|
220
219
|
"extract_metadata" => {
|
|
221
|
-
|
|
220
|
+
update.extract_metadata = Some(bool::try_convert(val)?);
|
|
222
221
|
}
|
|
223
222
|
"whitespace_mode" => {
|
|
224
|
-
|
|
223
|
+
update.whitespace_mode = Some(parse_whitespace_mode(val)?);
|
|
225
224
|
}
|
|
226
225
|
"strip_newlines" => {
|
|
227
|
-
|
|
226
|
+
update.strip_newlines = Some(bool::try_convert(val)?);
|
|
228
227
|
}
|
|
229
228
|
"wrap" => {
|
|
230
|
-
|
|
229
|
+
update.wrap = Some(bool::try_convert(val)?);
|
|
231
230
|
}
|
|
232
231
|
"wrap_width" => {
|
|
233
|
-
|
|
232
|
+
update.wrap_width = Some(usize::try_convert(val)?);
|
|
234
233
|
}
|
|
235
234
|
"convert_as_inline" => {
|
|
236
|
-
|
|
235
|
+
update.convert_as_inline = Some(bool::try_convert(val)?);
|
|
237
236
|
}
|
|
238
237
|
"sub_symbol" => {
|
|
239
|
-
|
|
238
|
+
update.sub_symbol = Some(String::try_convert(val)?);
|
|
240
239
|
}
|
|
241
240
|
"sup_symbol" => {
|
|
242
|
-
|
|
241
|
+
update.sup_symbol = Some(String::try_convert(val)?);
|
|
243
242
|
}
|
|
244
243
|
"newline_style" => {
|
|
245
|
-
|
|
244
|
+
update.newline_style = Some(parse_newline_style(val)?);
|
|
246
245
|
}
|
|
247
246
|
"code_block_style" => {
|
|
248
|
-
|
|
247
|
+
update.code_block_style = Some(parse_code_block_style(val)?);
|
|
249
248
|
}
|
|
250
249
|
"keep_inline_images_in" => {
|
|
251
|
-
|
|
250
|
+
update.keep_inline_images_in = Some(parse_vec_of_strings(val)?);
|
|
252
251
|
}
|
|
253
252
|
"preprocessing" => {
|
|
254
|
-
|
|
253
|
+
update.preprocessing = Some(parse_preprocessing_options(ruby, val)?);
|
|
255
254
|
}
|
|
256
255
|
"encoding" => {
|
|
257
|
-
|
|
256
|
+
update.encoding = Some(String::try_convert(val)?);
|
|
258
257
|
}
|
|
259
258
|
"debug" => {
|
|
260
|
-
|
|
259
|
+
update.debug = Some(bool::try_convert(val)?);
|
|
261
260
|
}
|
|
262
261
|
"strip_tags" => {
|
|
263
|
-
|
|
262
|
+
update.strip_tags = Some(parse_vec_of_strings(val)?);
|
|
264
263
|
}
|
|
265
264
|
"preserve_tags" => {
|
|
266
|
-
|
|
265
|
+
update.preserve_tags = Some(parse_vec_of_strings(val)?);
|
|
267
266
|
}
|
|
268
267
|
_ => {}
|
|
269
268
|
}
|
|
270
269
|
Ok(ForEach::Continue)
|
|
271
270
|
})?;
|
|
272
271
|
|
|
273
|
-
Ok(
|
|
272
|
+
Ok(ConversionOptions::from(update))
|
|
274
273
|
}
|
|
275
274
|
|
|
276
275
|
fn build_inline_image_config(_ruby: &Ruby, config: Option<Value>) -> Result<InlineImageConfig, Error> {
|
|
277
|
-
let mut
|
|
276
|
+
let mut update = InlineImageConfigUpdate::default();
|
|
278
277
|
|
|
279
278
|
let Some(config) = config else {
|
|
280
|
-
return Ok(
|
|
279
|
+
return Ok(InlineImageConfig::new(DEFAULT_INLINE_IMAGE_LIMIT));
|
|
281
280
|
};
|
|
282
281
|
|
|
283
282
|
if config.is_nil() {
|
|
284
|
-
return Ok(
|
|
283
|
+
return Ok(InlineImageConfig::new(DEFAULT_INLINE_IMAGE_LIMIT));
|
|
285
284
|
}
|
|
286
285
|
|
|
287
286
|
let hash = RHash::from_value(config).ok_or_else(|| arg_error("inline image config must be provided as a Hash"))?;
|
|
@@ -290,27 +289,27 @@ fn build_inline_image_config(_ruby: &Ruby, config: Option<Value>) -> Result<Inli
|
|
|
290
289
|
let key_name = symbol_to_string(key)?;
|
|
291
290
|
match key_name.as_str() {
|
|
292
291
|
"max_decoded_size_bytes" => {
|
|
293
|
-
|
|
292
|
+
update.max_decoded_size_bytes = Some(u64::try_convert(val)?);
|
|
294
293
|
}
|
|
295
294
|
"filename_prefix" => {
|
|
296
|
-
|
|
295
|
+
update.filename_prefix = if val.is_nil() {
|
|
297
296
|
None
|
|
298
297
|
} else {
|
|
299
298
|
Some(String::try_convert(val)?)
|
|
300
299
|
};
|
|
301
300
|
}
|
|
302
301
|
"capture_svg" => {
|
|
303
|
-
|
|
302
|
+
update.capture_svg = Some(bool::try_convert(val)?);
|
|
304
303
|
}
|
|
305
304
|
"infer_dimensions" => {
|
|
306
|
-
|
|
305
|
+
update.infer_dimensions = Some(bool::try_convert(val)?);
|
|
307
306
|
}
|
|
308
307
|
_ => {}
|
|
309
308
|
}
|
|
310
309
|
Ok(ForEach::Continue)
|
|
311
310
|
})?;
|
|
312
311
|
|
|
313
|
-
Ok(
|
|
312
|
+
Ok(InlineImageConfig::from_update(update))
|
|
314
313
|
}
|
|
315
314
|
|
|
316
315
|
fn inline_image_to_value(ruby: &Ruby, image: InlineImage) -> Result<Value, Error> {
|
|
@@ -328,15 +327,7 @@ fn inline_image_to_value(ruby: &Ruby, image: InlineImage) -> Result<Value, Error
|
|
|
328
327
|
let data_value = ruby.str_from_slice(&data);
|
|
329
328
|
hash.aset(ruby.intern("data"), data_value)?;
|
|
330
329
|
|
|
331
|
-
let format_value =
|
|
332
|
-
InlineImageFormat::Png => "png".to_string(),
|
|
333
|
-
InlineImageFormat::Jpeg => "jpeg".to_string(),
|
|
334
|
-
InlineImageFormat::Gif => "gif".to_string(),
|
|
335
|
-
InlineImageFormat::Bmp => "bmp".to_string(),
|
|
336
|
-
InlineImageFormat::Webp => "webp".to_string(),
|
|
337
|
-
InlineImageFormat::Svg => "svg".to_string(),
|
|
338
|
-
InlineImageFormat::Other(other) => other,
|
|
339
|
-
};
|
|
330
|
+
let format_value = format.to_string();
|
|
340
331
|
hash.aset(ruby.intern("format"), format_value)?;
|
|
341
332
|
|
|
342
333
|
match filename {
|
|
@@ -358,10 +349,7 @@ fn inline_image_to_value(ruby: &Ruby, image: InlineImage) -> Result<Value, Error
|
|
|
358
349
|
hash.aset(ruby.intern("dimensions"), ruby.qnil())?;
|
|
359
350
|
}
|
|
360
351
|
|
|
361
|
-
let source_value =
|
|
362
|
-
InlineImageSource::ImgDataUri => "img_data_uri",
|
|
363
|
-
InlineImageSource::SvgElement => "svg_element",
|
|
364
|
-
};
|
|
352
|
+
let source_value = source.to_string();
|
|
365
353
|
hash.aset(ruby.intern("source"), source_value)?;
|
|
366
354
|
|
|
367
355
|
let attrs = ruby.hash_new();
|
|
@@ -404,7 +392,7 @@ fn convert_fn(ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
|
|
|
404
392
|
let html = parsed.required.0;
|
|
405
393
|
let options = build_conversion_options(ruby, parsed.optional.0)?;
|
|
406
394
|
|
|
407
|
-
guard_panic(|| convert_inner(&html, Some(options))).map_err(conversion_error)
|
|
395
|
+
guard_panic(|| profiling::maybe_profile(|| convert_inner(&html, Some(options)))).map_err(conversion_error)
|
|
408
396
|
}
|
|
409
397
|
|
|
410
398
|
fn options_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<OptionsHandle, Error> {
|
|
@@ -419,7 +407,7 @@ fn convert_with_options_handle_fn(_ruby: &Ruby, args: &[Value]) -> Result<String
|
|
|
419
407
|
let handle = parsed.required.1;
|
|
420
408
|
let options = handle.0.clone();
|
|
421
409
|
|
|
422
|
-
guard_panic(|| convert_inner(&html, Some(options))).map_err(conversion_error)
|
|
410
|
+
guard_panic(|| profiling::maybe_profile(|| convert_inner(&html, Some(options)))).map_err(conversion_error)
|
|
423
411
|
}
|
|
424
412
|
|
|
425
413
|
fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
|
|
@@ -434,6 +422,19 @@ fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, E
|
|
|
434
422
|
extraction_to_value(ruby, extraction)
|
|
435
423
|
}
|
|
436
424
|
|
|
425
|
+
fn convert_with_inline_images_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
|
|
426
|
+
let parsed = scan_args::<(String, &OptionsHandle), (Option<Value>,), (), (), (), ()>(args)?;
|
|
427
|
+
let html = parsed.required.0;
|
|
428
|
+
let handle = parsed.required.1;
|
|
429
|
+
let options = handle.0.clone();
|
|
430
|
+
let config = build_inline_image_config(ruby, parsed.optional.0)?;
|
|
431
|
+
|
|
432
|
+
let extraction =
|
|
433
|
+
guard_panic(|| convert_with_inline_images_inner(&html, Some(options), config)).map_err(conversion_error)?;
|
|
434
|
+
|
|
435
|
+
extraction_to_value(ruby, extraction)
|
|
436
|
+
}
|
|
437
|
+
|
|
437
438
|
#[cfg(feature = "metadata")]
|
|
438
439
|
fn build_metadata_config(_ruby: &Ruby, config: Option<Value>) -> Result<RustMetadataConfig, Error> {
|
|
439
440
|
let mut cfg = RustMetadataConfig::default();
|
|
@@ -451,6 +452,9 @@ fn build_metadata_config(_ruby: &Ruby, config: Option<Value>) -> Result<RustMeta
|
|
|
451
452
|
hash.foreach(|key: Value, val: Value| {
|
|
452
453
|
let key_name = symbol_to_string(key)?;
|
|
453
454
|
match key_name.as_str() {
|
|
455
|
+
"extract_document" => {
|
|
456
|
+
cfg.extract_document = bool::try_convert(val)?;
|
|
457
|
+
}
|
|
454
458
|
"extract_headers" => {
|
|
455
459
|
cfg.extract_headers = bool::try_convert(val)?;
|
|
456
460
|
}
|
|
@@ -492,44 +496,8 @@ fn btreemap_to_ruby_hash(ruby: &Ruby, map: std::collections::BTreeMap<String, St
|
|
|
492
496
|
}
|
|
493
497
|
|
|
494
498
|
#[cfg(feature = "metadata")]
|
|
495
|
-
fn text_direction_to_string(text_direction: Option<RustTextDirection>) -> Option
|
|
496
|
-
|
|
497
|
-
Some(RustTextDirection::LeftToRight) => Some("ltr"),
|
|
498
|
-
Some(RustTextDirection::RightToLeft) => Some("rtl"),
|
|
499
|
-
Some(RustTextDirection::Auto) => Some("auto"),
|
|
500
|
-
None => None,
|
|
501
|
-
}
|
|
502
|
-
}
|
|
503
|
-
|
|
504
|
-
#[cfg(feature = "metadata")]
|
|
505
|
-
fn link_type_to_string(link_type: &RustLinkType) -> &'static str {
|
|
506
|
-
match link_type {
|
|
507
|
-
RustLinkType::Anchor => "anchor",
|
|
508
|
-
RustLinkType::Internal => "internal",
|
|
509
|
-
RustLinkType::External => "external",
|
|
510
|
-
RustLinkType::Email => "email",
|
|
511
|
-
RustLinkType::Phone => "phone",
|
|
512
|
-
RustLinkType::Other => "other",
|
|
513
|
-
}
|
|
514
|
-
}
|
|
515
|
-
|
|
516
|
-
#[cfg(feature = "metadata")]
|
|
517
|
-
fn image_type_to_string(image_type: &RustImageType) -> &'static str {
|
|
518
|
-
match image_type {
|
|
519
|
-
RustImageType::DataUri => "data_uri",
|
|
520
|
-
RustImageType::InlineSvg => "inline_svg",
|
|
521
|
-
RustImageType::External => "external",
|
|
522
|
-
RustImageType::Relative => "relative",
|
|
523
|
-
}
|
|
524
|
-
}
|
|
525
|
-
|
|
526
|
-
#[cfg(feature = "metadata")]
|
|
527
|
-
fn structured_data_type_to_string(data_type: &RustStructuredDataType) -> &'static str {
|
|
528
|
-
match data_type {
|
|
529
|
-
RustStructuredDataType::JsonLd => "json_ld",
|
|
530
|
-
RustStructuredDataType::Microdata => "microdata",
|
|
531
|
-
RustStructuredDataType::RDFa => "rdfa",
|
|
532
|
-
}
|
|
499
|
+
fn text_direction_to_string(text_direction: Option<RustTextDirection>) -> Option<String> {
|
|
500
|
+
text_direction.map(|direction| direction.to_string())
|
|
533
501
|
}
|
|
534
502
|
|
|
535
503
|
#[cfg(feature = "metadata")]
|
|
@@ -591,7 +559,7 @@ fn links_to_ruby(ruby: &Ruby, links: Vec<RustLinkMetadata>) -> Result<Value, Err
|
|
|
591
559
|
hash.aset(ruby.intern("href"), link.href)?;
|
|
592
560
|
hash.aset(ruby.intern("text"), link.text)?;
|
|
593
561
|
hash.aset(ruby.intern("title"), opt_string_to_ruby(ruby, link.title)?)?;
|
|
594
|
-
hash.aset(ruby.intern("link_type"),
|
|
562
|
+
hash.aset(ruby.intern("link_type"), link.link_type.to_string())?;
|
|
595
563
|
|
|
596
564
|
let rel_array = ruby.ary_new();
|
|
597
565
|
for r in link.rel {
|
|
@@ -626,7 +594,7 @@ fn images_to_ruby(ruby: &Ruby, images: Vec<RustImageMetadata>) -> Result<Value,
|
|
|
626
594
|
}
|
|
627
595
|
}
|
|
628
596
|
|
|
629
|
-
hash.aset(ruby.intern("image_type"),
|
|
597
|
+
hash.aset(ruby.intern("image_type"), image.image_type.to_string())?;
|
|
630
598
|
hash.aset(
|
|
631
599
|
ruby.intern("attributes"),
|
|
632
600
|
btreemap_to_ruby_hash(ruby, image.attributes)?,
|
|
@@ -641,10 +609,7 @@ fn structured_data_to_ruby(ruby: &Ruby, data: Vec<RustStructuredData>) -> Result
|
|
|
641
609
|
let array = ruby.ary_new();
|
|
642
610
|
for item in data {
|
|
643
611
|
let hash = ruby.hash_new();
|
|
644
|
-
hash.aset(
|
|
645
|
-
ruby.intern("data_type"),
|
|
646
|
-
structured_data_type_to_string(&item.data_type),
|
|
647
|
-
)?;
|
|
612
|
+
hash.aset(ruby.intern("data_type"), item.data_type.to_string())?;
|
|
648
613
|
hash.aset(ruby.intern("raw_json"), item.raw_json)?;
|
|
649
614
|
hash.aset(ruby.intern("schema_type"), opt_string_to_ruby(ruby, item.schema_type)?)?;
|
|
650
615
|
array.push(hash)?;
|
|
@@ -688,6 +653,41 @@ fn convert_with_metadata_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error>
|
|
|
688
653
|
Ok(array.as_value())
|
|
689
654
|
}
|
|
690
655
|
|
|
656
|
+
#[cfg(feature = "metadata")]
|
|
657
|
+
fn convert_with_metadata_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
|
|
658
|
+
let parsed = scan_args::<(String, &OptionsHandle), (Option<Value>,), (), (), (), ()>(args)?;
|
|
659
|
+
let html = parsed.required.0;
|
|
660
|
+
let handle = parsed.required.1;
|
|
661
|
+
let options = handle.0.clone();
|
|
662
|
+
let metadata_config = build_metadata_config(ruby, parsed.optional.0)?;
|
|
663
|
+
|
|
664
|
+
let (markdown, metadata) =
|
|
665
|
+
guard_panic(|| convert_with_metadata_inner(&html, Some(options), metadata_config)).map_err(conversion_error)?;
|
|
666
|
+
|
|
667
|
+
let array = ruby.ary_new();
|
|
668
|
+
array.push(markdown)?;
|
|
669
|
+
array.push(extended_metadata_to_ruby(ruby, metadata)?)?;
|
|
670
|
+
|
|
671
|
+
Ok(array.as_value())
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
fn start_profiling_fn(_ruby: &Ruby, args: &[Value]) -> Result<bool, Error> {
|
|
675
|
+
let output = args.first().ok_or_else(|| arg_error("output_path required"))?;
|
|
676
|
+
let output: String = String::try_convert(*output)?;
|
|
677
|
+
let freq = if let Some(value) = args.get(1) {
|
|
678
|
+
i32::try_convert(*value)?
|
|
679
|
+
} else {
|
|
680
|
+
1000
|
|
681
|
+
};
|
|
682
|
+
profiling::start(PathBuf::from(output), freq).map_err(conversion_error)?;
|
|
683
|
+
Ok(true)
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
fn stop_profiling_fn(_ruby: &Ruby, _args: &[Value]) -> Result<bool, Error> {
|
|
687
|
+
profiling::stop().map_err(conversion_error)?;
|
|
688
|
+
Ok(true)
|
|
689
|
+
}
|
|
690
|
+
|
|
691
691
|
#[magnus::init]
|
|
692
692
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
693
693
|
let module = ruby.define_module("HtmlToMarkdown")?;
|
|
@@ -698,9 +698,20 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
698
698
|
"convert_with_inline_images",
|
|
699
699
|
function!(convert_with_inline_images_fn, -1),
|
|
700
700
|
)?;
|
|
701
|
+
module.define_singleton_method(
|
|
702
|
+
"convert_with_inline_images_handle",
|
|
703
|
+
function!(convert_with_inline_images_handle_fn, -1),
|
|
704
|
+
)?;
|
|
701
705
|
|
|
702
706
|
#[cfg(feature = "metadata")]
|
|
703
707
|
module.define_singleton_method("convert_with_metadata", function!(convert_with_metadata_fn, -1))?;
|
|
708
|
+
#[cfg(feature = "metadata")]
|
|
709
|
+
module.define_singleton_method(
|
|
710
|
+
"convert_with_metadata_handle",
|
|
711
|
+
function!(convert_with_metadata_handle_fn, -1),
|
|
712
|
+
)?;
|
|
713
|
+
module.define_singleton_method("start_profiling", function!(start_profiling_fn, -1))?;
|
|
714
|
+
module.define_singleton_method("stop_profiling", function!(stop_profiling_fn, -1))?;
|
|
704
715
|
|
|
705
716
|
Ok(())
|
|
706
717
|
}
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
use html_to_markdown_rs::{ConversionError, Result};
|
|
2
|
+
use std::path::PathBuf;
|
|
3
|
+
|
|
4
|
+
#[cfg(all(not(target_os = "windows"), feature = "profiling"))]
|
|
5
|
+
mod enabled {
|
|
6
|
+
use super::{ConversionError, PathBuf, Result};
|
|
7
|
+
use std::sync::atomic::{AtomicBool, Ordering};
|
|
8
|
+
use std::sync::{Mutex, OnceLock};
|
|
9
|
+
|
|
10
|
+
const ENV_OUTPUT: &str = "HTML_TO_MARKDOWN_PROFILE_OUTPUT";
|
|
11
|
+
const ENV_FREQUENCY: &str = "HTML_TO_MARKDOWN_PROFILE_FREQUENCY";
|
|
12
|
+
const ENV_ONCE: &str = "HTML_TO_MARKDOWN_PROFILE_ONCE";
|
|
13
|
+
|
|
14
|
+
static PROFILED_ONCE: AtomicBool = AtomicBool::new(false);
|
|
15
|
+
static PROFILE_ACTIVE: AtomicBool = AtomicBool::new(false);
|
|
16
|
+
|
|
17
|
+
struct EnvProfileConfig {
|
|
18
|
+
output: Option<PathBuf>,
|
|
19
|
+
profile_once: bool,
|
|
20
|
+
frequency: i32,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
fn env_profile_config() -> &'static EnvProfileConfig {
|
|
24
|
+
static ENV_CONFIG: OnceLock<EnvProfileConfig> = OnceLock::new();
|
|
25
|
+
ENV_CONFIG.get_or_init(|| {
|
|
26
|
+
let output = match std::env::var(ENV_OUTPUT) {
|
|
27
|
+
Ok(value) if !value.trim().is_empty() => Some(PathBuf::from(value)),
|
|
28
|
+
_ => None,
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
let profile_once = match std::env::var(ENV_ONCE) {
|
|
32
|
+
Ok(value) => !matches!(value.as_str(), "0" | "false" | "no"),
|
|
33
|
+
Err(_) => true,
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
let frequency = std::env::var(ENV_FREQUENCY)
|
|
37
|
+
.ok()
|
|
38
|
+
.and_then(|value| value.parse::<i32>().ok())
|
|
39
|
+
.unwrap_or(1000);
|
|
40
|
+
|
|
41
|
+
EnvProfileConfig {
|
|
42
|
+
output,
|
|
43
|
+
profile_once,
|
|
44
|
+
frequency,
|
|
45
|
+
}
|
|
46
|
+
})
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
struct ProfileState {
|
|
50
|
+
guard: Option<pprof::ProfilerGuard<'static>>,
|
|
51
|
+
output: Option<PathBuf>,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
fn state() -> &'static Mutex<ProfileState> {
|
|
55
|
+
static STATE: OnceLock<Mutex<ProfileState>> = OnceLock::new();
|
|
56
|
+
STATE.get_or_init(|| {
|
|
57
|
+
Mutex::new(ProfileState {
|
|
58
|
+
guard: None,
|
|
59
|
+
output: None,
|
|
60
|
+
})
|
|
61
|
+
})
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
pub fn start(output_path: PathBuf, frequency: i32) -> Result<()> {
|
|
65
|
+
let mut state = state()
|
|
66
|
+
.lock()
|
|
67
|
+
.map_err(|_| ConversionError::Other("profiling state lock poisoned".to_string()))?;
|
|
68
|
+
|
|
69
|
+
if state.guard.is_some() {
|
|
70
|
+
return Err(ConversionError::Other("profiling already active".to_string()));
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
let guard = pprof::ProfilerGuardBuilder::default()
|
|
74
|
+
.frequency(frequency)
|
|
75
|
+
.blocklist(&["libc", "libpthread", "libgcc", "libm"])
|
|
76
|
+
.build()
|
|
77
|
+
.map_err(|err| ConversionError::Other(format!("Profiling init failed: {err}")))?;
|
|
78
|
+
|
|
79
|
+
state.guard = Some(guard);
|
|
80
|
+
state.output = Some(output_path);
|
|
81
|
+
PROFILE_ACTIVE.store(true, Ordering::Release);
|
|
82
|
+
Ok(())
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
pub fn stop() -> Result<()> {
|
|
86
|
+
let (guard, output) = {
|
|
87
|
+
let mut state = state()
|
|
88
|
+
.lock()
|
|
89
|
+
.map_err(|_| ConversionError::Other("profiling state lock poisoned".to_string()))?;
|
|
90
|
+
let guard = state.guard.take();
|
|
91
|
+
let output = state.output.take();
|
|
92
|
+
(guard, output)
|
|
93
|
+
};
|
|
94
|
+
PROFILE_ACTIVE.store(false, Ordering::Release);
|
|
95
|
+
|
|
96
|
+
let Some(guard) = guard else {
|
|
97
|
+
return Err(ConversionError::Other("profiling not active".to_string()));
|
|
98
|
+
};
|
|
99
|
+
let Some(output_path) = output else {
|
|
100
|
+
return Err(ConversionError::Other("profiling output path missing".to_string()));
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
if let Some(parent) = output_path.parent() {
|
|
104
|
+
std::fs::create_dir_all(parent).map_err(ConversionError::IoError)?;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
let report = guard
|
|
108
|
+
.report()
|
|
109
|
+
.build()
|
|
110
|
+
.map_err(|err| ConversionError::Other(format!("Profiling report failed: {err}")))?;
|
|
111
|
+
|
|
112
|
+
let file = std::fs::File::create(&output_path).map_err(ConversionError::IoError)?;
|
|
113
|
+
report
|
|
114
|
+
.flamegraph(file)
|
|
115
|
+
.map_err(|err| ConversionError::Other(format!("Flamegraph write failed: {err}")))?;
|
|
116
|
+
PROFILE_ACTIVE.store(false, Ordering::Release);
|
|
117
|
+
Ok(())
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
pub fn maybe_profile<T, F>(f: F) -> Result<T>
|
|
121
|
+
where
|
|
122
|
+
F: FnOnce() -> Result<T>,
|
|
123
|
+
{
|
|
124
|
+
if PROFILE_ACTIVE.load(Ordering::Relaxed) {
|
|
125
|
+
return f();
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
let config = env_profile_config();
|
|
129
|
+
let Some(output_path) = config.output.as_ref() else {
|
|
130
|
+
return f();
|
|
131
|
+
};
|
|
132
|
+
|
|
133
|
+
if config.profile_once && PROFILED_ONCE.swap(true, Ordering::SeqCst) {
|
|
134
|
+
return f();
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
struct ActiveGuard;
|
|
138
|
+
impl Drop for ActiveGuard {
|
|
139
|
+
fn drop(&mut self) {
|
|
140
|
+
PROFILE_ACTIVE.store(false, Ordering::Release);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
PROFILE_ACTIVE.store(true, Ordering::Release);
|
|
144
|
+
let _active = ActiveGuard;
|
|
145
|
+
|
|
146
|
+
let guard = pprof::ProfilerGuardBuilder::default()
|
|
147
|
+
.frequency(config.frequency)
|
|
148
|
+
.blocklist(&["libc", "libpthread", "libgcc", "libm"])
|
|
149
|
+
.build()
|
|
150
|
+
.map_err(|err| ConversionError::Other(format!("Profiling init failed: {err}")))?;
|
|
151
|
+
|
|
152
|
+
let result = f();
|
|
153
|
+
|
|
154
|
+
if result.is_ok() {
|
|
155
|
+
if let Some(parent) = output_path.parent() {
|
|
156
|
+
std::fs::create_dir_all(parent).map_err(ConversionError::IoError)?;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
let report = guard
|
|
160
|
+
.report()
|
|
161
|
+
.build()
|
|
162
|
+
.map_err(|err| ConversionError::Other(format!("Profiling report failed: {err}")))?;
|
|
163
|
+
|
|
164
|
+
let file = std::fs::File::create(output_path).map_err(ConversionError::IoError)?;
|
|
165
|
+
report
|
|
166
|
+
.flamegraph(file)
|
|
167
|
+
.map_err(|err| ConversionError::Other(format!("Flamegraph write failed: {err}")))?;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
result
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
#[cfg(all(not(target_os = "windows"), feature = "profiling"))]
|
|
175
|
+
pub use enabled::{maybe_profile, start, stop};
|
|
176
|
+
|
|
177
|
+
#[cfg(target_os = "windows")]
|
|
178
|
+
pub fn start(_output_path: PathBuf, _frequency: i32) -> Result<()> {
|
|
179
|
+
Err(ConversionError::Other(
|
|
180
|
+
"Profiling is not supported on Windows".to_string(),
|
|
181
|
+
))
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
#[cfg(all(not(target_os = "windows"), not(feature = "profiling")))]
|
|
185
|
+
pub fn start(_output_path: PathBuf, _frequency: i32) -> Result<()> {
|
|
186
|
+
Err(ConversionError::Other(
|
|
187
|
+
"Profiling is disabled; rebuild with the profiling feature".to_string(),
|
|
188
|
+
))
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
#[cfg(target_os = "windows")]
|
|
192
|
+
pub fn stop() -> Result<()> {
|
|
193
|
+
Err(ConversionError::Other(
|
|
194
|
+
"Profiling is not supported on Windows".to_string(),
|
|
195
|
+
))
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
#[cfg(all(not(target_os = "windows"), not(feature = "profiling")))]
|
|
199
|
+
pub fn stop() -> Result<()> {
|
|
200
|
+
Err(ConversionError::Other(
|
|
201
|
+
"Profiling is disabled; rebuild with the profiling feature".to_string(),
|
|
202
|
+
))
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
#[cfg(any(target_os = "windows", not(feature = "profiling")))]
|
|
206
|
+
pub fn maybe_profile<T, F>(f: F) -> Result<T>
|
|
207
|
+
where
|
|
208
|
+
F: FnOnce() -> Result<T>,
|
|
209
|
+
{
|
|
210
|
+
f()
|
|
211
|
+
}
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -12,9 +12,11 @@ module HtmlToMarkdown
|
|
|
12
12
|
class << self
|
|
13
13
|
alias native_convert convert
|
|
14
14
|
alias native_convert_with_inline_images convert_with_inline_images
|
|
15
|
+
alias native_convert_with_inline_images_handle convert_with_inline_images_handle
|
|
15
16
|
alias native_options options
|
|
16
17
|
alias native_convert_with_options convert_with_options
|
|
17
18
|
alias native_convert_with_metadata convert_with_metadata
|
|
19
|
+
alias native_convert_with_metadata_handle convert_with_metadata_handle
|
|
18
20
|
end
|
|
19
21
|
|
|
20
22
|
module_function
|
|
@@ -31,6 +33,10 @@ module HtmlToMarkdown
|
|
|
31
33
|
native_convert_with_inline_images(html.to_s, options, image_config)
|
|
32
34
|
end
|
|
33
35
|
|
|
36
|
+
def convert_with_inline_images_handle(html, options_handle, image_config = nil)
|
|
37
|
+
native_convert_with_inline_images_handle(html.to_s, options_handle, image_config)
|
|
38
|
+
end
|
|
39
|
+
|
|
34
40
|
def options(options_hash = nil)
|
|
35
41
|
native_options(options_hash)
|
|
36
42
|
end
|
|
@@ -160,4 +166,8 @@ module HtmlToMarkdown
|
|
|
160
166
|
def convert_with_metadata(html, options = nil, metadata_config = nil)
|
|
161
167
|
native_convert_with_metadata(html.to_s, options, metadata_config)
|
|
162
168
|
end
|
|
169
|
+
|
|
170
|
+
def convert_with_metadata_handle(html, options_handle, metadata_config = nil)
|
|
171
|
+
native_convert_with_metadata_handle(html.to_s, options_handle, metadata_config)
|
|
172
|
+
end
|
|
163
173
|
end
|
data/sig/html_to_markdown.rbs
CHANGED
|
@@ -162,11 +162,21 @@ module HtmlToMarkdown
|
|
|
162
162
|
def self.native_convert: (String html, conversion_options? options) -> String
|
|
163
163
|
def self.native_options: (conversion_options? options_hash) -> Options
|
|
164
164
|
def self.native_convert_with_options: (String html, Options options_handle) -> String
|
|
165
|
+
def self.native_convert_with_inline_images_handle: (
|
|
166
|
+
String html,
|
|
167
|
+
Options options_handle,
|
|
168
|
+
inline_image_config? image_config
|
|
169
|
+
) -> html_extraction
|
|
165
170
|
def self.native_convert_with_inline_images: (
|
|
166
171
|
String html,
|
|
167
172
|
conversion_options? options,
|
|
168
173
|
inline_image_config? image_config
|
|
169
174
|
) -> html_extraction
|
|
175
|
+
def self.native_convert_with_metadata_handle: (
|
|
176
|
+
String html,
|
|
177
|
+
Options options_handle,
|
|
178
|
+
metadata_config? metadata_config
|
|
179
|
+
) -> [String, extended_metadata]
|
|
170
180
|
def self.native_convert_with_metadata: (
|
|
171
181
|
String html,
|
|
172
182
|
conversion_options? options,
|
|
@@ -176,11 +186,21 @@ module HtmlToMarkdown
|
|
|
176
186
|
def native_convert: (String html, conversion_options? options) -> String
|
|
177
187
|
def native_options: (conversion_options? options_hash) -> Options
|
|
178
188
|
def native_convert_with_options: (String html, Options options_handle) -> String
|
|
189
|
+
def native_convert_with_inline_images_handle: (
|
|
190
|
+
String html,
|
|
191
|
+
Options options_handle,
|
|
192
|
+
inline_image_config? image_config
|
|
193
|
+
) -> html_extraction
|
|
179
194
|
def native_convert_with_inline_images: (
|
|
180
195
|
String html,
|
|
181
196
|
conversion_options? options,
|
|
182
197
|
inline_image_config? image_config
|
|
183
198
|
) -> html_extraction
|
|
199
|
+
def native_convert_with_metadata_handle: (
|
|
200
|
+
String html,
|
|
201
|
+
Options options_handle,
|
|
202
|
+
metadata_config? metadata_config
|
|
203
|
+
) -> [String, extended_metadata]
|
|
184
204
|
def native_convert_with_metadata: (
|
|
185
205
|
String html,
|
|
186
206
|
conversion_options? options,
|
|
@@ -197,6 +217,11 @@ module HtmlToMarkdown
|
|
|
197
217
|
|
|
198
218
|
# Convert HTML using a pre-built options handle
|
|
199
219
|
def self.convert_with_options: (String html, Options options_handle) -> String
|
|
220
|
+
def self.convert_with_inline_images_handle: (
|
|
221
|
+
String html,
|
|
222
|
+
Options options_handle,
|
|
223
|
+
?inline_image_config image_config
|
|
224
|
+
) -> html_extraction
|
|
200
225
|
|
|
201
226
|
# Convert HTML with inline image extraction
|
|
202
227
|
def self.convert_with_inline_images: (
|
|
@@ -236,11 +261,21 @@ module HtmlToMarkdown
|
|
|
236
261
|
?conversion_options options,
|
|
237
262
|
?metadata_config metadata_config
|
|
238
263
|
) -> [String, extended_metadata]
|
|
264
|
+
def self.convert_with_metadata_handle: (
|
|
265
|
+
String html,
|
|
266
|
+
Options options_handle,
|
|
267
|
+
?metadata_config metadata_config
|
|
268
|
+
) -> [String, extended_metadata]
|
|
239
269
|
|
|
240
270
|
# Instance method versions (created by module_function)
|
|
241
271
|
def convert: (String html, ?conversion_options options) -> String
|
|
242
272
|
def options: (?conversion_options options_hash) -> Options
|
|
243
273
|
def convert_with_options: (String html, Options options_handle) -> String
|
|
274
|
+
def convert_with_inline_images_handle: (
|
|
275
|
+
String html,
|
|
276
|
+
Options options_handle,
|
|
277
|
+
?inline_image_config image_config
|
|
278
|
+
) -> html_extraction
|
|
244
279
|
def convert_with_inline_images: (
|
|
245
280
|
String html,
|
|
246
281
|
?conversion_options options,
|
|
@@ -251,4 +286,9 @@ module HtmlToMarkdown
|
|
|
251
286
|
?conversion_options options,
|
|
252
287
|
?metadata_config metadata_config
|
|
253
288
|
) -> [String, extended_metadata]
|
|
289
|
+
def convert_with_metadata_handle: (
|
|
290
|
+
String html,
|
|
291
|
+
Options options_handle,
|
|
292
|
+
?metadata_config metadata_config
|
|
293
|
+
) -> [String, extended_metadata]
|
|
254
294
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.16.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-12-
|
|
11
|
+
date: 2025-12-22 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -56,6 +56,7 @@ files:
|
|
|
56
56
|
- ext/html-to-markdown-rb/native/README.md
|
|
57
57
|
- ext/html-to-markdown-rb/native/extconf.rb
|
|
58
58
|
- ext/html-to-markdown-rb/native/src/lib.rs
|
|
59
|
+
- ext/html-to-markdown-rb/native/src/profiling.rs
|
|
59
60
|
- html-to-markdown-rb.gemspec
|
|
60
61
|
- lib/html_to_markdown.rb
|
|
61
62
|
- lib/html_to_markdown/cli.rb
|