html-to-markdown 2.16.0 → 2.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +90 -10
- data/README.md +99 -302
- data/bin/benchmark.rb +100 -12
- data/ext/html-to-markdown-rb/native/Cargo.toml +7 -5
- data/ext/html-to-markdown-rb/native/README.md +5 -5
- data/ext/html-to-markdown-rb/native/src/lib.rs +951 -0
- data/ext/html-to-markdown-rb/native/src/profiling.rs +4 -0
- data/html-to-markdown-rb.gemspec +6 -6
- data/lib/html_to_markdown/version.rb +1 -1
- data/sig/html_to_markdown.rbs +110 -0
- data/spec/visitor_spec.rb +1149 -0
- metadata +9 -8
data/bin/benchmark.rb
CHANGED
|
@@ -23,7 +23,8 @@ end
|
|
|
23
23
|
options = {
|
|
24
24
|
iterations: 50,
|
|
25
25
|
format: 'html',
|
|
26
|
-
scenario: 'convert-default'
|
|
26
|
+
scenario: 'convert-default',
|
|
27
|
+
visitor: nil
|
|
27
28
|
}
|
|
28
29
|
|
|
29
30
|
OptionParser.new do |parser|
|
|
@@ -44,6 +45,10 @@ OptionParser.new do |parser|
|
|
|
44
45
|
parser.on('--format FORMAT', 'Fixture format (html or hocr)') do |format|
|
|
45
46
|
options[:format] = format.downcase
|
|
46
47
|
end
|
|
48
|
+
|
|
49
|
+
parser.on('--visitor VISITOR', 'Visitor type (noop, simple, custom, complex)') do |visitor|
|
|
50
|
+
options[:visitor] = visitor if %w[noop simple custom complex].include?(visitor)
|
|
51
|
+
end
|
|
47
52
|
end.parse!
|
|
48
53
|
|
|
49
54
|
fixture = options.fetch(:file) do
|
|
@@ -74,6 +79,70 @@ unless supported_scenarios.include?(options[:scenario])
|
|
|
74
79
|
exit 1
|
|
75
80
|
end
|
|
76
81
|
|
|
82
|
+
# Visitor factory functions
|
|
83
|
+
def create_noop_visitor
|
|
84
|
+
{
|
|
85
|
+
visit_text: proc { |_ctx, _text| 'continue' },
|
|
86
|
+
visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
|
|
87
|
+
visit_paragraph: proc { |_ctx, _text| 'continue' },
|
|
88
|
+
visit_link: proc { |_ctx, _href, _text, _title| 'continue' },
|
|
89
|
+
visit_image: proc { |_ctx, _src, _alt, _title| 'continue' },
|
|
90
|
+
visit_strong: proc { |_ctx, _text| 'continue' },
|
|
91
|
+
visit_em: proc { |_ctx, _text| 'continue' },
|
|
92
|
+
visit_code: proc { |_ctx, _text| 'continue' },
|
|
93
|
+
visit_br: proc { |_ctx| 'continue' }
|
|
94
|
+
}
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def create_simple_visitor
|
|
98
|
+
{
|
|
99
|
+
text_count: 0,
|
|
100
|
+
link_count: 0,
|
|
101
|
+
image_count: 0,
|
|
102
|
+
visit_text: proc { |_ctx, _text| 'continue' },
|
|
103
|
+
visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
|
|
104
|
+
visit_paragraph: proc { |_ctx, _text| 'continue' },
|
|
105
|
+
visit_link: proc { |_ctx, _href, _text, _title| 'continue' },
|
|
106
|
+
visit_image: proc { |_ctx, _src, _alt, _title| 'continue' },
|
|
107
|
+
visit_strong: proc { |_ctx, _text| 'continue' },
|
|
108
|
+
visit_em: proc { |_ctx, _text| 'continue' },
|
|
109
|
+
visit_code: proc { |_ctx, _text| 'continue' },
|
|
110
|
+
visit_br: proc { |_ctx| 'continue' }
|
|
111
|
+
}
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def create_custom_visitor
|
|
115
|
+
{
|
|
116
|
+
visit_text: proc { |_ctx, _text| 'continue' },
|
|
117
|
+
visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
|
|
118
|
+
visit_paragraph: proc { |_ctx, _text| 'continue' },
|
|
119
|
+
visit_link: proc { |_ctx, href, text, _title| ['custom', "LINK[#{text}](#{href})"] },
|
|
120
|
+
visit_image: proc { |_ctx, src, alt, _title| ['custom', ""] },
|
|
121
|
+
visit_strong: proc { |_ctx, _text| 'continue' },
|
|
122
|
+
visit_em: proc { |_ctx, _text| 'continue' },
|
|
123
|
+
visit_code: proc { |_ctx, _text| 'continue' },
|
|
124
|
+
visit_br: proc { |_ctx| 'continue' }
|
|
125
|
+
}
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def create_complex_visitor
|
|
129
|
+
{
|
|
130
|
+
texts: 0,
|
|
131
|
+
links: 0,
|
|
132
|
+
images: 0,
|
|
133
|
+
headings: 0,
|
|
134
|
+
visit_text: proc { |_ctx, _text| 'continue' },
|
|
135
|
+
visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
|
|
136
|
+
visit_paragraph: proc { |_ctx, _text| 'continue' },
|
|
137
|
+
visit_link: proc { |_ctx, href, text, _title| ['custom', "[#{text}](#{href})"] },
|
|
138
|
+
visit_image: proc { |_ctx, _src, _alt, _title| 'skip' },
|
|
139
|
+
visit_strong: proc { |_ctx, _text| 'continue' },
|
|
140
|
+
visit_em: proc { |_ctx, _text| 'continue' },
|
|
141
|
+
visit_code: proc { |_ctx, _text| 'continue' },
|
|
142
|
+
visit_br: proc { |_ctx| 'continue' }
|
|
143
|
+
}
|
|
144
|
+
end
|
|
145
|
+
|
|
77
146
|
html = File.binread(fixture)
|
|
78
147
|
html.force_encoding(Encoding::UTF_8)
|
|
79
148
|
html.freeze
|
|
@@ -83,33 +152,52 @@ options_handle = if %w[convert-options inline-images-options metadata-options].i
|
|
|
83
152
|
HtmlToMarkdown.options(conversion_options)
|
|
84
153
|
end
|
|
85
154
|
|
|
155
|
+
# Create visitor if specified
|
|
156
|
+
visitor = nil
|
|
157
|
+
if options[:visitor]
|
|
158
|
+
visitor_creators = {
|
|
159
|
+
'noop' => method(:create_noop_visitor),
|
|
160
|
+
'simple' => method(:create_simple_visitor),
|
|
161
|
+
'custom' => method(:create_custom_visitor),
|
|
162
|
+
'complex' => method(:create_complex_visitor)
|
|
163
|
+
}
|
|
164
|
+
creator = visitor_creators[options[:visitor]]
|
|
165
|
+
visitor = creator.call if creator
|
|
166
|
+
end
|
|
167
|
+
|
|
86
168
|
SCENARIO_RUNNERS = {
|
|
87
|
-
'convert-default' => ->(html, _options, _handle) { HtmlToMarkdown.convert(html) },
|
|
88
|
-
'convert-options' => lambda do |html, _options, handle|
|
|
169
|
+
'convert-default' => ->(html, _options, _handle, _visitor) { HtmlToMarkdown.convert(html) },
|
|
170
|
+
'convert-options' => lambda do |html, _options, handle, _visitor|
|
|
89
171
|
raise ArgumentError, 'options handle required' unless handle
|
|
90
172
|
|
|
91
173
|
HtmlToMarkdown.convert_with_options(html, handle)
|
|
92
174
|
end,
|
|
93
|
-
'inline-images-default' =>
|
|
94
|
-
|
|
175
|
+
'inline-images-default' => lambda { |html, _options, _handle, _visitor|
|
|
176
|
+
HtmlToMarkdown.convert_with_inline_images(html, nil, nil)
|
|
177
|
+
},
|
|
178
|
+
'inline-images-options' => lambda do |html, _options, handle, _visitor|
|
|
95
179
|
raise ArgumentError, 'options handle required' unless handle
|
|
96
180
|
|
|
97
181
|
HtmlToMarkdown.convert_with_inline_images_handle(html, handle, nil)
|
|
98
182
|
end,
|
|
99
|
-
'metadata-default' => ->(html, _options, _handle) { HtmlToMarkdown.convert_with_metadata(html, nil, nil) },
|
|
100
|
-
'metadata-options' => lambda do |html, _options, handle|
|
|
183
|
+
'metadata-default' => ->(html, _options, _handle, _visitor) { HtmlToMarkdown.convert_with_metadata(html, nil, nil) },
|
|
184
|
+
'metadata-options' => lambda do |html, _options, handle, _visitor|
|
|
101
185
|
raise ArgumentError, 'options handle required' unless handle
|
|
102
186
|
|
|
103
187
|
HtmlToMarkdown.convert_with_metadata_handle(html, handle, nil)
|
|
104
188
|
end
|
|
105
189
|
}.freeze
|
|
106
190
|
|
|
107
|
-
def run_scenario(html, scenario, options, handle)
|
|
108
|
-
|
|
109
|
-
|
|
191
|
+
def run_scenario(html, scenario, options, handle, visitor = nil)
|
|
192
|
+
if visitor
|
|
193
|
+
HtmlToMarkdown.convert_with_visitor(html, nil, visitor)
|
|
194
|
+
else
|
|
195
|
+
runner = SCENARIO_RUNNERS.fetch(scenario) { raise ArgumentError, "Unsupported scenario: #{scenario}" }
|
|
196
|
+
runner.call(html, options, handle, visitor)
|
|
197
|
+
end
|
|
110
198
|
end
|
|
111
199
|
|
|
112
|
-
run_scenario(html, options[:scenario], conversion_options, options_handle)
|
|
200
|
+
run_scenario(html, options[:scenario], conversion_options, options_handle, visitor)
|
|
113
201
|
|
|
114
202
|
profile_output = ENV.fetch('HTML_TO_MARKDOWN_PROFILE_OUTPUT', nil)
|
|
115
203
|
if profile_output && HtmlToMarkdown.respond_to?(:start_profiling)
|
|
@@ -118,7 +206,7 @@ if profile_output && HtmlToMarkdown.respond_to?(:start_profiling)
|
|
|
118
206
|
end
|
|
119
207
|
|
|
120
208
|
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
121
|
-
iterations.times { run_scenario(html, options[:scenario], conversion_options, options_handle) }
|
|
209
|
+
iterations.times { run_scenario(html, options[:scenario], conversion_options, options_handle, visitor) }
|
|
122
210
|
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
|
|
123
211
|
|
|
124
212
|
HtmlToMarkdown.stop_profiling if profile_output && HtmlToMarkdown.respond_to?(:stop_profiling)
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "html-to-markdown-rb"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.18.0"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
6
6
|
license = "MIT"
|
|
7
|
-
repository = "https://github.com/
|
|
8
|
-
homepage = "https://github.com/
|
|
7
|
+
repository = "https://github.com/kreuzberg-dev/html-to-markdown"
|
|
8
|
+
homepage = "https://github.com/kreuzberg-dev/html-to-markdown"
|
|
9
9
|
documentation = "https://docs.rs/html-to-markdown-rs"
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
rust-version = "1.85"
|
|
@@ -18,7 +18,7 @@ name = "html_to_markdown_rb"
|
|
|
18
18
|
crate-type = ["cdylib", "rlib"]
|
|
19
19
|
|
|
20
20
|
[dependencies]
|
|
21
|
-
html-to-markdown-rs = { version = "2.16.
|
|
21
|
+
html-to-markdown-rs = { version = "2.16.1", features = ["inline-images", "visitor", "metadata"] }
|
|
22
22
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
|
|
23
23
|
|
|
24
24
|
[target.'cfg(not(target_os = "windows"))'.dependencies]
|
|
@@ -27,6 +27,8 @@ pprof = { version = "0.15", features = ["flamegraph"], optional = true }
|
|
|
27
27
|
[dev-dependencies]
|
|
28
28
|
pretty_assertions = "1.4"
|
|
29
29
|
[features]
|
|
30
|
-
default = ["metadata"]
|
|
30
|
+
default = ["inline-images", "metadata", "visitor"]
|
|
31
|
+
inline-images = ["html-to-markdown-rs/inline-images"]
|
|
31
32
|
metadata = ["html-to-markdown-rs/metadata"]
|
|
33
|
+
visitor = ["html-to-markdown-rs/visitor"]
|
|
32
34
|
profiling = ["dep:pprof"]
|
|
@@ -8,7 +8,7 @@ Blazing-fast HTML → Markdown conversion for Ruby, powered by the same Rust eng
|
|
|
8
8
|
[](https://pypi.org/project/html-to-markdown/)
|
|
9
9
|
[](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
10
10
|
[](https://rubygems.org/gems/html-to-markdown)
|
|
11
|
-
[](https://github.com/
|
|
11
|
+
[](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE)
|
|
12
12
|
|
|
13
13
|
## Features
|
|
14
14
|
|
|
@@ -21,10 +21,10 @@ Blazing-fast HTML → Markdown conversion for Ruby, powered by the same Rust eng
|
|
|
21
21
|
|
|
22
22
|
## Documentation & Support
|
|
23
23
|
|
|
24
|
-
- [GitHub repository](https://github.com/
|
|
25
|
-
- [Issue tracker](https://github.com/
|
|
26
|
-
- [Changelog](https://github.com/
|
|
27
|
-
- [Live demo (WASM)](https://
|
|
24
|
+
- [GitHub repository](https://github.com/kreuzberg-dev/html-to-markdown)
|
|
25
|
+
- [Issue tracker](https://github.com/kreuzberg-dev/html-to-markdown/issues)
|
|
26
|
+
- [Changelog](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/CHANGELOG.md)
|
|
27
|
+
- [Live demo (WASM)](https://kreuzberg-dev.github.io/html-to-markdown/)
|
|
28
28
|
|
|
29
29
|
## Installation
|
|
30
30
|
|