html-to-markdown 2.29.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.bundle/config +2 -0
- data/.gitignore +3 -0
- data/.rubocop.yml +29 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +223 -0
- data/README.md +345 -0
- data/Rakefile +32 -0
- data/Steepfile +26 -0
- data/bin/benchmark.rb +232 -0
- data/exe/html-to-markdown +6 -0
- data/html-to-markdown-rb.gemspec +99 -0
- data/lib/html_to_markdown/cli.rb +21 -0
- data/lib/html_to_markdown/cli_proxy.rb +74 -0
- data/lib/html_to_markdown/version.rb +5 -0
- data/lib/html_to_markdown.rb +211 -0
- data/lib/html_to_markdown_rb.bundle +0 -0
- data/sig/html_to_markdown/cli.rbs +24 -0
- data/sig/html_to_markdown/cli_proxy.rbs +48 -0
- data/sig/html_to_markdown.rbs +498 -0
- data/sig/open3.rbs +12 -0
- data/spec/cli_proxy_spec.rb +42 -0
- data/spec/convert_spec.rb +77 -0
- data/spec/convert_with_tables_spec.rb +194 -0
- data/spec/metadata_extraction_spec.rb +437 -0
- data/spec/spec_helper.rb +10 -0
- data/spec/visitor_issue_187_spec.rb +605 -0
- data/spec/visitor_spec.rb +1149 -0
- metadata +80 -0
data/Steepfile
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Steepfile for type checking html-to-markdown Ruby gem
|
|
4
|
+
|
|
5
|
+
target :lib do
|
|
6
|
+
signature 'sig'
|
|
7
|
+
|
|
8
|
+
check 'lib'
|
|
9
|
+
|
|
10
|
+
# Configure libraries
|
|
11
|
+
library 'pathname'
|
|
12
|
+
library 'open3'
|
|
13
|
+
|
|
14
|
+
# Ignore vendor directory
|
|
15
|
+
ignore 'vendor'
|
|
16
|
+
|
|
17
|
+
# Ignore spec directory
|
|
18
|
+
ignore 'spec'
|
|
19
|
+
|
|
20
|
+
# Ignore bin directory
|
|
21
|
+
ignore 'bin'
|
|
22
|
+
|
|
23
|
+
# Ignore internal implementation modules (not public API)
|
|
24
|
+
ignore 'lib/html_to_markdown/cli.rb'
|
|
25
|
+
ignore 'lib/html_to_markdown/cli_proxy.rb'
|
|
26
|
+
end
|
data/bin/benchmark.rb
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require 'optparse'
|
|
5
|
+
require 'time'
|
|
6
|
+
|
|
7
|
+
$LOAD_PATH.unshift(File.expand_path('../lib', __dir__))
|
|
8
|
+
require 'html_to_markdown'
|
|
9
|
+
|
|
10
|
+
def json_escape(value)
|
|
11
|
+
value.to_s.gsub(/["\\\n\r]/) do |char|
|
|
12
|
+
case char
|
|
13
|
+
when '"', '\\'
|
|
14
|
+
"\\#{char}"
|
|
15
|
+
when "\n"
|
|
16
|
+
'\\n'
|
|
17
|
+
when "\r"
|
|
18
|
+
'\\r'
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
options = {
|
|
24
|
+
iterations: 50,
|
|
25
|
+
format: 'html',
|
|
26
|
+
scenario: 'convert-default',
|
|
27
|
+
visitor: nil
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
OptionParser.new do |parser|
|
|
31
|
+
parser.banner = 'ruby benchmark.rb --file path/to/fixture.html [--iterations 200]'
|
|
32
|
+
|
|
33
|
+
parser.on('--file FILE', 'HTML fixture to convert repeatedly') do |file|
|
|
34
|
+
options[:file] = file
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
parser.on('--iterations N', Integer, 'Number of conversion iterations (default: 50)') do |n|
|
|
38
|
+
options[:iterations] = n.positive? ? n : 1
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
parser.on('--scenario SCENARIO', 'Scenario to benchmark') do |scenario|
|
|
42
|
+
options[:scenario] = scenario
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
parser.on('--format FORMAT', 'Fixture format (html or hocr)') do |format|
|
|
46
|
+
options[:format] = format.downcase
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
parser.on('--visitor VISITOR', 'Visitor type (noop, simple, custom, complex)') do |visitor|
|
|
50
|
+
options[:visitor] = visitor if %w[noop simple custom complex].include?(visitor)
|
|
51
|
+
end
|
|
52
|
+
end.parse!
|
|
53
|
+
|
|
54
|
+
fixture = options.fetch(:file) do
|
|
55
|
+
warn 'Missing --file parameter'
|
|
56
|
+
exit 1
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
unless File.exist?(fixture)
|
|
60
|
+
warn "Fixture not found: #{fixture}"
|
|
61
|
+
exit 1
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
unless %w[html hocr].include?(options[:format])
|
|
65
|
+
warn "Unsupported format: #{options[:format]}"
|
|
66
|
+
exit 1
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
supported_scenarios = %w[
|
|
70
|
+
convert-default
|
|
71
|
+
convert-options
|
|
72
|
+
inline-images-default
|
|
73
|
+
inline-images-options
|
|
74
|
+
metadata-default
|
|
75
|
+
metadata-options
|
|
76
|
+
]
|
|
77
|
+
unless supported_scenarios.include?(options[:scenario])
|
|
78
|
+
warn "Unsupported scenario: #{options[:scenario]}"
|
|
79
|
+
exit 1
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Visitor factory functions
|
|
83
|
+
def create_noop_visitor
|
|
84
|
+
{
|
|
85
|
+
visit_text: proc { |_ctx, _text| 'continue' },
|
|
86
|
+
visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
|
|
87
|
+
visit_paragraph: proc { |_ctx, _text| 'continue' },
|
|
88
|
+
visit_link: proc { |_ctx, _href, _text, _title| 'continue' },
|
|
89
|
+
visit_image: proc { |_ctx, _src, _alt, _title| 'continue' },
|
|
90
|
+
visit_strong: proc { |_ctx, _text| 'continue' },
|
|
91
|
+
visit_em: proc { |_ctx, _text| 'continue' },
|
|
92
|
+
visit_code: proc { |_ctx, _text| 'continue' },
|
|
93
|
+
visit_br: proc { |_ctx| 'continue' }
|
|
94
|
+
}
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def create_simple_visitor
|
|
98
|
+
{
|
|
99
|
+
text_count: 0,
|
|
100
|
+
link_count: 0,
|
|
101
|
+
image_count: 0,
|
|
102
|
+
visit_text: proc { |_ctx, _text| 'continue' },
|
|
103
|
+
visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
|
|
104
|
+
visit_paragraph: proc { |_ctx, _text| 'continue' },
|
|
105
|
+
visit_link: proc { |_ctx, _href, _text, _title| 'continue' },
|
|
106
|
+
visit_image: proc { |_ctx, _src, _alt, _title| 'continue' },
|
|
107
|
+
visit_strong: proc { |_ctx, _text| 'continue' },
|
|
108
|
+
visit_em: proc { |_ctx, _text| 'continue' },
|
|
109
|
+
visit_code: proc { |_ctx, _text| 'continue' },
|
|
110
|
+
visit_br: proc { |_ctx| 'continue' }
|
|
111
|
+
}
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def create_custom_visitor
|
|
115
|
+
{
|
|
116
|
+
visit_text: proc { |_ctx, _text| 'continue' },
|
|
117
|
+
visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
|
|
118
|
+
visit_paragraph: proc { |_ctx, _text| 'continue' },
|
|
119
|
+
visit_link: proc { |_ctx, href, text, _title| ['custom', "LINK[#{text}](#{href})"] },
|
|
120
|
+
visit_image: proc { |_ctx, src, alt, _title| ['custom', ""] },
|
|
121
|
+
visit_strong: proc { |_ctx, _text| 'continue' },
|
|
122
|
+
visit_em: proc { |_ctx, _text| 'continue' },
|
|
123
|
+
visit_code: proc { |_ctx, _text| 'continue' },
|
|
124
|
+
visit_br: proc { |_ctx| 'continue' }
|
|
125
|
+
}
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def create_complex_visitor
|
|
129
|
+
{
|
|
130
|
+
texts: 0,
|
|
131
|
+
links: 0,
|
|
132
|
+
images: 0,
|
|
133
|
+
headings: 0,
|
|
134
|
+
visit_text: proc { |_ctx, _text| 'continue' },
|
|
135
|
+
visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
|
|
136
|
+
visit_paragraph: proc { |_ctx, _text| 'continue' },
|
|
137
|
+
visit_link: proc { |_ctx, href, text, _title| ['custom', "[#{text}](#{href})"] },
|
|
138
|
+
visit_image: proc { |_ctx, _src, _alt, _title| 'skip' },
|
|
139
|
+
visit_strong: proc { |_ctx, _text| 'continue' },
|
|
140
|
+
visit_em: proc { |_ctx, _text| 'continue' },
|
|
141
|
+
visit_code: proc { |_ctx, _text| 'continue' },
|
|
142
|
+
visit_br: proc { |_ctx| 'continue' }
|
|
143
|
+
}
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
html = File.binread(fixture)
|
|
147
|
+
html.force_encoding(Encoding::UTF_8)
|
|
148
|
+
html.freeze
|
|
149
|
+
iterations = options[:iterations]
|
|
150
|
+
conversion_options = options[:format] == 'hocr' ? { hocr_spatial_tables: false } : {}
|
|
151
|
+
options_handle = if %w[convert-options inline-images-options metadata-options].include?(options[:scenario])
|
|
152
|
+
HtmlToMarkdown.options(conversion_options)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Create visitor if specified
|
|
156
|
+
visitor = nil
|
|
157
|
+
if options[:visitor]
|
|
158
|
+
visitor_creators = {
|
|
159
|
+
'noop' => method(:create_noop_visitor),
|
|
160
|
+
'simple' => method(:create_simple_visitor),
|
|
161
|
+
'custom' => method(:create_custom_visitor),
|
|
162
|
+
'complex' => method(:create_complex_visitor)
|
|
163
|
+
}
|
|
164
|
+
creator = visitor_creators[options[:visitor]]
|
|
165
|
+
visitor = creator.call if creator
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
SCENARIO_RUNNERS = {
|
|
169
|
+
'convert-default' => ->(html, _options, _handle, _visitor) { HtmlToMarkdown.convert(html) },
|
|
170
|
+
'convert-options' => lambda do |html, _options, handle, _visitor|
|
|
171
|
+
raise ArgumentError, 'options handle required' unless handle
|
|
172
|
+
|
|
173
|
+
HtmlToMarkdown.convert_with_options(html, handle)
|
|
174
|
+
end,
|
|
175
|
+
'inline-images-default' => lambda { |html, _options, _handle, _visitor|
|
|
176
|
+
HtmlToMarkdown.convert_with_inline_images(html, nil, nil)
|
|
177
|
+
},
|
|
178
|
+
'inline-images-options' => lambda do |html, _options, handle, _visitor|
|
|
179
|
+
raise ArgumentError, 'options handle required' unless handle
|
|
180
|
+
|
|
181
|
+
HtmlToMarkdown.convert_with_inline_images_handle(html, handle, nil)
|
|
182
|
+
end,
|
|
183
|
+
'metadata-default' => ->(html, _options, _handle, _visitor) { HtmlToMarkdown.convert_with_metadata(html, nil, nil) },
|
|
184
|
+
'metadata-options' => lambda do |html, _options, handle, _visitor|
|
|
185
|
+
raise ArgumentError, 'options handle required' unless handle
|
|
186
|
+
|
|
187
|
+
HtmlToMarkdown.convert_with_metadata_handle(html, handle, nil)
|
|
188
|
+
end
|
|
189
|
+
}.freeze
|
|
190
|
+
|
|
191
|
+
def run_scenario(html, scenario, options, handle, visitor = nil)
|
|
192
|
+
if visitor
|
|
193
|
+
HtmlToMarkdown.convert_with_visitor(html, nil, visitor)
|
|
194
|
+
else
|
|
195
|
+
runner = SCENARIO_RUNNERS.fetch(scenario) { raise ArgumentError, "Unsupported scenario: #{scenario}" }
|
|
196
|
+
runner.call(html, options, handle, visitor)
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
run_scenario(html, options[:scenario], conversion_options, options_handle, visitor)
|
|
201
|
+
|
|
202
|
+
profile_output = ENV.fetch('HTML_TO_MARKDOWN_PROFILE_OUTPUT', nil)
|
|
203
|
+
if profile_output && HtmlToMarkdown.respond_to?(:start_profiling)
|
|
204
|
+
freq = Integer(ENV.fetch('HTML_TO_MARKDOWN_PROFILE_FREQUENCY', '1000'), 10)
|
|
205
|
+
HtmlToMarkdown.start_profiling(profile_output, freq)
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
209
|
+
iterations.times { run_scenario(html, options[:scenario], conversion_options, options_handle, visitor) }
|
|
210
|
+
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
|
|
211
|
+
|
|
212
|
+
HtmlToMarkdown.stop_profiling if profile_output && HtmlToMarkdown.respond_to?(:stop_profiling)
|
|
213
|
+
|
|
214
|
+
payload_size_bytes = html.bytesize
|
|
215
|
+
bytes_processed = payload_size_bytes * iterations
|
|
216
|
+
ops_per_sec = iterations / elapsed
|
|
217
|
+
mb_per_sec = (bytes_processed.to_f / (1024 * 1024)) / elapsed
|
|
218
|
+
|
|
219
|
+
payload = %({
|
|
220
|
+
"language":"ruby",
|
|
221
|
+
"fixture":"#{json_escape(File.basename(fixture))}",
|
|
222
|
+
"fixture_path":"#{json_escape(fixture)}",
|
|
223
|
+
"scenario":"#{json_escape(options[:scenario])}",
|
|
224
|
+
"iterations":#{iterations},
|
|
225
|
+
"elapsed_seconds":#{format('%.8f', elapsed)},
|
|
226
|
+
"ops_per_sec":#{format('%.4f', ops_per_sec)},
|
|
227
|
+
"mb_per_sec":#{format('%.4f', mb_per_sec)},
|
|
228
|
+
"bytes_processed":#{bytes_processed},
|
|
229
|
+
"payload_size_bytes":#{payload_size_bytes}
|
|
230
|
+
})
|
|
231
|
+
|
|
232
|
+
puts payload.strip
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'lib/html_to_markdown/version'
|
|
4
|
+
|
|
5
|
+
repo_root = File.expand_path('../..', __dir__)
|
|
6
|
+
crate_prefix = 'packages/ruby/'
|
|
7
|
+
git_cmd = %(git -C "#{repo_root}" ls-files -z #{crate_prefix})
|
|
8
|
+
ruby_files =
|
|
9
|
+
`#{git_cmd}`.split("\x0")
|
|
10
|
+
.select { |path| path.start_with?(crate_prefix) }
|
|
11
|
+
.map { |path| path.delete_prefix(crate_prefix) }
|
|
12
|
+
|
|
13
|
+
fallback_files = Dir.chdir(__dir__) do
|
|
14
|
+
Dir.glob(
|
|
15
|
+
%w[
|
|
16
|
+
README.md
|
|
17
|
+
ext/**/*
|
|
18
|
+
exe/*
|
|
19
|
+
lib/**/*.rb
|
|
20
|
+
lib/bin/*
|
|
21
|
+
src/**/*.rs
|
|
22
|
+
spec/**/*.rb
|
|
23
|
+
sig/**/*.rbs
|
|
24
|
+
]
|
|
25
|
+
)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Vendor files: include vendored crates and workspace Cargo.toml
|
|
29
|
+
vendor_files = Dir.chdir(__dir__) do
|
|
30
|
+
Dir.glob('vendor/html-to-markdown-rs/**/*', File::FNM_DOTMATCH)
|
|
31
|
+
.select { |f| File.file?(f) }
|
|
32
|
+
.grep_v(%r{/target/})
|
|
33
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Include vendor/Cargo.toml (workspace definition) if it exists
|
|
37
|
+
workspace_toml = if File.exist?(File.join(__dir__, 'vendor/Cargo.toml'))
|
|
38
|
+
['vendor/Cargo.toml']
|
|
39
|
+
else
|
|
40
|
+
[]
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# When vendor exists, use ext/ files from filesystem (modified by vendor script)
|
|
44
|
+
# instead of git (which has the unmodified Cargo.toml with workspace paths)
|
|
45
|
+
ext_files_from_fs = Dir.chdir(__dir__) do
|
|
46
|
+
Dir.glob('ext/**/*', File::FNM_DOTMATCH)
|
|
47
|
+
.reject { |f| File.directory?(f) }
|
|
48
|
+
.reject { |f| f.include?('/target/') }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Include native artifacts (.so, .bundle, .dylib) if present (for platform gems)
|
|
52
|
+
native_files = Dir.chdir(__dir__) do
|
|
53
|
+
Dir.glob('lib/**/*.{so,bundle,dylib}')
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
files = if vendor_files.any?
|
|
57
|
+
# Vendor exists: use ext/ from filesystem (has modified Cargo.toml)
|
|
58
|
+
non_ext_ruby_files = (ruby_files.empty? ? fallback_files : ruby_files)
|
|
59
|
+
.reject { |f| f.start_with?('ext/') }
|
|
60
|
+
non_ext_ruby_files + ext_files_from_fs + vendor_files + workspace_toml + native_files
|
|
61
|
+
else
|
|
62
|
+
ruby_files.empty? ? fallback_files : ruby_files
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
files = files.uniq
|
|
66
|
+
|
|
67
|
+
Gem::Specification.new do |spec|
|
|
68
|
+
spec.name = 'html-to-markdown'
|
|
69
|
+
spec.version = HtmlToMarkdown::VERSION
|
|
70
|
+
spec.authors = ["Na'aman Hirschfeld"]
|
|
71
|
+
spec.email = ['nhirschfeld@gmail.com']
|
|
72
|
+
|
|
73
|
+
spec.summary = 'Blazing-fast HTML to Markdown conversion for Ruby, powered by Rust.'
|
|
74
|
+
spec.description = <<~DESC.strip
|
|
75
|
+
html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
|
|
76
|
+
It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
|
|
77
|
+
DESC
|
|
78
|
+
spec.homepage = 'https://github.com/kreuzberg-dev/html-to-markdown'
|
|
79
|
+
spec.license = 'MIT'
|
|
80
|
+
|
|
81
|
+
spec.required_ruby_version = Gem::Requirement.new('>= 3.2')
|
|
82
|
+
|
|
83
|
+
spec.bindir = 'exe'
|
|
84
|
+
spec.executables = ['html-to-markdown']
|
|
85
|
+
spec.require_paths = ['lib']
|
|
86
|
+
|
|
87
|
+
spec.files = files
|
|
88
|
+
spec.extra_rdoc_files = ['README.md']
|
|
89
|
+
|
|
90
|
+
spec.extensions = ['ext/html-to-markdown-rb/extconf.rb']
|
|
91
|
+
|
|
92
|
+
spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
|
|
93
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
|
94
|
+
spec.metadata['homepage_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown'
|
|
95
|
+
spec.metadata['source_code_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown'
|
|
96
|
+
spec.metadata['bug_tracker_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/issues'
|
|
97
|
+
spec.metadata['changelog_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/releases'
|
|
98
|
+
spec.metadata['documentation_uri'] = 'https://github.com/kreuzberg-dev/html-to-markdown/blob/main/packages/ruby/README.md'
|
|
99
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'html_to_markdown/cli_proxy'
|
|
4
|
+
|
|
5
|
+
module HtmlToMarkdown
|
|
6
|
+
module CLI
|
|
7
|
+
module_function
|
|
8
|
+
|
|
9
|
+
def run(argv = ARGV, stdout: $stdout, stderr: $stderr)
|
|
10
|
+
output = CLIProxy.call(argv)
|
|
11
|
+
stdout.print(output)
|
|
12
|
+
0
|
|
13
|
+
rescue CLIProxy::CLIExecutionError => e
|
|
14
|
+
stderr.print(e.stderr)
|
|
15
|
+
e.status || 1
|
|
16
|
+
rescue CLIProxy::MissingBinaryError, CLIProxy::Error => e
|
|
17
|
+
stderr.puts(e.message)
|
|
18
|
+
1
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'open3'
|
|
4
|
+
require 'pathname'
|
|
5
|
+
|
|
6
|
+
module HtmlToMarkdown
|
|
7
|
+
module CLIProxy
|
|
8
|
+
class Error < StandardError
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
class MissingBinaryError < Error
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
class CLIExecutionError < Error
|
|
15
|
+
attr_reader :stderr, :status
|
|
16
|
+
|
|
17
|
+
def initialize(message, stderr:, status:)
|
|
18
|
+
super(message)
|
|
19
|
+
@stderr = stderr
|
|
20
|
+
@status = status
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
module_function
|
|
25
|
+
|
|
26
|
+
def call(argv)
|
|
27
|
+
binary = find_cli_binary
|
|
28
|
+
args = Array(argv).map(&:to_s)
|
|
29
|
+
stdout, stderr, status = Open3.capture3(binary.to_s, *args)
|
|
30
|
+
return stdout if status.success?
|
|
31
|
+
|
|
32
|
+
raise CLIExecutionError.new(
|
|
33
|
+
"html-to-markdown CLI exited with status #{status.exitstatus}",
|
|
34
|
+
stderr: stderr,
|
|
35
|
+
status: status.exitstatus
|
|
36
|
+
)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def find_cli_binary
|
|
40
|
+
binary_name = Gem.win_platform? ? 'html-to-markdown.exe' : 'html-to-markdown'
|
|
41
|
+
found = search_paths(binary_name).find(&:file?)
|
|
42
|
+
return found if found
|
|
43
|
+
|
|
44
|
+
raise MissingBinaryError, missing_binary_message
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def root_path
|
|
48
|
+
@root_path ||= Pathname(__dir__.to_s).join('../..').expand_path
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def lib_path
|
|
52
|
+
@lib_path ||= Pathname(__dir__.to_s).join('..').expand_path
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def search_paths(binary_name)
|
|
56
|
+
paths = [
|
|
57
|
+
root_path.join('target', 'release', binary_name),
|
|
58
|
+
lib_path.join('bin', binary_name),
|
|
59
|
+
lib_path.join(binary_name)
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
workspace_root = root_path.parent&.parent
|
|
63
|
+
paths << workspace_root.join('target', 'release', binary_name) if workspace_root
|
|
64
|
+
paths
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def missing_binary_message
|
|
68
|
+
<<~MSG.strip
|
|
69
|
+
html-to-markdown CLI binary not found. Build it with
|
|
70
|
+
`cargo build --release --package html-to-markdown-cli`.
|
|
71
|
+
MSG
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'html_to_markdown/version'
|
|
4
|
+
require 'html_to_markdown_rb'
|
|
5
|
+
|
|
6
|
+
module HtmlToMarkdown
|
|
7
|
+
autoload :CLI, 'html_to_markdown/cli'
|
|
8
|
+
autoload :CLIProxy, 'html_to_markdown/cli_proxy'
|
|
9
|
+
|
|
10
|
+
class Options; end # rubocop:disable Lint/EmptyClass
|
|
11
|
+
|
|
12
|
+
class << self
|
|
13
|
+
alias native_convert convert
|
|
14
|
+
alias native_convert_with_inline_images convert_with_inline_images
|
|
15
|
+
alias native_convert_with_inline_images_handle convert_with_inline_images_handle
|
|
16
|
+
alias native_options options
|
|
17
|
+
alias native_convert_with_options convert_with_options
|
|
18
|
+
alias native_convert_with_metadata convert_with_metadata
|
|
19
|
+
alias native_convert_with_metadata_handle convert_with_metadata_handle
|
|
20
|
+
alias native_convert_with_visitor convert_with_visitor
|
|
21
|
+
alias native_convert_with_tables convert_with_tables
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
module_function
|
|
25
|
+
|
|
26
|
+
def convert(html, options = nil, visitor = nil)
|
|
27
|
+
if visitor
|
|
28
|
+
native_convert_with_visitor(html.to_s, options, visitor)
|
|
29
|
+
else
|
|
30
|
+
native_convert(html.to_s, options)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def convert_with_options(html, options_handle)
|
|
35
|
+
native_convert_with_options(html.to_s, options_handle)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def convert_with_inline_images(html, options = nil, image_config = nil, _visitor = nil)
|
|
39
|
+
# NOTE: visitor parameter is accepted for API compatibility but not used in inline images mode
|
|
40
|
+
# The visitor pattern is only supported in the standard convert() method
|
|
41
|
+
native_convert_with_inline_images(html.to_s, options, image_config)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def convert_with_inline_images_handle(html, options_handle, image_config = nil)
|
|
45
|
+
native_convert_with_inline_images_handle(html.to_s, options_handle, image_config)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def options(options_hash = nil)
|
|
49
|
+
native_options(options_hash)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Convert HTML to Markdown with comprehensive metadata extraction.
|
|
53
|
+
#
|
|
54
|
+
# Performs HTML-to-Markdown conversion while extracting document metadata, headers,
|
|
55
|
+
# links, images, and structured data in a single pass. Ideal for content analysis,
|
|
56
|
+
# SEO workflows, and document indexing.
|
|
57
|
+
#
|
|
58
|
+
# @param html [String] HTML string to convert. Line endings are normalized (CRLF -> LF).
|
|
59
|
+
# @param options [ConversionOptions, Hash, nil] Optional conversion configuration.
|
|
60
|
+
# When a Hash, keys should match ConversionOptions field names (as symbols or strings).
|
|
61
|
+
# Common options:
|
|
62
|
+
# - :heading_style [String] "atx", "atx_closed", or "underlined" (default: "underlined")
|
|
63
|
+
# - :list_indent_type [String] "spaces" or "tabs" (default: "spaces")
|
|
64
|
+
# - :list_indent_width [Integer] Spaces per indent level (default: 4)
|
|
65
|
+
# - :wrap [true, false] Enable text wrapping (default: false)
|
|
66
|
+
# - :wrap_width [Integer] Wrap at this column width (default: 80)
|
|
67
|
+
# See ConversionOptions documentation for complete list.
|
|
68
|
+
#
|
|
69
|
+
# @param metadata_config [Hash, nil] Optional metadata extraction configuration.
|
|
70
|
+
# Keys should be symbols or strings. Supported keys:
|
|
71
|
+
# - :extract_headers [true, false] Extract h1-h6 heading elements (default: true)
|
|
72
|
+
# - :extract_links [true, false] Extract hyperlinks with type classification (default: true)
|
|
73
|
+
# - :extract_images [true, false] Extract image elements (default: true)
|
|
74
|
+
# - :extract_structured_data [true, false] Extract JSON-LD/Microdata/RDFa (default: true)
|
|
75
|
+
# - :max_structured_data_size [Integer] Size limit for structured data in bytes (default: 1_000_000)
|
|
76
|
+
#
|
|
77
|
+
# @return [Array<String, Hash>] Tuple of [markdown_string, metadata_hash]
|
|
78
|
+
# markdown_string: String - The converted Markdown output
|
|
79
|
+
#
|
|
80
|
+
# metadata_hash: Hash with keys:
|
|
81
|
+
# - :document [Hash] Document-level metadata:
|
|
82
|
+
# - :title [String, nil] From <title> tag
|
|
83
|
+
# - :description [String, nil] From <meta name="description">
|
|
84
|
+
# - :keywords [Array<String>] From <meta name="keywords">
|
|
85
|
+
# - :author [String, nil] From <meta name="author">
|
|
86
|
+
# - :language [String, nil] From lang attribute (e.g., "en")
|
|
87
|
+
# - :text_direction [String, nil] "ltr", "rtl", or "auto"
|
|
88
|
+
# - :canonical_url [String, nil] From <link rel="canonical">
|
|
89
|
+
# - :base_href [String, nil] From <base href="">
|
|
90
|
+
# - :open_graph [Hash<String, String>] Open Graph properties (og:* meta tags)
|
|
91
|
+
# - :twitter_card [Hash<String, String>] Twitter Card properties (twitter:* meta tags)
|
|
92
|
+
# - :meta_tags [Hash<String, String>] Other meta tags
|
|
93
|
+
#
|
|
94
|
+
# - :headers [Array<Hash>] Heading elements:
|
|
95
|
+
# - :level [Integer] 1-6
|
|
96
|
+
# - :text [String] Header text content
|
|
97
|
+
# - :id [String, nil] HTML id attribute
|
|
98
|
+
# - :depth [Integer] Tree nesting depth
|
|
99
|
+
# - :html_offset [Integer] Byte offset in original HTML
|
|
100
|
+
#
|
|
101
|
+
# - :links [Array<Hash>] Hyperlinks:
|
|
102
|
+
# - :href [String] Link URL
|
|
103
|
+
# - :text [String] Link text content
|
|
104
|
+
# - :title [String, nil] Title attribute
|
|
105
|
+
# - :link_type [String] "anchor", "internal", "external", "email", "phone", or "other"
|
|
106
|
+
# - :rel [Array<String>] Rel attribute values
|
|
107
|
+
# - :attributes [Hash<String, String>] Additional HTML attributes
|
|
108
|
+
#
|
|
109
|
+
# - :images [Array<Hash>] Image elements:
|
|
110
|
+
# - :src [String] Image source URL or data URI
|
|
111
|
+
# - :alt [String, nil] Alt text for accessibility
|
|
112
|
+
# - :title [String, nil] Title attribute
|
|
113
|
+
# - :dimensions [Array<Integer>, nil] [width, height] if available
|
|
114
|
+
# - :image_type [String] "data_uri", "external", "relative", or "inline_svg"
|
|
115
|
+
# - :attributes [Hash<String, String>] Additional HTML attributes
|
|
116
|
+
#
|
|
117
|
+
# - :structured_data [Array<Hash>] Structured data blocks:
|
|
118
|
+
# - :data_type [String] "json_ld", "microdata", or "rdfa"
|
|
119
|
+
# - :raw_json [String] Raw JSON content
|
|
120
|
+
# - :schema_type [String, nil] Schema type (e.g., "Article", "Event")
|
|
121
|
+
#
|
|
122
|
+
# @raise [StandardError] If conversion fails or invalid configuration
|
|
123
|
+
#
|
|
124
|
+
# @example Basic usage
|
|
125
|
+
# html = <<~HTML
|
|
126
|
+
# <html lang="en">
|
|
127
|
+
# <head>
|
|
128
|
+
# <title>My Article</title>
|
|
129
|
+
# <meta name="description" content="A great read">
|
|
130
|
+
# </head>
|
|
131
|
+
# <body>
|
|
132
|
+
# <h1 id="intro">Introduction</h1>
|
|
133
|
+
# <p>Visit <a href="https://example.com">our site</a></p>
|
|
134
|
+
# <img src="photo.jpg" alt="Beautiful landscape">
|
|
135
|
+
# </body>
|
|
136
|
+
# </html>
|
|
137
|
+
# HTML
|
|
138
|
+
#
|
|
139
|
+
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
|
|
140
|
+
#
|
|
141
|
+
# puts metadata[:document][:title] # => "My Article"
|
|
142
|
+
# puts metadata[:document][:language] # => "en"
|
|
143
|
+
# puts metadata[:headers].length # => 1
|
|
144
|
+
# puts metadata[:headers][0][:text] # => "Introduction"
|
|
145
|
+
# puts metadata[:links].length # => 1
|
|
146
|
+
# puts metadata[:images].length # => 1
|
|
147
|
+
#
|
|
148
|
+
# @example With selective metadata extraction
|
|
149
|
+
# config = {
|
|
150
|
+
# extract_headers: true,
|
|
151
|
+
# extract_links: true,
|
|
152
|
+
# extract_images: false, # Skip images
|
|
153
|
+
# extract_structured_data: false # Skip structured data
|
|
154
|
+
# }
|
|
155
|
+
#
|
|
156
|
+
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, nil, config)
|
|
157
|
+
# puts metadata[:images].empty? # => true (not extracted)
|
|
158
|
+
#
|
|
159
|
+
# @example With conversion options
|
|
160
|
+
# options = {
|
|
161
|
+
# heading_style: "atx", # Use # H1, ## H2 style
|
|
162
|
+
# wrap: true,
|
|
163
|
+
# wrap_width: 80
|
|
164
|
+
# }
|
|
165
|
+
#
|
|
166
|
+
# config = { extract_headers: true }
|
|
167
|
+
#
|
|
168
|
+
# markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, options, config)
|
|
169
|
+
# # Markdown uses ATX-style headings and wraps at 80 characters
|
|
170
|
+
#
|
|
171
|
+
# @see #convert Simple conversion without metadata
|
|
172
|
+
# @see #convert_with_inline_images Extract inline images during conversion
|
|
173
|
+
# @see ConversionOptions Detailed conversion configuration
|
|
174
|
+
def convert_with_metadata(html, options = nil, metadata_config = nil, _visitor = nil)
|
|
175
|
+
# NOTE: visitor parameter is accepted for API compatibility but not used in metadata extraction mode
|
|
176
|
+
# The visitor pattern is only supported in the standard convert() method
|
|
177
|
+
native_convert_with_metadata(html.to_s, options, metadata_config)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def convert_with_metadata_handle(html, options_handle, metadata_config = nil)
|
|
181
|
+
native_convert_with_metadata_handle(html.to_s, options_handle, metadata_config)
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Convert HTML to Markdown with table extraction.
|
|
185
|
+
#
|
|
186
|
+
# Performs HTML-to-Markdown conversion while extracting structured table data
|
|
187
|
+
# (cells, markdown representation, header row flags) in a single pass.
|
|
188
|
+
#
|
|
189
|
+
# @param html [String] HTML string to convert.
|
|
190
|
+
# @param options [Hash, nil] Optional conversion configuration.
|
|
191
|
+
# @param metadata_config [Hash, nil] Optional metadata extraction configuration.
|
|
192
|
+
#
|
|
193
|
+
# @return [Hash] A hash with keys:
|
|
194
|
+
# - :content [String] The converted Markdown output
|
|
195
|
+
# - :metadata [Hash, nil] Extended metadata (if metadata extraction was configured)
|
|
196
|
+
# - :tables [Array<Hash>] Extracted tables, each with:
|
|
197
|
+
# - :cells [Array<Array<String>>] Table cells organized as rows x columns
|
|
198
|
+
# - :markdown [String] Complete rendered table in Markdown format
|
|
199
|
+
# - :is_header_row [Array<Boolean>] Per-row flag indicating header rows
|
|
200
|
+
#
|
|
201
|
+
# @raise [StandardError] If conversion fails or invalid configuration
|
|
202
|
+
#
|
|
203
|
+
# @example Basic usage
|
|
204
|
+
# html = '<table><thead><tr><th>Name</th></tr></thead><tbody><tr><td>Alice</td></tr></tbody></table>'
|
|
205
|
+
# result = HtmlToMarkdown.convert_with_tables(html)
|
|
206
|
+
# puts result[:tables].length # => 1
|
|
207
|
+
# puts result[:tables][0][:cells] # => [["Name"], ["Alice"]]
|
|
208
|
+
def convert_with_tables(html, options = nil, metadata_config = nil)
|
|
209
|
+
native_convert_with_tables(html.to_s, options, metadata_config)
|
|
210
|
+
end
|
|
211
|
+
end
|
|
Binary file
|