html-to-markdown 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 9e0e6194a6f1f081dd93f13abb5e299118f8e92bb5a7f706d6dc3359439dd5ae
4
+ data.tar.gz: 7146575639b67ad477a74640d5cd8d19219bb58a60c0d1311595fd46ee9be2ad
5
+ SHA512:
6
+ metadata.gz: 0d55454f4a640146ddbe4ed0173f140901a9bfc6c1333827a51c5f404aed8b38d7c90bd6910e15aef0dd2da25f91fbfe6c08b1b5e914e6fc2d466caca148d1f2
7
+ data.tar.gz: 88fbedd30ddacdbac50be232944ff331edf344129d91c9d0799d999443833f0726a963881e6e608892923a115d7e3b01cf041443f46363059ab2e68a9e6cde2f
data/Cargo.toml ADDED
@@ -0,0 +1,28 @@
1
+ [package]
2
+ name = "html-to-markdown-rb"
3
+ version.workspace = true
4
+ edition.workspace = true
5
+ authors.workspace = true
6
+ license.workspace = true
7
+ repository.workspace = true
8
+ homepage.workspace = true
9
+ documentation.workspace = true
10
+ readme = "README.md"
11
+ rust-version.workspace = true
12
+ description = "Ruby bindings (Magnus) for html-to-markdown - high-performance HTML to Markdown converter"
13
+ keywords = ["html", "markdown", "ruby", "magnus", "bindings"]
14
+ categories = ["api-bindings"]
15
+
16
+ [lib]
17
+ name = "html_to_markdown_rb"
18
+ crate-type = ["cdylib", "rlib"]
19
+
20
+ [features]
21
+ default = []
22
+
23
+ [dependencies]
24
+ html-to-markdown-rs = { workspace = true, features = ["inline-images"] }
25
+ magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
26
+
27
+ [dev-dependencies]
28
+ pretty_assertions = "1.4"
data/README.md ADDED
@@ -0,0 +1,146 @@
1
+ # html-to-markdown-rb
2
+
3
+ Ruby bindings for the `html-to-markdown` Rust engine – the same core that powers the Python wheels, Node.js NAPI bindings, WebAssembly package, and CLI. The gem exposes fast HTML → Markdown conversion with identical rendering behaviour across every supported language.
4
+
5
+ [![RubyGems](https://badge.fury.io/rb/html-to-markdown.svg)](https://rubygems.org/gems/html-to-markdown)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
7
+
8
+ ## Installation
9
+
10
+ ```bash
11
+ bundle add html-to-markdown
12
+ # or
13
+ gem install html-to-markdown
14
+ ```
15
+
16
+ Add the gem to your project and Bundler will compile the native Rust extension on first install.
17
+
18
+ ### Requirements
19
+
20
+ - Ruby **3.2+** (Magnus relies on the fiber scheduler APIs added in 3.2)
21
+ - Rust toolchain **1.85+** with Cargo available on your `$PATH`
22
+ - Ruby development headers (`ruby-dev`, `ruby-devel`, or the platform equivalent)
23
+
24
+ **Windows**: install [RubyInstaller with MSYS2](https://rubyinstaller.org/) (UCRT64). Run once:
25
+
26
+ ```powershell
27
+ ridk exec pacman -S --needed --noconfirm base-devel mingw-w64-ucrt-x86_64-toolchain
28
+ ```
29
+
30
+ This provides the standard headers (including `strings.h`) required for the bindgen step.
31
+
32
+ ## Quick Start
33
+
34
+ ```ruby
35
+ require 'html_to_markdown'
36
+
37
+ html = <<~HTML
38
+ <h1>Welcome</h1>
39
+ <p>This is <strong>Rust-fast</strong> conversion!</p>
40
+ <ul>
41
+ <li>Native extension</li>
42
+ <li>Identical output across languages</li>
43
+ </ul>
44
+ HTML
45
+
46
+ markdown = HtmlToMarkdown.convert(html)
47
+ puts markdown
48
+ # # Welcome
49
+ #
50
+ # This is **Rust-fast** conversion!
51
+ #
52
+ # - Native extension
53
+ # - Identical output across languages
54
+ ```
55
+
56
+ ### Conversion with Options
57
+
58
+ All configuration mirrors the Rust API. Options accept symbols or strings and match the same defaults as the other bindings.
59
+
60
+ ```ruby
61
+ require 'html_to_markdown'
62
+
63
+ markdown = HtmlToMarkdown.convert(
64
+ '<pre><code class="language-ruby">puts "hi"</code></pre>',
65
+ heading_style: :atx,
66
+ code_block_style: :fenced,
67
+ bullets: ['*', '-', '+'],
68
+ wrap: true,
69
+ wrap_width: 80,
70
+ preserve_tags: %w[table figure]
71
+ )
72
+ ```
73
+
74
+ ### Inline Images
75
+
76
+ Extract inline binary data (data URIs, SVG) together with the converted Markdown.
77
+
78
+ ```ruby
79
+ require 'html_to_markdown'
80
+
81
+ result = HtmlToMarkdown.convert_with_inline_images(
82
+ '<img src="..." alt="Pixel">',
83
+ image_config: {
84
+ max_decoded_size_bytes: 1 * 1024 * 1024,
85
+ infer_dimensions: true,
86
+ filename_prefix: 'img_',
87
+ capture_svg: true
88
+ }
89
+ )
90
+
91
+ puts result.markdown
92
+ result.inline_images.each do |img|
93
+ puts "#{img.filename} -> #{img.format} (#{img.data.bytesize} bytes)"
94
+ end
95
+ ```
96
+
97
+ ### CLI Proxy
98
+
99
+ The gem bundles a small proxy for the Rust CLI binary. Use it when you need parity with the standalone `html-to-markdown` executable.
100
+
101
+ ```ruby
102
+ require 'html_to_markdown/cli'
103
+
104
+ HtmlToMarkdown::CLI.run(%w[--heading-style atx input.html], stdout: $stdout)
105
+ # => writes converted Markdown to STDOUT
106
+ ```
107
+
108
+ You can also call the CLI binary directly for scripting:
109
+
110
+ ```ruby
111
+ HtmlToMarkdown::CLIProxy.call(['--version'])
112
+ # => "html-to-markdown 2.5.1"
113
+ ```
114
+
115
+ ### Error Handling
116
+
117
+ Conversion errors raise `HtmlToMarkdown::Error` (wrapping the Rust error context). CLI invocations use specialised subclasses:
118
+
119
+ - `HtmlToMarkdown::CLIProxy::MissingBinaryError`
120
+ - `HtmlToMarkdown::CLIProxy::CLIExecutionError`
121
+
122
+ Rescue them to provide clearer feedback in your application.
123
+
124
+ ## Consistent Across Languages
125
+
126
+ The Ruby gem shares the exact Rust core with:
127
+
128
+ - [Python wheels](https://pypi.org/project/html-to-markdown/)
129
+ - [Node.js / Bun bindings](https://www.npmjs.com/package/html-to-markdown-node)
130
+ - [WebAssembly package](https://www.npmjs.com/package/html-to-markdown-wasm)
131
+ - The Rust crate and CLI
132
+
133
+ Use whichever runtime fits your stack while keeping formatting behaviour identical.
134
+
135
+ ## Development
136
+
137
+ ```bash
138
+ bundle exec rake compile # build the native extension
139
+ bundle exec rspec # run test suite
140
+ ```
141
+
142
+ The extension uses [Magnus](https://github.com/matsadler/magnus) plus `rb-sys` for bindgen. When editing the Rust code under `src/`, rerun `rake compile`.
143
+
144
+ ## License
145
+
146
+ MIT © Na'aman Hirschfeld
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'html_to_markdown/cli'
5
+
6
+ exit HtmlToMarkdown::CLI.run(ARGV)
data/extconf.rb ADDED
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mkmf'
4
+ require 'rb_sys/mkmf'
5
+ require 'rbconfig'
6
+
7
+ if RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
8
+ devkit = ENV['RI_DEVKIT']
9
+ prefix = ENV['MSYSTEM_PREFIX'] || '/ucrt64'
10
+
11
+ if devkit
12
+ sysroot = "#{devkit}#{prefix}".tr('\\\\', '/')
13
+ extra_args = [
14
+ '--target=x86_64-pc-windows-gnu',
15
+ "--sysroot=#{sysroot}"
16
+ ]
17
+
18
+ existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/)
19
+ ENV['BINDGEN_EXTRA_CLANG_ARGS'] = (existing + extra_args).uniq.join(' ')
20
+ end
21
+ end
22
+
23
+ default_profile = ENV.fetch('CARGO_PROFILE', 'release')
24
+
25
+ create_rust_makefile('html_to_markdown_rb') do |config|
26
+ config.profile = default_profile.to_sym
27
+ end
Binary file
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'html_to_markdown/cli_proxy'
4
+
5
+ module HtmlToMarkdown
6
+ module CLI
7
+ module_function
8
+
9
+ def run(argv = ARGV, stdout: $stdout, stderr: $stderr)
10
+ output = CLIProxy.call(argv)
11
+ stdout.print(output)
12
+ 0
13
+ rescue CLIProxy::CLIExecutionError => e
14
+ stderr.print(e.stderr)
15
+ e.status || 1
16
+ rescue CLIProxy::MissingBinaryError, CLIProxy::Error => e
17
+ stderr.puts(e.message)
18
+ 1
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+ require 'pathname'
5
+
6
+ module HtmlToMarkdown
7
+ module CLIProxy
8
+ Error = Class.new(StandardError)
9
+ MissingBinaryError = Class.new(Error)
10
+
11
+ class CLIExecutionError < Error
12
+ attr_reader :stderr, :status
13
+
14
+ def initialize(message, stderr:, status:)
15
+ super(message)
16
+ @stderr = stderr
17
+ @status = status
18
+ end
19
+ end
20
+
21
+ module_function
22
+
23
+ def call(argv)
24
+ binary = find_cli_binary
25
+ args = Array(argv).map(&:to_s)
26
+ stdout, stderr, status = Open3.capture3(binary.to_s, *args)
27
+ return stdout if status.success?
28
+
29
+ raise CLIExecutionError.new(
30
+ "html-to-markdown CLI exited with status #{status.exitstatus}",
31
+ stderr: stderr,
32
+ status: status.exitstatus
33
+ )
34
+ end
35
+
36
+ def find_cli_binary
37
+ binary_name = Gem.win_platform? ? 'html-to-markdown.exe' : 'html-to-markdown'
38
+ found = search_paths(binary_name).find(&:file?)
39
+ return found if found
40
+
41
+ raise MissingBinaryError, missing_binary_message
42
+ end
43
+
44
+ def root_path
45
+ @root_path ||= Pathname(__dir__).join('../..').expand_path
46
+ end
47
+
48
+ def lib_path
49
+ @lib_path ||= Pathname(__dir__).join('..').expand_path
50
+ end
51
+
52
+ def search_paths(binary_name)
53
+ paths = [
54
+ root_path.join('target', 'release', binary_name),
55
+ lib_path.join('bin', binary_name),
56
+ lib_path.join(binary_name)
57
+ ]
58
+
59
+ workspace_root = root_path.parent&.parent
60
+ paths << workspace_root.join('target', 'release', binary_name) if workspace_root
61
+ paths
62
+ end
63
+
64
+ def missing_binary_message
65
+ <<~MSG.strip
66
+ html-to-markdown CLI binary not found. Build it with
67
+ `cargo build --release --package html-to-markdown-cli`.
68
+ MSG
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module HtmlToMarkdown
4
+ VERSION = '2.5.1'
5
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'html_to_markdown/version'
4
+ require 'html_to_markdown_rb'
5
+
6
+ module HtmlToMarkdown
7
+ autoload :CLI, 'html_to_markdown/cli'
8
+ autoload :CLIProxy, 'html_to_markdown/cli_proxy'
9
+
10
+ class << self
11
+ alias native_convert convert
12
+ alias native_convert_with_inline_images convert_with_inline_images
13
+ end
14
+
15
+ module_function
16
+
17
+ def convert(html, options = nil)
18
+ native_convert(html.to_s, options)
19
+ end
20
+
21
+ def convert_with_inline_images(html, options = nil, image_config = nil)
22
+ native_convert_with_inline_images(html.to_s, options, image_config)
23
+ end
24
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'html_to_markdown/cli_proxy'
5
+ require 'html_to_markdown/cli'
6
+ require 'stringio'
7
+
8
+ RSpec.describe HtmlToMarkdown::CLIProxy do
9
+ describe '.call' do
10
+ it 'executes the CLI binary' do
11
+ begin
12
+ binary = described_class.find_cli_binary
13
+ rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
14
+ skip 'CLI binary not built'
15
+ end
16
+
17
+ expect(binary).to be_file
18
+
19
+ output = described_class.call(['--version'])
20
+ expect(output).to include(HtmlToMarkdown::VERSION)
21
+ end
22
+ end
23
+
24
+ describe HtmlToMarkdown::CLI do
25
+ it 'writes CLI output to stdout' do
26
+ begin
27
+ HtmlToMarkdown::CLIProxy.find_cli_binary
28
+ rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
29
+ skip 'CLI binary not built'
30
+ end
31
+
32
+ stdout = StringIO.new
33
+ stderr = StringIO.new
34
+
35
+ exit_code = described_class.run(['--version'], stdout: stdout, stderr: stderr)
36
+
37
+ expect(exit_code).to eq(0)
38
+ expect(stdout.string).to include(HtmlToMarkdown::VERSION)
39
+ expect(stderr.string).to be_empty
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe HtmlToMarkdown do
6
+ describe '.convert' do
7
+ it 'converts simple headings' do
8
+ expect(described_class.convert('<h1>Hello</h1>')).to eq("# Hello\n")
9
+ end
10
+
11
+ it 'accepts options hash' do
12
+ result = described_class.convert(
13
+ '<h1>Hello</h1>',
14
+ heading_style: :atx_closed,
15
+ default_title: true
16
+ )
17
+ expect(result).to include('Hello')
18
+ end
19
+ end
20
+
21
+ describe '.convert_with_inline_images' do
22
+ it 'returns inline images metadata' do
23
+ html = '<p><img src="" alt="fake"></p>'
24
+ extraction = described_class.convert_with_inline_images(html)
25
+ expect(extraction).to include(:markdown, :inline_images, :warnings)
26
+ expect(extraction[:inline_images].first[:description]).to eq('fake')
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/setup'
4
+ require 'html_to_markdown'
5
+
6
+ RSpec.configure do |config|
7
+ config.expect_with :rspec do |c|
8
+ c.syntax = :expect
9
+ end
10
+ end
data/src/lib.rs ADDED
@@ -0,0 +1,432 @@
1
+ use html_to_markdown_rs::{
2
+ convert as convert_inner, convert_with_inline_images as convert_with_inline_images_inner, error::ConversionError,
3
+ CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, HtmlExtraction, InlineImage, InlineImageConfig,
4
+ InlineImageFormat, InlineImageSource, InlineImageWarning, ListIndentType, NewlineStyle, PreprocessingOptions,
5
+ PreprocessingPreset, WhitespaceMode,
6
+ };
7
+ use magnus::prelude::*;
8
+ use magnus::{function, scan_args::scan_args, Error, RArray, RHash, Ruby, Symbol, TryConvert, Value};
9
+
10
+ const DEFAULT_INLINE_IMAGE_LIMIT: u64 = 5 * 1024 * 1024;
11
+
12
+ fn conversion_error(err: ConversionError) -> Error {
13
+ match err {
14
+ ConversionError::ConfigError(msg) => arg_error(msg),
15
+ other => runtime_error(other.to_string()),
16
+ }
17
+ }
18
+
19
+ fn arg_error(message: impl Into<String>) -> Error {
20
+ let ruby = Ruby::get().expect("Ruby not initialised");
21
+ Error::new(ruby.exception_arg_error(), message.into())
22
+ }
23
+
24
+ fn runtime_error(message: impl Into<String>) -> Error {
25
+ let ruby = Ruby::get().expect("Ruby not initialised");
26
+ Error::new(ruby.exception_runtime_error(), message.into())
27
+ }
28
+
29
+ fn symbol_to_string(value: Value) -> Result<String, Error> {
30
+ if let Some(symbol) = Symbol::from_value(value) {
31
+ Ok(symbol.name()?.to_string())
32
+ } else {
33
+ String::try_convert(value)
34
+ }
35
+ }
36
+
37
+ fn get_kw(ruby: &Ruby, hash: RHash, name: &str) -> Option<Value> {
38
+ let sym = ruby.intern(name);
39
+ hash.get(sym).or_else(|| hash.get(name))
40
+ }
41
+
42
+ fn parse_heading_style(value: Value) -> Result<HeadingStyle, Error> {
43
+ match symbol_to_string(value)?.as_str() {
44
+ "underlined" => Ok(HeadingStyle::Underlined),
45
+ "atx" => Ok(HeadingStyle::Atx),
46
+ "atx_closed" => Ok(HeadingStyle::AtxClosed),
47
+ other => Err(arg_error(format!("invalid heading_style: {other}"))),
48
+ }
49
+ }
50
+
51
+ fn parse_list_indent_type(value: Value) -> Result<ListIndentType, Error> {
52
+ match symbol_to_string(value)?.as_str() {
53
+ "spaces" => Ok(ListIndentType::Spaces),
54
+ "tabs" => Ok(ListIndentType::Tabs),
55
+ other => Err(arg_error(format!("invalid list_indent_type: {other}"))),
56
+ }
57
+ }
58
+
59
+ fn parse_highlight_style(value: Value) -> Result<HighlightStyle, Error> {
60
+ match symbol_to_string(value)?.as_str() {
61
+ "double_equal" => Ok(HighlightStyle::DoubleEqual),
62
+ "html" => Ok(HighlightStyle::Html),
63
+ "bold" => Ok(HighlightStyle::Bold),
64
+ "none" => Ok(HighlightStyle::None),
65
+ other => Err(arg_error(format!("invalid highlight_style: {other}"))),
66
+ }
67
+ }
68
+
69
+ fn parse_whitespace_mode(value: Value) -> Result<WhitespaceMode, Error> {
70
+ match symbol_to_string(value)?.as_str() {
71
+ "normalized" => Ok(WhitespaceMode::Normalized),
72
+ "strict" => Ok(WhitespaceMode::Strict),
73
+ other => Err(arg_error(format!("invalid whitespace_mode: {other}"))),
74
+ }
75
+ }
76
+
77
+ fn parse_newline_style(value: Value) -> Result<NewlineStyle, Error> {
78
+ match symbol_to_string(value)?.as_str() {
79
+ "spaces" => Ok(NewlineStyle::Spaces),
80
+ "backslash" => Ok(NewlineStyle::Backslash),
81
+ other => Err(arg_error(format!("invalid newline_style: {other}"))),
82
+ }
83
+ }
84
+
85
+ fn parse_code_block_style(value: Value) -> Result<CodeBlockStyle, Error> {
86
+ match symbol_to_string(value)?.as_str() {
87
+ "indented" => Ok(CodeBlockStyle::Indented),
88
+ "backticks" => Ok(CodeBlockStyle::Backticks),
89
+ "tildes" => Ok(CodeBlockStyle::Tildes),
90
+ other => Err(arg_error(format!("invalid code_block_style: {other}"))),
91
+ }
92
+ }
93
+
94
+ fn parse_preset(value: Value) -> Result<PreprocessingPreset, Error> {
95
+ match symbol_to_string(value)?.as_str() {
96
+ "minimal" => Ok(PreprocessingPreset::Minimal),
97
+ "standard" => Ok(PreprocessingPreset::Standard),
98
+ "aggressive" => Ok(PreprocessingPreset::Aggressive),
99
+ other => Err(arg_error(format!("invalid preprocessing preset: {other}"))),
100
+ }
101
+ }
102
+
103
+ fn parse_vec_of_strings(value: Value) -> Result<Vec<String>, Error> {
104
+ let array = RArray::from_value(value).ok_or_else(|| arg_error("expected an Array of strings"))?;
105
+
106
+ array.to_vec::<String>()
107
+ }
108
+
109
+ fn parse_preprocessing_options(ruby: &Ruby, value: Value) -> Result<PreprocessingOptions, Error> {
110
+ let hash = RHash::from_value(value).ok_or_else(|| arg_error("expected preprocessing to be a Hash"))?;
111
+
112
+ let mut opts = PreprocessingOptions::default();
113
+
114
+ if let Some(enabled) = get_kw(ruby, hash, "enabled") {
115
+ opts.enabled = bool::try_convert(enabled)?;
116
+ }
117
+
118
+ if let Some(preset) = get_kw(ruby, hash, "preset") {
119
+ opts.preset = parse_preset(preset)?;
120
+ }
121
+
122
+ if let Some(remove_navigation) = get_kw(ruby, hash, "remove_navigation") {
123
+ opts.remove_navigation = bool::try_convert(remove_navigation)?;
124
+ }
125
+
126
+ if let Some(remove_forms) = get_kw(ruby, hash, "remove_forms") {
127
+ opts.remove_forms = bool::try_convert(remove_forms)?;
128
+ }
129
+
130
+ Ok(opts)
131
+ }
132
+
133
+ fn build_conversion_options(ruby: &Ruby, options: Option<Value>) -> Result<ConversionOptions, Error> {
134
+ let mut opts = ConversionOptions::default();
135
+
136
+ let Some(options) = options else {
137
+ return Ok(opts);
138
+ };
139
+
140
+ if options.is_nil() {
141
+ return Ok(opts);
142
+ }
143
+
144
+ let hash = RHash::from_value(options).ok_or_else(|| arg_error("options must be provided as a Hash"))?;
145
+
146
+ if let Some(heading_style) = get_kw(ruby, hash, "heading_style") {
147
+ opts.heading_style = parse_heading_style(heading_style)?;
148
+ }
149
+
150
+ if let Some(list_indent_type) = get_kw(ruby, hash, "list_indent_type") {
151
+ opts.list_indent_type = parse_list_indent_type(list_indent_type)?;
152
+ }
153
+
154
+ if let Some(list_indent_width) = get_kw(ruby, hash, "list_indent_width") {
155
+ opts.list_indent_width = usize::try_convert(list_indent_width)?;
156
+ }
157
+
158
+ if let Some(bullets) = get_kw(ruby, hash, "bullets") {
159
+ opts.bullets = String::try_convert(bullets)?;
160
+ }
161
+
162
+ if let Some(strong_em_symbol) = get_kw(ruby, hash, "strong_em_symbol") {
163
+ let value = String::try_convert(strong_em_symbol)?;
164
+ let mut chars = value.chars();
165
+ let ch = chars
166
+ .next()
167
+ .ok_or_else(|| arg_error("strong_em_symbol must not be empty"))?;
168
+ if chars.next().is_some() {
169
+ return Err(arg_error("strong_em_symbol must be a single character"));
170
+ }
171
+ opts.strong_em_symbol = ch;
172
+ }
173
+
174
+ if let Some(escape_asterisks) = get_kw(ruby, hash, "escape_asterisks") {
175
+ opts.escape_asterisks = bool::try_convert(escape_asterisks)?;
176
+ }
177
+
178
+ if let Some(escape_underscores) = get_kw(ruby, hash, "escape_underscores") {
179
+ opts.escape_underscores = bool::try_convert(escape_underscores)?;
180
+ }
181
+
182
+ if let Some(escape_misc) = get_kw(ruby, hash, "escape_misc") {
183
+ opts.escape_misc = bool::try_convert(escape_misc)?;
184
+ }
185
+
186
+ if let Some(escape_ascii) = get_kw(ruby, hash, "escape_ascii") {
187
+ opts.escape_ascii = bool::try_convert(escape_ascii)?;
188
+ }
189
+
190
+ if let Some(code_language) = get_kw(ruby, hash, "code_language") {
191
+ opts.code_language = String::try_convert(code_language)?;
192
+ }
193
+
194
+ if let Some(autolinks) = get_kw(ruby, hash, "autolinks") {
195
+ opts.autolinks = bool::try_convert(autolinks)?;
196
+ }
197
+
198
+ if let Some(default_title) = get_kw(ruby, hash, "default_title") {
199
+ opts.default_title = bool::try_convert(default_title)?;
200
+ }
201
+
202
+ if let Some(br_in_tables) = get_kw(ruby, hash, "br_in_tables") {
203
+ opts.br_in_tables = bool::try_convert(br_in_tables)?;
204
+ }
205
+
206
+ if let Some(hocr_spatial_tables) = get_kw(ruby, hash, "hocr_spatial_tables") {
207
+ opts.hocr_spatial_tables = bool::try_convert(hocr_spatial_tables)?;
208
+ }
209
+
210
+ if let Some(highlight_style) = get_kw(ruby, hash, "highlight_style") {
211
+ opts.highlight_style = parse_highlight_style(highlight_style)?;
212
+ }
213
+
214
+ if let Some(extract_metadata) = get_kw(ruby, hash, "extract_metadata") {
215
+ opts.extract_metadata = bool::try_convert(extract_metadata)?;
216
+ }
217
+
218
+ if let Some(whitespace_mode) = get_kw(ruby, hash, "whitespace_mode") {
219
+ opts.whitespace_mode = parse_whitespace_mode(whitespace_mode)?;
220
+ }
221
+
222
+ if let Some(strip_newlines) = get_kw(ruby, hash, "strip_newlines") {
223
+ opts.strip_newlines = bool::try_convert(strip_newlines)?;
224
+ }
225
+
226
+ if let Some(wrap) = get_kw(ruby, hash, "wrap") {
227
+ opts.wrap = bool::try_convert(wrap)?;
228
+ }
229
+
230
+ if let Some(wrap_width) = get_kw(ruby, hash, "wrap_width") {
231
+ opts.wrap_width = usize::try_convert(wrap_width)?;
232
+ }
233
+
234
+ if let Some(convert_as_inline) = get_kw(ruby, hash, "convert_as_inline") {
235
+ opts.convert_as_inline = bool::try_convert(convert_as_inline)?;
236
+ }
237
+
238
+ if let Some(sub_symbol) = get_kw(ruby, hash, "sub_symbol") {
239
+ opts.sub_symbol = String::try_convert(sub_symbol)?;
240
+ }
241
+
242
+ if let Some(sup_symbol) = get_kw(ruby, hash, "sup_symbol") {
243
+ opts.sup_symbol = String::try_convert(sup_symbol)?;
244
+ }
245
+
246
+ if let Some(newline_style) = get_kw(ruby, hash, "newline_style") {
247
+ opts.newline_style = parse_newline_style(newline_style)?;
248
+ }
249
+
250
+ if let Some(code_block_style) = get_kw(ruby, hash, "code_block_style") {
251
+ opts.code_block_style = parse_code_block_style(code_block_style)?;
252
+ }
253
+
254
+ if let Some(keep_inline_images_in) = get_kw(ruby, hash, "keep_inline_images_in") {
255
+ opts.keep_inline_images_in = parse_vec_of_strings(keep_inline_images_in)?;
256
+ }
257
+
258
+ if let Some(preprocessing) = get_kw(ruby, hash, "preprocessing") {
259
+ opts.preprocessing = parse_preprocessing_options(ruby, preprocessing)?;
260
+ }
261
+
262
+ if let Some(encoding) = get_kw(ruby, hash, "encoding") {
263
+ opts.encoding = String::try_convert(encoding)?;
264
+ }
265
+
266
+ if let Some(debug) = get_kw(ruby, hash, "debug") {
267
+ opts.debug = bool::try_convert(debug)?;
268
+ }
269
+
270
+ if let Some(strip_tags) = get_kw(ruby, hash, "strip_tags") {
271
+ opts.strip_tags = parse_vec_of_strings(strip_tags)?;
272
+ }
273
+
274
+ if let Some(preserve_tags) = get_kw(ruby, hash, "preserve_tags") {
275
+ opts.preserve_tags = parse_vec_of_strings(preserve_tags)?;
276
+ }
277
+
278
+ Ok(opts)
279
+ }
280
+
281
+ fn build_inline_image_config(ruby: &Ruby, config: Option<Value>) -> Result<InlineImageConfig, Error> {
282
+ let mut cfg = InlineImageConfig::new(DEFAULT_INLINE_IMAGE_LIMIT);
283
+
284
+ let Some(config) = config else {
285
+ return Ok(cfg);
286
+ };
287
+
288
+ if config.is_nil() {
289
+ return Ok(cfg);
290
+ }
291
+
292
+ let hash = RHash::from_value(config).ok_or_else(|| arg_error("inline image config must be provided as a Hash"))?;
293
+
294
+ if let Some(limit) = get_kw(ruby, hash, "max_decoded_size_bytes") {
295
+ cfg.max_decoded_size_bytes = u64::try_convert(limit)?;
296
+ }
297
+
298
+ if let Some(prefix) = get_kw(ruby, hash, "filename_prefix") {
299
+ cfg.filename_prefix = if prefix.is_nil() {
300
+ None
301
+ } else {
302
+ Some(String::try_convert(prefix)?)
303
+ };
304
+ }
305
+
306
+ if let Some(capture_svg) = get_kw(ruby, hash, "capture_svg") {
307
+ cfg.capture_svg = bool::try_convert(capture_svg)?;
308
+ }
309
+
310
+ if let Some(infer_dimensions) = get_kw(ruby, hash, "infer_dimensions") {
311
+ cfg.infer_dimensions = bool::try_convert(infer_dimensions)?;
312
+ }
313
+
314
+ Ok(cfg)
315
+ }
316
+
317
+ fn inline_image_to_value(ruby: &Ruby, image: InlineImage) -> Result<Value, Error> {
318
+ let InlineImage {
319
+ data,
320
+ format,
321
+ filename,
322
+ description,
323
+ dimensions,
324
+ source,
325
+ attributes,
326
+ } = image;
327
+
328
+ let hash = ruby.hash_new();
329
+ let data_value = ruby.str_from_slice(&data);
330
+ hash.aset(ruby.intern("data"), data_value)?;
331
+
332
+ let format_value = match format {
333
+ InlineImageFormat::Png => "png".to_string(),
334
+ InlineImageFormat::Jpeg => "jpeg".to_string(),
335
+ InlineImageFormat::Gif => "gif".to_string(),
336
+ InlineImageFormat::Bmp => "bmp".to_string(),
337
+ InlineImageFormat::Webp => "webp".to_string(),
338
+ InlineImageFormat::Svg => "svg".to_string(),
339
+ InlineImageFormat::Other(other) => other,
340
+ };
341
+ hash.aset(ruby.intern("format"), format_value)?;
342
+
343
+ match filename {
344
+ Some(name) => hash.aset(ruby.intern("filename"), name)?,
345
+ None => hash.aset(ruby.intern("filename"), ruby.qnil())?,
346
+ }
347
+
348
+ match description {
349
+ Some(desc) => hash.aset(ruby.intern("description"), desc)?,
350
+ None => hash.aset(ruby.intern("description"), ruby.qnil())?,
351
+ }
352
+
353
+ if let Some((width, height)) = dimensions {
354
+ let dims = ruby.ary_new();
355
+ dims.push(width as i64)?;
356
+ dims.push(height as i64)?;
357
+ hash.aset(ruby.intern("dimensions"), dims)?;
358
+ } else {
359
+ hash.aset(ruby.intern("dimensions"), ruby.qnil())?;
360
+ }
361
+
362
+ let source_value = match source {
363
+ InlineImageSource::ImgDataUri => "img_data_uri",
364
+ InlineImageSource::SvgElement => "svg_element",
365
+ };
366
+ hash.aset(ruby.intern("source"), source_value)?;
367
+
368
+ let attrs = ruby.hash_new();
369
+ for (key, value) in attributes {
370
+ attrs.aset(key, value)?;
371
+ }
372
+ hash.aset(ruby.intern("attributes"), attrs)?;
373
+
374
+ Ok(hash.as_value())
375
+ }
376
+
377
+ fn warning_to_value(ruby: &Ruby, warning: InlineImageWarning) -> Result<Value, Error> {
378
+ let hash = ruby.hash_new();
379
+ hash.aset(ruby.intern("index"), warning.index as i64)?;
380
+ hash.aset(ruby.intern("message"), warning.message)?;
381
+ Ok(hash.as_value())
382
+ }
383
+
384
+ fn extraction_to_value(ruby: &Ruby, extraction: HtmlExtraction) -> Result<Value, Error> {
385
+ let hash = ruby.hash_new();
386
+ hash.aset(ruby.intern("markdown"), extraction.markdown)?;
387
+
388
+ let inline_images = ruby.ary_new();
389
+ for image in extraction.inline_images {
390
+ inline_images.push(inline_image_to_value(ruby, image)?)?;
391
+ }
392
+ hash.aset(ruby.intern("inline_images"), inline_images)?;
393
+
394
+ let warnings = ruby.ary_new();
395
+ for warning in extraction.warnings {
396
+ warnings.push(warning_to_value(ruby, warning)?)?;
397
+ }
398
+ hash.aset(ruby.intern("warnings"), warnings)?;
399
+
400
+ Ok(hash.as_value())
401
+ }
402
+
403
+ fn convert_fn(ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
404
+ let parsed = scan_args::<(String,), (Option<Value>,), (), (), (), ()>(args)?;
405
+ let html = parsed.required.0;
406
+ let options = build_conversion_options(ruby, parsed.optional.0)?;
407
+
408
+ convert_inner(&html, Some(options)).map_err(conversion_error)
409
+ }
410
+
411
+ fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
412
+ let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
413
+ let html = parsed.required.0;
414
+ let options = build_conversion_options(ruby, parsed.optional.0)?;
415
+ let config = build_inline_image_config(ruby, parsed.optional.1)?;
416
+
417
+ let extraction = convert_with_inline_images_inner(&html, Some(options), config).map_err(conversion_error)?;
418
+
419
+ extraction_to_value(ruby, extraction)
420
+ }
421
+
422
+ #[magnus::init]
423
+ fn init(ruby: &Ruby) -> Result<(), Error> {
424
+ let module = ruby.define_module("HtmlToMarkdown")?;
425
+ module.define_singleton_method("convert", function!(convert_fn, -1))?;
426
+ module.define_singleton_method(
427
+ "convert_with_inline_images",
428
+ function!(convert_with_inline_images_fn, -1),
429
+ )?;
430
+
431
+ Ok(())
432
+ }
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html-to-markdown
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.5.1
5
+ platform: ruby
6
+ authors:
7
+ - Na'aman Hirschfeld
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: rb_sys
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0.9'
19
+ - - "<"
20
+ - !ruby/object:Gem::Version
21
+ version: '1.0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ version: '0.9'
29
+ - - "<"
30
+ - !ruby/object:Gem::Version
31
+ version: '1.0'
32
+ description: High-performance HTML to Markdown conversion from Ruby using Magnus and
33
+ rb-sys.
34
+ email:
35
+ - nhirschfeld@gmail.com
36
+ executables:
37
+ - html-to-markdown
38
+ extensions:
39
+ - extconf.rb
40
+ extra_rdoc_files: []
41
+ files:
42
+ - Cargo.toml
43
+ - README.md
44
+ - exe/html-to-markdown
45
+ - extconf.rb
46
+ - lib/bin/html-to-markdown
47
+ - lib/html_to_markdown.rb
48
+ - lib/html_to_markdown/cli.rb
49
+ - lib/html_to_markdown/cli_proxy.rb
50
+ - lib/html_to_markdown/version.rb
51
+ - spec/cli_proxy_spec.rb
52
+ - spec/convert_spec.rb
53
+ - spec/spec_helper.rb
54
+ - src/lib.rs
55
+ homepage: https://github.com/Goldziher/html-to-markdown
56
+ licenses:
57
+ - MIT
58
+ metadata:
59
+ rubygems_mfa_required: 'true'
60
+ homepage_uri: https://github.com/Goldziher/html-to-markdown
61
+ source_code_uri: https://github.com/Goldziher/html-to-markdown
62
+ bug_tracker_uri: https://github.com/Goldziher/html-to-markdown/issues
63
+ changelog_uri: https://github.com/Goldziher/html-to-markdown/releases
64
+ documentation_uri: https://github.com/Goldziher/html-to-markdown/blob/main/README.md
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: '3.2'
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubygems_version: 3.7.2
80
+ specification_version: 4
81
+ summary: Ruby bindings for the html-to-markdown Rust library
82
+ test_files: []