html-to-markdown 2.5.5 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: da35a384f2b4f769a94313ea7611f281e2d85f1c01b8862f28ee721cd6fcdd22
4
- data.tar.gz: e956bce4c519a62feb137aa6448b514a6099e75ee978d247846a31c35a466ce0
3
+ metadata.gz: 71fe440f4811019fdf9c1d88b72b092c97d8223dab27a83f6b23e2675d98317a
4
+ data.tar.gz: 1bbe81a26da2c50e16e5207e48bda25e309afde2f3e22600889b85855624de2c
5
5
  SHA512:
6
- metadata.gz: b84f0b4305bd792948ee2924190143cd2ec0161d84ab0853b50039e214bbff19c6806718c12cb51a086e39557cb70e1650ef2baf4840419ced2e659e7575a28f
7
- data.tar.gz: 636fc6ce5662a33ad62f2526a5b6105d755db0a57c6b99ed7c4c519ed0f6f44611be4e51932ccab868e30a6b89b87071678606f454e18d1f17b235546b7c91ee
6
+ metadata.gz: 49c83e6230f9df3a96088e7f0e4eeb885da4b9601ac7bad407675a1393d228eb4a9357ee5a036e328dd13d1b28aa3d74a50e216030acd6a359296ca5a16888d3
7
+ data.tar.gz: ee85846cd5a92e271719632edba3c2415e2263803a4bd74948e6bae8acd93176c6a4daec380d5423ce4670472219a5ed06bb61f047b47a53bf4778e85b9ed459
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.5.5)
4
+ html-to-markdown (2.6.0)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
@@ -19,7 +19,7 @@ GEM
19
19
  prism (1.6.0)
20
20
  racc (1.8.1)
21
21
  rainbow (3.1.1)
22
- rake (13.3.0)
22
+ rake (13.3.1)
23
23
  rake-compiler (1.3.0)
24
24
  rake
25
25
  rake-compiler-dock (1.9.1)
@@ -35,11 +35,11 @@ GEM
35
35
  rspec-expectations (3.13.5)
36
36
  diff-lcs (>= 1.2.0, < 2.0)
37
37
  rspec-support (~> 3.13.0)
38
- rspec-mocks (3.13.6)
38
+ rspec-mocks (3.13.7)
39
39
  diff-lcs (>= 1.2.0, < 2.0)
40
40
  rspec-support (~> 3.13.0)
41
41
  rspec-support (3.13.6)
42
- rubocop (1.81.6)
42
+ rubocop (1.81.7)
43
43
  json (~> 2.3)
44
44
  language_server-protocol (~> 3.17.0.2)
45
45
  lint_roller (~> 1.1.0)
data/README.md CHANGED
@@ -3,9 +3,12 @@
3
3
  Blazing-fast HTML → Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, and WebAssembly packages. Ship identical Markdown across every runtime while enjoying native extension performance.
4
4
 
5
5
  [![Crates.io](https://img.shields.io/crates/v/html-to-markdown-rs.svg)](https://crates.io/crates/html-to-markdown-rs)
6
- [![npm version](https://badge.fury.io/js/html-to-markdown-node.svg)](https://www.npmjs.com/package/html-to-markdown-node)
7
- [![PyPI version](https://badge.fury.io/py/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
8
- [![Gem Version](https://badge.fury.io/rb/html-to-markdown.svg)](https://rubygems.org/gems/html-to-markdown)
6
+ [![npm (node)](https://badge.fury.io/js/html-to-markdown-node.svg)](https://www.npmjs.com/package/html-to-markdown-node)
7
+ [![npm (wasm)](https://badge.fury.io/js/html-to-markdown-wasm.svg)](https://www.npmjs.com/package/html-to-markdown-wasm)
8
+ [![npm (typescript)](https://badge.fury.io/js/html-to-markdown.svg)](https://www.npmjs.com/package/html-to-markdown)
9
+ [![PyPI](https://badge.fury.io/py/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
10
+ [![Packagist](https://img.shields.io/packagist/v/goldziher/html-to-markdown.svg)](https://packagist.org/packages/goldziher/html-to-markdown)
11
+ [![RubyGems](https://badge.fury.io/rb/html-to-markdown.svg)](https://rubygems.org/gems/html-to-markdown)
9
12
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
10
13
 
11
14
  ## Features
@@ -17,6 +20,13 @@ Blazing-fast HTML → Markdown conversion for Ruby, powered by the same Rust eng
17
20
  - 🧰 **Bundled CLI proxy**: Call the Rust CLI straight from Ruby or shell scripts.
18
21
  - 🛠️ **First-class Rails support**: Works with `Gem.win_platform?` builds, supports Trusted Publishing, and compiles on install if no native gem matches.
19
22
 
23
+ ## Documentation & Support
24
+
25
+ - [GitHub repository](https://github.com/Goldziher/html-to-markdown)
26
+ - [Issue tracker](https://github.com/Goldziher/html-to-markdown/issues)
27
+ - [Changelog](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md)
28
+ - [Live demo (WASM)](https://goldziher.github.io/html-to-markdown/)
29
+
20
30
  ## Installation
21
31
 
22
32
  ```bash
@@ -156,7 +166,7 @@ You can also call the CLI binary directly for scripting:
156
166
 
157
167
  ```ruby
158
168
  HtmlToMarkdown::CLIProxy.call(['--version'])
159
- # => "html-to-markdown 2.5.5"
169
+ # => "html-to-markdown 2.5.7"
160
170
  ```
161
171
 
162
172
  Rebuild the CLI locally if you see `CLI binary not built` during tests:
data/Rakefile CHANGED
@@ -8,6 +8,7 @@ GEMSPEC = Gem::Specification.load(File.expand_path('html-to-markdown-rb.gemspec'
8
8
 
9
9
  RbSys::ExtensionTask.new('html-to-markdown-rb', GEMSPEC) do |ext|
10
10
  ext.lib_dir = 'lib'
11
+ ext.ext_dir = 'ext/html-to-markdown-rb'
11
12
  ext.cross_compile = true
12
13
  ext.cross_platform = %w[
13
14
  x86_64-linux
@@ -24,4 +24,5 @@ default_profile = ENV.fetch('CARGO_PROFILE', 'release')
24
24
 
25
25
  create_rust_makefile('html_to_markdown_rb') do |config|
26
26
  config.profile = default_profile.to_sym
27
+ config.ext_dir = '../../../../crates/html-to-markdown-rb'
27
28
  end
@@ -2,8 +2,27 @@
2
2
 
3
3
  require_relative 'lib/html_to_markdown/version'
4
4
 
5
- readme_path = File.expand_path('README.md', __dir__)
6
- readme_body = File.read(readme_path, encoding: 'UTF-8')
5
+ repo_root = File.expand_path('../..', __dir__)
6
+ crate_prefix = 'packages/ruby/'
7
+ git_cmd = %(git -C "#{repo_root}" ls-files -z #{crate_prefix})
8
+ git_files =
9
+ `#{git_cmd}`.split("\x0")
10
+ .select { |path| path.start_with?(crate_prefix) }
11
+ .map { |path| path.delete_prefix(crate_prefix) }
12
+ fallback_files = Dir.chdir(__dir__) do
13
+ Dir.glob(
14
+ %w[
15
+ README.md
16
+ ext/html-to-markdown-rb/extconf.rb
17
+ exe/*
18
+ lib/**/*.rb
19
+ lib/bin/*
20
+ src/**/*.rs
21
+ spec/**/*.rb
22
+ ]
23
+ )
24
+ end
25
+ files = git_files.empty? ? fallback_files : git_files
7
26
 
8
27
  Gem::Specification.new do |spec|
9
28
  spec.name = 'html-to-markdown'
@@ -12,7 +31,10 @@ Gem::Specification.new do |spec|
12
31
  spec.email = ['nhirschfeld@gmail.com']
13
32
 
14
33
  spec.summary = 'Blazing-fast HTML to Markdown conversion for Ruby, powered by Rust.'
15
- spec.description = readme_body
34
+ spec.description = <<~DESC.strip
35
+ html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
36
+ It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
37
+ DESC
16
38
  spec.homepage = 'https://github.com/Goldziher/html-to-markdown'
17
39
  spec.license = 'MIT'
18
40
 
@@ -22,10 +44,10 @@ Gem::Specification.new do |spec|
22
44
  spec.executables = ['html-to-markdown']
23
45
  spec.require_paths = ['lib']
24
46
 
25
- spec.files = `git ls-files -z`.split("\x0")
47
+ spec.files = files
26
48
  spec.extra_rdoc_files = ['README.md']
27
49
 
28
- spec.extensions = ['extconf.rb']
50
+ spec.extensions = ['ext/html-to-markdown-rb/extconf.rb']
29
51
 
30
52
  spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
31
53
  spec.metadata['rubygems_mfa_required'] = 'true'
@@ -33,5 +55,5 @@ Gem::Specification.new do |spec|
33
55
  spec.metadata['source_code_uri'] = 'https://github.com/Goldziher/html-to-markdown'
34
56
  spec.metadata['bug_tracker_uri'] = 'https://github.com/Goldziher/html-to-markdown/issues'
35
57
  spec.metadata['changelog_uri'] = 'https://github.com/Goldziher/html-to-markdown/releases'
36
- spec.metadata['documentation_uri'] = 'https://github.com/Goldziher/html-to-markdown/blob/main/README.md'
58
+ spec.metadata['documentation_uri'] = 'https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md'
37
59
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.5.5'
4
+ VERSION = '2.6.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.5.5
4
+ version: 2.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-10-29 00:00:00.000000000 Z
11
+ date: 2025-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -30,96 +30,25 @@ dependencies:
30
30
  - - "<"
31
31
  - !ruby/object:Gem::Version
32
32
  version: '1.0'
33
- description: "# html-to-markdown-rb\n\nBlazing-fast HTML → Markdown conversion for
34
- Ruby, powered by the same Rust engine used by our Python, Node.js, and WebAssembly
35
- packages. Ship identical Markdown across every runtime while enjoying native extension
36
- performance.\n\n[![Crates.io](https://img.shields.io/crates/v/html-to-markdown-rs.svg)](https://crates.io/crates/html-to-markdown-rs)\n[![npm
37
- version](https://badge.fury.io/js/html-to-markdown-node.svg)](https://www.npmjs.com/package/html-to-markdown-node)\n[![PyPI
38
- version](https://badge.fury.io/py/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)\n[![Gem
39
- Version](https://badge.fury.io/rb/html-to-markdown.svg)](https://rubygems.org/gems/html-to-markdown)\n[![License:
40
- MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)\n\n##
41
- Features\n\n- ⚡ **Rust-fast**: Ruby bindings around a highly optimised Rust core
42
- (60‑80× faster than BeautifulSoup-based converters).\n- \U0001F501 **Identical output**:
43
- Shares logic with the Python wheels, npm bindings, WASM package, and CLI — consistent
44
- Markdown everywhere.\n- ⚙️ **Rich configuration**: Control heading styles, list
45
- indentation, whitespace handling, HTML preprocessing, and more.\n- \U0001F5BC️ **Inline
46
- image extraction**: Pull out embedded images (PNG/JPEG/SVG/data URIs) alongside
47
- Markdown.\n- \U0001F9F0 **Bundled CLI proxy**: Call the Rust CLI straight from Ruby
48
- or shell scripts.\n- \U0001F6E0️ **First-class Rails support**: Works with `Gem.win_platform?`
49
- builds, supports Trusted Publishing, and compiles on install if no native gem matches.\n\n##
50
- Installation\n\n```bash\nbundle add html-to-markdown\n# or\ngem install html-to-markdown\n```\n\nAdd
51
- the gem to your project and Bundler will compile the native Rust extension on first
52
- install.\n\n### Requirements\n\n- Ruby **3.2+** (Magnus relies on the fiber scheduler
53
- APIs added in 3.2)\n- Rust toolchain **1.85+** with Cargo available on your `$PATH`\n-
54
- Ruby development headers (`ruby-dev`, `ruby-devel`, or the platform equivalent)\n\n**Windows**:
55
- install [RubyInstaller with MSYS2](https://rubyinstaller.org/) (UCRT64). Run once:\n\n```powershell\nridk
56
- exec pacman -S --needed --noconfirm base-devel mingw-w64-ucrt-x86_64-toolchain\n```\n\nThis
57
- provides the standard headers (including `strings.h`) required for the bindgen step.\n\n##
58
- Performance Snapshot\n\nApple M4 • Real Wikipedia documents • `HtmlToMarkdown.convert`
59
- (Ruby)\n\n| Document | Size | Latency | Throughput | Docs/sec |\n| -------------------
60
- | ----- | ------- | ---------- | -------- |\n| Lists (Timeline) | 129KB | 0.69ms
61
- \ | 187 MB/s | 1,450 |\n| Tables (Countries) | 360KB | 2.19ms | 164 MB/s
62
- \ | 456 |\n| Mixed (Python wiki) | 656KB | 4.88ms | 134 MB/s | 205 |\n\n>
63
- Same core, same benchmarks: the Ruby extension stays within single-digit % of the
64
- Rust CLI and mirrors the Python/Node numbers.\n\n## Quick Start\n\n```ruby\nrequire
65
- 'html_to_markdown'\n\nhtml = <<~HTML\n <h1>Welcome</h1>\n <p>This is <strong>Rust-fast</strong>
66
- conversion!</p>\n <ul>\n <li>Native extension</li>\n <li>Identical output
67
- across languages</li>\n </ul>\nHTML\n\nmarkdown = HtmlToMarkdown.convert(html)\nputs
68
- markdown\n# # Welcome\n#\n# This is **Rust-fast** conversion!\n#\n# - Native extension\n#
69
- - Identical output across languages\n```\n\n## API\n\n### Conversion Options\n\nPass
70
- a Ruby hash (string or symbol keys) to tweak rendering. Every option maps one-for-one
71
- with the Rust/Python/Node APIs.\n\n```ruby\nrequire 'html_to_markdown'\n\nmarkdown
72
- = HtmlToMarkdown.convert(\n '<pre><code class=\"language-ruby\">puts \"hi\"</code></pre>',\n
73
- \ heading_style: :atx,\n code_block_style: :fenced,\n bullets: '*+-',\n list_indent_type:
74
- :spaces,\n list_indent_width: 2,\n whitespace_mode: :normalized,\n highlight_style:
75
- :double_equal\n)\n\nputs markdown\n```\n\n### HTML Preprocessing\n\nClean up scraped
76
- HTML (navigation, forms, malformed markup) before conversion:\n\n```ruby\nrequire
77
- 'html_to_markdown'\n\nmarkdown = HtmlToMarkdown.convert(\n html,\n preprocessing:
78
- {\n enabled: true,\n preset: :aggressive, # :minimal, :standard, :aggressive\n
79
- \ remove_navigation: true,\n remove_forms: true\n }\n)\n```\n\n### Inline
80
- Images\n\nExtract inline binary data (data URIs, SVG) together with the converted
81
- Markdown.\n\n```ruby\nrequire 'html_to_markdown'\n\nresult = HtmlToMarkdown.convert_with_inline_images(\n
82
- \ '<img src=\"...\" alt=\"Pixel\">',\n image_config:
83
- {\n max_decoded_size_bytes: 1 * 1024 * 1024,\n infer_dimensions: true,\n filename_prefix:
84
- 'img_',\n capture_svg: true\n }\n)\n\nputs result.markdown\nresult.inline_images.each
85
- do |img|\n puts \"#{img.filename} -> #{img.format} (#{img.data.bytesize} bytes)\"\nend\n```\n\n##
86
- CLI\n\nThe gem bundles a small proxy for the Rust CLI binary. Use it when you need
87
- parity with the standalone `html-to-markdown` executable.\n\n```ruby\nrequire 'html_to_markdown/cli'\n\nHtmlToMarkdown::CLI.run(%w[--heading-style
88
- atx input.html], stdout: $stdout)\n# => writes converted Markdown to STDOUT\n```\n\nYou
89
- can also call the CLI binary directly for scripting:\n\n```ruby\nHtmlToMarkdown::CLIProxy.call(['--version'])\n#
90
- => \"html-to-markdown 2.5.5\"\n```\n\nRebuild the CLI locally if you see `CLI binary
91
- not built` during tests:\n\n```bash\nbundle exec rake compile # builds
92
- the extension\nbundle exec ruby scripts/prepare_ruby_gem.rb # copies the CLI into
93
- lib/bin/\n```\n\n## Error Handling\n\nConversion errors raise `HtmlToMarkdown::Error`
94
- (wrapping the Rust error context). CLI invocations use specialised subclasses:\n\n-
95
- `HtmlToMarkdown::CLIProxy::MissingBinaryError`\n- `HtmlToMarkdown::CLIProxy::CLIExecutionError`\n\nRescue
96
- them to provide clearer feedback in your application.\n\n## Consistent Across Languages\n\nThe
97
- Ruby gem shares the exact Rust core with:\n\n- [Python wheels](https://pypi.org/project/html-to-markdown/)\n-
98
- [Node.js / Bun bindings](https://www.npmjs.com/package/html-to-markdown-node)\n-
99
- [WebAssembly package](https://www.npmjs.com/package/html-to-markdown-wasm)\n- The
100
- Rust crate and CLI\n\nUse whichever runtime fits your stack while keeping formatting
101
- behaviour identical.\n\n## Development\n\n```bash\nbundle exec rake compile #
102
- build the native extension\nbundle exec rspec # run test suite\n```\n\nThe
103
- extension uses [Magnus](https://github.com/matsadler/magnus) plus `rb-sys` for bindgen.
104
- When editing the Rust code under `src/`, rerun `rake compile`.\n\n## License\n\nMIT
105
- © Na'aman Hirschfeld\n"
33
+ description: |-
34
+ html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
35
+ It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
106
36
  email:
107
37
  - nhirschfeld@gmail.com
108
38
  executables:
109
39
  - html-to-markdown
110
40
  extensions:
111
- - extconf.rb
41
+ - ext/html-to-markdown-rb/extconf.rb
112
42
  extra_rdoc_files:
113
43
  - README.md
114
44
  files:
115
45
  - ".rubocop.yml"
116
- - Cargo.toml
117
46
  - Gemfile
118
47
  - Gemfile.lock
119
48
  - README.md
120
49
  - Rakefile
121
50
  - exe/html-to-markdown
122
- - extconf.rb
51
+ - ext/html-to-markdown-rb/extconf.rb
123
52
  - html-to-markdown-rb.gemspec
124
53
  - lib/html_to_markdown.rb
125
54
  - lib/html_to_markdown/cli.rb
@@ -128,7 +57,6 @@ files:
128
57
  - spec/cli_proxy_spec.rb
129
58
  - spec/convert_spec.rb
130
59
  - spec/spec_helper.rb
131
- - src/lib.rs
132
60
  homepage: https://github.com/Goldziher/html-to-markdown
133
61
  licenses:
134
62
  - MIT
@@ -138,7 +66,7 @@ metadata:
138
66
  source_code_uri: https://github.com/Goldziher/html-to-markdown
139
67
  bug_tracker_uri: https://github.com/Goldziher/html-to-markdown/issues
140
68
  changelog_uri: https://github.com/Goldziher/html-to-markdown/releases
141
- documentation_uri: https://github.com/Goldziher/html-to-markdown/blob/main/README.md
69
+ documentation_uri: https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md
142
70
  post_install_message:
143
71
  rdoc_options: []
144
72
  require_paths:
data/Cargo.toml DELETED
@@ -1,28 +0,0 @@
1
- [package]
2
- name = "html-to-markdown-rb"
3
- version = "2.5.5"
4
- edition = "2021"
5
- authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
6
- license = "MIT"
7
- repository = "https://github.com/Goldziher/html-to-markdown"
8
- homepage = "https://github.com/Goldziher/html-to-markdown"
9
- documentation = "https://docs.rs/html-to-markdown-rs"
10
- readme = "README.md"
11
- rust-version = "1.80"
12
- description = "Ruby bindings (Magnus) for html-to-markdown - high-performance HTML to Markdown converter"
13
- keywords = ["html", "markdown", "ruby", "magnus", "bindings"]
14
- categories = ["api-bindings"]
15
-
16
- [lib]
17
- name = "html_to_markdown_rb"
18
- crate-type = ["cdylib", "rlib"]
19
-
20
- [features]
21
- default = []
22
-
23
- [dependencies]
24
- html-to-markdown-rs = { version = "2.5.5", features = ["inline-images"] }
25
- magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
26
-
27
- [dev-dependencies]
28
- pretty_assertions = "1.4"
data/src/lib.rs DELETED
@@ -1,432 +0,0 @@
1
- use html_to_markdown_rs::{
2
- convert as convert_inner, convert_with_inline_images as convert_with_inline_images_inner, error::ConversionError,
3
- CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, HtmlExtraction, InlineImage, InlineImageConfig,
4
- InlineImageFormat, InlineImageSource, InlineImageWarning, ListIndentType, NewlineStyle, PreprocessingOptions,
5
- PreprocessingPreset, WhitespaceMode,
6
- };
7
- use magnus::prelude::*;
8
- use magnus::{function, scan_args::scan_args, Error, RArray, RHash, Ruby, Symbol, TryConvert, Value};
9
-
10
- const DEFAULT_INLINE_IMAGE_LIMIT: u64 = 5 * 1024 * 1024;
11
-
12
- fn conversion_error(err: ConversionError) -> Error {
13
- match err {
14
- ConversionError::ConfigError(msg) => arg_error(msg),
15
- other => runtime_error(other.to_string()),
16
- }
17
- }
18
-
19
- fn arg_error(message: impl Into<String>) -> Error {
20
- let ruby = Ruby::get().expect("Ruby not initialised");
21
- Error::new(ruby.exception_arg_error(), message.into())
22
- }
23
-
24
- fn runtime_error(message: impl Into<String>) -> Error {
25
- let ruby = Ruby::get().expect("Ruby not initialised");
26
- Error::new(ruby.exception_runtime_error(), message.into())
27
- }
28
-
29
- fn symbol_to_string(value: Value) -> Result<String, Error> {
30
- if let Some(symbol) = Symbol::from_value(value) {
31
- Ok(symbol.name()?.to_string())
32
- } else {
33
- String::try_convert(value)
34
- }
35
- }
36
-
37
- fn get_kw(ruby: &Ruby, hash: RHash, name: &str) -> Option<Value> {
38
- let sym = ruby.intern(name);
39
- hash.get(sym).or_else(|| hash.get(name))
40
- }
41
-
42
- fn parse_heading_style(value: Value) -> Result<HeadingStyle, Error> {
43
- match symbol_to_string(value)?.as_str() {
44
- "underlined" => Ok(HeadingStyle::Underlined),
45
- "atx" => Ok(HeadingStyle::Atx),
46
- "atx_closed" => Ok(HeadingStyle::AtxClosed),
47
- other => Err(arg_error(format!("invalid heading_style: {other}"))),
48
- }
49
- }
50
-
51
- fn parse_list_indent_type(value: Value) -> Result<ListIndentType, Error> {
52
- match symbol_to_string(value)?.as_str() {
53
- "spaces" => Ok(ListIndentType::Spaces),
54
- "tabs" => Ok(ListIndentType::Tabs),
55
- other => Err(arg_error(format!("invalid list_indent_type: {other}"))),
56
- }
57
- }
58
-
59
- fn parse_highlight_style(value: Value) -> Result<HighlightStyle, Error> {
60
- match symbol_to_string(value)?.as_str() {
61
- "double_equal" => Ok(HighlightStyle::DoubleEqual),
62
- "html" => Ok(HighlightStyle::Html),
63
- "bold" => Ok(HighlightStyle::Bold),
64
- "none" => Ok(HighlightStyle::None),
65
- other => Err(arg_error(format!("invalid highlight_style: {other}"))),
66
- }
67
- }
68
-
69
- fn parse_whitespace_mode(value: Value) -> Result<WhitespaceMode, Error> {
70
- match symbol_to_string(value)?.as_str() {
71
- "normalized" => Ok(WhitespaceMode::Normalized),
72
- "strict" => Ok(WhitespaceMode::Strict),
73
- other => Err(arg_error(format!("invalid whitespace_mode: {other}"))),
74
- }
75
- }
76
-
77
- fn parse_newline_style(value: Value) -> Result<NewlineStyle, Error> {
78
- match symbol_to_string(value)?.as_str() {
79
- "spaces" => Ok(NewlineStyle::Spaces),
80
- "backslash" => Ok(NewlineStyle::Backslash),
81
- other => Err(arg_error(format!("invalid newline_style: {other}"))),
82
- }
83
- }
84
-
85
- fn parse_code_block_style(value: Value) -> Result<CodeBlockStyle, Error> {
86
- match symbol_to_string(value)?.as_str() {
87
- "indented" => Ok(CodeBlockStyle::Indented),
88
- "backticks" => Ok(CodeBlockStyle::Backticks),
89
- "tildes" => Ok(CodeBlockStyle::Tildes),
90
- other => Err(arg_error(format!("invalid code_block_style: {other}"))),
91
- }
92
- }
93
-
94
- fn parse_preset(value: Value) -> Result<PreprocessingPreset, Error> {
95
- match symbol_to_string(value)?.as_str() {
96
- "minimal" => Ok(PreprocessingPreset::Minimal),
97
- "standard" => Ok(PreprocessingPreset::Standard),
98
- "aggressive" => Ok(PreprocessingPreset::Aggressive),
99
- other => Err(arg_error(format!("invalid preprocessing preset: {other}"))),
100
- }
101
- }
102
-
103
- fn parse_vec_of_strings(value: Value) -> Result<Vec<String>, Error> {
104
- let array = RArray::from_value(value).ok_or_else(|| arg_error("expected an Array of strings"))?;
105
-
106
- array.to_vec::<String>()
107
- }
108
-
109
- fn parse_preprocessing_options(ruby: &Ruby, value: Value) -> Result<PreprocessingOptions, Error> {
110
- let hash = RHash::from_value(value).ok_or_else(|| arg_error("expected preprocessing to be a Hash"))?;
111
-
112
- let mut opts = PreprocessingOptions::default();
113
-
114
- if let Some(enabled) = get_kw(ruby, hash, "enabled") {
115
- opts.enabled = bool::try_convert(enabled)?;
116
- }
117
-
118
- if let Some(preset) = get_kw(ruby, hash, "preset") {
119
- opts.preset = parse_preset(preset)?;
120
- }
121
-
122
- if let Some(remove_navigation) = get_kw(ruby, hash, "remove_navigation") {
123
- opts.remove_navigation = bool::try_convert(remove_navigation)?;
124
- }
125
-
126
- if let Some(remove_forms) = get_kw(ruby, hash, "remove_forms") {
127
- opts.remove_forms = bool::try_convert(remove_forms)?;
128
- }
129
-
130
- Ok(opts)
131
- }
132
-
133
- fn build_conversion_options(ruby: &Ruby, options: Option<Value>) -> Result<ConversionOptions, Error> {
134
- let mut opts = ConversionOptions::default();
135
-
136
- let Some(options) = options else {
137
- return Ok(opts);
138
- };
139
-
140
- if options.is_nil() {
141
- return Ok(opts);
142
- }
143
-
144
- let hash = RHash::from_value(options).ok_or_else(|| arg_error("options must be provided as a Hash"))?;
145
-
146
- if let Some(heading_style) = get_kw(ruby, hash, "heading_style") {
147
- opts.heading_style = parse_heading_style(heading_style)?;
148
- }
149
-
150
- if let Some(list_indent_type) = get_kw(ruby, hash, "list_indent_type") {
151
- opts.list_indent_type = parse_list_indent_type(list_indent_type)?;
152
- }
153
-
154
- if let Some(list_indent_width) = get_kw(ruby, hash, "list_indent_width") {
155
- opts.list_indent_width = usize::try_convert(list_indent_width)?;
156
- }
157
-
158
- if let Some(bullets) = get_kw(ruby, hash, "bullets") {
159
- opts.bullets = String::try_convert(bullets)?;
160
- }
161
-
162
- if let Some(strong_em_symbol) = get_kw(ruby, hash, "strong_em_symbol") {
163
- let value = String::try_convert(strong_em_symbol)?;
164
- let mut chars = value.chars();
165
- let ch = chars
166
- .next()
167
- .ok_or_else(|| arg_error("strong_em_symbol must not be empty"))?;
168
- if chars.next().is_some() {
169
- return Err(arg_error("strong_em_symbol must be a single character"));
170
- }
171
- opts.strong_em_symbol = ch;
172
- }
173
-
174
- if let Some(escape_asterisks) = get_kw(ruby, hash, "escape_asterisks") {
175
- opts.escape_asterisks = bool::try_convert(escape_asterisks)?;
176
- }
177
-
178
- if let Some(escape_underscores) = get_kw(ruby, hash, "escape_underscores") {
179
- opts.escape_underscores = bool::try_convert(escape_underscores)?;
180
- }
181
-
182
- if let Some(escape_misc) = get_kw(ruby, hash, "escape_misc") {
183
- opts.escape_misc = bool::try_convert(escape_misc)?;
184
- }
185
-
186
- if let Some(escape_ascii) = get_kw(ruby, hash, "escape_ascii") {
187
- opts.escape_ascii = bool::try_convert(escape_ascii)?;
188
- }
189
-
190
- if let Some(code_language) = get_kw(ruby, hash, "code_language") {
191
- opts.code_language = String::try_convert(code_language)?;
192
- }
193
-
194
- if let Some(autolinks) = get_kw(ruby, hash, "autolinks") {
195
- opts.autolinks = bool::try_convert(autolinks)?;
196
- }
197
-
198
- if let Some(default_title) = get_kw(ruby, hash, "default_title") {
199
- opts.default_title = bool::try_convert(default_title)?;
200
- }
201
-
202
- if let Some(br_in_tables) = get_kw(ruby, hash, "br_in_tables") {
203
- opts.br_in_tables = bool::try_convert(br_in_tables)?;
204
- }
205
-
206
- if let Some(hocr_spatial_tables) = get_kw(ruby, hash, "hocr_spatial_tables") {
207
- opts.hocr_spatial_tables = bool::try_convert(hocr_spatial_tables)?;
208
- }
209
-
210
- if let Some(highlight_style) = get_kw(ruby, hash, "highlight_style") {
211
- opts.highlight_style = parse_highlight_style(highlight_style)?;
212
- }
213
-
214
- if let Some(extract_metadata) = get_kw(ruby, hash, "extract_metadata") {
215
- opts.extract_metadata = bool::try_convert(extract_metadata)?;
216
- }
217
-
218
- if let Some(whitespace_mode) = get_kw(ruby, hash, "whitespace_mode") {
219
- opts.whitespace_mode = parse_whitespace_mode(whitespace_mode)?;
220
- }
221
-
222
- if let Some(strip_newlines) = get_kw(ruby, hash, "strip_newlines") {
223
- opts.strip_newlines = bool::try_convert(strip_newlines)?;
224
- }
225
-
226
- if let Some(wrap) = get_kw(ruby, hash, "wrap") {
227
- opts.wrap = bool::try_convert(wrap)?;
228
- }
229
-
230
- if let Some(wrap_width) = get_kw(ruby, hash, "wrap_width") {
231
- opts.wrap_width = usize::try_convert(wrap_width)?;
232
- }
233
-
234
- if let Some(convert_as_inline) = get_kw(ruby, hash, "convert_as_inline") {
235
- opts.convert_as_inline = bool::try_convert(convert_as_inline)?;
236
- }
237
-
238
- if let Some(sub_symbol) = get_kw(ruby, hash, "sub_symbol") {
239
- opts.sub_symbol = String::try_convert(sub_symbol)?;
240
- }
241
-
242
- if let Some(sup_symbol) = get_kw(ruby, hash, "sup_symbol") {
243
- opts.sup_symbol = String::try_convert(sup_symbol)?;
244
- }
245
-
246
- if let Some(newline_style) = get_kw(ruby, hash, "newline_style") {
247
- opts.newline_style = parse_newline_style(newline_style)?;
248
- }
249
-
250
- if let Some(code_block_style) = get_kw(ruby, hash, "code_block_style") {
251
- opts.code_block_style = parse_code_block_style(code_block_style)?;
252
- }
253
-
254
- if let Some(keep_inline_images_in) = get_kw(ruby, hash, "keep_inline_images_in") {
255
- opts.keep_inline_images_in = parse_vec_of_strings(keep_inline_images_in)?;
256
- }
257
-
258
- if let Some(preprocessing) = get_kw(ruby, hash, "preprocessing") {
259
- opts.preprocessing = parse_preprocessing_options(ruby, preprocessing)?;
260
- }
261
-
262
- if let Some(encoding) = get_kw(ruby, hash, "encoding") {
263
- opts.encoding = String::try_convert(encoding)?;
264
- }
265
-
266
- if let Some(debug) = get_kw(ruby, hash, "debug") {
267
- opts.debug = bool::try_convert(debug)?;
268
- }
269
-
270
- if let Some(strip_tags) = get_kw(ruby, hash, "strip_tags") {
271
- opts.strip_tags = parse_vec_of_strings(strip_tags)?;
272
- }
273
-
274
- if let Some(preserve_tags) = get_kw(ruby, hash, "preserve_tags") {
275
- opts.preserve_tags = parse_vec_of_strings(preserve_tags)?;
276
- }
277
-
278
- Ok(opts)
279
- }
280
-
281
- fn build_inline_image_config(ruby: &Ruby, config: Option<Value>) -> Result<InlineImageConfig, Error> {
282
- let mut cfg = InlineImageConfig::new(DEFAULT_INLINE_IMAGE_LIMIT);
283
-
284
- let Some(config) = config else {
285
- return Ok(cfg);
286
- };
287
-
288
- if config.is_nil() {
289
- return Ok(cfg);
290
- }
291
-
292
- let hash = RHash::from_value(config).ok_or_else(|| arg_error("inline image config must be provided as a Hash"))?;
293
-
294
- if let Some(limit) = get_kw(ruby, hash, "max_decoded_size_bytes") {
295
- cfg.max_decoded_size_bytes = u64::try_convert(limit)?;
296
- }
297
-
298
- if let Some(prefix) = get_kw(ruby, hash, "filename_prefix") {
299
- cfg.filename_prefix = if prefix.is_nil() {
300
- None
301
- } else {
302
- Some(String::try_convert(prefix)?)
303
- };
304
- }
305
-
306
- if let Some(capture_svg) = get_kw(ruby, hash, "capture_svg") {
307
- cfg.capture_svg = bool::try_convert(capture_svg)?;
308
- }
309
-
310
- if let Some(infer_dimensions) = get_kw(ruby, hash, "infer_dimensions") {
311
- cfg.infer_dimensions = bool::try_convert(infer_dimensions)?;
312
- }
313
-
314
- Ok(cfg)
315
- }
316
-
317
- fn inline_image_to_value(ruby: &Ruby, image: InlineImage) -> Result<Value, Error> {
318
- let InlineImage {
319
- data,
320
- format,
321
- filename,
322
- description,
323
- dimensions,
324
- source,
325
- attributes,
326
- } = image;
327
-
328
- let hash = ruby.hash_new();
329
- let data_value = ruby.str_from_slice(&data);
330
- hash.aset(ruby.intern("data"), data_value)?;
331
-
332
- let format_value = match format {
333
- InlineImageFormat::Png => "png".to_string(),
334
- InlineImageFormat::Jpeg => "jpeg".to_string(),
335
- InlineImageFormat::Gif => "gif".to_string(),
336
- InlineImageFormat::Bmp => "bmp".to_string(),
337
- InlineImageFormat::Webp => "webp".to_string(),
338
- InlineImageFormat::Svg => "svg".to_string(),
339
- InlineImageFormat::Other(other) => other,
340
- };
341
- hash.aset(ruby.intern("format"), format_value)?;
342
-
343
- match filename {
344
- Some(name) => hash.aset(ruby.intern("filename"), name)?,
345
- None => hash.aset(ruby.intern("filename"), ruby.qnil())?,
346
- }
347
-
348
- match description {
349
- Some(desc) => hash.aset(ruby.intern("description"), desc)?,
350
- None => hash.aset(ruby.intern("description"), ruby.qnil())?,
351
- }
352
-
353
- if let Some((width, height)) = dimensions {
354
- let dims = ruby.ary_new();
355
- dims.push(width as i64)?;
356
- dims.push(height as i64)?;
357
- hash.aset(ruby.intern("dimensions"), dims)?;
358
- } else {
359
- hash.aset(ruby.intern("dimensions"), ruby.qnil())?;
360
- }
361
-
362
- let source_value = match source {
363
- InlineImageSource::ImgDataUri => "img_data_uri",
364
- InlineImageSource::SvgElement => "svg_element",
365
- };
366
- hash.aset(ruby.intern("source"), source_value)?;
367
-
368
- let attrs = ruby.hash_new();
369
- for (key, value) in attributes {
370
- attrs.aset(key, value)?;
371
- }
372
- hash.aset(ruby.intern("attributes"), attrs)?;
373
-
374
- Ok(hash.as_value())
375
- }
376
-
377
- fn warning_to_value(ruby: &Ruby, warning: InlineImageWarning) -> Result<Value, Error> {
378
- let hash = ruby.hash_new();
379
- hash.aset(ruby.intern("index"), warning.index as i64)?;
380
- hash.aset(ruby.intern("message"), warning.message)?;
381
- Ok(hash.as_value())
382
- }
383
-
384
- fn extraction_to_value(ruby: &Ruby, extraction: HtmlExtraction) -> Result<Value, Error> {
385
- let hash = ruby.hash_new();
386
- hash.aset(ruby.intern("markdown"), extraction.markdown)?;
387
-
388
- let inline_images = ruby.ary_new();
389
- for image in extraction.inline_images {
390
- inline_images.push(inline_image_to_value(ruby, image)?)?;
391
- }
392
- hash.aset(ruby.intern("inline_images"), inline_images)?;
393
-
394
- let warnings = ruby.ary_new();
395
- for warning in extraction.warnings {
396
- warnings.push(warning_to_value(ruby, warning)?)?;
397
- }
398
- hash.aset(ruby.intern("warnings"), warnings)?;
399
-
400
- Ok(hash.as_value())
401
- }
402
-
403
- fn convert_fn(ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
404
- let parsed = scan_args::<(String,), (Option<Value>,), (), (), (), ()>(args)?;
405
- let html = parsed.required.0;
406
- let options = build_conversion_options(ruby, parsed.optional.0)?;
407
-
408
- convert_inner(&html, Some(options)).map_err(conversion_error)
409
- }
410
-
411
- fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
412
- let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
413
- let html = parsed.required.0;
414
- let options = build_conversion_options(ruby, parsed.optional.0)?;
415
- let config = build_inline_image_config(ruby, parsed.optional.1)?;
416
-
417
- let extraction = convert_with_inline_images_inner(&html, Some(options), config).map_err(conversion_error)?;
418
-
419
- extraction_to_value(ruby, extraction)
420
- }
421
-
422
- #[magnus::init]
423
- fn init(ruby: &Ruby) -> Result<(), Error> {
424
- let module = ruby.define_module("HtmlToMarkdown")?;
425
- module.define_singleton_method("convert", function!(convert_fn, -1))?;
426
- module.define_singleton_method(
427
- "convert_with_inline_images",
428
- function!(convert_with_inline_images_fn, -1),
429
- )?;
430
-
431
- Ok(())
432
- }