html-to-markdown 2.6.3 → 2.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f64511afc3484dfb2deb8fd65e0d6b52a60e1db075437281fad2773bf936b4fb
4
- data.tar.gz: 8e03e85a5f741b798a4a7e4d86f43847e72dc75801c4e4f3bc50de2ec6f7ed14
3
+ metadata.gz: ce9f59ee100c4d4b3de28c61803fd2940681d3bc8bd46e1730598269e6283e1b
4
+ data.tar.gz: 52619ee971d3f51f040bd05a516517fd6082ea0ff10395dcb239f7081055234c
5
5
  SHA512:
6
- metadata.gz: 61a31a784899e72b4d33f7afe2762b5493325e2db9e8c3bebc7d029098ce9e8a30d1b9d1133b986fb68094c543bf402550396a9b86c2e10ab069737452756a68
7
- data.tar.gz: 528b7d9a5906a582e4a131896634789e36426ee7f4da65d8f0b28bb37dac2ca8afeff869f6228132c62e3b41507db49dd81c32b0c7536099f6677907679186de
6
+ metadata.gz: ee2c1b632ba2ec4edb5b449b6f9b9050229d15858b6434bb76ea327ca4b129f02a629d4e10daf1d36c86c2725a3aa431484aa92ecff7503b7adfc73b83697bb5
7
+ data.tar.gz: b2e48f4f60f8bd05bc32621ca8fb43e27886ae220a98a7ee378a1ac0796d4dc7ed43fc70ffe6391c53da20e2b85324e4dbe0bc98922cbffe64471b70f784a5a6
data/.rubocop.yml CHANGED
@@ -1,29 +1,29 @@
1
- plugins:
2
- - rubocop-rspec
3
-
4
- AllCops:
5
- NewCops: enable
6
- TargetRubyVersion: 3.2
7
- Exclude:
8
- - "tmp/**/*"
9
- - "vendor/**/*"
10
-
11
- Style/Documentation:
12
- Enabled: false
13
-
14
- Metrics/BlockLength:
15
- Exclude:
16
- - "spec/**/*"
17
- - "*.gemspec"
18
-
19
- Metrics/MethodLength:
20
- Max: 15
21
-
22
- RSpec/MultipleExpectations:
23
- Enabled: false
24
-
25
- RSpec/ExampleLength:
26
- Enabled: false
27
-
28
- RSpec/SpecFilePathFormat:
29
- Enabled: false
1
+ plugins:
2
+ - rubocop-rspec
3
+
4
+ AllCops:
5
+ NewCops: enable
6
+ TargetRubyVersion: 3.2
7
+ Exclude:
8
+ - "tmp/**/*"
9
+ - "vendor/**/*"
10
+
11
+ Style/Documentation:
12
+ Enabled: false
13
+
14
+ Metrics/BlockLength:
15
+ Exclude:
16
+ - "spec/**/*"
17
+ - "*.gemspec"
18
+
19
+ Metrics/MethodLength:
20
+ Max: 15
21
+
22
+ RSpec/MultipleExpectations:
23
+ Enabled: false
24
+
25
+ RSpec/ExampleLength:
26
+ Enabled: false
27
+
28
+ RSpec/SpecFilePathFormat:
29
+ Enabled: false
data/Gemfile CHANGED
@@ -1,15 +1,15 @@
1
- # frozen_string_literal: true
2
-
3
- source 'https://rubygems.org'
4
-
5
- ruby '>= 3.2'
6
-
7
- gemspec
8
-
9
- group :development, :test do
10
- gem 'rake-compiler'
11
- gem 'rb_sys' # provides build tooling when developing locally
12
- gem 'rspec'
13
- gem 'rubocop', require: false
14
- gem 'rubocop-rspec', require: false
15
- end
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ ruby '>= 3.2'
6
+
7
+ gemspec
8
+
9
+ group :development, :test do
10
+ gem 'rake-compiler'
11
+ gem 'rb_sys' # provides build tooling when developing locally
12
+ gem 'rspec'
13
+ gem 'rubocop', require: false
14
+ gem 'rubocop-rspec', require: false
15
+ end
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.6.3)
4
+ html-to-markdown (2.6.4)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
@@ -64,6 +64,7 @@ GEM
64
64
  PLATFORMS
65
65
  arm64-darwin-24
66
66
  ruby
67
+ x64-mingw-ucrt
67
68
 
68
69
  DEPENDENCIES
69
70
  html-to-markdown!
data/README.md CHANGED
@@ -1,210 +1,210 @@
1
- # html-to-markdown-rb
2
-
3
- Blazing-fast HTML → Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, and WebAssembly packages. Ship identical Markdown across every runtime while enjoying native extension performance.
4
-
5
- [![Crates.io](https://img.shields.io/crates/v/html-to-markdown-rs.svg)](https://crates.io/crates/html-to-markdown-rs)
6
- [![npm (node)](https://badge.fury.io/js/html-to-markdown-node.svg)](https://www.npmjs.com/package/html-to-markdown-node)
7
- [![npm (wasm)](https://badge.fury.io/js/html-to-markdown-wasm.svg)](https://www.npmjs.com/package/html-to-markdown-wasm)
8
- [![npm (typescript)](https://badge.fury.io/js/html-to-markdown.svg)](https://www.npmjs.com/package/html-to-markdown)
9
- [![PyPI](https://badge.fury.io/py/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
10
- [![Packagist](https://img.shields.io/packagist/v/goldziher/html-to-markdown.svg)](https://packagist.org/packages/goldziher/html-to-markdown)
11
- [![RubyGems](https://badge.fury.io/rb/html-to-markdown.svg)](https://rubygems.org/gems/html-to-markdown)
12
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
13
-
14
- ## Features
15
-
16
- - ⚡ **Rust-fast**: Ruby bindings around a highly optimised Rust core (60‑80× faster than BeautifulSoup-based converters).
17
- - 🔁 **Identical output**: Shares logic with the Python wheels, npm bindings, WASM package, and CLI — consistent Markdown everywhere.
18
- - ⚙️ **Rich configuration**: Control heading styles, list indentation, whitespace handling, HTML preprocessing, and more.
19
- - 🖼️ **Inline image extraction**: Pull out embedded images (PNG/JPEG/SVG/data URIs) alongside Markdown.
20
- - 🧰 **Bundled CLI proxy**: Call the Rust CLI straight from Ruby or shell scripts.
21
- - 🛠️ **First-class Rails support**: Works with `Gem.win_platform?` builds, supports Trusted Publishing, and compiles on install if no native gem matches.
22
-
23
- ## Documentation & Support
24
-
25
- - [GitHub repository](https://github.com/Goldziher/html-to-markdown)
26
- - [Issue tracker](https://github.com/Goldziher/html-to-markdown/issues)
27
- - [Changelog](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md)
28
- - [Live demo (WASM)](https://goldziher.github.io/html-to-markdown/)
29
-
30
- ## Installation
31
-
32
- ```bash
33
- bundle add html-to-markdown
34
- # or
35
- gem install html-to-markdown
36
- ```
37
-
38
- Add the gem to your project and Bundler will compile the native Rust extension on first install.
39
-
40
- ### Requirements
41
-
42
- - Ruby **3.2+** (Magnus relies on the fiber scheduler APIs added in 3.2)
43
- - Rust toolchain **1.85+** with Cargo available on your `$PATH`
44
- - Ruby development headers (`ruby-dev`, `ruby-devel`, or the platform equivalent)
45
-
46
- **Windows**: install [RubyInstaller with MSYS2](https://rubyinstaller.org/) (UCRT64). Run once:
47
-
48
- ```powershell
49
- ridk exec pacman -S --needed --noconfirm base-devel mingw-w64-ucrt-x86_64-toolchain
50
- ```
51
-
52
- This provides the standard headers (including `strings.h`) required for the bindgen step.
53
-
54
- ## Performance Snapshot
55
-
56
- Apple M4 • Real Wikipedia documents • `HtmlToMarkdown.convert` (Ruby)
57
-
58
- | Document | Size | Latency | Throughput | Docs/sec |
59
- | ------------------- | ----- | ------- | ---------- | -------- |
60
- | Lists (Timeline) | 129KB | 0.69ms | 187 MB/s | 1,450 |
61
- | Tables (Countries) | 360KB | 2.19ms | 164 MB/s | 456 |
62
- | Mixed (Python wiki) | 656KB | 4.88ms | 134 MB/s | 205 |
63
-
64
- > Same core, same benchmarks: the Ruby extension stays within single-digit % of the Rust CLI and mirrors the Python/Node numbers.
65
-
66
- ## Quick Start
67
-
68
- ```ruby
69
- require 'html_to_markdown'
70
-
71
- html = <<~HTML
72
- <h1>Welcome</h1>
73
- <p>This is <strong>Rust-fast</strong> conversion!</p>
74
- <ul>
75
- <li>Native extension</li>
76
- <li>Identical output across languages</li>
77
- </ul>
78
- HTML
79
-
80
- markdown = HtmlToMarkdown.convert(html)
81
- puts markdown
82
- # # Welcome
83
- #
84
- # This is **Rust-fast** conversion!
85
- #
86
- # - Native extension
87
- # - Identical output across languages
88
- ```
89
-
90
- ## API
91
-
92
- ### Conversion Options
93
-
94
- Pass a Ruby hash (string or symbol keys) to tweak rendering. Every option maps one-for-one with the Rust/Python/Node APIs.
95
-
96
- ```ruby
97
- require 'html_to_markdown'
98
-
99
- markdown = HtmlToMarkdown.convert(
100
- '<pre><code class="language-ruby">puts "hi"</code></pre>',
101
- heading_style: :atx,
102
- code_block_style: :fenced,
103
- bullets: '*+-',
104
- list_indent_type: :spaces,
105
- list_indent_width: 2,
106
- whitespace_mode: :normalized,
107
- highlight_style: :double_equal
108
- )
109
-
110
- puts markdown
111
- ```
112
-
113
- ### HTML Preprocessing
114
-
115
- Clean up scraped HTML (navigation, forms, malformed markup) before conversion:
116
-
117
- ```ruby
118
- require 'html_to_markdown'
119
-
120
- markdown = HtmlToMarkdown.convert(
121
- html,
122
- preprocessing: {
123
- enabled: true,
124
- preset: :aggressive, # :minimal, :standard, :aggressive
125
- remove_navigation: true,
126
- remove_forms: true
127
- }
128
- )
129
- ```
130
-
131
- ### Inline Images
132
-
133
- Extract inline binary data (data URIs, SVG) together with the converted Markdown.
134
-
135
- ```ruby
136
- require 'html_to_markdown'
137
-
138
- result = HtmlToMarkdown.convert_with_inline_images(
139
- '<img src="..." alt="Pixel">',
140
- image_config: {
141
- max_decoded_size_bytes: 1 * 1024 * 1024,
142
- infer_dimensions: true,
143
- filename_prefix: 'img_',
144
- capture_svg: true
145
- }
146
- )
147
-
148
- puts result.markdown
149
- result.inline_images.each do |img|
150
- puts "#{img.filename} -> #{img.format} (#{img.data.bytesize} bytes)"
151
- end
152
- ```
153
-
154
- ## CLI
155
-
156
- The gem bundles a small proxy for the Rust CLI binary. Use it when you need parity with the standalone `html-to-markdown` executable.
157
-
158
- ```ruby
159
- require 'html_to_markdown/cli'
160
-
161
- HtmlToMarkdown::CLI.run(%w[--heading-style atx input.html], stdout: $stdout)
162
- # => writes converted Markdown to STDOUT
163
- ```
164
-
165
- You can also call the CLI binary directly for scripting:
166
-
167
- ```ruby
168
- HtmlToMarkdown::CLIProxy.call(['--version'])
169
- # => "html-to-markdown 2.5.7"
170
- ```
171
-
172
- Rebuild the CLI locally if you see `CLI binary not built` during tests:
173
-
174
- ```bash
175
- bundle exec rake compile # builds the extension
176
- bundle exec ruby scripts/prepare_ruby_gem.rb # copies the CLI into lib/bin/
177
- ```
178
-
179
- ## Error Handling
180
-
181
- Conversion errors raise `HtmlToMarkdown::Error` (wrapping the Rust error context). CLI invocations use specialised subclasses:
182
-
183
- - `HtmlToMarkdown::CLIProxy::MissingBinaryError`
184
- - `HtmlToMarkdown::CLIProxy::CLIExecutionError`
185
-
186
- Rescue them to provide clearer feedback in your application.
187
-
188
- ## Consistent Across Languages
189
-
190
- The Ruby gem shares the exact Rust core with:
191
-
192
- - [Python wheels](https://pypi.org/project/html-to-markdown/)
193
- - [Node.js / Bun bindings](https://www.npmjs.com/package/html-to-markdown-node)
194
- - [WebAssembly package](https://www.npmjs.com/package/html-to-markdown-wasm)
195
- - The Rust crate and CLI
196
-
197
- Use whichever runtime fits your stack while keeping formatting behaviour identical.
198
-
199
- ## Development
200
-
201
- ```bash
202
- bundle exec rake compile # build the native extension
203
- bundle exec rspec # run test suite
204
- ```
205
-
206
- The extension uses [Magnus](https://github.com/matsadler/magnus) plus `rb-sys` for bindgen. When editing the Rust code under `src/`, rerun `rake compile`.
207
-
208
- ## License
209
-
210
- MIT © Na'aman Hirschfeld
1
+ # html-to-markdown-rb
2
+
3
+ Blazing-fast HTML → Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, and WebAssembly packages. Ship identical Markdown across every runtime while enjoying native extension performance.
4
+
5
+ [![Crates.io](https://img.shields.io/crates/v/html-to-markdown-rs.svg)](https://crates.io/crates/html-to-markdown-rs)
6
+ [![npm (node)](https://badge.fury.io/js/html-to-markdown-node.svg)](https://www.npmjs.com/package/html-to-markdown-node)
7
+ [![npm (wasm)](https://badge.fury.io/js/html-to-markdown-wasm.svg)](https://www.npmjs.com/package/html-to-markdown-wasm)
8
+ [![npm (typescript)](https://badge.fury.io/js/html-to-markdown.svg)](https://www.npmjs.com/package/html-to-markdown)
9
+ [![PyPI](https://badge.fury.io/py/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
10
+ [![Packagist](https://img.shields.io/packagist/v/goldziher/html-to-markdown.svg)](https://packagist.org/packages/goldziher/html-to-markdown)
11
+ [![RubyGems](https://badge.fury.io/rb/html-to-markdown.svg)](https://rubygems.org/gems/html-to-markdown)
12
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
13
+
14
+ ## Features
15
+
16
+ - ⚡ **Rust-fast**: Ruby bindings around a highly optimised Rust core (60‑80× faster than BeautifulSoup-based converters).
17
+ - 🔁 **Identical output**: Shares logic with the Python wheels, npm bindings, WASM package, and CLI — consistent Markdown everywhere.
18
+ - ⚙️ **Rich configuration**: Control heading styles, list indentation, whitespace handling, HTML preprocessing, and more.
19
+ - 🖼️ **Inline image extraction**: Pull out embedded images (PNG/JPEG/SVG/data URIs) alongside Markdown.
20
+ - 🧰 **Bundled CLI proxy**: Call the Rust CLI straight from Ruby or shell scripts.
21
+ - 🛠️ **First-class Rails support**: Works with `Gem.win_platform?` builds, supports Trusted Publishing, and compiles on install if no native gem matches.
22
+
23
+ ## Documentation & Support
24
+
25
+ - [GitHub repository](https://github.com/Goldziher/html-to-markdown)
26
+ - [Issue tracker](https://github.com/Goldziher/html-to-markdown/issues)
27
+ - [Changelog](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md)
28
+ - [Live demo (WASM)](https://goldziher.github.io/html-to-markdown/)
29
+
30
+ ## Installation
31
+
32
+ ```bash
33
+ bundle add html-to-markdown
34
+ # or
35
+ gem install html-to-markdown
36
+ ```
37
+
38
+ Add the gem to your project and Bundler will compile the native Rust extension on first install.
39
+
40
+ ### Requirements
41
+
42
+ - Ruby **3.2+** (Magnus relies on the fiber scheduler APIs added in 3.2)
43
+ - Rust toolchain **1.85+** with Cargo available on your `$PATH`
44
+ - Ruby development headers (`ruby-dev`, `ruby-devel`, or the platform equivalent)
45
+
46
+ **Windows**: install [RubyInstaller with MSYS2](https://rubyinstaller.org/) (UCRT64). Run once:
47
+
48
+ ```powershell
49
+ ridk exec pacman -S --needed --noconfirm base-devel mingw-w64-ucrt-x86_64-toolchain
50
+ ```
51
+
52
+ This provides the standard headers (including `strings.h`) required for the bindgen step.
53
+
54
+ ## Performance Snapshot
55
+
56
+ Apple M4 • Real Wikipedia documents • `HtmlToMarkdown.convert` (Ruby)
57
+
58
+ | Document | Size | Latency | Throughput | Docs/sec |
59
+ | ------------------- | ----- | ------- | ---------- | -------- |
60
+ | Lists (Timeline) | 129KB | 0.69ms | 187 MB/s | 1,450 |
61
+ | Tables (Countries) | 360KB | 2.19ms | 164 MB/s | 456 |
62
+ | Mixed (Python wiki) | 656KB | 4.88ms | 134 MB/s | 205 |
63
+
64
+ > Same core, same benchmarks: the Ruby extension stays within single-digit % of the Rust CLI and mirrors the Python/Node numbers.
65
+
66
+ ## Quick Start
67
+
68
+ ```ruby
69
+ require 'html_to_markdown'
70
+
71
+ html = <<~HTML
72
+ <h1>Welcome</h1>
73
+ <p>This is <strong>Rust-fast</strong> conversion!</p>
74
+ <ul>
75
+ <li>Native extension</li>
76
+ <li>Identical output across languages</li>
77
+ </ul>
78
+ HTML
79
+
80
+ markdown = HtmlToMarkdown.convert(html)
81
+ puts markdown
82
+ # # Welcome
83
+ #
84
+ # This is **Rust-fast** conversion!
85
+ #
86
+ # - Native extension
87
+ # - Identical output across languages
88
+ ```
89
+
90
+ ## API
91
+
92
+ ### Conversion Options
93
+
94
+ Pass a Ruby hash (string or symbol keys) to tweak rendering. Every option maps one-for-one with the Rust/Python/Node APIs.
95
+
96
+ ```ruby
97
+ require 'html_to_markdown'
98
+
99
+ markdown = HtmlToMarkdown.convert(
100
+ '<pre><code class="language-ruby">puts "hi"</code></pre>',
101
+ heading_style: :atx,
102
+ code_block_style: :fenced,
103
+ bullets: '*+-',
104
+ list_indent_type: :spaces,
105
+ list_indent_width: 2,
106
+ whitespace_mode: :normalized,
107
+ highlight_style: :double_equal
108
+ )
109
+
110
+ puts markdown
111
+ ```
112
+
113
+ ### HTML Preprocessing
114
+
115
+ Clean up scraped HTML (navigation, forms, malformed markup) before conversion:
116
+
117
+ ```ruby
118
+ require 'html_to_markdown'
119
+
120
+ markdown = HtmlToMarkdown.convert(
121
+ html,
122
+ preprocessing: {
123
+ enabled: true,
124
+ preset: :aggressive, # :minimal, :standard, :aggressive
125
+ remove_navigation: true,
126
+ remove_forms: true
127
+ }
128
+ )
129
+ ```
130
+
131
+ ### Inline Images
132
+
133
+ Extract inline binary data (data URIs, SVG) together with the converted Markdown.
134
+
135
+ ```ruby
136
+ require 'html_to_markdown'
137
+
138
+ result = HtmlToMarkdown.convert_with_inline_images(
139
+ '<img src="..." alt="Pixel">',
140
+ image_config: {
141
+ max_decoded_size_bytes: 1 * 1024 * 1024,
142
+ infer_dimensions: true,
143
+ filename_prefix: 'img_',
144
+ capture_svg: true
145
+ }
146
+ )
147
+
148
+ puts result.markdown
149
+ result.inline_images.each do |img|
150
+ puts "#{img.filename} -> #{img.format} (#{img.data.bytesize} bytes)"
151
+ end
152
+ ```
153
+
154
+ ## CLI
155
+
156
+ The gem bundles a small proxy for the Rust CLI binary. Use it when you need parity with the standalone `html-to-markdown` executable.
157
+
158
+ ```ruby
159
+ require 'html_to_markdown/cli'
160
+
161
+ HtmlToMarkdown::CLI.run(%w[--heading-style atx input.html], stdout: $stdout)
162
+ # => writes converted Markdown to STDOUT
163
+ ```
164
+
165
+ You can also call the CLI binary directly for scripting:
166
+
167
+ ```ruby
168
+ HtmlToMarkdown::CLIProxy.call(['--version'])
169
+ # => "html-to-markdown 2.5.7"
170
+ ```
171
+
172
+ Rebuild the CLI locally if you see `CLI binary not built` during tests:
173
+
174
+ ```bash
175
+ bundle exec rake compile # builds the extension
176
+ bundle exec ruby scripts/prepare_ruby_gem.rb # copies the CLI into lib/bin/
177
+ ```
178
+
179
+ ## Error Handling
180
+
181
+ Conversion errors raise `HtmlToMarkdown::Error` (wrapping the Rust error context). CLI invocations use specialised subclasses:
182
+
183
+ - `HtmlToMarkdown::CLIProxy::MissingBinaryError`
184
+ - `HtmlToMarkdown::CLIProxy::CLIExecutionError`
185
+
186
+ Rescue them to provide clearer feedback in your application.
187
+
188
+ ## Consistent Across Languages
189
+
190
+ The Ruby gem shares the exact Rust core with:
191
+
192
+ - [Python wheels](https://pypi.org/project/html-to-markdown/)
193
+ - [Node.js / Bun bindings](https://www.npmjs.com/package/html-to-markdown-node)
194
+ - [WebAssembly package](https://www.npmjs.com/package/html-to-markdown-wasm)
195
+ - The Rust crate and CLI
196
+
197
+ Use whichever runtime fits your stack while keeping formatting behaviour identical.
198
+
199
+ ## Development
200
+
201
+ ```bash
202
+ bundle exec rake compile # build the native extension
203
+ bundle exec rspec # run test suite
204
+ ```
205
+
206
+ The extension uses [Magnus](https://github.com/matsadler/magnus) plus `rb-sys` for bindgen. When editing the Rust code under `src/`, rerun `rake compile`.
207
+
208
+ ## License
209
+
210
+ MIT © Na'aman Hirschfeld
data/Rakefile CHANGED
@@ -1,24 +1,24 @@
1
- # frozen_string_literal: true
2
-
3
- require 'bundler/gem_tasks'
4
- require 'rb_sys/extensiontask'
5
- require 'rspec/core/rake_task'
6
-
7
- GEMSPEC = Gem::Specification.load(File.expand_path('html-to-markdown-rb.gemspec', __dir__))
8
-
9
- RbSys::ExtensionTask.new('html-to-markdown-rb', GEMSPEC) do |ext|
10
- ext.lib_dir = 'lib'
11
- ext.ext_dir = 'ext/html-to-markdown-rb'
12
- ext.cross_compile = true
13
- ext.cross_platform = %w[
14
- x86_64-linux
15
- x86_64-darwin
16
- arm64-darwin
17
- x64-mingw32
18
- ]
19
- end
20
-
21
- RSpec::Core::RakeTask.new(:spec)
22
-
23
- task spec: :compile
24
- task default: :spec
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rb_sys/extensiontask'
5
+ require 'rspec/core/rake_task'
6
+
7
+ GEMSPEC = Gem::Specification.load(File.expand_path('html-to-markdown-rb.gemspec', __dir__))
8
+
9
+ RbSys::ExtensionTask.new('html-to-markdown-rb', GEMSPEC) do |ext|
10
+ ext.lib_dir = 'lib'
11
+ ext.ext_dir = 'ext/html-to-markdown-rb'
12
+ ext.cross_compile = true
13
+ ext.cross_platform = %w[
14
+ x86_64-linux
15
+ x86_64-darwin
16
+ arm64-darwin
17
+ x64-mingw32
18
+ ]
19
+ end
20
+
21
+ RSpec::Core::RakeTask.new(:spec)
22
+
23
+ task spec: :compile
24
+ task default: :spec
data/exe/html-to-markdown CHANGED
@@ -1,6 +1,6 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- require 'html_to_markdown/cli'
5
-
6
- exit HtmlToMarkdown::CLI.run(ARGV)
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'html_to_markdown/cli'
5
+
6
+ exit HtmlToMarkdown::CLI.run(ARGV)
@@ -1,28 +1,28 @@
1
- # frozen_string_literal: true
2
-
3
- require 'mkmf'
4
- require 'rb_sys/mkmf'
5
- require 'rbconfig'
6
-
7
- if RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
8
- devkit = ENV.fetch('RI_DEVKIT', nil)
9
- prefix = ENV['MSYSTEM_PREFIX'] || '/ucrt64'
10
-
11
- if devkit
12
- sysroot = "#{devkit}#{prefix}".tr('\\\\', '/')
13
- extra_args = [
14
- '--target=x86_64-pc-windows-gnu',
15
- "--sysroot=#{sysroot}"
16
- ]
17
-
18
- existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/)
19
- ENV['BINDGEN_EXTRA_CLANG_ARGS'] = (existing + extra_args).uniq.join(' ')
20
- end
21
- end
22
-
23
- default_profile = ENV.fetch('CARGO_PROFILE', 'release')
24
-
25
- create_rust_makefile('html_to_markdown_rb') do |config|
26
- config.profile = default_profile.to_sym
27
- config.ext_dir = '../../../../crates/html-to-markdown-rb'
28
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'mkmf'
4
+ require 'rb_sys/mkmf'
5
+ require 'rbconfig'
6
+
7
+ if RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
8
+ devkit = ENV.fetch('RI_DEVKIT', nil)
9
+ prefix = ENV['MSYSTEM_PREFIX'] || '/ucrt64'
10
+
11
+ if devkit
12
+ sysroot = "#{devkit}#{prefix}".tr('\\\\', '/')
13
+ extra_args = [
14
+ '--target=x86_64-pc-windows-gnu',
15
+ "--sysroot=#{sysroot}"
16
+ ]
17
+
18
+ existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/)
19
+ ENV['BINDGEN_EXTRA_CLANG_ARGS'] = (existing + extra_args).uniq.join(' ')
20
+ end
21
+ end
22
+
23
+ default_profile = ENV.fetch('CARGO_PROFILE', 'release')
24
+
25
+ create_rust_makefile('html_to_markdown_rb') do |config|
26
+ config.profile = default_profile.to_sym
27
+ config.ext_dir = '../../../../crates/html-to-markdown-rb'
28
+ end
@@ -1,59 +1,59 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative 'lib/html_to_markdown/version'
4
-
5
- repo_root = File.expand_path('../..', __dir__)
6
- crate_prefix = 'packages/ruby/'
7
- git_cmd = %(git -C "#{repo_root}" ls-files -z #{crate_prefix})
8
- git_files =
9
- `#{git_cmd}`.split("\x0")
10
- .select { |path| path.start_with?(crate_prefix) }
11
- .map { |path| path.delete_prefix(crate_prefix) }
12
- fallback_files = Dir.chdir(__dir__) do
13
- Dir.glob(
14
- %w[
15
- README.md
16
- ext/html-to-markdown-rb/extconf.rb
17
- exe/*
18
- lib/**/*.rb
19
- lib/bin/*
20
- src/**/*.rs
21
- spec/**/*.rb
22
- ]
23
- )
24
- end
25
- files = git_files.empty? ? fallback_files : git_files
26
-
27
- Gem::Specification.new do |spec|
28
- spec.name = 'html-to-markdown'
29
- spec.version = HtmlToMarkdown::VERSION
30
- spec.authors = ["Na'aman Hirschfeld"]
31
- spec.email = ['nhirschfeld@gmail.com']
32
-
33
- spec.summary = 'Blazing-fast HTML to Markdown conversion for Ruby, powered by Rust.'
34
- spec.description = <<~DESC.strip
35
- html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
36
- It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
37
- DESC
38
- spec.homepage = 'https://github.com/Goldziher/html-to-markdown'
39
- spec.license = 'MIT'
40
-
41
- spec.required_ruby_version = Gem::Requirement.new('>= 3.2')
42
-
43
- spec.bindir = 'exe'
44
- spec.executables = ['html-to-markdown']
45
- spec.require_paths = ['lib']
46
-
47
- spec.files = files
48
- spec.extra_rdoc_files = ['README.md']
49
-
50
- spec.extensions = ['ext/html-to-markdown-rb/extconf.rb']
51
-
52
- spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
53
- spec.metadata['rubygems_mfa_required'] = 'true'
54
- spec.metadata['homepage_uri'] = 'https://github.com/Goldziher/html-to-markdown'
55
- spec.metadata['source_code_uri'] = 'https://github.com/Goldziher/html-to-markdown'
56
- spec.metadata['bug_tracker_uri'] = 'https://github.com/Goldziher/html-to-markdown/issues'
57
- spec.metadata['changelog_uri'] = 'https://github.com/Goldziher/html-to-markdown/releases'
58
- spec.metadata['documentation_uri'] = 'https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md'
59
- end
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/html_to_markdown/version'
4
+
5
+ repo_root = File.expand_path('../..', __dir__)
6
+ crate_prefix = 'packages/ruby/'
7
+ git_cmd = %(git -C "#{repo_root}" ls-files -z #{crate_prefix})
8
+ git_files =
9
+ `#{git_cmd}`.split("\x0")
10
+ .select { |path| path.start_with?(crate_prefix) }
11
+ .map { |path| path.delete_prefix(crate_prefix) }
12
+ fallback_files = Dir.chdir(__dir__) do
13
+ Dir.glob(
14
+ %w[
15
+ README.md
16
+ ext/html-to-markdown-rb/extconf.rb
17
+ exe/*
18
+ lib/**/*.rb
19
+ lib/bin/*
20
+ src/**/*.rs
21
+ spec/**/*.rb
22
+ ]
23
+ )
24
+ end
25
+ files = git_files.empty? ? fallback_files : git_files
26
+
27
+ Gem::Specification.new do |spec|
28
+ spec.name = 'html-to-markdown'
29
+ spec.version = HtmlToMarkdown::VERSION
30
+ spec.authors = ["Na'aman Hirschfeld"]
31
+ spec.email = ['nhirschfeld@gmail.com']
32
+
33
+ spec.summary = 'Blazing-fast HTML to Markdown conversion for Ruby, powered by Rust.'
34
+ spec.description = <<~DESC.strip
35
+ html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
36
+ It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
37
+ DESC
38
+ spec.homepage = 'https://github.com/Goldziher/html-to-markdown'
39
+ spec.license = 'MIT'
40
+
41
+ spec.required_ruby_version = Gem::Requirement.new('>= 3.2')
42
+
43
+ spec.bindir = 'exe'
44
+ spec.executables = ['html-to-markdown']
45
+ spec.require_paths = ['lib']
46
+
47
+ spec.files = files
48
+ spec.extra_rdoc_files = ['README.md']
49
+
50
+ spec.extensions = ['ext/html-to-markdown-rb/extconf.rb']
51
+
52
+ spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
53
+ spec.metadata['rubygems_mfa_required'] = 'true'
54
+ spec.metadata['homepage_uri'] = 'https://github.com/Goldziher/html-to-markdown'
55
+ spec.metadata['source_code_uri'] = 'https://github.com/Goldziher/html-to-markdown'
56
+ spec.metadata['bug_tracker_uri'] = 'https://github.com/Goldziher/html-to-markdown/issues'
57
+ spec.metadata['changelog_uri'] = 'https://github.com/Goldziher/html-to-markdown/releases'
58
+ spec.metadata['documentation_uri'] = 'https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md'
59
+ end
@@ -1,21 +1,21 @@
1
- # frozen_string_literal: true
2
-
3
- require 'html_to_markdown/cli_proxy'
4
-
5
- module HtmlToMarkdown
6
- module CLI
7
- module_function
8
-
9
- def run(argv = ARGV, stdout: $stdout, stderr: $stderr)
10
- output = CLIProxy.call(argv)
11
- stdout.print(output)
12
- 0
13
- rescue CLIProxy::CLIExecutionError => e
14
- stderr.print(e.stderr)
15
- e.status || 1
16
- rescue CLIProxy::MissingBinaryError, CLIProxy::Error => e
17
- stderr.puts(e.message)
18
- 1
19
- end
20
- end
21
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'html_to_markdown/cli_proxy'
4
+
5
+ module HtmlToMarkdown
6
+ module CLI
7
+ module_function
8
+
9
+ def run(argv = ARGV, stdout: $stdout, stderr: $stderr)
10
+ output = CLIProxy.call(argv)
11
+ stdout.print(output)
12
+ 0
13
+ rescue CLIProxy::CLIExecutionError => e
14
+ stderr.print(e.stderr)
15
+ e.status || 1
16
+ rescue CLIProxy::MissingBinaryError, CLIProxy::Error => e
17
+ stderr.puts(e.message)
18
+ 1
19
+ end
20
+ end
21
+ end
@@ -1,71 +1,71 @@
1
- # frozen_string_literal: true
2
-
3
- require 'open3'
4
- require 'pathname'
5
-
6
- module HtmlToMarkdown
7
- module CLIProxy
8
- Error = Class.new(StandardError)
9
- MissingBinaryError = Class.new(Error)
10
-
11
- class CLIExecutionError < Error
12
- attr_reader :stderr, :status
13
-
14
- def initialize(message, stderr:, status:)
15
- super(message)
16
- @stderr = stderr
17
- @status = status
18
- end
19
- end
20
-
21
- module_function
22
-
23
- def call(argv)
24
- binary = find_cli_binary
25
- args = Array(argv).map(&:to_s)
26
- stdout, stderr, status = Open3.capture3(binary.to_s, *args)
27
- return stdout if status.success?
28
-
29
- raise CLIExecutionError.new(
30
- "html-to-markdown CLI exited with status #{status.exitstatus}",
31
- stderr: stderr,
32
- status: status.exitstatus
33
- )
34
- end
35
-
36
- def find_cli_binary
37
- binary_name = Gem.win_platform? ? 'html-to-markdown.exe' : 'html-to-markdown'
38
- found = search_paths(binary_name).find(&:file?)
39
- return found if found
40
-
41
- raise MissingBinaryError, missing_binary_message
42
- end
43
-
44
- def root_path
45
- @root_path ||= Pathname(__dir__).join('../..').expand_path
46
- end
47
-
48
- def lib_path
49
- @lib_path ||= Pathname(__dir__).join('..').expand_path
50
- end
51
-
52
- def search_paths(binary_name)
53
- paths = [
54
- root_path.join('target', 'release', binary_name),
55
- lib_path.join('bin', binary_name),
56
- lib_path.join(binary_name)
57
- ]
58
-
59
- workspace_root = root_path.parent&.parent
60
- paths << workspace_root.join('target', 'release', binary_name) if workspace_root
61
- paths
62
- end
63
-
64
- def missing_binary_message
65
- <<~MSG.strip
66
- html-to-markdown CLI binary not found. Build it with
67
- `cargo build --release --package html-to-markdown-cli`.
68
- MSG
69
- end
70
- end
71
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+ require 'pathname'
5
+
6
+ module HtmlToMarkdown
7
+ module CLIProxy
8
+ Error = Class.new(StandardError)
9
+ MissingBinaryError = Class.new(Error)
10
+
11
+ class CLIExecutionError < Error
12
+ attr_reader :stderr, :status
13
+
14
+ def initialize(message, stderr:, status:)
15
+ super(message)
16
+ @stderr = stderr
17
+ @status = status
18
+ end
19
+ end
20
+
21
+ module_function
22
+
23
+ def call(argv)
24
+ binary = find_cli_binary
25
+ args = Array(argv).map(&:to_s)
26
+ stdout, stderr, status = Open3.capture3(binary.to_s, *args)
27
+ return stdout if status.success?
28
+
29
+ raise CLIExecutionError.new(
30
+ "html-to-markdown CLI exited with status #{status.exitstatus}",
31
+ stderr: stderr,
32
+ status: status.exitstatus
33
+ )
34
+ end
35
+
36
+ def find_cli_binary
37
+ binary_name = Gem.win_platform? ? 'html-to-markdown.exe' : 'html-to-markdown'
38
+ found = search_paths(binary_name).find(&:file?)
39
+ return found if found
40
+
41
+ raise MissingBinaryError, missing_binary_message
42
+ end
43
+
44
+ def root_path
45
+ @root_path ||= Pathname(__dir__).join('../..').expand_path
46
+ end
47
+
48
+ def lib_path
49
+ @lib_path ||= Pathname(__dir__).join('..').expand_path
50
+ end
51
+
52
+ def search_paths(binary_name)
53
+ paths = [
54
+ root_path.join('target', 'release', binary_name),
55
+ lib_path.join('bin', binary_name),
56
+ lib_path.join(binary_name)
57
+ ]
58
+
59
+ workspace_root = root_path.parent&.parent
60
+ paths << workspace_root.join('target', 'release', binary_name) if workspace_root
61
+ paths
62
+ end
63
+
64
+ def missing_binary_message
65
+ <<~MSG.strip
66
+ html-to-markdown CLI binary not found. Build it with
67
+ `cargo build --release --package html-to-markdown-cli`.
68
+ MSG
69
+ end
70
+ end
71
+ end
@@ -1,5 +1,5 @@
1
- # frozen_string_literal: true
2
-
3
- module HtmlToMarkdown
4
- VERSION = '2.6.3'
5
- end
1
+ # frozen_string_literal: true
2
+
3
+ module HtmlToMarkdown
4
+ VERSION = '2.6.4'
5
+ end
@@ -1,24 +1,24 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative 'html_to_markdown/version'
4
- require 'html_to_markdown_rb'
5
-
6
- module HtmlToMarkdown
7
- autoload :CLI, 'html_to_markdown/cli'
8
- autoload :CLIProxy, 'html_to_markdown/cli_proxy'
9
-
10
- class << self
11
- alias native_convert convert
12
- alias native_convert_with_inline_images convert_with_inline_images
13
- end
14
-
15
- module_function
16
-
17
- def convert(html, options = nil)
18
- native_convert(html.to_s, options)
19
- end
20
-
21
- def convert_with_inline_images(html, options = nil, image_config = nil)
22
- native_convert_with_inline_images(html.to_s, options, image_config)
23
- end
24
- end
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'html_to_markdown/version'
4
+ require 'html_to_markdown_rb'
5
+
6
+ module HtmlToMarkdown
7
+ autoload :CLI, 'html_to_markdown/cli'
8
+ autoload :CLIProxy, 'html_to_markdown/cli_proxy'
9
+
10
+ class << self
11
+ alias native_convert convert
12
+ alias native_convert_with_inline_images convert_with_inline_images
13
+ end
14
+
15
+ module_function
16
+
17
+ def convert(html, options = nil)
18
+ native_convert(html.to_s, options)
19
+ end
20
+
21
+ def convert_with_inline_images(html, options = nil, image_config = nil)
22
+ native_convert_with_inline_images(html.to_s, options, image_config)
23
+ end
24
+ end
@@ -1,42 +1,42 @@
1
- # frozen_string_literal: true
2
-
3
- require 'spec_helper'
4
- require 'html_to_markdown/cli_proxy'
5
- require 'html_to_markdown/cli'
6
- require 'stringio'
7
-
8
- RSpec.describe HtmlToMarkdown::CLIProxy do
9
- describe '.call' do
10
- it 'executes the CLI binary' do
11
- begin
12
- binary = described_class.find_cli_binary
13
- rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
14
- skip 'CLI binary not built'
15
- end
16
-
17
- expect(binary).to be_file
18
-
19
- output = described_class.call(['--version'])
20
- expect(output).to include(HtmlToMarkdown::VERSION)
21
- end
22
- end
23
-
24
- describe HtmlToMarkdown::CLI do
25
- it 'writes CLI output to stdout' do
26
- begin
27
- HtmlToMarkdown::CLIProxy.find_cli_binary
28
- rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
29
- skip 'CLI binary not built'
30
- end
31
-
32
- stdout = StringIO.new
33
- stderr = StringIO.new
34
-
35
- exit_code = described_class.run(['--version'], stdout: stdout, stderr: stderr)
36
-
37
- expect(exit_code).to eq(0)
38
- expect(stdout.string).to include(HtmlToMarkdown::VERSION)
39
- expect(stderr.string).to be_empty
40
- end
41
- end
42
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'html_to_markdown/cli_proxy'
5
+ require 'html_to_markdown/cli'
6
+ require 'stringio'
7
+
8
+ RSpec.describe HtmlToMarkdown::CLIProxy do
9
+ describe '.call' do
10
+ it 'executes the CLI binary' do
11
+ begin
12
+ binary = described_class.find_cli_binary
13
+ rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
14
+ skip 'CLI binary not built'
15
+ end
16
+
17
+ expect(binary).to be_file
18
+
19
+ output = described_class.call(['--version'])
20
+ expect(output).to include(HtmlToMarkdown::VERSION)
21
+ end
22
+ end
23
+
24
+ describe HtmlToMarkdown::CLI do
25
+ it 'writes CLI output to stdout' do
26
+ begin
27
+ HtmlToMarkdown::CLIProxy.find_cli_binary
28
+ rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
29
+ skip 'CLI binary not built'
30
+ end
31
+
32
+ stdout = StringIO.new
33
+ stderr = StringIO.new
34
+
35
+ exit_code = described_class.run(['--version'], stdout: stdout, stderr: stderr)
36
+
37
+ expect(exit_code).to eq(0)
38
+ expect(stdout.string).to include(HtmlToMarkdown::VERSION)
39
+ expect(stderr.string).to be_empty
40
+ end
41
+ end
42
+ end
data/spec/convert_spec.rb CHANGED
@@ -1,29 +1,29 @@
1
- # frozen_string_literal: true
2
-
3
- require 'spec_helper'
4
-
5
- RSpec.describe HtmlToMarkdown do
6
- describe '.convert' do
7
- it 'converts simple headings' do
8
- expect(described_class.convert('<h1>Hello</h1>')).to eq("# Hello\n")
9
- end
10
-
11
- it 'accepts options hash' do
12
- result = described_class.convert(
13
- '<h1>Hello</h1>',
14
- heading_style: :atx_closed,
15
- default_title: true
16
- )
17
- expect(result).to include('Hello')
18
- end
19
- end
20
-
21
- describe '.convert_with_inline_images' do
22
- it 'returns inline images metadata' do
23
- html = '<p><img src="" alt="fake"></p>'
24
- extraction = described_class.convert_with_inline_images(html)
25
- expect(extraction).to include(:markdown, :inline_images, :warnings)
26
- expect(extraction[:inline_images].first[:description]).to eq('fake')
27
- end
28
- end
29
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe HtmlToMarkdown do
6
+ describe '.convert' do
7
+ it 'converts simple headings' do
8
+ expect(described_class.convert('<h1>Hello</h1>')).to eq("# Hello\n")
9
+ end
10
+
11
+ it 'accepts options hash' do
12
+ result = described_class.convert(
13
+ '<h1>Hello</h1>',
14
+ heading_style: :atx_closed,
15
+ default_title: true
16
+ )
17
+ expect(result).to include('Hello')
18
+ end
19
+ end
20
+
21
+ describe '.convert_with_inline_images' do
22
+ it 'returns inline images metadata' do
23
+ html = '<p><img src="" alt="fake"></p>'
24
+ extraction = described_class.convert_with_inline_images(html)
25
+ expect(extraction).to include(:markdown, :inline_images, :warnings)
26
+ expect(extraction[:inline_images].first[:description]).to eq('fake')
27
+ end
28
+ end
29
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,10 +1,10 @@
1
- # frozen_string_literal: true
2
-
3
- require 'bundler/setup'
4
- require 'html_to_markdown'
5
-
6
- RSpec.configure do |config|
7
- config.expect_with :rspec do |c|
8
- c.syntax = :expect
9
- end
10
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/setup'
4
+ require 'html_to_markdown'
5
+
6
+ RSpec.configure do |config|
7
+ config.expect_with :rspec do |c|
8
+ c.syntax = :expect
9
+ end
10
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.6.3
4
+ version: 2.6.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld