html-to-markdown 2.6.2 → 2.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +29 -29
- data/Gemfile +15 -15
- data/Gemfile.lock +2 -1
- data/README.md +210 -210
- data/Rakefile +24 -24
- data/exe/html-to-markdown +6 -6
- data/ext/html-to-markdown-rb/extconf.rb +28 -28
- data/html-to-markdown-rb.gemspec +59 -59
- data/lib/html_to_markdown/cli.rb +21 -21
- data/lib/html_to_markdown/cli_proxy.rb +71 -71
- data/lib/html_to_markdown/version.rb +5 -5
- data/lib/html_to_markdown.rb +24 -24
- data/spec/cli_proxy_spec.rb +42 -42
- data/spec/convert_spec.rb +29 -29
- data/spec/spec_helper.rb +10 -10
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ce9f59ee100c4d4b3de28c61803fd2940681d3bc8bd46e1730598269e6283e1b
|
|
4
|
+
data.tar.gz: 52619ee971d3f51f040bd05a516517fd6082ea0ff10395dcb239f7081055234c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ee2c1b632ba2ec4edb5b449b6f9b9050229d15858b6434bb76ea327ca4b129f02a629d4e10daf1d36c86c2725a3aa431484aa92ecff7503b7adfc73b83697bb5
|
|
7
|
+
data.tar.gz: b2e48f4f60f8bd05bc32621ca8fb43e27886ae220a98a7ee378a1ac0796d4dc7ed43fc70ffe6391c53da20e2b85324e4dbe0bc98922cbffe64471b70f784a5a6
|
data/.rubocop.yml
CHANGED
|
@@ -1,29 +1,29 @@
|
|
|
1
|
-
plugins:
|
|
2
|
-
- rubocop-rspec
|
|
3
|
-
|
|
4
|
-
AllCops:
|
|
5
|
-
NewCops: enable
|
|
6
|
-
TargetRubyVersion: 3.2
|
|
7
|
-
Exclude:
|
|
8
|
-
- "tmp/**/*"
|
|
9
|
-
- "vendor/**/*"
|
|
10
|
-
|
|
11
|
-
Style/Documentation:
|
|
12
|
-
Enabled: false
|
|
13
|
-
|
|
14
|
-
Metrics/BlockLength:
|
|
15
|
-
Exclude:
|
|
16
|
-
- "spec/**/*"
|
|
17
|
-
- "*.gemspec"
|
|
18
|
-
|
|
19
|
-
Metrics/MethodLength:
|
|
20
|
-
Max: 15
|
|
21
|
-
|
|
22
|
-
RSpec/MultipleExpectations:
|
|
23
|
-
Enabled: false
|
|
24
|
-
|
|
25
|
-
RSpec/ExampleLength:
|
|
26
|
-
Enabled: false
|
|
27
|
-
|
|
28
|
-
RSpec/SpecFilePathFormat:
|
|
29
|
-
Enabled: false
|
|
1
|
+
plugins:
|
|
2
|
+
- rubocop-rspec
|
|
3
|
+
|
|
4
|
+
AllCops:
|
|
5
|
+
NewCops: enable
|
|
6
|
+
TargetRubyVersion: 3.2
|
|
7
|
+
Exclude:
|
|
8
|
+
- "tmp/**/*"
|
|
9
|
+
- "vendor/**/*"
|
|
10
|
+
|
|
11
|
+
Style/Documentation:
|
|
12
|
+
Enabled: false
|
|
13
|
+
|
|
14
|
+
Metrics/BlockLength:
|
|
15
|
+
Exclude:
|
|
16
|
+
- "spec/**/*"
|
|
17
|
+
- "*.gemspec"
|
|
18
|
+
|
|
19
|
+
Metrics/MethodLength:
|
|
20
|
+
Max: 15
|
|
21
|
+
|
|
22
|
+
RSpec/MultipleExpectations:
|
|
23
|
+
Enabled: false
|
|
24
|
+
|
|
25
|
+
RSpec/ExampleLength:
|
|
26
|
+
Enabled: false
|
|
27
|
+
|
|
28
|
+
RSpec/SpecFilePathFormat:
|
|
29
|
+
Enabled: false
|
data/Gemfile
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
source 'https://rubygems.org'
|
|
4
|
-
|
|
5
|
-
ruby '>= 3.2'
|
|
6
|
-
|
|
7
|
-
gemspec
|
|
8
|
-
|
|
9
|
-
group :development, :test do
|
|
10
|
-
gem 'rake-compiler'
|
|
11
|
-
gem 'rb_sys' # provides build tooling when developing locally
|
|
12
|
-
gem 'rspec'
|
|
13
|
-
gem 'rubocop', require: false
|
|
14
|
-
gem 'rubocop-rspec', require: false
|
|
15
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
source 'https://rubygems.org'
|
|
4
|
+
|
|
5
|
+
ruby '>= 3.2'
|
|
6
|
+
|
|
7
|
+
gemspec
|
|
8
|
+
|
|
9
|
+
group :development, :test do
|
|
10
|
+
gem 'rake-compiler'
|
|
11
|
+
gem 'rb_sys' # provides build tooling when developing locally
|
|
12
|
+
gem 'rspec'
|
|
13
|
+
gem 'rubocop', require: false
|
|
14
|
+
gem 'rubocop-rspec', require: false
|
|
15
|
+
end
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
|
@@ -1,210 +1,210 @@
|
|
|
1
|
-
# html-to-markdown-rb
|
|
2
|
-
|
|
3
|
-
Blazing-fast HTML → Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, and WebAssembly packages. Ship identical Markdown across every runtime while enjoying native extension performance.
|
|
4
|
-
|
|
5
|
-
[](https://crates.io/crates/html-to-markdown-rs)
|
|
6
|
-
[](https://www.npmjs.com/package/html-to-markdown-node)
|
|
7
|
-
[](https://www.npmjs.com/package/html-to-markdown-wasm)
|
|
8
|
-
[](https://www.npmjs.com/package/html-to-markdown)
|
|
9
|
-
[](https://pypi.org/project/html-to-markdown/)
|
|
10
|
-
[](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
11
|
-
[](https://rubygems.org/gems/html-to-markdown)
|
|
12
|
-
[](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
|
|
13
|
-
|
|
14
|
-
## Features
|
|
15
|
-
|
|
16
|
-
- ⚡ **Rust-fast**: Ruby bindings around a highly optimised Rust core (60‑80× faster than BeautifulSoup-based converters).
|
|
17
|
-
- 🔁 **Identical output**: Shares logic with the Python wheels, npm bindings, WASM package, and CLI — consistent Markdown everywhere.
|
|
18
|
-
- ⚙️ **Rich configuration**: Control heading styles, list indentation, whitespace handling, HTML preprocessing, and more.
|
|
19
|
-
- 🖼️ **Inline image extraction**: Pull out embedded images (PNG/JPEG/SVG/data URIs) alongside Markdown.
|
|
20
|
-
- 🧰 **Bundled CLI proxy**: Call the Rust CLI straight from Ruby or shell scripts.
|
|
21
|
-
- 🛠️ **First-class Rails support**: Works with `Gem.win_platform?` builds, supports Trusted Publishing, and compiles on install if no native gem matches.
|
|
22
|
-
|
|
23
|
-
## Documentation & Support
|
|
24
|
-
|
|
25
|
-
- [GitHub repository](https://github.com/Goldziher/html-to-markdown)
|
|
26
|
-
- [Issue tracker](https://github.com/Goldziher/html-to-markdown/issues)
|
|
27
|
-
- [Changelog](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md)
|
|
28
|
-
- [Live demo (WASM)](https://goldziher.github.io/html-to-markdown/)
|
|
29
|
-
|
|
30
|
-
## Installation
|
|
31
|
-
|
|
32
|
-
```bash
|
|
33
|
-
bundle add html-to-markdown
|
|
34
|
-
# or
|
|
35
|
-
gem install html-to-markdown
|
|
36
|
-
```
|
|
37
|
-
|
|
38
|
-
Add the gem to your project and Bundler will compile the native Rust extension on first install.
|
|
39
|
-
|
|
40
|
-
### Requirements
|
|
41
|
-
|
|
42
|
-
- Ruby **3.2+** (Magnus relies on the fiber scheduler APIs added in 3.2)
|
|
43
|
-
- Rust toolchain **1.85+** with Cargo available on your `$PATH`
|
|
44
|
-
- Ruby development headers (`ruby-dev`, `ruby-devel`, or the platform equivalent)
|
|
45
|
-
|
|
46
|
-
**Windows**: install [RubyInstaller with MSYS2](https://rubyinstaller.org/) (UCRT64). Run once:
|
|
47
|
-
|
|
48
|
-
```powershell
|
|
49
|
-
ridk exec pacman -S --needed --noconfirm base-devel mingw-w64-ucrt-x86_64-toolchain
|
|
50
|
-
```
|
|
51
|
-
|
|
52
|
-
This provides the standard headers (including `strings.h`) required for the bindgen step.
|
|
53
|
-
|
|
54
|
-
## Performance Snapshot
|
|
55
|
-
|
|
56
|
-
Apple M4 • Real Wikipedia documents • `HtmlToMarkdown.convert` (Ruby)
|
|
57
|
-
|
|
58
|
-
| Document | Size | Latency | Throughput | Docs/sec |
|
|
59
|
-
| ------------------- | ----- | ------- | ---------- | -------- |
|
|
60
|
-
| Lists (Timeline) | 129KB | 0.69ms | 187 MB/s | 1,450 |
|
|
61
|
-
| Tables (Countries) | 360KB | 2.19ms | 164 MB/s | 456 |
|
|
62
|
-
| Mixed (Python wiki) | 656KB | 4.88ms | 134 MB/s | 205 |
|
|
63
|
-
|
|
64
|
-
> Same core, same benchmarks: the Ruby extension stays within single-digit % of the Rust CLI and mirrors the Python/Node numbers.
|
|
65
|
-
|
|
66
|
-
## Quick Start
|
|
67
|
-
|
|
68
|
-
```ruby
|
|
69
|
-
require 'html_to_markdown'
|
|
70
|
-
|
|
71
|
-
html = <<~HTML
|
|
72
|
-
<h1>Welcome</h1>
|
|
73
|
-
<p>This is <strong>Rust-fast</strong> conversion!</p>
|
|
74
|
-
<ul>
|
|
75
|
-
<li>Native extension</li>
|
|
76
|
-
<li>Identical output across languages</li>
|
|
77
|
-
</ul>
|
|
78
|
-
HTML
|
|
79
|
-
|
|
80
|
-
markdown = HtmlToMarkdown.convert(html)
|
|
81
|
-
puts markdown
|
|
82
|
-
# # Welcome
|
|
83
|
-
#
|
|
84
|
-
# This is **Rust-fast** conversion!
|
|
85
|
-
#
|
|
86
|
-
# - Native extension
|
|
87
|
-
# - Identical output across languages
|
|
88
|
-
```
|
|
89
|
-
|
|
90
|
-
## API
|
|
91
|
-
|
|
92
|
-
### Conversion Options
|
|
93
|
-
|
|
94
|
-
Pass a Ruby hash (string or symbol keys) to tweak rendering. Every option maps one-for-one with the Rust/Python/Node APIs.
|
|
95
|
-
|
|
96
|
-
```ruby
|
|
97
|
-
require 'html_to_markdown'
|
|
98
|
-
|
|
99
|
-
markdown = HtmlToMarkdown.convert(
|
|
100
|
-
'<pre><code class="language-ruby">puts "hi"</code></pre>',
|
|
101
|
-
heading_style: :atx,
|
|
102
|
-
code_block_style: :fenced,
|
|
103
|
-
bullets: '*+-',
|
|
104
|
-
list_indent_type: :spaces,
|
|
105
|
-
list_indent_width: 2,
|
|
106
|
-
whitespace_mode: :normalized,
|
|
107
|
-
highlight_style: :double_equal
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
puts markdown
|
|
111
|
-
```
|
|
112
|
-
|
|
113
|
-
### HTML Preprocessing
|
|
114
|
-
|
|
115
|
-
Clean up scraped HTML (navigation, forms, malformed markup) before conversion:
|
|
116
|
-
|
|
117
|
-
```ruby
|
|
118
|
-
require 'html_to_markdown'
|
|
119
|
-
|
|
120
|
-
markdown = HtmlToMarkdown.convert(
|
|
121
|
-
html,
|
|
122
|
-
preprocessing: {
|
|
123
|
-
enabled: true,
|
|
124
|
-
preset: :aggressive, # :minimal, :standard, :aggressive
|
|
125
|
-
remove_navigation: true,
|
|
126
|
-
remove_forms: true
|
|
127
|
-
}
|
|
128
|
-
)
|
|
129
|
-
```
|
|
130
|
-
|
|
131
|
-
### Inline Images
|
|
132
|
-
|
|
133
|
-
Extract inline binary data (data URIs, SVG) together with the converted Markdown.
|
|
134
|
-
|
|
135
|
-
```ruby
|
|
136
|
-
require 'html_to_markdown'
|
|
137
|
-
|
|
138
|
-
result = HtmlToMarkdown.convert_with_inline_images(
|
|
139
|
-
'<img src="data:image/png;base64,iVBORw0..." alt="Pixel">',
|
|
140
|
-
image_config: {
|
|
141
|
-
max_decoded_size_bytes: 1 * 1024 * 1024,
|
|
142
|
-
infer_dimensions: true,
|
|
143
|
-
filename_prefix: 'img_',
|
|
144
|
-
capture_svg: true
|
|
145
|
-
}
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
puts result.markdown
|
|
149
|
-
result.inline_images.each do |img|
|
|
150
|
-
puts "#{img.filename} -> #{img.format} (#{img.data.bytesize} bytes)"
|
|
151
|
-
end
|
|
152
|
-
```
|
|
153
|
-
|
|
154
|
-
## CLI
|
|
155
|
-
|
|
156
|
-
The gem bundles a small proxy for the Rust CLI binary. Use it when you need parity with the standalone `html-to-markdown` executable.
|
|
157
|
-
|
|
158
|
-
```ruby
|
|
159
|
-
require 'html_to_markdown/cli'
|
|
160
|
-
|
|
161
|
-
HtmlToMarkdown::CLI.run(%w[--heading-style atx input.html], stdout: $stdout)
|
|
162
|
-
# => writes converted Markdown to STDOUT
|
|
163
|
-
```
|
|
164
|
-
|
|
165
|
-
You can also call the CLI binary directly for scripting:
|
|
166
|
-
|
|
167
|
-
```ruby
|
|
168
|
-
HtmlToMarkdown::CLIProxy.call(['--version'])
|
|
169
|
-
# => "html-to-markdown 2.5.7"
|
|
170
|
-
```
|
|
171
|
-
|
|
172
|
-
Rebuild the CLI locally if you see `CLI binary not built` during tests:
|
|
173
|
-
|
|
174
|
-
```bash
|
|
175
|
-
bundle exec rake compile # builds the extension
|
|
176
|
-
bundle exec ruby scripts/prepare_ruby_gem.rb # copies the CLI into lib/bin/
|
|
177
|
-
```
|
|
178
|
-
|
|
179
|
-
## Error Handling
|
|
180
|
-
|
|
181
|
-
Conversion errors raise `HtmlToMarkdown::Error` (wrapping the Rust error context). CLI invocations use specialised subclasses:
|
|
182
|
-
|
|
183
|
-
- `HtmlToMarkdown::CLIProxy::MissingBinaryError`
|
|
184
|
-
- `HtmlToMarkdown::CLIProxy::CLIExecutionError`
|
|
185
|
-
|
|
186
|
-
Rescue them to provide clearer feedback in your application.
|
|
187
|
-
|
|
188
|
-
## Consistent Across Languages
|
|
189
|
-
|
|
190
|
-
The Ruby gem shares the exact Rust core with:
|
|
191
|
-
|
|
192
|
-
- [Python wheels](https://pypi.org/project/html-to-markdown/)
|
|
193
|
-
- [Node.js / Bun bindings](https://www.npmjs.com/package/html-to-markdown-node)
|
|
194
|
-
- [WebAssembly package](https://www.npmjs.com/package/html-to-markdown-wasm)
|
|
195
|
-
- The Rust crate and CLI
|
|
196
|
-
|
|
197
|
-
Use whichever runtime fits your stack while keeping formatting behaviour identical.
|
|
198
|
-
|
|
199
|
-
## Development
|
|
200
|
-
|
|
201
|
-
```bash
|
|
202
|
-
bundle exec rake compile # build the native extension
|
|
203
|
-
bundle exec rspec # run test suite
|
|
204
|
-
```
|
|
205
|
-
|
|
206
|
-
The extension uses [Magnus](https://github.com/matsadler/magnus) plus `rb-sys` for bindgen. When editing the Rust code under `src/`, rerun `rake compile`.
|
|
207
|
-
|
|
208
|
-
## License
|
|
209
|
-
|
|
210
|
-
MIT © Na'aman Hirschfeld
|
|
1
|
+
# html-to-markdown-rb
|
|
2
|
+
|
|
3
|
+
Blazing-fast HTML → Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, and WebAssembly packages. Ship identical Markdown across every runtime while enjoying native extension performance.
|
|
4
|
+
|
|
5
|
+
[](https://crates.io/crates/html-to-markdown-rs)
|
|
6
|
+
[](https://www.npmjs.com/package/html-to-markdown-node)
|
|
7
|
+
[](https://www.npmjs.com/package/html-to-markdown-wasm)
|
|
8
|
+
[](https://www.npmjs.com/package/html-to-markdown)
|
|
9
|
+
[](https://pypi.org/project/html-to-markdown/)
|
|
10
|
+
[](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
11
|
+
[](https://rubygems.org/gems/html-to-markdown)
|
|
12
|
+
[](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
|
|
13
|
+
|
|
14
|
+
## Features
|
|
15
|
+
|
|
16
|
+
- ⚡ **Rust-fast**: Ruby bindings around a highly optimised Rust core (60‑80× faster than BeautifulSoup-based converters).
|
|
17
|
+
- 🔁 **Identical output**: Shares logic with the Python wheels, npm bindings, WASM package, and CLI — consistent Markdown everywhere.
|
|
18
|
+
- ⚙️ **Rich configuration**: Control heading styles, list indentation, whitespace handling, HTML preprocessing, and more.
|
|
19
|
+
- 🖼️ **Inline image extraction**: Pull out embedded images (PNG/JPEG/SVG/data URIs) alongside Markdown.
|
|
20
|
+
- 🧰 **Bundled CLI proxy**: Call the Rust CLI straight from Ruby or shell scripts.
|
|
21
|
+
- 🛠️ **First-class Rails support**: Works with `Gem.win_platform?` builds, supports Trusted Publishing, and compiles on install if no native gem matches.
|
|
22
|
+
|
|
23
|
+
## Documentation & Support
|
|
24
|
+
|
|
25
|
+
- [GitHub repository](https://github.com/Goldziher/html-to-markdown)
|
|
26
|
+
- [Issue tracker](https://github.com/Goldziher/html-to-markdown/issues)
|
|
27
|
+
- [Changelog](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md)
|
|
28
|
+
- [Live demo (WASM)](https://goldziher.github.io/html-to-markdown/)
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
bundle add html-to-markdown
|
|
34
|
+
# or
|
|
35
|
+
gem install html-to-markdown
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Add the gem to your project and Bundler will compile the native Rust extension on first install.
|
|
39
|
+
|
|
40
|
+
### Requirements
|
|
41
|
+
|
|
42
|
+
- Ruby **3.2+** (Magnus relies on the fiber scheduler APIs added in 3.2)
|
|
43
|
+
- Rust toolchain **1.85+** with Cargo available on your `$PATH`
|
|
44
|
+
- Ruby development headers (`ruby-dev`, `ruby-devel`, or the platform equivalent)
|
|
45
|
+
|
|
46
|
+
**Windows**: install [RubyInstaller with MSYS2](https://rubyinstaller.org/) (UCRT64). Run once:
|
|
47
|
+
|
|
48
|
+
```powershell
|
|
49
|
+
ridk exec pacman -S --needed --noconfirm base-devel mingw-w64-ucrt-x86_64-toolchain
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
This provides the standard headers (including `strings.h`) required for the bindgen step.
|
|
53
|
+
|
|
54
|
+
## Performance Snapshot
|
|
55
|
+
|
|
56
|
+
Apple M4 • Real Wikipedia documents • `HtmlToMarkdown.convert` (Ruby)
|
|
57
|
+
|
|
58
|
+
| Document | Size | Latency | Throughput | Docs/sec |
|
|
59
|
+
| ------------------- | ----- | ------- | ---------- | -------- |
|
|
60
|
+
| Lists (Timeline) | 129KB | 0.69ms | 187 MB/s | 1,450 |
|
|
61
|
+
| Tables (Countries) | 360KB | 2.19ms | 164 MB/s | 456 |
|
|
62
|
+
| Mixed (Python wiki) | 656KB | 4.88ms | 134 MB/s | 205 |
|
|
63
|
+
|
|
64
|
+
> Same core, same benchmarks: the Ruby extension stays within single-digit % of the Rust CLI and mirrors the Python/Node numbers.
|
|
65
|
+
|
|
66
|
+
## Quick Start
|
|
67
|
+
|
|
68
|
+
```ruby
|
|
69
|
+
require 'html_to_markdown'
|
|
70
|
+
|
|
71
|
+
html = <<~HTML
|
|
72
|
+
<h1>Welcome</h1>
|
|
73
|
+
<p>This is <strong>Rust-fast</strong> conversion!</p>
|
|
74
|
+
<ul>
|
|
75
|
+
<li>Native extension</li>
|
|
76
|
+
<li>Identical output across languages</li>
|
|
77
|
+
</ul>
|
|
78
|
+
HTML
|
|
79
|
+
|
|
80
|
+
markdown = HtmlToMarkdown.convert(html)
|
|
81
|
+
puts markdown
|
|
82
|
+
# # Welcome
|
|
83
|
+
#
|
|
84
|
+
# This is **Rust-fast** conversion!
|
|
85
|
+
#
|
|
86
|
+
# - Native extension
|
|
87
|
+
# - Identical output across languages
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## API
|
|
91
|
+
|
|
92
|
+
### Conversion Options
|
|
93
|
+
|
|
94
|
+
Pass a Ruby hash (string or symbol keys) to tweak rendering. Every option maps one-for-one with the Rust/Python/Node APIs.
|
|
95
|
+
|
|
96
|
+
```ruby
|
|
97
|
+
require 'html_to_markdown'
|
|
98
|
+
|
|
99
|
+
markdown = HtmlToMarkdown.convert(
|
|
100
|
+
'<pre><code class="language-ruby">puts "hi"</code></pre>',
|
|
101
|
+
heading_style: :atx,
|
|
102
|
+
code_block_style: :fenced,
|
|
103
|
+
bullets: '*+-',
|
|
104
|
+
list_indent_type: :spaces,
|
|
105
|
+
list_indent_width: 2,
|
|
106
|
+
whitespace_mode: :normalized,
|
|
107
|
+
highlight_style: :double_equal
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
puts markdown
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### HTML Preprocessing
|
|
114
|
+
|
|
115
|
+
Clean up scraped HTML (navigation, forms, malformed markup) before conversion:
|
|
116
|
+
|
|
117
|
+
```ruby
|
|
118
|
+
require 'html_to_markdown'
|
|
119
|
+
|
|
120
|
+
markdown = HtmlToMarkdown.convert(
|
|
121
|
+
html,
|
|
122
|
+
preprocessing: {
|
|
123
|
+
enabled: true,
|
|
124
|
+
preset: :aggressive, # :minimal, :standard, :aggressive
|
|
125
|
+
remove_navigation: true,
|
|
126
|
+
remove_forms: true
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Inline Images
|
|
132
|
+
|
|
133
|
+
Extract inline binary data (data URIs, SVG) together with the converted Markdown.
|
|
134
|
+
|
|
135
|
+
```ruby
|
|
136
|
+
require 'html_to_markdown'
|
|
137
|
+
|
|
138
|
+
result = HtmlToMarkdown.convert_with_inline_images(
|
|
139
|
+
'<img src="data:image/png;base64,iVBORw0..." alt="Pixel">',
|
|
140
|
+
image_config: {
|
|
141
|
+
max_decoded_size_bytes: 1 * 1024 * 1024,
|
|
142
|
+
infer_dimensions: true,
|
|
143
|
+
filename_prefix: 'img_',
|
|
144
|
+
capture_svg: true
|
|
145
|
+
}
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
puts result.markdown
|
|
149
|
+
result.inline_images.each do |img|
|
|
150
|
+
puts "#{img.filename} -> #{img.format} (#{img.data.bytesize} bytes)"
|
|
151
|
+
end
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## CLI
|
|
155
|
+
|
|
156
|
+
The gem bundles a small proxy for the Rust CLI binary. Use it when you need parity with the standalone `html-to-markdown` executable.
|
|
157
|
+
|
|
158
|
+
```ruby
|
|
159
|
+
require 'html_to_markdown/cli'
|
|
160
|
+
|
|
161
|
+
HtmlToMarkdown::CLI.run(%w[--heading-style atx input.html], stdout: $stdout)
|
|
162
|
+
# => writes converted Markdown to STDOUT
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
You can also call the CLI binary directly for scripting:
|
|
166
|
+
|
|
167
|
+
```ruby
|
|
168
|
+
HtmlToMarkdown::CLIProxy.call(['--version'])
|
|
169
|
+
# => "html-to-markdown 2.5.7"
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Rebuild the CLI locally if you see `CLI binary not built` during tests:
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
bundle exec rake compile # builds the extension
|
|
176
|
+
bundle exec ruby scripts/prepare_ruby_gem.rb # copies the CLI into lib/bin/
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## Error Handling
|
|
180
|
+
|
|
181
|
+
Conversion errors raise `HtmlToMarkdown::Error` (wrapping the Rust error context). CLI invocations use specialised subclasses:
|
|
182
|
+
|
|
183
|
+
- `HtmlToMarkdown::CLIProxy::MissingBinaryError`
|
|
184
|
+
- `HtmlToMarkdown::CLIProxy::CLIExecutionError`
|
|
185
|
+
|
|
186
|
+
Rescue them to provide clearer feedback in your application.
|
|
187
|
+
|
|
188
|
+
## Consistent Across Languages
|
|
189
|
+
|
|
190
|
+
The Ruby gem shares the exact Rust core with:
|
|
191
|
+
|
|
192
|
+
- [Python wheels](https://pypi.org/project/html-to-markdown/)
|
|
193
|
+
- [Node.js / Bun bindings](https://www.npmjs.com/package/html-to-markdown-node)
|
|
194
|
+
- [WebAssembly package](https://www.npmjs.com/package/html-to-markdown-wasm)
|
|
195
|
+
- The Rust crate and CLI
|
|
196
|
+
|
|
197
|
+
Use whichever runtime fits your stack while keeping formatting behaviour identical.
|
|
198
|
+
|
|
199
|
+
## Development
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
bundle exec rake compile # build the native extension
|
|
203
|
+
bundle exec rspec # run test suite
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
The extension uses [Magnus](https://github.com/matsadler/magnus) plus `rb-sys` for bindgen. When editing the Rust code under `src/`, rerun `rake compile`.
|
|
207
|
+
|
|
208
|
+
## License
|
|
209
|
+
|
|
210
|
+
MIT © Na'aman Hirschfeld
|
data/Rakefile
CHANGED
|
@@ -1,24 +1,24 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'bundler/gem_tasks'
|
|
4
|
-
require 'rb_sys/extensiontask'
|
|
5
|
-
require 'rspec/core/rake_task'
|
|
6
|
-
|
|
7
|
-
GEMSPEC = Gem::Specification.load(File.expand_path('html-to-markdown-rb.gemspec', __dir__))
|
|
8
|
-
|
|
9
|
-
RbSys::ExtensionTask.new('html-to-markdown-rb', GEMSPEC) do |ext|
|
|
10
|
-
ext.lib_dir = 'lib'
|
|
11
|
-
ext.ext_dir = 'ext/html-to-markdown-rb'
|
|
12
|
-
ext.cross_compile = true
|
|
13
|
-
ext.cross_platform = %w[
|
|
14
|
-
x86_64-linux
|
|
15
|
-
x86_64-darwin
|
|
16
|
-
arm64-darwin
|
|
17
|
-
x64-mingw32
|
|
18
|
-
]
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
RSpec::Core::RakeTask.new(:spec)
|
|
22
|
-
|
|
23
|
-
task spec: :compile
|
|
24
|
-
task default: :spec
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'bundler/gem_tasks'
|
|
4
|
+
require 'rb_sys/extensiontask'
|
|
5
|
+
require 'rspec/core/rake_task'
|
|
6
|
+
|
|
7
|
+
GEMSPEC = Gem::Specification.load(File.expand_path('html-to-markdown-rb.gemspec', __dir__))
|
|
8
|
+
|
|
9
|
+
RbSys::ExtensionTask.new('html-to-markdown-rb', GEMSPEC) do |ext|
|
|
10
|
+
ext.lib_dir = 'lib'
|
|
11
|
+
ext.ext_dir = 'ext/html-to-markdown-rb'
|
|
12
|
+
ext.cross_compile = true
|
|
13
|
+
ext.cross_platform = %w[
|
|
14
|
+
x86_64-linux
|
|
15
|
+
x86_64-darwin
|
|
16
|
+
arm64-darwin
|
|
17
|
+
x64-mingw32
|
|
18
|
+
]
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
RSpec::Core::RakeTask.new(:spec)
|
|
22
|
+
|
|
23
|
+
task spec: :compile
|
|
24
|
+
task default: :spec
|
data/exe/html-to-markdown
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
# frozen_string_literal: true
|
|
3
|
-
|
|
4
|
-
require 'html_to_markdown/cli'
|
|
5
|
-
|
|
6
|
-
exit HtmlToMarkdown::CLI.run(ARGV)
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require 'html_to_markdown/cli'
|
|
5
|
+
|
|
6
|
+
exit HtmlToMarkdown::CLI.run(ARGV)
|
|
@@ -1,28 +1,28 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'mkmf'
|
|
4
|
-
require 'rb_sys/mkmf'
|
|
5
|
-
require 'rbconfig'
|
|
6
|
-
|
|
7
|
-
if RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
|
|
8
|
-
devkit = ENV.fetch('RI_DEVKIT', nil)
|
|
9
|
-
prefix = ENV['MSYSTEM_PREFIX'] || '/ucrt64'
|
|
10
|
-
|
|
11
|
-
if devkit
|
|
12
|
-
sysroot = "#{devkit}#{prefix}".tr('\\\\', '/')
|
|
13
|
-
extra_args = [
|
|
14
|
-
'--target=x86_64-pc-windows-gnu',
|
|
15
|
-
"--sysroot=#{sysroot}"
|
|
16
|
-
]
|
|
17
|
-
|
|
18
|
-
existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/)
|
|
19
|
-
ENV['BINDGEN_EXTRA_CLANG_ARGS'] = (existing + extra_args).uniq.join(' ')
|
|
20
|
-
end
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
default_profile = ENV.fetch('CARGO_PROFILE', 'release')
|
|
24
|
-
|
|
25
|
-
create_rust_makefile('html_to_markdown_rb') do |config|
|
|
26
|
-
config.profile = default_profile.to_sym
|
|
27
|
-
config.ext_dir = '../../../../crates/html-to-markdown-rb'
|
|
28
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'mkmf'
|
|
4
|
+
require 'rb_sys/mkmf'
|
|
5
|
+
require 'rbconfig'
|
|
6
|
+
|
|
7
|
+
if RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
|
|
8
|
+
devkit = ENV.fetch('RI_DEVKIT', nil)
|
|
9
|
+
prefix = ENV['MSYSTEM_PREFIX'] || '/ucrt64'
|
|
10
|
+
|
|
11
|
+
if devkit
|
|
12
|
+
sysroot = "#{devkit}#{prefix}".tr('\\\\', '/')
|
|
13
|
+
extra_args = [
|
|
14
|
+
'--target=x86_64-pc-windows-gnu',
|
|
15
|
+
"--sysroot=#{sysroot}"
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/)
|
|
19
|
+
ENV['BINDGEN_EXTRA_CLANG_ARGS'] = (existing + extra_args).uniq.join(' ')
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
default_profile = ENV.fetch('CARGO_PROFILE', 'release')
|
|
24
|
+
|
|
25
|
+
create_rust_makefile('html_to_markdown_rb') do |config|
|
|
26
|
+
config.profile = default_profile.to_sym
|
|
27
|
+
config.ext_dir = '../../../../crates/html-to-markdown-rb'
|
|
28
|
+
end
|
data/html-to-markdown-rb.gemspec
CHANGED
|
@@ -1,59 +1,59 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require_relative 'lib/html_to_markdown/version'
|
|
4
|
-
|
|
5
|
-
repo_root = File.expand_path('../..', __dir__)
|
|
6
|
-
crate_prefix = 'packages/ruby/'
|
|
7
|
-
git_cmd = %(git -C "#{repo_root}" ls-files -z #{crate_prefix})
|
|
8
|
-
git_files =
|
|
9
|
-
`#{git_cmd}`.split("\x0")
|
|
10
|
-
.select { |path| path.start_with?(crate_prefix) }
|
|
11
|
-
.map { |path| path.delete_prefix(crate_prefix) }
|
|
12
|
-
fallback_files = Dir.chdir(__dir__) do
|
|
13
|
-
Dir.glob(
|
|
14
|
-
%w[
|
|
15
|
-
README.md
|
|
16
|
-
ext/html-to-markdown-rb/extconf.rb
|
|
17
|
-
exe/*
|
|
18
|
-
lib/**/*.rb
|
|
19
|
-
lib/bin/*
|
|
20
|
-
src/**/*.rs
|
|
21
|
-
spec/**/*.rb
|
|
22
|
-
]
|
|
23
|
-
)
|
|
24
|
-
end
|
|
25
|
-
files = git_files.empty? ? fallback_files : git_files
|
|
26
|
-
|
|
27
|
-
Gem::Specification.new do |spec|
|
|
28
|
-
spec.name = 'html-to-markdown'
|
|
29
|
-
spec.version = HtmlToMarkdown::VERSION
|
|
30
|
-
spec.authors = ["Na'aman Hirschfeld"]
|
|
31
|
-
spec.email = ['nhirschfeld@gmail.com']
|
|
32
|
-
|
|
33
|
-
spec.summary = 'Blazing-fast HTML to Markdown conversion for Ruby, powered by Rust.'
|
|
34
|
-
spec.description = <<~DESC.strip
|
|
35
|
-
html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
|
|
36
|
-
It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
|
|
37
|
-
DESC
|
|
38
|
-
spec.homepage = 'https://github.com/Goldziher/html-to-markdown'
|
|
39
|
-
spec.license = 'MIT'
|
|
40
|
-
|
|
41
|
-
spec.required_ruby_version = Gem::Requirement.new('>= 3.2')
|
|
42
|
-
|
|
43
|
-
spec.bindir = 'exe'
|
|
44
|
-
spec.executables = ['html-to-markdown']
|
|
45
|
-
spec.require_paths = ['lib']
|
|
46
|
-
|
|
47
|
-
spec.files = files
|
|
48
|
-
spec.extra_rdoc_files = ['README.md']
|
|
49
|
-
|
|
50
|
-
spec.extensions = ['ext/html-to-markdown-rb/extconf.rb']
|
|
51
|
-
|
|
52
|
-
spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
|
|
53
|
-
spec.metadata['rubygems_mfa_required'] = 'true'
|
|
54
|
-
spec.metadata['homepage_uri'] = 'https://github.com/Goldziher/html-to-markdown'
|
|
55
|
-
spec.metadata['source_code_uri'] = 'https://github.com/Goldziher/html-to-markdown'
|
|
56
|
-
spec.metadata['bug_tracker_uri'] = 'https://github.com/Goldziher/html-to-markdown/issues'
|
|
57
|
-
spec.metadata['changelog_uri'] = 'https://github.com/Goldziher/html-to-markdown/releases'
|
|
58
|
-
spec.metadata['documentation_uri'] = 'https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md'
|
|
59
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'lib/html_to_markdown/version'
|
|
4
|
+
|
|
5
|
+
repo_root = File.expand_path('../..', __dir__)
|
|
6
|
+
crate_prefix = 'packages/ruby/'
|
|
7
|
+
git_cmd = %(git -C "#{repo_root}" ls-files -z #{crate_prefix})
|
|
8
|
+
git_files =
|
|
9
|
+
`#{git_cmd}`.split("\x0")
|
|
10
|
+
.select { |path| path.start_with?(crate_prefix) }
|
|
11
|
+
.map { |path| path.delete_prefix(crate_prefix) }
|
|
12
|
+
fallback_files = Dir.chdir(__dir__) do
|
|
13
|
+
Dir.glob(
|
|
14
|
+
%w[
|
|
15
|
+
README.md
|
|
16
|
+
ext/html-to-markdown-rb/extconf.rb
|
|
17
|
+
exe/*
|
|
18
|
+
lib/**/*.rb
|
|
19
|
+
lib/bin/*
|
|
20
|
+
src/**/*.rs
|
|
21
|
+
spec/**/*.rb
|
|
22
|
+
]
|
|
23
|
+
)
|
|
24
|
+
end
|
|
25
|
+
files = git_files.empty? ? fallback_files : git_files
|
|
26
|
+
|
|
27
|
+
Gem::Specification.new do |spec|
|
|
28
|
+
spec.name = 'html-to-markdown'
|
|
29
|
+
spec.version = HtmlToMarkdown::VERSION
|
|
30
|
+
spec.authors = ["Na'aman Hirschfeld"]
|
|
31
|
+
spec.email = ['nhirschfeld@gmail.com']
|
|
32
|
+
|
|
33
|
+
spec.summary = 'Blazing-fast HTML to Markdown conversion for Ruby, powered by Rust.'
|
|
34
|
+
spec.description = <<~DESC.strip
|
|
35
|
+
html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
|
|
36
|
+
It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
|
|
37
|
+
DESC
|
|
38
|
+
spec.homepage = 'https://github.com/Goldziher/html-to-markdown'
|
|
39
|
+
spec.license = 'MIT'
|
|
40
|
+
|
|
41
|
+
spec.required_ruby_version = Gem::Requirement.new('>= 3.2')
|
|
42
|
+
|
|
43
|
+
spec.bindir = 'exe'
|
|
44
|
+
spec.executables = ['html-to-markdown']
|
|
45
|
+
spec.require_paths = ['lib']
|
|
46
|
+
|
|
47
|
+
spec.files = files
|
|
48
|
+
spec.extra_rdoc_files = ['README.md']
|
|
49
|
+
|
|
50
|
+
spec.extensions = ['ext/html-to-markdown-rb/extconf.rb']
|
|
51
|
+
|
|
52
|
+
spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
|
|
53
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
|
54
|
+
spec.metadata['homepage_uri'] = 'https://github.com/Goldziher/html-to-markdown'
|
|
55
|
+
spec.metadata['source_code_uri'] = 'https://github.com/Goldziher/html-to-markdown'
|
|
56
|
+
spec.metadata['bug_tracker_uri'] = 'https://github.com/Goldziher/html-to-markdown/issues'
|
|
57
|
+
spec.metadata['changelog_uri'] = 'https://github.com/Goldziher/html-to-markdown/releases'
|
|
58
|
+
spec.metadata['documentation_uri'] = 'https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md'
|
|
59
|
+
end
|
data/lib/html_to_markdown/cli.rb
CHANGED
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'html_to_markdown/cli_proxy'
|
|
4
|
-
|
|
5
|
-
module HtmlToMarkdown
|
|
6
|
-
module CLI
|
|
7
|
-
module_function
|
|
8
|
-
|
|
9
|
-
def run(argv = ARGV, stdout: $stdout, stderr: $stderr)
|
|
10
|
-
output = CLIProxy.call(argv)
|
|
11
|
-
stdout.print(output)
|
|
12
|
-
0
|
|
13
|
-
rescue CLIProxy::CLIExecutionError => e
|
|
14
|
-
stderr.print(e.stderr)
|
|
15
|
-
e.status || 1
|
|
16
|
-
rescue CLIProxy::MissingBinaryError, CLIProxy::Error => e
|
|
17
|
-
stderr.puts(e.message)
|
|
18
|
-
1
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'html_to_markdown/cli_proxy'
|
|
4
|
+
|
|
5
|
+
module HtmlToMarkdown
|
|
6
|
+
module CLI
|
|
7
|
+
module_function
|
|
8
|
+
|
|
9
|
+
def run(argv = ARGV, stdout: $stdout, stderr: $stderr)
|
|
10
|
+
output = CLIProxy.call(argv)
|
|
11
|
+
stdout.print(output)
|
|
12
|
+
0
|
|
13
|
+
rescue CLIProxy::CLIExecutionError => e
|
|
14
|
+
stderr.print(e.stderr)
|
|
15
|
+
e.status || 1
|
|
16
|
+
rescue CLIProxy::MissingBinaryError, CLIProxy::Error => e
|
|
17
|
+
stderr.puts(e.message)
|
|
18
|
+
1
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -1,71 +1,71 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'open3'
|
|
4
|
-
require 'pathname'
|
|
5
|
-
|
|
6
|
-
module HtmlToMarkdown
|
|
7
|
-
module CLIProxy
|
|
8
|
-
Error = Class.new(StandardError)
|
|
9
|
-
MissingBinaryError = Class.new(Error)
|
|
10
|
-
|
|
11
|
-
class CLIExecutionError < Error
|
|
12
|
-
attr_reader :stderr, :status
|
|
13
|
-
|
|
14
|
-
def initialize(message, stderr:, status:)
|
|
15
|
-
super(message)
|
|
16
|
-
@stderr = stderr
|
|
17
|
-
@status = status
|
|
18
|
-
end
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
module_function
|
|
22
|
-
|
|
23
|
-
def call(argv)
|
|
24
|
-
binary = find_cli_binary
|
|
25
|
-
args = Array(argv).map(&:to_s)
|
|
26
|
-
stdout, stderr, status = Open3.capture3(binary.to_s, *args)
|
|
27
|
-
return stdout if status.success?
|
|
28
|
-
|
|
29
|
-
raise CLIExecutionError.new(
|
|
30
|
-
"html-to-markdown CLI exited with status #{status.exitstatus}",
|
|
31
|
-
stderr: stderr,
|
|
32
|
-
status: status.exitstatus
|
|
33
|
-
)
|
|
34
|
-
end
|
|
35
|
-
|
|
36
|
-
def find_cli_binary
|
|
37
|
-
binary_name = Gem.win_platform? ? 'html-to-markdown.exe' : 'html-to-markdown'
|
|
38
|
-
found = search_paths(binary_name).find(&:file?)
|
|
39
|
-
return found if found
|
|
40
|
-
|
|
41
|
-
raise MissingBinaryError, missing_binary_message
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
def root_path
|
|
45
|
-
@root_path ||= Pathname(__dir__).join('../..').expand_path
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
def lib_path
|
|
49
|
-
@lib_path ||= Pathname(__dir__).join('..').expand_path
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
def search_paths(binary_name)
|
|
53
|
-
paths = [
|
|
54
|
-
root_path.join('target', 'release', binary_name),
|
|
55
|
-
lib_path.join('bin', binary_name),
|
|
56
|
-
lib_path.join(binary_name)
|
|
57
|
-
]
|
|
58
|
-
|
|
59
|
-
workspace_root = root_path.parent&.parent
|
|
60
|
-
paths << workspace_root.join('target', 'release', binary_name) if workspace_root
|
|
61
|
-
paths
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
def missing_binary_message
|
|
65
|
-
<<~MSG.strip
|
|
66
|
-
html-to-markdown CLI binary not found. Build it with
|
|
67
|
-
`cargo build --release --package html-to-markdown-cli`.
|
|
68
|
-
MSG
|
|
69
|
-
end
|
|
70
|
-
end
|
|
71
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'open3'
|
|
4
|
+
require 'pathname'
|
|
5
|
+
|
|
6
|
+
module HtmlToMarkdown
|
|
7
|
+
module CLIProxy
|
|
8
|
+
Error = Class.new(StandardError)
|
|
9
|
+
MissingBinaryError = Class.new(Error)
|
|
10
|
+
|
|
11
|
+
class CLIExecutionError < Error
|
|
12
|
+
attr_reader :stderr, :status
|
|
13
|
+
|
|
14
|
+
def initialize(message, stderr:, status:)
|
|
15
|
+
super(message)
|
|
16
|
+
@stderr = stderr
|
|
17
|
+
@status = status
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
module_function
|
|
22
|
+
|
|
23
|
+
def call(argv)
|
|
24
|
+
binary = find_cli_binary
|
|
25
|
+
args = Array(argv).map(&:to_s)
|
|
26
|
+
stdout, stderr, status = Open3.capture3(binary.to_s, *args)
|
|
27
|
+
return stdout if status.success?
|
|
28
|
+
|
|
29
|
+
raise CLIExecutionError.new(
|
|
30
|
+
"html-to-markdown CLI exited with status #{status.exitstatus}",
|
|
31
|
+
stderr: stderr,
|
|
32
|
+
status: status.exitstatus
|
|
33
|
+
)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def find_cli_binary
|
|
37
|
+
binary_name = Gem.win_platform? ? 'html-to-markdown.exe' : 'html-to-markdown'
|
|
38
|
+
found = search_paths(binary_name).find(&:file?)
|
|
39
|
+
return found if found
|
|
40
|
+
|
|
41
|
+
raise MissingBinaryError, missing_binary_message
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def root_path
|
|
45
|
+
@root_path ||= Pathname(__dir__).join('../..').expand_path
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def lib_path
|
|
49
|
+
@lib_path ||= Pathname(__dir__).join('..').expand_path
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def search_paths(binary_name)
|
|
53
|
+
paths = [
|
|
54
|
+
root_path.join('target', 'release', binary_name),
|
|
55
|
+
lib_path.join('bin', binary_name),
|
|
56
|
+
lib_path.join(binary_name)
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
workspace_root = root_path.parent&.parent
|
|
60
|
+
paths << workspace_root.join('target', 'release', binary_name) if workspace_root
|
|
61
|
+
paths
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def missing_binary_message
|
|
65
|
+
<<~MSG.strip
|
|
66
|
+
html-to-markdown CLI binary not found. Build it with
|
|
67
|
+
`cargo build --release --package html-to-markdown-cli`.
|
|
68
|
+
MSG
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module HtmlToMarkdown
|
|
4
|
-
VERSION = '2.6.
|
|
5
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module HtmlToMarkdown
|
|
4
|
+
VERSION = '2.6.4'
|
|
5
|
+
end
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -1,24 +1,24 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require_relative 'html_to_markdown/version'
|
|
4
|
-
require 'html_to_markdown_rb'
|
|
5
|
-
|
|
6
|
-
module HtmlToMarkdown
|
|
7
|
-
autoload :CLI, 'html_to_markdown/cli'
|
|
8
|
-
autoload :CLIProxy, 'html_to_markdown/cli_proxy'
|
|
9
|
-
|
|
10
|
-
class << self
|
|
11
|
-
alias native_convert convert
|
|
12
|
-
alias native_convert_with_inline_images convert_with_inline_images
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
module_function
|
|
16
|
-
|
|
17
|
-
def convert(html, options = nil)
|
|
18
|
-
native_convert(html.to_s, options)
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
def convert_with_inline_images(html, options = nil, image_config = nil)
|
|
22
|
-
native_convert_with_inline_images(html.to_s, options, image_config)
|
|
23
|
-
end
|
|
24
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'html_to_markdown/version'
|
|
4
|
+
require 'html_to_markdown_rb'
|
|
5
|
+
|
|
6
|
+
module HtmlToMarkdown
|
|
7
|
+
autoload :CLI, 'html_to_markdown/cli'
|
|
8
|
+
autoload :CLIProxy, 'html_to_markdown/cli_proxy'
|
|
9
|
+
|
|
10
|
+
class << self
|
|
11
|
+
alias native_convert convert
|
|
12
|
+
alias native_convert_with_inline_images convert_with_inline_images
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
module_function
|
|
16
|
+
|
|
17
|
+
def convert(html, options = nil)
|
|
18
|
+
native_convert(html.to_s, options)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def convert_with_inline_images(html, options = nil, image_config = nil)
|
|
22
|
+
native_convert_with_inline_images(html.to_s, options, image_config)
|
|
23
|
+
end
|
|
24
|
+
end
|
data/spec/cli_proxy_spec.rb
CHANGED
|
@@ -1,42 +1,42 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'spec_helper'
|
|
4
|
-
require 'html_to_markdown/cli_proxy'
|
|
5
|
-
require 'html_to_markdown/cli'
|
|
6
|
-
require 'stringio'
|
|
7
|
-
|
|
8
|
-
RSpec.describe HtmlToMarkdown::CLIProxy do
|
|
9
|
-
describe '.call' do
|
|
10
|
-
it 'executes the CLI binary' do
|
|
11
|
-
begin
|
|
12
|
-
binary = described_class.find_cli_binary
|
|
13
|
-
rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
|
|
14
|
-
skip 'CLI binary not built'
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
expect(binary).to be_file
|
|
18
|
-
|
|
19
|
-
output = described_class.call(['--version'])
|
|
20
|
-
expect(output).to include(HtmlToMarkdown::VERSION)
|
|
21
|
-
end
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
describe HtmlToMarkdown::CLI do
|
|
25
|
-
it 'writes CLI output to stdout' do
|
|
26
|
-
begin
|
|
27
|
-
HtmlToMarkdown::CLIProxy.find_cli_binary
|
|
28
|
-
rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
|
|
29
|
-
skip 'CLI binary not built'
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
stdout = StringIO.new
|
|
33
|
-
stderr = StringIO.new
|
|
34
|
-
|
|
35
|
-
exit_code = described_class.run(['--version'], stdout: stdout, stderr: stderr)
|
|
36
|
-
|
|
37
|
-
expect(exit_code).to eq(0)
|
|
38
|
-
expect(stdout.string).to include(HtmlToMarkdown::VERSION)
|
|
39
|
-
expect(stderr.string).to be_empty
|
|
40
|
-
end
|
|
41
|
-
end
|
|
42
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
require 'html_to_markdown/cli_proxy'
|
|
5
|
+
require 'html_to_markdown/cli'
|
|
6
|
+
require 'stringio'
|
|
7
|
+
|
|
8
|
+
RSpec.describe HtmlToMarkdown::CLIProxy do
|
|
9
|
+
describe '.call' do
|
|
10
|
+
it 'executes the CLI binary' do
|
|
11
|
+
begin
|
|
12
|
+
binary = described_class.find_cli_binary
|
|
13
|
+
rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
|
|
14
|
+
skip 'CLI binary not built'
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
expect(binary).to be_file
|
|
18
|
+
|
|
19
|
+
output = described_class.call(['--version'])
|
|
20
|
+
expect(output).to include(HtmlToMarkdown::VERSION)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
describe HtmlToMarkdown::CLI do
|
|
25
|
+
it 'writes CLI output to stdout' do
|
|
26
|
+
begin
|
|
27
|
+
HtmlToMarkdown::CLIProxy.find_cli_binary
|
|
28
|
+
rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
|
|
29
|
+
skip 'CLI binary not built'
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
stdout = StringIO.new
|
|
33
|
+
stderr = StringIO.new
|
|
34
|
+
|
|
35
|
+
exit_code = described_class.run(['--version'], stdout: stdout, stderr: stderr)
|
|
36
|
+
|
|
37
|
+
expect(exit_code).to eq(0)
|
|
38
|
+
expect(stdout.string).to include(HtmlToMarkdown::VERSION)
|
|
39
|
+
expect(stderr.string).to be_empty
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
data/spec/convert_spec.rb
CHANGED
|
@@ -1,29 +1,29 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'spec_helper'
|
|
4
|
-
|
|
5
|
-
RSpec.describe HtmlToMarkdown do
|
|
6
|
-
describe '.convert' do
|
|
7
|
-
it 'converts simple headings' do
|
|
8
|
-
expect(described_class.convert('<h1>Hello</h1>')).to eq("# Hello\n")
|
|
9
|
-
end
|
|
10
|
-
|
|
11
|
-
it 'accepts options hash' do
|
|
12
|
-
result = described_class.convert(
|
|
13
|
-
'<h1>Hello</h1>',
|
|
14
|
-
heading_style: :atx_closed,
|
|
15
|
-
default_title: true
|
|
16
|
-
)
|
|
17
|
-
expect(result).to include('Hello')
|
|
18
|
-
end
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
describe '.convert_with_inline_images' do
|
|
22
|
-
it 'returns inline images metadata' do
|
|
23
|
-
html = '<p><img src="data:image/png;base64,ZmFrZQ==" alt="fake"></p>'
|
|
24
|
-
extraction = described_class.convert_with_inline_images(html)
|
|
25
|
-
expect(extraction).to include(:markdown, :inline_images, :warnings)
|
|
26
|
-
expect(extraction[:inline_images].first[:description]).to eq('fake')
|
|
27
|
-
end
|
|
28
|
-
end
|
|
29
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
|
|
5
|
+
RSpec.describe HtmlToMarkdown do
|
|
6
|
+
describe '.convert' do
|
|
7
|
+
it 'converts simple headings' do
|
|
8
|
+
expect(described_class.convert('<h1>Hello</h1>')).to eq("# Hello\n")
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
it 'accepts options hash' do
|
|
12
|
+
result = described_class.convert(
|
|
13
|
+
'<h1>Hello</h1>',
|
|
14
|
+
heading_style: :atx_closed,
|
|
15
|
+
default_title: true
|
|
16
|
+
)
|
|
17
|
+
expect(result).to include('Hello')
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
describe '.convert_with_inline_images' do
|
|
22
|
+
it 'returns inline images metadata' do
|
|
23
|
+
html = '<p><img src="data:image/png;base64,ZmFrZQ==" alt="fake"></p>'
|
|
24
|
+
extraction = described_class.convert_with_inline_images(html)
|
|
25
|
+
expect(extraction).to include(:markdown, :inline_images, :warnings)
|
|
26
|
+
expect(extraction[:inline_images].first[:description]).to eq('fake')
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
data/spec/spec_helper.rb
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'bundler/setup'
|
|
4
|
-
require 'html_to_markdown'
|
|
5
|
-
|
|
6
|
-
RSpec.configure do |config|
|
|
7
|
-
config.expect_with :rspec do |c|
|
|
8
|
-
c.syntax = :expect
|
|
9
|
-
end
|
|
10
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'bundler/setup'
|
|
4
|
+
require 'html_to_markdown'
|
|
5
|
+
|
|
6
|
+
RSpec.configure do |config|
|
|
7
|
+
config.expect_with :rspec do |c|
|
|
8
|
+
c.syntax = :expect
|
|
9
|
+
end
|
|
10
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.6.
|
|
4
|
+
version: 2.6.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-11-
|
|
11
|
+
date: 2025-11-08 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|