html-to-markdown 2.5.3 → 2.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.toml +2 -2
- data/README.md +61 -10
- data/lib/html_to_markdown/version.rb +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0f6d03456f76e7f7b1157f052c51dc333e1bd68e72a6bb7664f4701f9a592ed2
|
|
4
|
+
data.tar.gz: e4a09afb52aba7580f9805b91981d96007f04c37fa7ba6059867d0e818e260e0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d37723f521d0765ff47578457ba24d6636fe5102af2672cc3f8485bef323770cdb8155512378db4025ea47aa4c31146291da960a69ccdacb93a461e4c8e64593
|
|
7
|
+
data.tar.gz: 31523b5389627bfb1476133f1daa39ae2273d32dfdd72dac0f7927cc45267e41def028ff07bec8a833c567d07d02babda7e8e4e53c7dc63f7490c8ad7dc00481
|
data/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "html-to-markdown-rb"
|
|
3
|
-
version = "2.5.
|
|
3
|
+
version = "2.5.4"
|
|
4
4
|
edition = "2021"
|
|
5
5
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -21,7 +21,7 @@ crate-type = ["cdylib", "rlib"]
|
|
|
21
21
|
default = []
|
|
22
22
|
|
|
23
23
|
[dependencies]
|
|
24
|
-
html-to-markdown-rs = { version = "2.5.
|
|
24
|
+
html-to-markdown-rs = { version = "2.5.4", features = ["inline-images"] }
|
|
25
25
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
|
|
26
26
|
|
|
27
27
|
[dev-dependencies]
|
data/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# html-to-markdown-rb
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Blazing-fast HTML → Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, and WebAssembly packages. Ship identical Markdown across every runtime while enjoying native extension performance.
|
|
4
4
|
|
|
5
5
|
[](https://crates.io/crates/html-to-markdown-rs)
|
|
6
6
|
[](https://www.npmjs.com/package/html-to-markdown-node)
|
|
@@ -8,6 +8,15 @@ Ruby bindings for the `html-to-markdown` Rust engine – the same core that powe
|
|
|
8
8
|
[](https://rubygems.org/gems/html-to-markdown)
|
|
9
9
|
[](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
|
|
10
10
|
|
|
11
|
+
## Features
|
|
12
|
+
|
|
13
|
+
- ⚡ **Rust-fast**: Ruby bindings around a highly optimised Rust core (60‑80× faster than BeautifulSoup-based converters).
|
|
14
|
+
- 🔁 **Identical output**: Shares logic with the Python wheels, npm bindings, WASM package, and CLI — consistent Markdown everywhere.
|
|
15
|
+
- ⚙️ **Rich configuration**: Control heading styles, list indentation, whitespace handling, HTML preprocessing, and more.
|
|
16
|
+
- 🖼️ **Inline image extraction**: Pull out embedded images (PNG/JPEG/SVG/data URIs) alongside Markdown.
|
|
17
|
+
- 🧰 **Bundled CLI proxy**: Call the Rust CLI straight from Ruby or shell scripts.
|
|
18
|
+
- 🛠️ **First-class Rails support**: Works with `Gem.win_platform?` builds, supports Trusted Publishing, and compiles on install if no native gem matches.
|
|
19
|
+
|
|
11
20
|
## Installation
|
|
12
21
|
|
|
13
22
|
```bash
|
|
@@ -32,6 +41,18 @@ ridk exec pacman -S --needed --noconfirm base-devel mingw-w64-ucrt-x86_64-toolch
|
|
|
32
41
|
|
|
33
42
|
This provides the standard headers (including `strings.h`) required for the bindgen step.
|
|
34
43
|
|
|
44
|
+
## Performance Snapshot
|
|
45
|
+
|
|
46
|
+
Apple M4 • Real Wikipedia documents • `HtmlToMarkdown.convert` (Ruby)
|
|
47
|
+
|
|
48
|
+
| Document | Size | Latency | Throughput | Docs/sec |
|
|
49
|
+
| ------------------- | ----- | ------- | ---------- | -------- |
|
|
50
|
+
| Lists (Timeline) | 129KB | 0.69ms | 187 MB/s | 1,450 |
|
|
51
|
+
| Tables (Countries) | 360KB | 2.19ms | 164 MB/s | 456 |
|
|
52
|
+
| Mixed (Python wiki) | 656KB | 4.88ms | 134 MB/s | 205 |
|
|
53
|
+
|
|
54
|
+
> Same core, same benchmarks: the Ruby extension stays within single-digit % of the Rust CLI and mirrors the Python/Node numbers.
|
|
55
|
+
|
|
35
56
|
## Quick Start
|
|
36
57
|
|
|
37
58
|
```ruby
|
|
@@ -56,9 +77,11 @@ puts markdown
|
|
|
56
77
|
# - Identical output across languages
|
|
57
78
|
```
|
|
58
79
|
|
|
59
|
-
|
|
80
|
+
## API
|
|
60
81
|
|
|
61
|
-
|
|
82
|
+
### Conversion Options
|
|
83
|
+
|
|
84
|
+
Pass a Ruby hash (string or symbol keys) to tweak rendering. Every option maps one-for-one with the Rust/Python/Node APIs.
|
|
62
85
|
|
|
63
86
|
```ruby
|
|
64
87
|
require 'html_to_markdown'
|
|
@@ -67,10 +90,31 @@ markdown = HtmlToMarkdown.convert(
|
|
|
67
90
|
'<pre><code class="language-ruby">puts "hi"</code></pre>',
|
|
68
91
|
heading_style: :atx,
|
|
69
92
|
code_block_style: :fenced,
|
|
70
|
-
bullets:
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
93
|
+
bullets: '*+-',
|
|
94
|
+
list_indent_type: :spaces,
|
|
95
|
+
list_indent_width: 2,
|
|
96
|
+
whitespace_mode: :normalized,
|
|
97
|
+
highlight_style: :double_equal
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
puts markdown
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### HTML Preprocessing
|
|
104
|
+
|
|
105
|
+
Clean up scraped HTML (navigation, forms, malformed markup) before conversion:
|
|
106
|
+
|
|
107
|
+
```ruby
|
|
108
|
+
require 'html_to_markdown'
|
|
109
|
+
|
|
110
|
+
markdown = HtmlToMarkdown.convert(
|
|
111
|
+
html,
|
|
112
|
+
preprocessing: {
|
|
113
|
+
enabled: true,
|
|
114
|
+
preset: :aggressive, # :minimal, :standard, :aggressive
|
|
115
|
+
remove_navigation: true,
|
|
116
|
+
remove_forms: true
|
|
117
|
+
}
|
|
74
118
|
)
|
|
75
119
|
```
|
|
76
120
|
|
|
@@ -97,7 +141,7 @@ result.inline_images.each do |img|
|
|
|
97
141
|
end
|
|
98
142
|
```
|
|
99
143
|
|
|
100
|
-
|
|
144
|
+
## CLI
|
|
101
145
|
|
|
102
146
|
The gem bundles a small proxy for the Rust CLI binary. Use it when you need parity with the standalone `html-to-markdown` executable.
|
|
103
147
|
|
|
@@ -112,10 +156,17 @@ You can also call the CLI binary directly for scripting:
|
|
|
112
156
|
|
|
113
157
|
```ruby
|
|
114
158
|
HtmlToMarkdown::CLIProxy.call(['--version'])
|
|
115
|
-
# => "html-to-markdown 2.5.
|
|
159
|
+
# => "html-to-markdown 2.5.4"
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
Rebuild the CLI locally if you see `CLI binary not built` during tests:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
bundle exec rake compile # builds the extension
|
|
166
|
+
bundle exec ruby scripts/prepare_ruby_gem.rb # copies the CLI into lib/bin/
|
|
116
167
|
```
|
|
117
168
|
|
|
118
|
-
|
|
169
|
+
## Error Handling
|
|
119
170
|
|
|
120
171
|
Conversion errors raise `HtmlToMarkdown::Error` (wrapping the Rust error context). CLI invocations use specialised subclasses:
|
|
121
172
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.5.
|
|
4
|
+
version: 2.5.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
@@ -30,8 +30,9 @@ dependencies:
|
|
|
30
30
|
- - "<"
|
|
31
31
|
- !ruby/object:Gem::Version
|
|
32
32
|
version: '1.0'
|
|
33
|
-
description:
|
|
34
|
-
rb-sys.
|
|
33
|
+
description: |-
|
|
34
|
+
html-to-markdown wraps our ultra-fast Rust converter with a Ruby-native API via Magnus and rb-sys.
|
|
35
|
+
Enjoy identical output to the Python, Node, and WASM bindings, a bundled CLI proxy, and seamless cross-platform installs.
|
|
35
36
|
email:
|
|
36
37
|
- nhirschfeld@gmail.com
|
|
37
38
|
executables:
|
|
@@ -80,5 +81,5 @@ requirements: []
|
|
|
80
81
|
rubygems_version: 3.5.22
|
|
81
82
|
signing_key:
|
|
82
83
|
specification_version: 4
|
|
83
|
-
summary:
|
|
84
|
+
summary: Blazing-fast HTML to Markdown conversion for Ruby, powered by Rust.
|
|
84
85
|
test_files: []
|