html-to-markdown 3.1.0 → 3.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +42 -12
  3. data/Gemfile +1 -0
  4. data/Gemfile.lock +27 -55
  5. data/README.md +9 -10
  6. data/Rakefile +4 -10
  7. data/ext/html-to-markdown_rb/Cargo.toml +14 -0
  8. data/ext/html_to_markdown_rb/Cargo.toml +16 -0
  9. data/ext/html_to_markdown_rb/extconf.rb +10 -0
  10. data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +6 -0
  11. data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +9 -0
  12. data/ext/html_to_markdown_rb/src/lib.rs +3941 -0
  13. data/html-to-markdown-rb.gemspec +1 -1
  14. data/lib/html_to_markdown/version.rb +1 -1
  15. data/lib/html_to_markdown.rb +31 -21
  16. data/{ext/html-to-markdown-rb/native/extconf.rb → lib/html_to_markdown_rs.rb} +1 -1
  17. data/sig/html_to_markdown.rbs +17 -5
  18. data/vendor/Cargo.toml +4 -4
  19. data/vendor/html-to-markdown-rs/Cargo.toml +2 -2
  20. data/vendor/html-to-markdown-rs/examples/test_deser.rs +12 -0
  21. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +1 -1
  22. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +1 -1
  23. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
  24. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +1 -1
  25. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +10 -2
  26. data/vendor/html-to-markdown-rs/src/converter/mod.rs +2 -2
  27. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
  28. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +1 -1
  29. data/vendor/html-to-markdown-rs/src/exports.rs +3 -3
  30. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  31. data/vendor/html-to-markdown-rs/src/lib.rs +1 -2
  32. data/vendor/html-to-markdown-rs/src/metadata/config.rs +1 -1
  33. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +5 -5
  34. data/vendor/html-to-markdown-rs/src/options/conversion.rs +6 -12
  35. data/vendor/html-to-markdown-rs/src/options/mod.rs +1 -1
  36. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +3 -9
  37. data/vendor/html-to-markdown-rs/src/options/validation.rs +3 -3
  38. data/vendor/html-to-markdown-rs/src/types/document.rs +11 -0
  39. data/vendor/html-to-markdown-rs/src/types/result.rs +5 -2
  40. data/vendor/html-to-markdown-rs/src/types/tables.rs +1 -1
  41. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +1 -1
  42. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/state.rs +1 -1
  43. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/traversal.rs +1 -1
  44. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +8 -8
  45. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +6 -0
  46. data/vendor/html-to-markdown-rs/tests/integration_test.rs +3 -3
  47. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -2
  48. data/vendor/html-to-markdown-rs/tests/lists_test.rs +4 -4
  49. metadata +11 -18
  50. data/ext/html-to-markdown-rb/extconf.rb +0 -41
  51. data/ext/html-to-markdown-rb/native/Cargo.lock +0 -934
  52. data/ext/html-to-markdown-rb/native/Cargo.toml +0 -48
  53. data/ext/html-to-markdown-rb/native/README.md +0 -215
  54. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +0 -54
  55. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +0 -158
  56. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -11
  57. data/ext/html-to-markdown-rb/native/src/lib.rs +0 -128
  58. data/ext/html-to-markdown-rb/native/src/options.rs +0 -238
  59. data/ext/html-to-markdown-rb/native/src/types.rs +0 -24
  60. data/lib/html_to_markdown/cli.rb +0 -21
  61. data/lib/html_to_markdown/cli_proxy.rb +0 -74
  62. data/spec/cli_proxy_spec.rb +0 -42
  63. data/spec/spec_helper.rb +0 -10
@@ -1,48 +0,0 @@
1
- [package]
2
- name = "html-to-markdown-rb"
3
- version = "3.1.0"
4
- edition = "2024"
5
- authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
- license = "MIT"
7
- repository = "https://github.com/kreuzberg-dev/html-to-markdown"
8
- homepage = "https://github.com/kreuzberg-dev/html-to-markdown"
9
- documentation = "https://docs.rs/html-to-markdown-rs"
10
- readme = "README.md"
11
- rust-version = "1.85"
12
- description = "Ruby bindings (Magnus) for html-to-markdown - high-performance HTML to Markdown converter"
13
- keywords = ["html", "markdown", "ruby", "magnus", "bindings"]
14
- categories = ["api-bindings"]
15
-
16
- [lib]
17
- name = "html_to_markdown_rb"
18
- crate-type = ["cdylib", "rlib"]
19
-
20
- [dependencies]
21
- html-to-markdown-rs = { path = "../../../vendor/html-to-markdown-rs", features = [
22
- "inline-images",
23
- "visitor",
24
- "metadata",
25
- ] }
26
- magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
27
- "rb-sys",
28
- ] }
29
- [dev-dependencies]
30
- pretty_assertions = "1.4"
31
-
32
- [features]
33
- default = ["inline-images", "metadata", "visitor"]
34
- inline-images = ["html-to-markdown-rs/inline-images"]
35
- metadata = ["html-to-markdown-rs/metadata"]
36
- visitor = ["html-to-markdown-rs/visitor"]
37
-
38
- [lints.rust]
39
- unsafe_code = "forbid"
40
- missing_docs = "warn"
41
- unused_must_use = "deny"
42
-
43
- [lints.clippy]
44
- all = { level = "deny", priority = -1 }
45
- cargo = { level = "deny", priority = -1 }
46
- pedantic = { level = "warn", priority = -1 }
47
- nursery = { level = "warn", priority = -1 }
48
- multiple_crate_versions = "allow"
@@ -1,215 +0,0 @@
1
- # html-to-markdown-rb
2
-
3
- Blazing-fast HTML → Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, WebAssembly, and PHP packages. Ship identical Markdown across every runtime while enjoying native extension performance.
4
-
5
- [![Crates.io](https://img.shields.io/crates/v/html-to-markdown-rs.svg)](https://crates.io/crates/html-to-markdown-rs)
6
- [![npm (node)](https://img.shields.io/npm/v/html-to-markdown-node.svg?logo=npm)](https://www.npmjs.com/package/html-to-markdown-node)
7
- [![npm (wasm)](https://img.shields.io/npm/v/html-to-markdown-wasm.svg?logo=npm)](https://www.npmjs.com/package/html-to-markdown-wasm)
8
- [![PyPI](https://img.shields.io/pypi/v/html-to-markdown.svg?logo=pypi)](https://pypi.org/project/html-to-markdown/)
9
- [![Packagist](https://img.shields.io/packagist/v/goldziher/html-to-markdown.svg)](https://packagist.org/packages/goldziher/html-to-markdown)
10
- [![RubyGems](https://badge.fury.io/rb/html-to-markdown.svg)](https://rubygems.org/gems/html-to-markdown)
11
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE)
12
-
13
- ## Features
14
-
15
- - ⚡ **Rust-fast**: Ruby bindings around a highly optimised Rust core (60‑80× faster than BeautifulSoup-based converters).
16
- - 🔁 **Identical output**: Shares logic with the Python wheels, npm bindings, PHP extension, WASM package, and CLI — consistent Markdown everywhere.
17
- - ⚙️ **Rich configuration**: Control heading styles, list indentation, whitespace handling, HTML preprocessing, and more.
18
- - 🖼️ **Inline image extraction**: Pull out embedded images (PNG/JPEG/SVG/data URIs) alongside Markdown.
19
- - 🧰 **Bundled CLI proxy**: Call the Rust CLI straight from Ruby or shell scripts.
20
- - 🛠️ **First-class Rails support**: Works with `Gem.win_platform?` builds, supports Trusted Publishing, and compiles on install if no native gem matches.
21
-
22
- ## Documentation & Support
23
-
24
- - [GitHub repository](https://github.com/kreuzberg-dev/html-to-markdown)
25
- - [Issue tracker](https://github.com/kreuzberg-dev/html-to-markdown/issues)
26
- - [Changelog](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/CHANGELOG.md)
27
- - [Live demo (WASM)](https://kreuzberg-dev.github.io/html-to-markdown/)
28
-
29
- ## Installation
30
-
31
- ```bash
32
- bundle add html-to-markdown
33
- # or
34
- gem install html-to-markdown
35
- ```
36
-
37
- Add the gem to your project and Bundler will compile the native Rust extension on first install.
38
-
39
- ### Requirements
40
-
41
- - Ruby **3.2+** (Magnus relies on the fiber scheduler APIs added in 3.2)
42
- - Rust toolchain **1.85+** with Cargo available on your `$PATH`
43
- - Ruby development headers (`ruby-dev`, `ruby-devel`, or the platform equivalent)
44
-
45
- **Windows**: install [RubyInstaller with MSYS2](https://rubyinstaller.org/) (UCRT64). Run once:
46
-
47
- ```powershell
48
- ridk exec pacman -S --needed --noconfirm base-devel mingw-w64-ucrt-x86_64-toolchain
49
- ```
50
-
51
- This provides the standard headers (including `strings.h`) required for the bindgen step.
52
-
53
- ## Performance Snapshot
54
-
55
- Apple M4 • Real Wikipedia documents • `HtmlToMarkdown.convert` (Ruby)
56
-
57
- | Document | Size | Latency | Throughput | Docs/sec |
58
- | ------------------- | ----- | ------- | ---------- | -------- |
59
- | Lists (Timeline) | 129KB | 0.69ms | 187 MB/s | 1,450 |
60
- | Tables (Countries) | 360KB | 2.19ms | 164 MB/s | 456 |
61
- | Mixed (Python wiki) | 656KB | 4.88ms | 134 MB/s | 205 |
62
-
63
- > Same core, same benchmarks: the Ruby extension stays within single-digit % of the Rust CLI and mirrors the Python/Node numbers.
64
-
65
- ### Benchmark Fixtures (Apple M4)
66
-
67
- Measured via `task bench:harness` with the shared Wikipedia + hOCR suite:
68
-
69
- | Document | Size | ops/sec (Ruby) |
70
- | ---------------------- | ------ | -------------- |
71
- | Lists (Timeline) | 129 KB | 3,156 |
72
- | Tables (Countries) | 360 KB | 921 |
73
- | Medium (Python) | 657 KB | 469 |
74
- | Large (Rust) | 567 KB | 534 |
75
- | Small (Intro) | 463 KB | 629 |
76
- | hOCR German PDF | 44 KB | 7,250 |
77
- | hOCR Invoice | 4 KB | 83,883 |
78
- | hOCR Embedded Tables | 37 KB | 7,890 |
79
-
80
- ## Quick Start
81
-
82
- ```ruby
83
- require 'html_to_markdown'
84
-
85
- html = <<~HTML
86
- <h1>Welcome</h1>
87
- <p>This is <strong>Rust-fast</strong> conversion!</p>
88
- <ul>
89
- <li>Native extension</li>
90
- <li>Identical output across languages</li>
91
- </ul>
92
- HTML
93
-
94
- markdown = HtmlToMarkdown.convert(html)
95
- puts markdown
96
- # # Welcome
97
- #
98
- # This is **Rust-fast** conversion!
99
- #
100
- # - Native extension
101
- # - Identical output across languages
102
- ```
103
-
104
- ## API
105
-
106
- ### Conversion Options
107
-
108
- Pass a Ruby hash (string or symbol keys) to tweak rendering. Every option maps one-for-one with the Rust/Python/Node APIs.
109
-
110
- ```ruby
111
- require 'html_to_markdown'
112
-
113
- markdown = HtmlToMarkdown.convert(
114
- '<pre><code class="language-ruby">puts "hi"</code></pre>',
115
- heading_style: :atx,
116
- code_block_style: :fenced,
117
- bullets: '*+-',
118
- list_indent_type: :spaces,
119
- list_indent_width: 2,
120
- whitespace_mode: :normalized,
121
- highlight_style: :double_equal
122
- )
123
-
124
- puts markdown
125
- ```
126
-
127
- ### HTML Preprocessing
128
-
129
- Clean up scraped HTML (navigation, forms, malformed markup) before conversion:
130
-
131
- ```ruby
132
- require 'html_to_markdown'
133
-
134
- markdown = HtmlToMarkdown.convert(
135
- html,
136
- preprocessing: {
137
- enabled: true,
138
- preset: :aggressive, # :minimal, :standard, :aggressive
139
- remove_navigation: true,
140
- remove_forms: true
141
- }
142
- )
143
- ```
144
-
145
- ### Inline Images
146
-
147
- Convert HTML with inline images (data URIs, SVG) to Markdown.
148
-
149
- ```ruby
150
- require 'html_to_markdown'
151
-
152
- markdown = HtmlToMarkdown.convert(
153
- '<img src="data:image/png;base64,iVBORw0..." alt="Pixel">'
154
- )
155
-
156
- puts markdown
157
- ```
158
-
159
- ## CLI
160
-
161
- The gem bundles a small proxy for the Rust CLI binary. Use it when you need parity with the standalone `html-to-markdown` executable.
162
-
163
- ```ruby
164
- require 'html_to_markdown/cli'
165
-
166
- HtmlToMarkdown::CLI.run(%w[--heading-style atx input.html], stdout: $stdout)
167
- # => writes converted Markdown to STDOUT
168
- ```
169
-
170
- You can also call the CLI binary directly for scripting:
171
-
172
- ```ruby
173
- HtmlToMarkdown::CLIProxy.call(['--version'])
174
- # => "html-to-markdown 2.5.7"
175
- ```
176
-
177
- Rebuild the CLI locally if you see `CLI binary not built` during tests:
178
-
179
- ```bash
180
- bundle exec rake compile # builds the extension
181
- bundle exec ruby scripts/prepare_ruby_gem.rb # copies the CLI into lib/bin/
182
- ```
183
-
184
- ## Error Handling
185
-
186
- Conversion errors raise `HtmlToMarkdown::Error` (wrapping the Rust error context). CLI invocations use specialised subclasses:
187
-
188
- - `HtmlToMarkdown::CLIProxy::MissingBinaryError`
189
- - `HtmlToMarkdown::CLIProxy::CLIExecutionError`
190
-
191
- Rescue them to provide clearer feedback in your application.
192
-
193
- ## Consistent Across Languages
194
-
195
- The Ruby gem shares the exact Rust core with:
196
-
197
- - [Python wheels](https://pypi.org/project/html-to-markdown/)
198
- - [Node.js / Bun bindings](https://www.npmjs.com/package/html-to-markdown-node)
199
- - [WebAssembly package](https://www.npmjs.com/package/html-to-markdown-wasm)
200
- - The Rust crate and CLI
201
-
202
- Use whichever runtime fits your stack while keeping formatting behaviour identical.
203
-
204
- ## Development
205
-
206
- ```bash
207
- bundle exec rake compile # build the native extension
208
- bundle exec rspec # run test suite
209
- ```
210
-
211
- The extension uses [Magnus](https://github.com/matsadler/magnus) plus `rb-sys` for bindgen. When editing the Rust code under `src/`, rerun `rake compile`.
212
-
213
- ## License
214
-
215
- MIT © Na'aman Hirschfeld
@@ -1,54 +0,0 @@
1
- //! Inline image configuration and conversion functions.
2
-
3
- use html_to_markdown_rs::InlineImage;
4
- use magnus::prelude::*;
5
- use magnus::{Error, Ruby, Value};
6
-
7
- pub fn inline_image_to_value(ruby: &Ruby, image: InlineImage) -> Result<Value, Error> {
8
- let InlineImage {
9
- data,
10
- format,
11
- filename,
12
- description,
13
- dimensions,
14
- source,
15
- attributes,
16
- } = image;
17
-
18
- let hash = ruby.hash_new();
19
- let data_value = ruby.str_from_slice(&data);
20
- hash.aset(ruby.intern("data"), data_value)?;
21
-
22
- let format_value = format.to_string();
23
- hash.aset(ruby.intern("format"), format_value)?;
24
-
25
- match filename {
26
- Some(name) => hash.aset(ruby.intern("filename"), name)?,
27
- None => hash.aset(ruby.intern("filename"), ruby.qnil())?,
28
- }
29
-
30
- match description {
31
- Some(desc) => hash.aset(ruby.intern("description"), desc)?,
32
- None => hash.aset(ruby.intern("description"), ruby.qnil())?,
33
- }
34
-
35
- if let Some((width, height)) = dimensions {
36
- let dims = ruby.ary_new();
37
- dims.push(i64::from(width))?;
38
- dims.push(i64::from(height))?;
39
- hash.aset(ruby.intern("dimensions"), dims)?;
40
- } else {
41
- hash.aset(ruby.intern("dimensions"), ruby.qnil())?;
42
- }
43
-
44
- let source_value = source.to_string();
45
- hash.aset(ruby.intern("source"), source_value)?;
46
-
47
- let attrs = ruby.hash_new();
48
- for (key, value) in attributes {
49
- attrs.aset(key, value)?;
50
- }
51
- hash.aset(ruby.intern("attributes"), attrs)?;
52
-
53
- Ok(hash.as_value())
54
- }
@@ -1,158 +0,0 @@
1
- //! Metadata configuration and conversion functions.
2
-
3
- use html_to_markdown_rs::metadata::{
4
- DocumentMetadata as RustDocumentMetadata, HeaderMetadata as RustHeaderMetadata,
5
- HtmlMetadata as RustHtmlMetadata, ImageMetadata as RustImageMetadata, LinkMetadata as RustLinkMetadata,
6
- StructuredData as RustStructuredData, TextDirection as RustTextDirection,
7
- };
8
- use magnus::prelude::*;
9
- use magnus::{Error, Ruby, Value};
10
-
11
- fn opt_string_to_ruby(ruby: &Ruby, opt: Option<String>) -> Result<Value, Error> {
12
- match opt {
13
- Some(val) => Ok(ruby.str_from_slice(val.as_bytes()).as_value()),
14
- None => Ok(ruby.qnil().as_value()),
15
- }
16
- }
17
-
18
- fn btreemap_to_ruby_hash(ruby: &Ruby, map: std::collections::BTreeMap<String, String>) -> Result<Value, Error> {
19
- let hash = ruby.hash_new();
20
- for (k, v) in map {
21
- hash.aset(k, v)?;
22
- }
23
- Ok(hash.as_value())
24
- }
25
-
26
- fn text_direction_to_string(text_direction: Option<RustTextDirection>) -> Option<String> {
27
- text_direction.map(|direction| direction.to_string())
28
- }
29
-
30
- fn document_metadata_to_ruby(ruby: &Ruby, doc: RustDocumentMetadata) -> Result<Value, Error> {
31
- let hash = ruby.hash_new();
32
-
33
- hash.aset(ruby.intern("title"), opt_string_to_ruby(ruby, doc.title)?)?;
34
- hash.aset(ruby.intern("description"), opt_string_to_ruby(ruby, doc.description)?)?;
35
-
36
- let keywords = ruby.ary_new();
37
- for keyword in doc.keywords {
38
- keywords.push(keyword)?;
39
- }
40
- hash.aset(ruby.intern("keywords"), keywords)?;
41
-
42
- hash.aset(ruby.intern("author"), opt_string_to_ruby(ruby, doc.author)?)?;
43
- hash.aset(
44
- ruby.intern("canonical_url"),
45
- opt_string_to_ruby(ruby, doc.canonical_url)?,
46
- )?;
47
- hash.aset(ruby.intern("base_href"), opt_string_to_ruby(ruby, doc.base_href)?)?;
48
- hash.aset(ruby.intern("language"), opt_string_to_ruby(ruby, doc.language)?)?;
49
-
50
- match text_direction_to_string(doc.text_direction) {
51
- Some(dir) => hash.aset(ruby.intern("text_direction"), dir)?,
52
- None => hash.aset(ruby.intern("text_direction"), ruby.qnil())?,
53
- }
54
-
55
- hash.aset(ruby.intern("open_graph"), btreemap_to_ruby_hash(ruby, doc.open_graph)?)?;
56
- hash.aset(
57
- ruby.intern("twitter_card"),
58
- btreemap_to_ruby_hash(ruby, doc.twitter_card)?,
59
- )?;
60
- hash.aset(ruby.intern("meta_tags"), btreemap_to_ruby_hash(ruby, doc.meta_tags)?)?;
61
-
62
- Ok(hash.as_value())
63
- }
64
-
65
- fn headers_to_ruby(ruby: &Ruby, headers: Vec<RustHeaderMetadata>) -> Result<Value, Error> {
66
- let array = ruby.ary_new();
67
- for header in headers {
68
- let hash = ruby.hash_new();
69
- hash.aset(ruby.intern("level"), header.level)?;
70
- hash.aset(ruby.intern("text"), header.text)?;
71
- hash.aset(ruby.intern("id"), opt_string_to_ruby(ruby, header.id)?)?;
72
- hash.aset(ruby.intern("depth"), header.depth as i64)?;
73
- hash.aset(ruby.intern("html_offset"), header.html_offset as i64)?;
74
- array.push(hash)?;
75
- }
76
- Ok(array.as_value())
77
- }
78
-
79
- fn links_to_ruby(ruby: &Ruby, links: Vec<RustLinkMetadata>) -> Result<Value, Error> {
80
- let array = ruby.ary_new();
81
- for link in links {
82
- let hash = ruby.hash_new();
83
- hash.aset(ruby.intern("href"), link.href)?;
84
- hash.aset(ruby.intern("text"), link.text)?;
85
- hash.aset(ruby.intern("title"), opt_string_to_ruby(ruby, link.title)?)?;
86
- hash.aset(ruby.intern("link_type"), link.link_type.to_string())?;
87
-
88
- let rel_array = ruby.ary_new();
89
- for r in link.rel {
90
- rel_array.push(r)?;
91
- }
92
- hash.aset(ruby.intern("rel"), rel_array)?;
93
-
94
- hash.aset(ruby.intern("attributes"), btreemap_to_ruby_hash(ruby, link.attributes)?)?;
95
- array.push(hash)?;
96
- }
97
- Ok(array.as_value())
98
- }
99
-
100
- fn images_to_ruby(ruby: &Ruby, images: Vec<RustImageMetadata>) -> Result<Value, Error> {
101
- let array = ruby.ary_new();
102
- for image in images {
103
- let hash = ruby.hash_new();
104
- hash.aset(ruby.intern("src"), image.src)?;
105
- hash.aset(ruby.intern("alt"), opt_string_to_ruby(ruby, image.alt)?)?;
106
- hash.aset(ruby.intern("title"), opt_string_to_ruby(ruby, image.title)?)?;
107
-
108
- match image.dimensions {
109
- Some((width, height)) => {
110
- let dims = ruby.ary_new();
111
- dims.push(i64::from(width))?;
112
- dims.push(i64::from(height))?;
113
- hash.aset(ruby.intern("dimensions"), dims)?;
114
- }
115
- None => {
116
- hash.aset(ruby.intern("dimensions"), ruby.qnil())?;
117
- }
118
- }
119
-
120
- hash.aset(ruby.intern("image_type"), image.image_type.to_string())?;
121
- hash.aset(
122
- ruby.intern("attributes"),
123
- btreemap_to_ruby_hash(ruby, image.attributes)?,
124
- )?;
125
- array.push(hash)?;
126
- }
127
- Ok(array.as_value())
128
- }
129
-
130
- fn structured_data_to_ruby(ruby: &Ruby, data: Vec<RustStructuredData>) -> Result<Value, Error> {
131
- let array = ruby.ary_new();
132
- for item in data {
133
- let hash = ruby.hash_new();
134
- hash.aset(ruby.intern("data_type"), item.data_type.to_string())?;
135
- hash.aset(ruby.intern("raw_json"), item.raw_json)?;
136
- hash.aset(ruby.intern("schema_type"), opt_string_to_ruby(ruby, item.schema_type)?)?;
137
- array.push(hash)?;
138
- }
139
- Ok(array.as_value())
140
- }
141
-
142
- pub fn extended_metadata_to_ruby(ruby: &Ruby, metadata: RustHtmlMetadata) -> Result<Value, Error> {
143
- let hash = ruby.hash_new();
144
-
145
- hash.aset(
146
- ruby.intern("document"),
147
- document_metadata_to_ruby(ruby, metadata.document)?,
148
- )?;
149
- hash.aset(ruby.intern("headers"), headers_to_ruby(ruby, metadata.headers)?)?;
150
- hash.aset(ruby.intern("links"), links_to_ruby(ruby, metadata.links)?)?;
151
- hash.aset(ruby.intern("images"), images_to_ruby(ruby, metadata.images)?)?;
152
- hash.aset(
153
- ruby.intern("structured_data"),
154
- structured_data_to_ruby(ruby, metadata.structured_data)?,
155
- )?;
156
-
157
- Ok(hash.as_value())
158
- }
@@ -1,11 +0,0 @@
1
- //! Conversion functions for Ruby bindings.
2
-
3
- pub mod inline_images;
4
-
5
- #[cfg(feature = "metadata")]
6
- pub mod metadata;
7
-
8
- pub use inline_images::*;
9
-
10
- #[cfg(feature = "metadata")]
11
- pub use metadata::*;
@@ -1,128 +0,0 @@
1
- #![allow(clippy::all, clippy::pedantic, clippy::nursery, missing_docs)]
2
-
3
- use html_to_markdown_rs::{error::ConversionError, safety::guard_panic};
4
-
5
- mod conversion;
6
- mod options;
7
- mod types;
8
-
9
- use options::build_conversion_options;
10
- use types::{arg_error, runtime_error};
11
-
12
- #[cfg(feature = "metadata")]
13
- use conversion::extended_metadata_to_ruby;
14
-
15
- use magnus::prelude::*;
16
- use magnus::{Error, Ruby, Value, function, scan_args::scan_args};
17
-
18
- fn conversion_error(err: ConversionError) -> Error {
19
- match err {
20
- ConversionError::ConfigError(msg) => arg_error(msg),
21
- ConversionError::Panic(message) => {
22
- runtime_error(format!("html-to-markdown panic during conversion: {message}"))
23
- }
24
- other => runtime_error(other.to_string()),
25
- }
26
- }
27
-
28
- fn convert_full_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
29
- let parsed = scan_args::<(String,), (Option<Value>,), (), (), (), ()>(args)?;
30
- let html = parsed.required.0;
31
- let options = build_conversion_options(ruby, parsed.optional.0)?;
32
-
33
- let result = guard_panic(|| html_to_markdown_rs::convert(&html, Some(options.clone())))
34
- .map_err(conversion_error)?;
35
-
36
- let hash = ruby.hash_new();
37
-
38
- // content: Option<String>
39
- match result.content {
40
- Some(ref s) => hash.aset(ruby.intern("content"), s.as_str())?,
41
- None => hash.aset(ruby.intern("content"), ruby.qnil())?,
42
- }
43
-
44
- // document: not yet exposed
45
- hash.aset(ruby.intern("document"), ruby.qnil())?;
46
-
47
- // metadata
48
- #[cfg(feature = "metadata")]
49
- {
50
- let metadata_value = extended_metadata_to_ruby(ruby, result.metadata)?;
51
- hash.aset(ruby.intern("metadata"), metadata_value)?;
52
- }
53
- #[cfg(not(feature = "metadata"))]
54
- hash.aset(ruby.intern("metadata"), ruby.qnil())?;
55
-
56
- // tables: Vec<TableData> with grid and markdown
57
- {
58
- let tables_array = ruby.ary_new();
59
- for table in &result.tables {
60
- let table_hash = ruby.hash_new();
61
- let grid_hash = ruby.hash_new();
62
- grid_hash.aset(ruby.intern("rows"), table.grid.rows as i64)?;
63
- grid_hash.aset(ruby.intern("cols"), table.grid.cols as i64)?;
64
- let cells_array = ruby.ary_new();
65
- for cell in &table.grid.cells {
66
- let cell_hash = ruby.hash_new();
67
- cell_hash.aset(ruby.intern("content"), cell.content.as_str())?;
68
- cell_hash.aset(ruby.intern("row"), cell.row as i64)?;
69
- cell_hash.aset(ruby.intern("col"), cell.col as i64)?;
70
- cell_hash.aset(ruby.intern("row_span"), cell.row_span as i64)?;
71
- cell_hash.aset(ruby.intern("col_span"), cell.col_span as i64)?;
72
- cell_hash.aset(ruby.intern("is_header"), cell.is_header)?;
73
- cells_array.push(cell_hash)?;
74
- }
75
- grid_hash.aset(ruby.intern("cells"), cells_array)?;
76
- table_hash.aset(ruby.intern("grid"), grid_hash)?;
77
- table_hash.aset(ruby.intern("markdown"), table.markdown.as_str())?;
78
- tables_array.push(table_hash)?;
79
- }
80
- hash.aset(ruby.intern("tables"), tables_array)?;
81
- }
82
-
83
- // images
84
- #[cfg(feature = "inline-images")]
85
- {
86
- use conversion::inline_image_to_value;
87
- let images_array = ruby.ary_new();
88
- for image in result.images {
89
- let image_value = inline_image_to_value(ruby, image)?;
90
- images_array.push(image_value)?;
91
- }
92
- hash.aset(ruby.intern("images"), images_array)?;
93
- }
94
- #[cfg(not(feature = "inline-images"))]
95
- {
96
- let empty = ruby.ary_new();
97
- hash.aset(ruby.intern("images"), empty)?;
98
- }
99
-
100
- // warnings
101
- {
102
- let warnings_array = ruby.ary_new();
103
- for warning in &result.warnings {
104
- let w_hash = ruby.hash_new();
105
- w_hash.aset(ruby.intern("message"), warning.message.as_str())?;
106
- let kind_str = match warning.kind {
107
- html_to_markdown_rs::WarningKind::ImageExtractionFailed => "image_extraction_failed",
108
- html_to_markdown_rs::WarningKind::EncodingFallback => "encoding_fallback",
109
- html_to_markdown_rs::WarningKind::TruncatedInput => "truncated_input",
110
- html_to_markdown_rs::WarningKind::MalformedHtml => "malformed_html",
111
- html_to_markdown_rs::WarningKind::SanitizationApplied => "sanitization_applied",
112
- };
113
- w_hash.aset(ruby.intern("kind"), kind_str)?;
114
- warnings_array.push(w_hash)?;
115
- }
116
- hash.aset(ruby.intern("warnings"), warnings_array)?;
117
- }
118
-
119
- Ok(hash.as_value())
120
- }
121
-
122
- #[magnus::init]
123
- fn init(ruby: &Ruby) -> Result<(), Error> {
124
- let module = ruby.define_module("HtmlToMarkdown")?;
125
- module.define_singleton_method("convert", function!(convert_full_fn, -1))?;
126
-
127
- Ok(())
128
- }