html-to-markdown 2.5.6 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1829c785edaf223adaa67b69c7350264e7bc55de02e1e9b40451f4b61222ae42
4
- data.tar.gz: 303ef1c08e294512de540e896d5b7d0f652a507ffb307544d847b1b1277adc24
3
+ metadata.gz: 71fe440f4811019fdf9c1d88b72b092c97d8223dab27a83f6b23e2675d98317a
4
+ data.tar.gz: 1bbe81a26da2c50e16e5207e48bda25e309afde2f3e22600889b85855624de2c
5
5
  SHA512:
6
- metadata.gz: b4df160116b63a80814855deaa288bd6456b47745567844347a8b99bb1ba6c4251e9fdfd176d9c45c3ed2fe77434d6c3d39bcb725683abf03c9c1abecf071899
7
- data.tar.gz: 4c3fb6133b606408d9907b51b49bf71ec6b8dee8f02fffe6ea89eaa2b0d95152e37fd5cafa2c19e2a42298a0eebbd10bb9fde4b3b67e824b6ff0729280490305
6
+ metadata.gz: 49c83e6230f9df3a96088e7f0e4eeb885da4b9601ac7bad407675a1393d228eb4a9357ee5a036e328dd13d1b28aa3d74a50e216030acd6a359296ca5a16888d3
7
+ data.tar.gz: ee85846cd5a92e271719632edba3c2415e2263803a4bd74948e6bae8acd93176c6a4daec380d5423ce4670472219a5ed06bb61f047b47a53bf4778e85b9ed459
data/.rubocop.yml ADDED
@@ -0,0 +1,29 @@
1
+ plugins:
2
+ - rubocop-rspec
3
+
4
+ AllCops:
5
+ NewCops: enable
6
+ TargetRubyVersion: 3.2
7
+ Exclude:
8
+ - "tmp/**/*"
9
+ - "vendor/**/*"
10
+
11
+ Style/Documentation:
12
+ Enabled: false
13
+
14
+ Metrics/BlockLength:
15
+ Exclude:
16
+ - "spec/**/*"
17
+ - "*.gemspec"
18
+
19
+ Metrics/MethodLength:
20
+ Max: 15
21
+
22
+ RSpec/MultipleExpectations:
23
+ Enabled: false
24
+
25
+ RSpec/ExampleLength:
26
+ Enabled: false
27
+
28
+ RSpec/SpecFilePathFormat:
29
+ Enabled: false
data/Gemfile ADDED
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ ruby '>= 3.2'
6
+
7
+ gemspec
8
+
9
+ group :development, :test do
10
+ gem 'rake-compiler'
11
+ gem 'rb_sys' # provides build tooling when developing locally
12
+ gem 'rspec'
13
+ gem 'rubocop', require: false
14
+ gem 'rubocop-rspec', require: false
15
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,80 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ html-to-markdown (2.6.0)
5
+ rb_sys (>= 0.9, < 1.0)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ ast (2.4.3)
11
+ diff-lcs (1.6.2)
12
+ json (2.15.2)
13
+ language_server-protocol (3.17.0.5)
14
+ lint_roller (1.1.0)
15
+ parallel (1.27.0)
16
+ parser (3.3.10.0)
17
+ ast (~> 2.4.1)
18
+ racc
19
+ prism (1.6.0)
20
+ racc (1.8.1)
21
+ rainbow (3.1.1)
22
+ rake (13.3.1)
23
+ rake-compiler (1.3.0)
24
+ rake
25
+ rake-compiler-dock (1.9.1)
26
+ rb_sys (0.9.117)
27
+ rake-compiler-dock (= 1.9.1)
28
+ regexp_parser (2.11.3)
29
+ rspec (3.13.2)
30
+ rspec-core (~> 3.13.0)
31
+ rspec-expectations (~> 3.13.0)
32
+ rspec-mocks (~> 3.13.0)
33
+ rspec-core (3.13.6)
34
+ rspec-support (~> 3.13.0)
35
+ rspec-expectations (3.13.5)
36
+ diff-lcs (>= 1.2.0, < 2.0)
37
+ rspec-support (~> 3.13.0)
38
+ rspec-mocks (3.13.7)
39
+ diff-lcs (>= 1.2.0, < 2.0)
40
+ rspec-support (~> 3.13.0)
41
+ rspec-support (3.13.6)
42
+ rubocop (1.81.7)
43
+ json (~> 2.3)
44
+ language_server-protocol (~> 3.17.0.2)
45
+ lint_roller (~> 1.1.0)
46
+ parallel (~> 1.10)
47
+ parser (>= 3.3.0.2)
48
+ rainbow (>= 2.2.2, < 4.0)
49
+ regexp_parser (>= 2.9.3, < 3.0)
50
+ rubocop-ast (>= 1.47.1, < 2.0)
51
+ ruby-progressbar (~> 1.7)
52
+ unicode-display_width (>= 2.4.0, < 4.0)
53
+ rubocop-ast (1.47.1)
54
+ parser (>= 3.3.7.2)
55
+ prism (~> 1.4)
56
+ rubocop-rspec (3.7.0)
57
+ lint_roller (~> 1.1)
58
+ rubocop (~> 1.72, >= 1.72.1)
59
+ ruby-progressbar (1.13.0)
60
+ unicode-display_width (3.2.0)
61
+ unicode-emoji (~> 4.1)
62
+ unicode-emoji (4.1.0)
63
+
64
+ PLATFORMS
65
+ arm64-darwin-24
66
+ ruby
67
+
68
+ DEPENDENCIES
69
+ html-to-markdown!
70
+ rake-compiler
71
+ rb_sys
72
+ rspec
73
+ rubocop
74
+ rubocop-rspec
75
+
76
+ RUBY VERSION
77
+ ruby 3.2.9p248
78
+
79
+ BUNDLED WITH
80
+ 2.5.12
data/README.md CHANGED
@@ -3,9 +3,12 @@
3
3
  Blazing-fast HTML → Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, and WebAssembly packages. Ship identical Markdown across every runtime while enjoying native extension performance.
4
4
 
5
5
  [![Crates.io](https://img.shields.io/crates/v/html-to-markdown-rs.svg)](https://crates.io/crates/html-to-markdown-rs)
6
- [![npm version](https://badge.fury.io/js/html-to-markdown-node.svg)](https://www.npmjs.com/package/html-to-markdown-node)
7
- [![PyPI version](https://badge.fury.io/py/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
8
- [![Gem Version](https://badge.fury.io/rb/html-to-markdown.svg)](https://rubygems.org/gems/html-to-markdown)
6
+ [![npm (node)](https://badge.fury.io/js/html-to-markdown-node.svg)](https://www.npmjs.com/package/html-to-markdown-node)
7
+ [![npm (wasm)](https://badge.fury.io/js/html-to-markdown-wasm.svg)](https://www.npmjs.com/package/html-to-markdown-wasm)
8
+ [![npm (typescript)](https://badge.fury.io/js/html-to-markdown.svg)](https://www.npmjs.com/package/html-to-markdown)
9
+ [![PyPI](https://badge.fury.io/py/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
10
+ [![Packagist](https://img.shields.io/packagist/v/goldziher/html-to-markdown.svg)](https://packagist.org/packages/goldziher/html-to-markdown)
11
+ [![RubyGems](https://badge.fury.io/rb/html-to-markdown.svg)](https://rubygems.org/gems/html-to-markdown)
9
12
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
10
13
 
11
14
  ## Features
@@ -163,7 +166,7 @@ You can also call the CLI binary directly for scripting:
163
166
 
164
167
  ```ruby
165
168
  HtmlToMarkdown::CLIProxy.call(['--version'])
166
- # => "html-to-markdown 2.5.6"
169
+ # => "html-to-markdown 2.5.7"
167
170
  ```
168
171
 
169
172
  Rebuild the CLI locally if you see `CLI binary not built` during tests:
data/Rakefile ADDED
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rb_sys/extensiontask'
5
+ require 'rspec/core/rake_task'
6
+
7
+ GEMSPEC = Gem::Specification.load(File.expand_path('html-to-markdown-rb.gemspec', __dir__))
8
+
9
+ RbSys::ExtensionTask.new('html-to-markdown-rb', GEMSPEC) do |ext|
10
+ ext.lib_dir = 'lib'
11
+ ext.ext_dir = 'ext/html-to-markdown-rb'
12
+ ext.cross_compile = true
13
+ ext.cross_platform = %w[
14
+ x86_64-linux
15
+ x86_64-darwin
16
+ arm64-darwin
17
+ x64-mingw32
18
+ ]
19
+ end
20
+
21
+ RSpec::Core::RakeTask.new(:spec)
22
+
23
+ task spec: :compile
24
+ task default: :spec
@@ -24,4 +24,5 @@ default_profile = ENV.fetch('CARGO_PROFILE', 'release')
24
24
 
25
25
  create_rust_makefile('html_to_markdown_rb') do |config|
26
26
  config.profile = default_profile.to_sym
27
+ config.ext_dir = '../../../../crates/html-to-markdown-rb'
27
28
  end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/html_to_markdown/version'
4
+
5
+ repo_root = File.expand_path('../..', __dir__)
6
+ crate_prefix = 'packages/ruby/'
7
+ git_cmd = %(git -C "#{repo_root}" ls-files -z #{crate_prefix})
8
+ git_files =
9
+ `#{git_cmd}`.split("\x0")
10
+ .select { |path| path.start_with?(crate_prefix) }
11
+ .map { |path| path.delete_prefix(crate_prefix) }
12
+ fallback_files = Dir.chdir(__dir__) do
13
+ Dir.glob(
14
+ %w[
15
+ README.md
16
+ ext/html-to-markdown-rb/extconf.rb
17
+ exe/*
18
+ lib/**/*.rb
19
+ lib/bin/*
20
+ src/**/*.rs
21
+ spec/**/*.rb
22
+ ]
23
+ )
24
+ end
25
+ files = git_files.empty? ? fallback_files : git_files
26
+
27
+ Gem::Specification.new do |spec|
28
+ spec.name = 'html-to-markdown'
29
+ spec.version = HtmlToMarkdown::VERSION
30
+ spec.authors = ["Na'aman Hirschfeld"]
31
+ spec.email = ['nhirschfeld@gmail.com']
32
+
33
+ spec.summary = 'Blazing-fast HTML to Markdown conversion for Ruby, powered by Rust.'
34
+ spec.description = <<~DESC.strip
35
+ html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
36
+ It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
37
+ DESC
38
+ spec.homepage = 'https://github.com/Goldziher/html-to-markdown'
39
+ spec.license = 'MIT'
40
+
41
+ spec.required_ruby_version = Gem::Requirement.new('>= 3.2')
42
+
43
+ spec.bindir = 'exe'
44
+ spec.executables = ['html-to-markdown']
45
+ spec.require_paths = ['lib']
46
+
47
+ spec.files = files
48
+ spec.extra_rdoc_files = ['README.md']
49
+
50
+ spec.extensions = ['ext/html-to-markdown-rb/extconf.rb']
51
+
52
+ spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
53
+ spec.metadata['rubygems_mfa_required'] = 'true'
54
+ spec.metadata['homepage_uri'] = 'https://github.com/Goldziher/html-to-markdown'
55
+ spec.metadata['source_code_uri'] = 'https://github.com/Goldziher/html-to-markdown'
56
+ spec.metadata['bug_tracker_uri'] = 'https://github.com/Goldziher/html-to-markdown/issues'
57
+ spec.metadata['changelog_uri'] = 'https://github.com/Goldziher/html-to-markdown/releases'
58
+ spec.metadata['documentation_uri'] = 'https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md'
59
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.5.6'
4
+ VERSION = '2.6.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.5.6
4
+ version: 2.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-10-29 00:00:00.000000000 Z
11
+ date: 2025-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -30,95 +30,26 @@ dependencies:
30
30
  - - "<"
31
31
  - !ruby/object:Gem::Version
32
32
  version: '1.0'
33
- description: "# html-to-markdown-rb\n\nBlazing-fast HTML → Markdown conversion for
34
- Ruby, powered by the same Rust engine used by our Python, Node.js, and WebAssembly
35
- packages. Ship identical Markdown across every runtime while enjoying native extension
36
- performance.\n\n[![Crates.io](https://img.shields.io/crates/v/html-to-markdown-rs.svg)](https://crates.io/crates/html-to-markdown-rs)\n[![npm
37
- version](https://badge.fury.io/js/html-to-markdown-node.svg)](https://www.npmjs.com/package/html-to-markdown-node)\n[![PyPI
38
- version](https://badge.fury.io/py/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)\n[![Gem
39
- Version](https://badge.fury.io/rb/html-to-markdown.svg)](https://rubygems.org/gems/html-to-markdown)\n[![License:
40
- MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)\n\n##
41
- Features\n\n- ⚡ **Rust-fast**: Ruby bindings around a highly optimised Rust core
42
- (60‑80× faster than BeautifulSoup-based converters).\n- \U0001F501 **Identical output**:
43
- Shares logic with the Python wheels, npm bindings, WASM package, and CLI — consistent
44
- Markdown everywhere.\n- ⚙️ **Rich configuration**: Control heading styles, list
45
- indentation, whitespace handling, HTML preprocessing, and more.\n- \U0001F5BC️ **Inline
46
- image extraction**: Pull out embedded images (PNG/JPEG/SVG/data URIs) alongside
47
- Markdown.\n- \U0001F9F0 **Bundled CLI proxy**: Call the Rust CLI straight from Ruby
48
- or shell scripts.\n- \U0001F6E0️ **First-class Rails support**: Works with `Gem.win_platform?`
49
- builds, supports Trusted Publishing, and compiles on install if no native gem matches.\n\n##
50
- Documentation & Support\n\n- [GitHub repository](https://github.com/Goldziher/html-to-markdown)\n-
51
- [Issue tracker](https://github.com/Goldziher/html-to-markdown/issues)\n- [Changelog](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md)\n-
52
- [Live demo (WASM)](https://goldziher.github.io/html-to-markdown/)\n\n## Installation\n\n```bash\nbundle
53
- add html-to-markdown\n# or\ngem install html-to-markdown\n```\n\nAdd the gem to
54
- your project and Bundler will compile the native Rust extension on first install.\n\n###
55
- Requirements\n\n- Ruby **3.2+** (Magnus relies on the fiber scheduler APIs added
56
- in 3.2)\n- Rust toolchain **1.85+** with Cargo available on your `$PATH`\n- Ruby
57
- development headers (`ruby-dev`, `ruby-devel`, or the platform equivalent)\n\n**Windows**:
58
- install [RubyInstaller with MSYS2](https://rubyinstaller.org/) (UCRT64). Run once:\n\n```powershell\nridk
59
- exec pacman -S --needed --noconfirm base-devel mingw-w64-ucrt-x86_64-toolchain\n```\n\nThis
60
- provides the standard headers (including `strings.h`) required for the bindgen step.\n\n##
61
- Performance Snapshot\n\nApple M4 • Real Wikipedia documents • `HtmlToMarkdown.convert`
62
- (Ruby)\n\n| Document | Size | Latency | Throughput | Docs/sec |\n| -------------------
63
- | ----- | ------- | ---------- | -------- |\n| Lists (Timeline) | 129KB | 0.69ms
64
- \ | 187 MB/s | 1,450 |\n| Tables (Countries) | 360KB | 2.19ms | 164 MB/s
65
- \ | 456 |\n| Mixed (Python wiki) | 656KB | 4.88ms | 134 MB/s | 205 |\n\n>
66
- Same core, same benchmarks: the Ruby extension stays within single-digit % of the
67
- Rust CLI and mirrors the Python/Node numbers.\n\n## Quick Start\n\n```ruby\nrequire
68
- 'html_to_markdown'\n\nhtml = <<~HTML\n <h1>Welcome</h1>\n <p>This is <strong>Rust-fast</strong>
69
- conversion!</p>\n <ul>\n <li>Native extension</li>\n <li>Identical output
70
- across languages</li>\n </ul>\nHTML\n\nmarkdown = HtmlToMarkdown.convert(html)\nputs
71
- markdown\n# # Welcome\n#\n# This is **Rust-fast** conversion!\n#\n# - Native extension\n#
72
- - Identical output across languages\n```\n\n## API\n\n### Conversion Options\n\nPass
73
- a Ruby hash (string or symbol keys) to tweak rendering. Every option maps one-for-one
74
- with the Rust/Python/Node APIs.\n\n```ruby\nrequire 'html_to_markdown'\n\nmarkdown
75
- = HtmlToMarkdown.convert(\n '<pre><code class=\"language-ruby\">puts \"hi\"</code></pre>',\n
76
- \ heading_style: :atx,\n code_block_style: :fenced,\n bullets: '*+-',\n list_indent_type:
77
- :spaces,\n list_indent_width: 2,\n whitespace_mode: :normalized,\n highlight_style:
78
- :double_equal\n)\n\nputs markdown\n```\n\n### HTML Preprocessing\n\nClean up scraped
79
- HTML (navigation, forms, malformed markup) before conversion:\n\n```ruby\nrequire
80
- 'html_to_markdown'\n\nmarkdown = HtmlToMarkdown.convert(\n html,\n preprocessing:
81
- {\n enabled: true,\n preset: :aggressive, # :minimal, :standard, :aggressive\n
82
- \ remove_navigation: true,\n remove_forms: true\n }\n)\n```\n\n### Inline
83
- Images\n\nExtract inline binary data (data URIs, SVG) together with the converted
84
- Markdown.\n\n```ruby\nrequire 'html_to_markdown'\n\nresult = HtmlToMarkdown.convert_with_inline_images(\n
85
- \ '<img src=\"...\" alt=\"Pixel\">',\n image_config:
86
- {\n max_decoded_size_bytes: 1 * 1024 * 1024,\n infer_dimensions: true,\n filename_prefix:
87
- 'img_',\n capture_svg: true\n }\n)\n\nputs result.markdown\nresult.inline_images.each
88
- do |img|\n puts \"#{img.filename} -> #{img.format} (#{img.data.bytesize} bytes)\"\nend\n```\n\n##
89
- CLI\n\nThe gem bundles a small proxy for the Rust CLI binary. Use it when you need
90
- parity with the standalone `html-to-markdown` executable.\n\n```ruby\nrequire 'html_to_markdown/cli'\n\nHtmlToMarkdown::CLI.run(%w[--heading-style
91
- atx input.html], stdout: $stdout)\n# => writes converted Markdown to STDOUT\n```\n\nYou
92
- can also call the CLI binary directly for scripting:\n\n```ruby\nHtmlToMarkdown::CLIProxy.call(['--version'])\n#
93
- => \"html-to-markdown 2.5.6\"\n```\n\nRebuild the CLI locally if you see `CLI binary
94
- not built` during tests:\n\n```bash\nbundle exec rake compile # builds
95
- the extension\nbundle exec ruby scripts/prepare_ruby_gem.rb # copies the CLI into
96
- lib/bin/\n```\n\n## Error Handling\n\nConversion errors raise `HtmlToMarkdown::Error`
97
- (wrapping the Rust error context). CLI invocations use specialised subclasses:\n\n-
98
- `HtmlToMarkdown::CLIProxy::MissingBinaryError`\n- `HtmlToMarkdown::CLIProxy::CLIExecutionError`\n\nRescue
99
- them to provide clearer feedback in your application.\n\n## Consistent Across Languages\n\nThe
100
- Ruby gem shares the exact Rust core with:\n\n- [Python wheels](https://pypi.org/project/html-to-markdown/)\n-
101
- [Node.js / Bun bindings](https://www.npmjs.com/package/html-to-markdown-node)\n-
102
- [WebAssembly package](https://www.npmjs.com/package/html-to-markdown-wasm)\n- The
103
- Rust crate and CLI\n\nUse whichever runtime fits your stack while keeping formatting
104
- behaviour identical.\n\n## Development\n\n```bash\nbundle exec rake compile #
105
- build the native extension\nbundle exec rspec # run test suite\n```\n\nThe
106
- extension uses [Magnus](https://github.com/matsadler/magnus) plus `rb-sys` for bindgen.
107
- When editing the Rust code under `src/`, rerun `rake compile`.\n\n## License\n\nMIT
108
- © Na'aman Hirschfeld\n"
33
+ description: |-
34
+ html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
35
+ It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
109
36
  email:
110
37
  - nhirschfeld@gmail.com
111
38
  executables:
112
39
  - html-to-markdown
113
40
  extensions:
114
- - extconf.rb
41
+ - ext/html-to-markdown-rb/extconf.rb
115
42
  extra_rdoc_files:
116
43
  - README.md
117
44
  files:
118
- - Cargo.toml
45
+ - ".rubocop.yml"
46
+ - Gemfile
47
+ - Gemfile.lock
119
48
  - README.md
49
+ - Rakefile
120
50
  - exe/html-to-markdown
121
- - extconf.rb
51
+ - ext/html-to-markdown-rb/extconf.rb
52
+ - html-to-markdown-rb.gemspec
122
53
  - lib/html_to_markdown.rb
123
54
  - lib/html_to_markdown/cli.rb
124
55
  - lib/html_to_markdown/cli_proxy.rb
@@ -126,7 +57,6 @@ files:
126
57
  - spec/cli_proxy_spec.rb
127
58
  - spec/convert_spec.rb
128
59
  - spec/spec_helper.rb
129
- - src/lib.rs
130
60
  homepage: https://github.com/Goldziher/html-to-markdown
131
61
  licenses:
132
62
  - MIT
@@ -136,7 +66,7 @@ metadata:
136
66
  source_code_uri: https://github.com/Goldziher/html-to-markdown
137
67
  bug_tracker_uri: https://github.com/Goldziher/html-to-markdown/issues
138
68
  changelog_uri: https://github.com/Goldziher/html-to-markdown/releases
139
- documentation_uri: https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown-rb/README.md
69
+ documentation_uri: https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md
140
70
  post_install_message:
141
71
  rdoc_options: []
142
72
  require_paths:
data/Cargo.toml DELETED
@@ -1,28 +0,0 @@
1
- [package]
2
- name = "html-to-markdown-rb"
3
- version = "2.5.6"
4
- edition = "2021"
5
- authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
6
- license = "MIT"
7
- repository = "https://github.com/Goldziher/html-to-markdown"
8
- homepage = "https://github.com/Goldziher/html-to-markdown"
9
- documentation = "https://docs.rs/html-to-markdown-rs"
10
- readme = "README.md"
11
- rust-version = "1.80"
12
- description = "Ruby bindings (Magnus) for html-to-markdown - high-performance HTML to Markdown converter"
13
- keywords = ["html", "markdown", "ruby", "magnus", "bindings"]
14
- categories = ["api-bindings"]
15
-
16
- [lib]
17
- name = "html_to_markdown_rb"
18
- crate-type = ["cdylib", "rlib"]
19
-
20
- [features]
21
- default = []
22
-
23
- [dependencies]
24
- html-to-markdown-rs = { version = "2.5.6", features = ["inline-images"] }
25
- magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
26
-
27
- [dev-dependencies]
28
- pretty_assertions = "1.4"
data/src/lib.rs DELETED
@@ -1,432 +0,0 @@
1
- use html_to_markdown_rs::{
2
- convert as convert_inner, convert_with_inline_images as convert_with_inline_images_inner, error::ConversionError,
3
- CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, HtmlExtraction, InlineImage, InlineImageConfig,
4
- InlineImageFormat, InlineImageSource, InlineImageWarning, ListIndentType, NewlineStyle, PreprocessingOptions,
5
- PreprocessingPreset, WhitespaceMode,
6
- };
7
- use magnus::prelude::*;
8
- use magnus::{function, scan_args::scan_args, Error, RArray, RHash, Ruby, Symbol, TryConvert, Value};
9
-
10
- const DEFAULT_INLINE_IMAGE_LIMIT: u64 = 5 * 1024 * 1024;
11
-
12
- fn conversion_error(err: ConversionError) -> Error {
13
- match err {
14
- ConversionError::ConfigError(msg) => arg_error(msg),
15
- other => runtime_error(other.to_string()),
16
- }
17
- }
18
-
19
- fn arg_error(message: impl Into<String>) -> Error {
20
- let ruby = Ruby::get().expect("Ruby not initialised");
21
- Error::new(ruby.exception_arg_error(), message.into())
22
- }
23
-
24
- fn runtime_error(message: impl Into<String>) -> Error {
25
- let ruby = Ruby::get().expect("Ruby not initialised");
26
- Error::new(ruby.exception_runtime_error(), message.into())
27
- }
28
-
29
- fn symbol_to_string(value: Value) -> Result<String, Error> {
30
- if let Some(symbol) = Symbol::from_value(value) {
31
- Ok(symbol.name()?.to_string())
32
- } else {
33
- String::try_convert(value)
34
- }
35
- }
36
-
37
- fn get_kw(ruby: &Ruby, hash: RHash, name: &str) -> Option<Value> {
38
- let sym = ruby.intern(name);
39
- hash.get(sym).or_else(|| hash.get(name))
40
- }
41
-
42
- fn parse_heading_style(value: Value) -> Result<HeadingStyle, Error> {
43
- match symbol_to_string(value)?.as_str() {
44
- "underlined" => Ok(HeadingStyle::Underlined),
45
- "atx" => Ok(HeadingStyle::Atx),
46
- "atx_closed" => Ok(HeadingStyle::AtxClosed),
47
- other => Err(arg_error(format!("invalid heading_style: {other}"))),
48
- }
49
- }
50
-
51
- fn parse_list_indent_type(value: Value) -> Result<ListIndentType, Error> {
52
- match symbol_to_string(value)?.as_str() {
53
- "spaces" => Ok(ListIndentType::Spaces),
54
- "tabs" => Ok(ListIndentType::Tabs),
55
- other => Err(arg_error(format!("invalid list_indent_type: {other}"))),
56
- }
57
- }
58
-
59
- fn parse_highlight_style(value: Value) -> Result<HighlightStyle, Error> {
60
- match symbol_to_string(value)?.as_str() {
61
- "double_equal" => Ok(HighlightStyle::DoubleEqual),
62
- "html" => Ok(HighlightStyle::Html),
63
- "bold" => Ok(HighlightStyle::Bold),
64
- "none" => Ok(HighlightStyle::None),
65
- other => Err(arg_error(format!("invalid highlight_style: {other}"))),
66
- }
67
- }
68
-
69
- fn parse_whitespace_mode(value: Value) -> Result<WhitespaceMode, Error> {
70
- match symbol_to_string(value)?.as_str() {
71
- "normalized" => Ok(WhitespaceMode::Normalized),
72
- "strict" => Ok(WhitespaceMode::Strict),
73
- other => Err(arg_error(format!("invalid whitespace_mode: {other}"))),
74
- }
75
- }
76
-
77
- fn parse_newline_style(value: Value) -> Result<NewlineStyle, Error> {
78
- match symbol_to_string(value)?.as_str() {
79
- "spaces" => Ok(NewlineStyle::Spaces),
80
- "backslash" => Ok(NewlineStyle::Backslash),
81
- other => Err(arg_error(format!("invalid newline_style: {other}"))),
82
- }
83
- }
84
-
85
- fn parse_code_block_style(value: Value) -> Result<CodeBlockStyle, Error> {
86
- match symbol_to_string(value)?.as_str() {
87
- "indented" => Ok(CodeBlockStyle::Indented),
88
- "backticks" => Ok(CodeBlockStyle::Backticks),
89
- "tildes" => Ok(CodeBlockStyle::Tildes),
90
- other => Err(arg_error(format!("invalid code_block_style: {other}"))),
91
- }
92
- }
93
-
94
- fn parse_preset(value: Value) -> Result<PreprocessingPreset, Error> {
95
- match symbol_to_string(value)?.as_str() {
96
- "minimal" => Ok(PreprocessingPreset::Minimal),
97
- "standard" => Ok(PreprocessingPreset::Standard),
98
- "aggressive" => Ok(PreprocessingPreset::Aggressive),
99
- other => Err(arg_error(format!("invalid preprocessing preset: {other}"))),
100
- }
101
- }
102
-
103
- fn parse_vec_of_strings(value: Value) -> Result<Vec<String>, Error> {
104
- let array = RArray::from_value(value).ok_or_else(|| arg_error("expected an Array of strings"))?;
105
-
106
- array.to_vec::<String>()
107
- }
108
-
109
- fn parse_preprocessing_options(ruby: &Ruby, value: Value) -> Result<PreprocessingOptions, Error> {
110
- let hash = RHash::from_value(value).ok_or_else(|| arg_error("expected preprocessing to be a Hash"))?;
111
-
112
- let mut opts = PreprocessingOptions::default();
113
-
114
- if let Some(enabled) = get_kw(ruby, hash, "enabled") {
115
- opts.enabled = bool::try_convert(enabled)?;
116
- }
117
-
118
- if let Some(preset) = get_kw(ruby, hash, "preset") {
119
- opts.preset = parse_preset(preset)?;
120
- }
121
-
122
- if let Some(remove_navigation) = get_kw(ruby, hash, "remove_navigation") {
123
- opts.remove_navigation = bool::try_convert(remove_navigation)?;
124
- }
125
-
126
- if let Some(remove_forms) = get_kw(ruby, hash, "remove_forms") {
127
- opts.remove_forms = bool::try_convert(remove_forms)?;
128
- }
129
-
130
- Ok(opts)
131
- }
132
-
133
- fn build_conversion_options(ruby: &Ruby, options: Option<Value>) -> Result<ConversionOptions, Error> {
134
- let mut opts = ConversionOptions::default();
135
-
136
- let Some(options) = options else {
137
- return Ok(opts);
138
- };
139
-
140
- if options.is_nil() {
141
- return Ok(opts);
142
- }
143
-
144
- let hash = RHash::from_value(options).ok_or_else(|| arg_error("options must be provided as a Hash"))?;
145
-
146
- if let Some(heading_style) = get_kw(ruby, hash, "heading_style") {
147
- opts.heading_style = parse_heading_style(heading_style)?;
148
- }
149
-
150
- if let Some(list_indent_type) = get_kw(ruby, hash, "list_indent_type") {
151
- opts.list_indent_type = parse_list_indent_type(list_indent_type)?;
152
- }
153
-
154
- if let Some(list_indent_width) = get_kw(ruby, hash, "list_indent_width") {
155
- opts.list_indent_width = usize::try_convert(list_indent_width)?;
156
- }
157
-
158
- if let Some(bullets) = get_kw(ruby, hash, "bullets") {
159
- opts.bullets = String::try_convert(bullets)?;
160
- }
161
-
162
- if let Some(strong_em_symbol) = get_kw(ruby, hash, "strong_em_symbol") {
163
- let value = String::try_convert(strong_em_symbol)?;
164
- let mut chars = value.chars();
165
- let ch = chars
166
- .next()
167
- .ok_or_else(|| arg_error("strong_em_symbol must not be empty"))?;
168
- if chars.next().is_some() {
169
- return Err(arg_error("strong_em_symbol must be a single character"));
170
- }
171
- opts.strong_em_symbol = ch;
172
- }
173
-
174
- if let Some(escape_asterisks) = get_kw(ruby, hash, "escape_asterisks") {
175
- opts.escape_asterisks = bool::try_convert(escape_asterisks)?;
176
- }
177
-
178
- if let Some(escape_underscores) = get_kw(ruby, hash, "escape_underscores") {
179
- opts.escape_underscores = bool::try_convert(escape_underscores)?;
180
- }
181
-
182
- if let Some(escape_misc) = get_kw(ruby, hash, "escape_misc") {
183
- opts.escape_misc = bool::try_convert(escape_misc)?;
184
- }
185
-
186
- if let Some(escape_ascii) = get_kw(ruby, hash, "escape_ascii") {
187
- opts.escape_ascii = bool::try_convert(escape_ascii)?;
188
- }
189
-
190
- if let Some(code_language) = get_kw(ruby, hash, "code_language") {
191
- opts.code_language = String::try_convert(code_language)?;
192
- }
193
-
194
- if let Some(autolinks) = get_kw(ruby, hash, "autolinks") {
195
- opts.autolinks = bool::try_convert(autolinks)?;
196
- }
197
-
198
- if let Some(default_title) = get_kw(ruby, hash, "default_title") {
199
- opts.default_title = bool::try_convert(default_title)?;
200
- }
201
-
202
- if let Some(br_in_tables) = get_kw(ruby, hash, "br_in_tables") {
203
- opts.br_in_tables = bool::try_convert(br_in_tables)?;
204
- }
205
-
206
- if let Some(hocr_spatial_tables) = get_kw(ruby, hash, "hocr_spatial_tables") {
207
- opts.hocr_spatial_tables = bool::try_convert(hocr_spatial_tables)?;
208
- }
209
-
210
- if let Some(highlight_style) = get_kw(ruby, hash, "highlight_style") {
211
- opts.highlight_style = parse_highlight_style(highlight_style)?;
212
- }
213
-
214
- if let Some(extract_metadata) = get_kw(ruby, hash, "extract_metadata") {
215
- opts.extract_metadata = bool::try_convert(extract_metadata)?;
216
- }
217
-
218
- if let Some(whitespace_mode) = get_kw(ruby, hash, "whitespace_mode") {
219
- opts.whitespace_mode = parse_whitespace_mode(whitespace_mode)?;
220
- }
221
-
222
- if let Some(strip_newlines) = get_kw(ruby, hash, "strip_newlines") {
223
- opts.strip_newlines = bool::try_convert(strip_newlines)?;
224
- }
225
-
226
- if let Some(wrap) = get_kw(ruby, hash, "wrap") {
227
- opts.wrap = bool::try_convert(wrap)?;
228
- }
229
-
230
- if let Some(wrap_width) = get_kw(ruby, hash, "wrap_width") {
231
- opts.wrap_width = usize::try_convert(wrap_width)?;
232
- }
233
-
234
- if let Some(convert_as_inline) = get_kw(ruby, hash, "convert_as_inline") {
235
- opts.convert_as_inline = bool::try_convert(convert_as_inline)?;
236
- }
237
-
238
- if let Some(sub_symbol) = get_kw(ruby, hash, "sub_symbol") {
239
- opts.sub_symbol = String::try_convert(sub_symbol)?;
240
- }
241
-
242
- if let Some(sup_symbol) = get_kw(ruby, hash, "sup_symbol") {
243
- opts.sup_symbol = String::try_convert(sup_symbol)?;
244
- }
245
-
246
- if let Some(newline_style) = get_kw(ruby, hash, "newline_style") {
247
- opts.newline_style = parse_newline_style(newline_style)?;
248
- }
249
-
250
- if let Some(code_block_style) = get_kw(ruby, hash, "code_block_style") {
251
- opts.code_block_style = parse_code_block_style(code_block_style)?;
252
- }
253
-
254
- if let Some(keep_inline_images_in) = get_kw(ruby, hash, "keep_inline_images_in") {
255
- opts.keep_inline_images_in = parse_vec_of_strings(keep_inline_images_in)?;
256
- }
257
-
258
- if let Some(preprocessing) = get_kw(ruby, hash, "preprocessing") {
259
- opts.preprocessing = parse_preprocessing_options(ruby, preprocessing)?;
260
- }
261
-
262
- if let Some(encoding) = get_kw(ruby, hash, "encoding") {
263
- opts.encoding = String::try_convert(encoding)?;
264
- }
265
-
266
- if let Some(debug) = get_kw(ruby, hash, "debug") {
267
- opts.debug = bool::try_convert(debug)?;
268
- }
269
-
270
- if let Some(strip_tags) = get_kw(ruby, hash, "strip_tags") {
271
- opts.strip_tags = parse_vec_of_strings(strip_tags)?;
272
- }
273
-
274
- if let Some(preserve_tags) = get_kw(ruby, hash, "preserve_tags") {
275
- opts.preserve_tags = parse_vec_of_strings(preserve_tags)?;
276
- }
277
-
278
- Ok(opts)
279
- }
280
-
281
- fn build_inline_image_config(ruby: &Ruby, config: Option<Value>) -> Result<InlineImageConfig, Error> {
282
- let mut cfg = InlineImageConfig::new(DEFAULT_INLINE_IMAGE_LIMIT);
283
-
284
- let Some(config) = config else {
285
- return Ok(cfg);
286
- };
287
-
288
- if config.is_nil() {
289
- return Ok(cfg);
290
- }
291
-
292
- let hash = RHash::from_value(config).ok_or_else(|| arg_error("inline image config must be provided as a Hash"))?;
293
-
294
- if let Some(limit) = get_kw(ruby, hash, "max_decoded_size_bytes") {
295
- cfg.max_decoded_size_bytes = u64::try_convert(limit)?;
296
- }
297
-
298
- if let Some(prefix) = get_kw(ruby, hash, "filename_prefix") {
299
- cfg.filename_prefix = if prefix.is_nil() {
300
- None
301
- } else {
302
- Some(String::try_convert(prefix)?)
303
- };
304
- }
305
-
306
- if let Some(capture_svg) = get_kw(ruby, hash, "capture_svg") {
307
- cfg.capture_svg = bool::try_convert(capture_svg)?;
308
- }
309
-
310
- if let Some(infer_dimensions) = get_kw(ruby, hash, "infer_dimensions") {
311
- cfg.infer_dimensions = bool::try_convert(infer_dimensions)?;
312
- }
313
-
314
- Ok(cfg)
315
- }
316
-
317
- fn inline_image_to_value(ruby: &Ruby, image: InlineImage) -> Result<Value, Error> {
318
- let InlineImage {
319
- data,
320
- format,
321
- filename,
322
- description,
323
- dimensions,
324
- source,
325
- attributes,
326
- } = image;
327
-
328
- let hash = ruby.hash_new();
329
- let data_value = ruby.str_from_slice(&data);
330
- hash.aset(ruby.intern("data"), data_value)?;
331
-
332
- let format_value = match format {
333
- InlineImageFormat::Png => "png".to_string(),
334
- InlineImageFormat::Jpeg => "jpeg".to_string(),
335
- InlineImageFormat::Gif => "gif".to_string(),
336
- InlineImageFormat::Bmp => "bmp".to_string(),
337
- InlineImageFormat::Webp => "webp".to_string(),
338
- InlineImageFormat::Svg => "svg".to_string(),
339
- InlineImageFormat::Other(other) => other,
340
- };
341
- hash.aset(ruby.intern("format"), format_value)?;
342
-
343
- match filename {
344
- Some(name) => hash.aset(ruby.intern("filename"), name)?,
345
- None => hash.aset(ruby.intern("filename"), ruby.qnil())?,
346
- }
347
-
348
- match description {
349
- Some(desc) => hash.aset(ruby.intern("description"), desc)?,
350
- None => hash.aset(ruby.intern("description"), ruby.qnil())?,
351
- }
352
-
353
- if let Some((width, height)) = dimensions {
354
- let dims = ruby.ary_new();
355
- dims.push(width as i64)?;
356
- dims.push(height as i64)?;
357
- hash.aset(ruby.intern("dimensions"), dims)?;
358
- } else {
359
- hash.aset(ruby.intern("dimensions"), ruby.qnil())?;
360
- }
361
-
362
- let source_value = match source {
363
- InlineImageSource::ImgDataUri => "img_data_uri",
364
- InlineImageSource::SvgElement => "svg_element",
365
- };
366
- hash.aset(ruby.intern("source"), source_value)?;
367
-
368
- let attrs = ruby.hash_new();
369
- for (key, value) in attributes {
370
- attrs.aset(key, value)?;
371
- }
372
- hash.aset(ruby.intern("attributes"), attrs)?;
373
-
374
- Ok(hash.as_value())
375
- }
376
-
377
- fn warning_to_value(ruby: &Ruby, warning: InlineImageWarning) -> Result<Value, Error> {
378
- let hash = ruby.hash_new();
379
- hash.aset(ruby.intern("index"), warning.index as i64)?;
380
- hash.aset(ruby.intern("message"), warning.message)?;
381
- Ok(hash.as_value())
382
- }
383
-
384
- fn extraction_to_value(ruby: &Ruby, extraction: HtmlExtraction) -> Result<Value, Error> {
385
- let hash = ruby.hash_new();
386
- hash.aset(ruby.intern("markdown"), extraction.markdown)?;
387
-
388
- let inline_images = ruby.ary_new();
389
- for image in extraction.inline_images {
390
- inline_images.push(inline_image_to_value(ruby, image)?)?;
391
- }
392
- hash.aset(ruby.intern("inline_images"), inline_images)?;
393
-
394
- let warnings = ruby.ary_new();
395
- for warning in extraction.warnings {
396
- warnings.push(warning_to_value(ruby, warning)?)?;
397
- }
398
- hash.aset(ruby.intern("warnings"), warnings)?;
399
-
400
- Ok(hash.as_value())
401
- }
402
-
403
- fn convert_fn(ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
404
- let parsed = scan_args::<(String,), (Option<Value>,), (), (), (), ()>(args)?;
405
- let html = parsed.required.0;
406
- let options = build_conversion_options(ruby, parsed.optional.0)?;
407
-
408
- convert_inner(&html, Some(options)).map_err(conversion_error)
409
- }
410
-
411
- fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
412
- let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
413
- let html = parsed.required.0;
414
- let options = build_conversion_options(ruby, parsed.optional.0)?;
415
- let config = build_inline_image_config(ruby, parsed.optional.1)?;
416
-
417
- let extraction = convert_with_inline_images_inner(&html, Some(options), config).map_err(conversion_error)?;
418
-
419
- extraction_to_value(ruby, extraction)
420
- }
421
-
422
- #[magnus::init]
423
- fn init(ruby: &Ruby) -> Result<(), Error> {
424
- let module = ruby.define_module("HtmlToMarkdown")?;
425
- module.define_singleton_method("convert", function!(convert_fn, -1))?;
426
- module.define_singleton_method(
427
- "convert_with_inline_images",
428
- function!(convert_with_inline_images_fn, -1),
429
- )?;
430
-
431
- Ok(())
432
- }