html-to-markdown 3.1.0 → 3.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +42 -12
  3. data/Gemfile +1 -0
  4. data/Gemfile.lock +27 -55
  5. data/README.md +9 -10
  6. data/Rakefile +4 -10
  7. data/ext/html-to-markdown_rb/Cargo.toml +14 -0
  8. data/ext/html_to_markdown_rb/Cargo.toml +16 -0
  9. data/ext/html_to_markdown_rb/extconf.rb +10 -0
  10. data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +6 -0
  11. data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +9 -0
  12. data/ext/html_to_markdown_rb/src/lib.rs +3941 -0
  13. data/html-to-markdown-rb.gemspec +1 -1
  14. data/lib/html_to_markdown/version.rb +1 -1
  15. data/lib/html_to_markdown.rb +31 -21
  16. data/{ext/html-to-markdown-rb/native/extconf.rb → lib/html_to_markdown_rs.rb} +1 -1
  17. data/sig/html_to_markdown.rbs +17 -5
  18. data/vendor/Cargo.toml +4 -4
  19. data/vendor/html-to-markdown-rs/Cargo.toml +2 -2
  20. data/vendor/html-to-markdown-rs/examples/test_deser.rs +12 -0
  21. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +1 -1
  22. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +1 -1
  23. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
  24. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +1 -1
  25. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +10 -2
  26. data/vendor/html-to-markdown-rs/src/converter/mod.rs +2 -2
  27. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
  28. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +1 -1
  29. data/vendor/html-to-markdown-rs/src/exports.rs +3 -3
  30. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  31. data/vendor/html-to-markdown-rs/src/lib.rs +1 -2
  32. data/vendor/html-to-markdown-rs/src/metadata/config.rs +1 -1
  33. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +5 -5
  34. data/vendor/html-to-markdown-rs/src/options/conversion.rs +6 -12
  35. data/vendor/html-to-markdown-rs/src/options/mod.rs +1 -1
  36. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +3 -9
  37. data/vendor/html-to-markdown-rs/src/options/validation.rs +3 -3
  38. data/vendor/html-to-markdown-rs/src/types/document.rs +11 -0
  39. data/vendor/html-to-markdown-rs/src/types/result.rs +5 -2
  40. data/vendor/html-to-markdown-rs/src/types/tables.rs +1 -1
  41. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +1 -1
  42. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/state.rs +1 -1
  43. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/traversal.rs +1 -1
  44. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +8 -8
  45. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +6 -0
  46. data/vendor/html-to-markdown-rs/tests/integration_test.rs +3 -3
  47. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -2
  48. data/vendor/html-to-markdown-rs/tests/lists_test.rs +4 -4
  49. metadata +11 -18
  50. data/ext/html-to-markdown-rb/extconf.rb +0 -41
  51. data/ext/html-to-markdown-rb/native/Cargo.lock +0 -934
  52. data/ext/html-to-markdown-rb/native/Cargo.toml +0 -48
  53. data/ext/html-to-markdown-rb/native/README.md +0 -215
  54. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +0 -54
  55. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +0 -158
  56. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -11
  57. data/ext/html-to-markdown-rb/native/src/lib.rs +0 -128
  58. data/ext/html-to-markdown-rb/native/src/options.rs +0 -238
  59. data/ext/html-to-markdown-rb/native/src/types.rs +0 -24
  60. data/lib/html_to_markdown/cli.rb +0 -21
  61. data/lib/html_to_markdown/cli_proxy.rb +0 -74
  62. data/spec/cli_proxy_spec.rb +0 -42
  63. data/spec/spec_helper.rb +0 -10
@@ -1,238 +0,0 @@
1
- //! Option parsing and building for Ruby bindings.
2
-
3
- use crate::types::{arg_error, symbol_to_string};
4
- use html_to_markdown_rs::{
5
- CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, ListIndentType,
6
- NewlineStyle, OutputFormat, PreprocessingOptionsUpdate, PreprocessingPreset, WhitespaceMode,
7
- };
8
- use magnus::prelude::*;
9
- use magnus::r_hash::ForEach;
10
- use magnus::{Error, RArray, RHash, Ruby, TryConvert, Value};
11
-
12
- pub fn parse_heading_style(value: Value) -> Result<HeadingStyle, Error> {
13
- match symbol_to_string(value)?.as_str() {
14
- "underlined" => Ok(HeadingStyle::Underlined),
15
- "atx" => Ok(HeadingStyle::Atx),
16
- "atx_closed" => Ok(HeadingStyle::AtxClosed),
17
- other => Err(arg_error(format!("invalid heading_style: {other}"))),
18
- }
19
- }
20
-
21
- pub fn parse_list_indent_type(value: Value) -> Result<ListIndentType, Error> {
22
- match symbol_to_string(value)?.as_str() {
23
- "spaces" => Ok(ListIndentType::Spaces),
24
- "tabs" => Ok(ListIndentType::Tabs),
25
- other => Err(arg_error(format!("invalid list_indent_type: {other}"))),
26
- }
27
- }
28
-
29
- pub fn parse_highlight_style(value: Value) -> Result<HighlightStyle, Error> {
30
- match symbol_to_string(value)?.as_str() {
31
- "double_equal" => Ok(HighlightStyle::DoubleEqual),
32
- "html" => Ok(HighlightStyle::Html),
33
- "bold" => Ok(HighlightStyle::Bold),
34
- "none" => Ok(HighlightStyle::None),
35
- other => Err(arg_error(format!("invalid highlight_style: {other}"))),
36
- }
37
- }
38
-
39
- pub fn parse_whitespace_mode(value: Value) -> Result<WhitespaceMode, Error> {
40
- match symbol_to_string(value)?.as_str() {
41
- "normalized" => Ok(WhitespaceMode::Normalized),
42
- "strict" => Ok(WhitespaceMode::Strict),
43
- other => Err(arg_error(format!("invalid whitespace_mode: {other}"))),
44
- }
45
- }
46
-
47
- pub fn parse_newline_style(value: Value) -> Result<NewlineStyle, Error> {
48
- match symbol_to_string(value)?.as_str() {
49
- "spaces" => Ok(NewlineStyle::Spaces),
50
- "backslash" => Ok(NewlineStyle::Backslash),
51
- other => Err(arg_error(format!("invalid newline_style: {other}"))),
52
- }
53
- }
54
-
55
- pub fn parse_code_block_style(value: Value) -> Result<CodeBlockStyle, Error> {
56
- match symbol_to_string(value)?.as_str() {
57
- "indented" => Ok(CodeBlockStyle::Indented),
58
- "backticks" => Ok(CodeBlockStyle::Backticks),
59
- "tildes" => Ok(CodeBlockStyle::Tildes),
60
- other => Err(arg_error(format!("invalid code_block_style: {other}"))),
61
- }
62
- }
63
-
64
- pub fn parse_output_format(value: Value) -> Result<OutputFormat, Error> {
65
- match symbol_to_string(value)?.as_str() {
66
- "markdown" => Ok(OutputFormat::Markdown),
67
- "djot" => Ok(OutputFormat::Djot),
68
- "plain" => Ok(OutputFormat::Plain),
69
- other => Err(arg_error(format!("invalid output_format: {other}"))),
70
- }
71
- }
72
-
73
- pub fn parse_preset(value: Value) -> Result<PreprocessingPreset, Error> {
74
- match symbol_to_string(value)?.as_str() {
75
- "minimal" => Ok(PreprocessingPreset::Minimal),
76
- "standard" => Ok(PreprocessingPreset::Standard),
77
- "aggressive" => Ok(PreprocessingPreset::Aggressive),
78
- other => Err(arg_error(format!("invalid preprocessing preset: {other}"))),
79
- }
80
- }
81
-
82
- pub fn parse_vec_of_strings(value: Value) -> Result<Vec<String>, Error> {
83
- let array = RArray::from_value(value).ok_or_else(|| arg_error("expected an Array of strings"))?;
84
- array.to_vec::<String>()
85
- }
86
-
87
- pub fn parse_preprocessing_options(_ruby: &Ruby, value: Value) -> Result<PreprocessingOptionsUpdate, Error> {
88
- let hash = RHash::from_value(value).ok_or_else(|| arg_error("expected preprocessing to be a Hash"))?;
89
-
90
- let mut update = PreprocessingOptionsUpdate::default();
91
-
92
- hash.foreach(|key: Value, val: Value| {
93
- let key_name = symbol_to_string(key)?;
94
- match key_name.as_str() {
95
- "enabled" => {
96
- update.enabled = Some(bool::try_convert(val)?);
97
- }
98
- "preset" => {
99
- update.preset = Some(parse_preset(val)?);
100
- }
101
- "remove_navigation" => {
102
- update.remove_navigation = Some(bool::try_convert(val)?);
103
- }
104
- "remove_forms" => {
105
- update.remove_forms = Some(bool::try_convert(val)?);
106
- }
107
- _ => {}
108
- }
109
- Ok(ForEach::Continue)
110
- })?;
111
-
112
- Ok(update)
113
- }
114
-
115
- pub fn build_conversion_options(ruby: &Ruby, options: Option<Value>) -> Result<ConversionOptions, Error> {
116
- let mut update = ConversionOptionsUpdate::default();
117
-
118
- let Some(options) = options else {
119
- return Ok(ConversionOptions::default());
120
- };
121
-
122
- if options.is_nil() {
123
- return Ok(ConversionOptions::default());
124
- }
125
-
126
- let hash = RHash::from_value(options).ok_or_else(|| arg_error("options must be provided as a Hash"))?;
127
-
128
- hash.foreach(|key: Value, val: Value| {
129
- let key_name = symbol_to_string(key)?;
130
- match key_name.as_str() {
131
- "heading_style" => {
132
- update.heading_style = Some(parse_heading_style(val)?);
133
- }
134
- "list_indent_type" => {
135
- update.list_indent_type = Some(parse_list_indent_type(val)?);
136
- }
137
- "list_indent_width" => {
138
- update.list_indent_width = Some(usize::try_convert(val)?);
139
- }
140
- "bullets" => {
141
- update.bullets = Some(String::try_convert(val)?);
142
- }
143
- "strong_em_symbol" => {
144
- let value = String::try_convert(val)?;
145
- let mut chars = value.chars();
146
- let ch = chars
147
- .next()
148
- .ok_or_else(|| arg_error("strong_em_symbol must not be empty"))?;
149
- if chars.next().is_some() {
150
- return Err(arg_error("strong_em_symbol must be a single character"));
151
- }
152
- update.strong_em_symbol = Some(ch);
153
- }
154
- "escape_asterisks" => {
155
- update.escape_asterisks = Some(bool::try_convert(val)?);
156
- }
157
- "escape_underscores" => {
158
- update.escape_underscores = Some(bool::try_convert(val)?);
159
- }
160
- "escape_misc" => {
161
- update.escape_misc = Some(bool::try_convert(val)?);
162
- }
163
- "escape_ascii" => {
164
- update.escape_ascii = Some(bool::try_convert(val)?);
165
- }
166
- "code_language" => {
167
- update.code_language = Some(String::try_convert(val)?);
168
- }
169
- "autolinks" => {
170
- update.autolinks = Some(bool::try_convert(val)?);
171
- }
172
- "default_title" => {
173
- update.default_title = Some(bool::try_convert(val)?);
174
- }
175
- "br_in_tables" => {
176
- update.br_in_tables = Some(bool::try_convert(val)?);
177
- }
178
- "highlight_style" => {
179
- update.highlight_style = Some(parse_highlight_style(val)?);
180
- }
181
- "extract_metadata" => {
182
- update.extract_metadata = Some(bool::try_convert(val)?);
183
- }
184
- "whitespace_mode" => {
185
- update.whitespace_mode = Some(parse_whitespace_mode(val)?);
186
- }
187
- "strip_newlines" => {
188
- update.strip_newlines = Some(bool::try_convert(val)?);
189
- }
190
- "wrap" => {
191
- update.wrap = Some(bool::try_convert(val)?);
192
- }
193
- "wrap_width" => {
194
- update.wrap_width = Some(usize::try_convert(val)?);
195
- }
196
- "convert_as_inline" => {
197
- update.convert_as_inline = Some(bool::try_convert(val)?);
198
- }
199
- "sub_symbol" => {
200
- update.sub_symbol = Some(String::try_convert(val)?);
201
- }
202
- "sup_symbol" => {
203
- update.sup_symbol = Some(String::try_convert(val)?);
204
- }
205
- "newline_style" => {
206
- update.newline_style = Some(parse_newline_style(val)?);
207
- }
208
- "code_block_style" => {
209
- update.code_block_style = Some(parse_code_block_style(val)?);
210
- }
211
- "keep_inline_images_in" => {
212
- update.keep_inline_images_in = Some(parse_vec_of_strings(val)?);
213
- }
214
- "preprocessing" => {
215
- update.preprocessing = Some(parse_preprocessing_options(ruby, val)?);
216
- }
217
- "encoding" => {
218
- update.encoding = Some(String::try_convert(val)?);
219
- }
220
- "debug" => {
221
- update.debug = Some(bool::try_convert(val)?);
222
- }
223
- "strip_tags" => {
224
- update.strip_tags = Some(parse_vec_of_strings(val)?);
225
- }
226
- "preserve_tags" => {
227
- update.preserve_tags = Some(parse_vec_of_strings(val)?);
228
- }
229
- "output_format" => {
230
- update.output_format = Some(parse_output_format(val)?);
231
- }
232
- _ => {}
233
- }
234
- Ok(ForEach::Continue)
235
- })?;
236
-
237
- Ok(ConversionOptions::from(update))
238
- }
@@ -1,24 +0,0 @@
1
- //! Type helpers and error utilities for Ruby bindings.
2
-
3
- use magnus::{Error, Ruby, Symbol, TryConvert, Value};
4
-
5
- /// Create an ArgumentError.
6
- pub fn arg_error(message: impl Into<String>) -> Error {
7
- let ruby = Ruby::get().expect("Ruby not initialised");
8
- Error::new(ruby.exception_arg_error(), message.into())
9
- }
10
-
11
- /// Create a RuntimeError.
12
- pub fn runtime_error(message: impl Into<String>) -> Error {
13
- let ruby = Ruby::get().expect("Ruby not initialised");
14
- Error::new(ruby.exception_runtime_error(), message.into())
15
- }
16
-
17
- /// Convert a Ruby Symbol or String to a Rust String.
18
- pub fn symbol_to_string(value: Value) -> Result<String, Error> {
19
- if let Some(symbol) = Symbol::from_value(value) {
20
- Ok(symbol.name()?.to_string())
21
- } else {
22
- String::try_convert(value)
23
- }
24
- }
@@ -1,21 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'html_to_markdown/cli_proxy'
4
-
5
- module HtmlToMarkdown
6
- module CLI
7
- module_function
8
-
9
- def run(argv = ARGV, stdout: $stdout, stderr: $stderr)
10
- output = CLIProxy.call(argv)
11
- stdout.print(output)
12
- 0
13
- rescue CLIProxy::CLIExecutionError => e
14
- stderr.print(e.stderr)
15
- e.status || 1
16
- rescue CLIProxy::MissingBinaryError, CLIProxy::Error => e
17
- stderr.puts(e.message)
18
- 1
19
- end
20
- end
21
- end
@@ -1,74 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'open3'
4
- require 'pathname'
5
-
6
- module HtmlToMarkdown
7
- module CLIProxy
8
- class Error < StandardError
9
- end
10
-
11
- class MissingBinaryError < Error
12
- end
13
-
14
- class CLIExecutionError < Error
15
- attr_reader :stderr, :status
16
-
17
- def initialize(message, stderr:, status:)
18
- super(message)
19
- @stderr = stderr
20
- @status = status
21
- end
22
- end
23
-
24
- module_function
25
-
26
- def call(argv)
27
- binary = find_cli_binary
28
- args = Array(argv).map(&:to_s)
29
- stdout, stderr, status = Open3.capture3(binary.to_s, *args)
30
- return stdout if status.success?
31
-
32
- raise CLIExecutionError.new(
33
- "html-to-markdown CLI exited with status #{status.exitstatus}",
34
- stderr: stderr,
35
- status: status.exitstatus
36
- )
37
- end
38
-
39
- def find_cli_binary
40
- binary_name = Gem.win_platform? ? 'html-to-markdown.exe' : 'html-to-markdown'
41
- found = search_paths(binary_name).find(&:file?)
42
- return found if found
43
-
44
- raise MissingBinaryError, missing_binary_message
45
- end
46
-
47
- def root_path
48
- @root_path ||= Pathname(__dir__.to_s).join('../..').expand_path
49
- end
50
-
51
- def lib_path
52
- @lib_path ||= Pathname(__dir__.to_s).join('..').expand_path
53
- end
54
-
55
- def search_paths(binary_name)
56
- paths = [
57
- root_path.join('target', 'release', binary_name),
58
- lib_path.join('bin', binary_name),
59
- lib_path.join(binary_name)
60
- ]
61
-
62
- workspace_root = root_path.parent&.parent
63
- paths << workspace_root.join('target', 'release', binary_name) if workspace_root
64
- paths
65
- end
66
-
67
- def missing_binary_message
68
- <<~MSG.strip
69
- html-to-markdown CLI binary not found. Build it with
70
- `cargo build --release --package html-to-markdown-cli`.
71
- MSG
72
- end
73
- end
74
- end
@@ -1,42 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'spec_helper'
4
- require 'html_to_markdown/cli_proxy'
5
- require 'html_to_markdown/cli'
6
- require 'stringio'
7
-
8
- RSpec.describe HtmlToMarkdown::CLIProxy do
9
- describe '.call' do
10
- it 'executes the CLI binary' do
11
- begin
12
- binary = described_class.find_cli_binary
13
- rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
14
- skip 'CLI binary not built'
15
- end
16
-
17
- expect(binary).to be_file
18
-
19
- output = described_class.call(['--version'])
20
- expect(output).to include(HtmlToMarkdown::VERSION)
21
- end
22
- end
23
-
24
- describe HtmlToMarkdown::CLI do
25
- it 'writes CLI output to stdout' do
26
- begin
27
- HtmlToMarkdown::CLIProxy.find_cli_binary
28
- rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
29
- skip 'CLI binary not built'
30
- end
31
-
32
- stdout = StringIO.new
33
- stderr = StringIO.new
34
-
35
- exit_code = described_class.run(['--version'], stdout: stdout, stderr: stderr)
36
-
37
- expect(exit_code).to eq(0)
38
- expect(stdout.string).to include(HtmlToMarkdown::VERSION)
39
- expect(stderr.string).to be_empty
40
- end
41
- end
42
- end
data/spec/spec_helper.rb DELETED
@@ -1,10 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'bundler/setup'
4
- require 'html_to_markdown'
5
-
6
- RSpec.configure do |config|
7
- config.expect_with :rspec do |c|
8
- c.syntax = :expect
9
- end
10
- end