html-to-markdown 2.9.2 → 2.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.bundle/config +2 -2
- data/.rubocop.yml +29 -29
- data/Gemfile +17 -17
- data/Gemfile.lock +3 -1
- data/README.md +243 -243
- data/Rakefile +24 -24
- data/Steepfile +26 -26
- data/bin/benchmark.rb +94 -94
- data/exe/html-to-markdown +6 -6
- data/ext/html-to-markdown-rb/extconf.rb +38 -38
- data/ext/html-to-markdown-rb/native/Cargo.toml +28 -28
- data/ext/html-to-markdown-rb/native/README.md +209 -209
- data/ext/html-to-markdown-rb/native/extconf.rb +3 -3
- data/ext/html-to-markdown-rb/native/src/lib.rs +438 -432
- data/html-to-markdown-rb.gemspec +59 -59
- data/lib/html_to_markdown/cli.rb +21 -21
- data/lib/html_to_markdown/cli_proxy.rb +71 -71
- data/lib/html_to_markdown/version.rb +5 -5
- data/lib/html_to_markdown.rb +36 -36
- data/sig/html_to_markdown/cli.rbs +24 -24
- data/sig/html_to_markdown/cli_proxy.rbs +48 -48
- data/sig/html_to_markdown.rbs +139 -139
- data/sig/open3.rbs +12 -12
- data/spec/cli_proxy_spec.rb +42 -42
- data/spec/convert_spec.rb +77 -38
- data/spec/spec_helper.rb +10 -10
- metadata +2 -2
data/sig/html_to_markdown.rbs
CHANGED
|
@@ -1,139 +1,139 @@
|
|
|
1
|
-
# Type definitions for HtmlToMarkdown Ruby gem
|
|
2
|
-
module HtmlToMarkdown
|
|
3
|
-
VERSION: String
|
|
4
|
-
|
|
5
|
-
# Opaque handle for reusable conversion options
|
|
6
|
-
class Options
|
|
7
|
-
end
|
|
8
|
-
|
|
9
|
-
type heading_style = :underlined | :atx | :atx_closed
|
|
10
|
-
type list_indent_type = :spaces | :tabs
|
|
11
|
-
type highlight_style = :double_equal | :html | :bold | :none
|
|
12
|
-
type whitespace_mode = :normalized | :strict
|
|
13
|
-
type newline_style = :spaces | :backslash
|
|
14
|
-
type code_block_style = :indented | :backticks | :tildes
|
|
15
|
-
type preprocessing_preset = :minimal | :standard | :aggressive
|
|
16
|
-
|
|
17
|
-
type preprocessing_options = {
|
|
18
|
-
enabled: bool,
|
|
19
|
-
preset: preprocessing_preset,
|
|
20
|
-
remove_navigation: bool,
|
|
21
|
-
remove_forms: bool
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
type conversion_options = {
|
|
25
|
-
heading_style: heading_style,
|
|
26
|
-
list_indent_type: list_indent_type,
|
|
27
|
-
list_indent_width: Integer,
|
|
28
|
-
bullets: String,
|
|
29
|
-
strong_em_symbol: String,
|
|
30
|
-
escape_asterisks: bool,
|
|
31
|
-
escape_underscores: bool,
|
|
32
|
-
escape_misc: bool,
|
|
33
|
-
escape_ascii: bool,
|
|
34
|
-
code_language: String,
|
|
35
|
-
autolinks: bool,
|
|
36
|
-
default_title: bool,
|
|
37
|
-
br_in_tables: bool,
|
|
38
|
-
hocr_spatial_tables: bool,
|
|
39
|
-
highlight_style: highlight_style,
|
|
40
|
-
extract_metadata: bool,
|
|
41
|
-
whitespace_mode: whitespace_mode,
|
|
42
|
-
strip_newlines: bool,
|
|
43
|
-
wrap: bool,
|
|
44
|
-
wrap_width: Integer,
|
|
45
|
-
convert_as_inline: bool,
|
|
46
|
-
sub_symbol: String,
|
|
47
|
-
sup_symbol: String,
|
|
48
|
-
newline_style: newline_style,
|
|
49
|
-
code_block_style: code_block_style,
|
|
50
|
-
keep_inline_images_in: Array[String],
|
|
51
|
-
preprocessing: preprocessing_options,
|
|
52
|
-
encoding: String,
|
|
53
|
-
debug: bool,
|
|
54
|
-
strip_tags: Array[String],
|
|
55
|
-
preserve_tags: Array[String]
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
type inline_image_config = {
|
|
59
|
-
max_decoded_size_bytes: Integer,
|
|
60
|
-
filename_prefix: String?,
|
|
61
|
-
capture_svg: bool,
|
|
62
|
-
infer_dimensions: bool
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
type inline_image_format = "png" | "jpeg" | "gif" | "bmp" | "webp" | "svg" | String
|
|
66
|
-
|
|
67
|
-
type inline_image_source = "img_data_uri" | "svg_element"
|
|
68
|
-
|
|
69
|
-
type inline_image = {
|
|
70
|
-
data: String,
|
|
71
|
-
format: inline_image_format,
|
|
72
|
-
filename: String?,
|
|
73
|
-
description: String?,
|
|
74
|
-
dimensions: [Integer, Integer]?,
|
|
75
|
-
source: inline_image_source,
|
|
76
|
-
attributes: Hash[String, String]
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
type inline_image_warning = {
|
|
80
|
-
index: Integer,
|
|
81
|
-
message: String
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
type html_extraction = {
|
|
85
|
-
markdown: String,
|
|
86
|
-
inline_images: Array[inline_image],
|
|
87
|
-
warnings: Array[inline_image_warning]
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
# Native methods (implemented in Rust via Magnus/rb-sys)
|
|
91
|
-
# These are aliased from the Rust extension and available as both module and instance methods
|
|
92
|
-
private
|
|
93
|
-
|
|
94
|
-
def self.native_convert: (String html, conversion_options? options) -> String
|
|
95
|
-
def self.native_options: (conversion_options? options_hash) -> Options
|
|
96
|
-
def self.native_convert_with_options: (String html, Options options_handle) -> String
|
|
97
|
-
def self.native_convert_with_inline_images: (
|
|
98
|
-
String html,
|
|
99
|
-
conversion_options? options,
|
|
100
|
-
inline_image_config? image_config
|
|
101
|
-
) -> html_extraction
|
|
102
|
-
|
|
103
|
-
def native_convert: (String html, conversion_options? options) -> String
|
|
104
|
-
def native_options: (conversion_options? options_hash) -> Options
|
|
105
|
-
def native_convert_with_options: (String html, Options options_handle) -> String
|
|
106
|
-
def native_convert_with_inline_images: (
|
|
107
|
-
String html,
|
|
108
|
-
conversion_options? options,
|
|
109
|
-
inline_image_config? image_config
|
|
110
|
-
) -> html_extraction
|
|
111
|
-
|
|
112
|
-
public
|
|
113
|
-
|
|
114
|
-
# Convert HTML to Markdown with optional configuration
|
|
115
|
-
def self.convert: (String html, ?conversion_options? options) -> String
|
|
116
|
-
|
|
117
|
-
# Create a reusable options handle for performance
|
|
118
|
-
def self.options: (?conversion_options? options_hash) -> Options
|
|
119
|
-
|
|
120
|
-
# Convert HTML using a pre-built options handle
|
|
121
|
-
def self.convert_with_options: (String html, Options options_handle) -> String
|
|
122
|
-
|
|
123
|
-
# Convert HTML with inline image extraction
|
|
124
|
-
def self.convert_with_inline_images: (
|
|
125
|
-
String html,
|
|
126
|
-
?conversion_options? options,
|
|
127
|
-
?inline_image_config? image_config
|
|
128
|
-
) -> html_extraction
|
|
129
|
-
|
|
130
|
-
# Instance method versions (created by module_function)
|
|
131
|
-
def convert: (String html, ?conversion_options? options) -> String
|
|
132
|
-
def options: (?conversion_options? options_hash) -> Options
|
|
133
|
-
def convert_with_options: (String html, Options options_handle) -> String
|
|
134
|
-
def convert_with_inline_images: (
|
|
135
|
-
String html,
|
|
136
|
-
?conversion_options? options,
|
|
137
|
-
?inline_image_config? image_config
|
|
138
|
-
) -> html_extraction
|
|
139
|
-
end
|
|
1
|
+
# Type definitions for HtmlToMarkdown Ruby gem
|
|
2
|
+
module HtmlToMarkdown
|
|
3
|
+
VERSION: String
|
|
4
|
+
|
|
5
|
+
# Opaque handle for reusable conversion options
|
|
6
|
+
class Options
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
type heading_style = :underlined | :atx | :atx_closed
|
|
10
|
+
type list_indent_type = :spaces | :tabs
|
|
11
|
+
type highlight_style = :double_equal | :html | :bold | :none
|
|
12
|
+
type whitespace_mode = :normalized | :strict
|
|
13
|
+
type newline_style = :spaces | :backslash
|
|
14
|
+
type code_block_style = :indented | :backticks | :tildes
|
|
15
|
+
type preprocessing_preset = :minimal | :standard | :aggressive
|
|
16
|
+
|
|
17
|
+
type preprocessing_options = {
|
|
18
|
+
enabled: bool,
|
|
19
|
+
preset: preprocessing_preset,
|
|
20
|
+
remove_navigation: bool,
|
|
21
|
+
remove_forms: bool
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
type conversion_options = {
|
|
25
|
+
heading_style: heading_style,
|
|
26
|
+
list_indent_type: list_indent_type,
|
|
27
|
+
list_indent_width: Integer,
|
|
28
|
+
bullets: String,
|
|
29
|
+
strong_em_symbol: String,
|
|
30
|
+
escape_asterisks: bool,
|
|
31
|
+
escape_underscores: bool,
|
|
32
|
+
escape_misc: bool,
|
|
33
|
+
escape_ascii: bool,
|
|
34
|
+
code_language: String,
|
|
35
|
+
autolinks: bool,
|
|
36
|
+
default_title: bool,
|
|
37
|
+
br_in_tables: bool,
|
|
38
|
+
hocr_spatial_tables: bool,
|
|
39
|
+
highlight_style: highlight_style,
|
|
40
|
+
extract_metadata: bool,
|
|
41
|
+
whitespace_mode: whitespace_mode,
|
|
42
|
+
strip_newlines: bool,
|
|
43
|
+
wrap: bool,
|
|
44
|
+
wrap_width: Integer,
|
|
45
|
+
convert_as_inline: bool,
|
|
46
|
+
sub_symbol: String,
|
|
47
|
+
sup_symbol: String,
|
|
48
|
+
newline_style: newline_style,
|
|
49
|
+
code_block_style: code_block_style,
|
|
50
|
+
keep_inline_images_in: Array[String],
|
|
51
|
+
preprocessing: preprocessing_options,
|
|
52
|
+
encoding: String,
|
|
53
|
+
debug: bool,
|
|
54
|
+
strip_tags: Array[String],
|
|
55
|
+
preserve_tags: Array[String]
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
type inline_image_config = {
|
|
59
|
+
max_decoded_size_bytes: Integer,
|
|
60
|
+
filename_prefix: String?,
|
|
61
|
+
capture_svg: bool,
|
|
62
|
+
infer_dimensions: bool
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
type inline_image_format = "png" | "jpeg" | "gif" | "bmp" | "webp" | "svg" | String
|
|
66
|
+
|
|
67
|
+
type inline_image_source = "img_data_uri" | "svg_element"
|
|
68
|
+
|
|
69
|
+
type inline_image = {
|
|
70
|
+
data: String,
|
|
71
|
+
format: inline_image_format,
|
|
72
|
+
filename: String?,
|
|
73
|
+
description: String?,
|
|
74
|
+
dimensions: [Integer, Integer]?,
|
|
75
|
+
source: inline_image_source,
|
|
76
|
+
attributes: Hash[String, String]
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
type inline_image_warning = {
|
|
80
|
+
index: Integer,
|
|
81
|
+
message: String
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
type html_extraction = {
|
|
85
|
+
markdown: String,
|
|
86
|
+
inline_images: Array[inline_image],
|
|
87
|
+
warnings: Array[inline_image_warning]
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
# Native methods (implemented in Rust via Magnus/rb-sys)
|
|
91
|
+
# These are aliased from the Rust extension and available as both module and instance methods
|
|
92
|
+
private
|
|
93
|
+
|
|
94
|
+
def self.native_convert: (String html, conversion_options? options) -> String
|
|
95
|
+
def self.native_options: (conversion_options? options_hash) -> Options
|
|
96
|
+
def self.native_convert_with_options: (String html, Options options_handle) -> String
|
|
97
|
+
def self.native_convert_with_inline_images: (
|
|
98
|
+
String html,
|
|
99
|
+
conversion_options? options,
|
|
100
|
+
inline_image_config? image_config
|
|
101
|
+
) -> html_extraction
|
|
102
|
+
|
|
103
|
+
def native_convert: (String html, conversion_options? options) -> String
|
|
104
|
+
def native_options: (conversion_options? options_hash) -> Options
|
|
105
|
+
def native_convert_with_options: (String html, Options options_handle) -> String
|
|
106
|
+
def native_convert_with_inline_images: (
|
|
107
|
+
String html,
|
|
108
|
+
conversion_options? options,
|
|
109
|
+
inline_image_config? image_config
|
|
110
|
+
) -> html_extraction
|
|
111
|
+
|
|
112
|
+
public
|
|
113
|
+
|
|
114
|
+
# Convert HTML to Markdown with optional configuration
|
|
115
|
+
def self.convert: (String html, ?conversion_options? options) -> String
|
|
116
|
+
|
|
117
|
+
# Create a reusable options handle for performance
|
|
118
|
+
def self.options: (?conversion_options? options_hash) -> Options
|
|
119
|
+
|
|
120
|
+
# Convert HTML using a pre-built options handle
|
|
121
|
+
def self.convert_with_options: (String html, Options options_handle) -> String
|
|
122
|
+
|
|
123
|
+
# Convert HTML with inline image extraction
|
|
124
|
+
def self.convert_with_inline_images: (
|
|
125
|
+
String html,
|
|
126
|
+
?conversion_options? options,
|
|
127
|
+
?inline_image_config? image_config
|
|
128
|
+
) -> html_extraction
|
|
129
|
+
|
|
130
|
+
# Instance method versions (created by module_function)
|
|
131
|
+
def convert: (String html, ?conversion_options? options) -> String
|
|
132
|
+
def options: (?conversion_options? options_hash) -> Options
|
|
133
|
+
def convert_with_options: (String html, Options options_handle) -> String
|
|
134
|
+
def convert_with_inline_images: (
|
|
135
|
+
String html,
|
|
136
|
+
?conversion_options? options,
|
|
137
|
+
?inline_image_config? image_config
|
|
138
|
+
) -> html_extraction
|
|
139
|
+
end
|
data/sig/open3.rbs
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
# Type signature for Open3 standard library
|
|
2
|
-
module Open3
|
|
3
|
-
# Execute command and capture stdout, stderr, and status
|
|
4
|
-
#
|
|
5
|
-
# @param cmd Command to execute
|
|
6
|
-
# @param args Command arguments
|
|
7
|
-
# @return Array containing stdout (String), stderr (String), and status (Process::Status)
|
|
8
|
-
def self.capture3: (
|
|
9
|
-
String cmd,
|
|
10
|
-
*String args
|
|
11
|
-
) -> [String, String, Process::Status]
|
|
12
|
-
end
|
|
1
|
+
# Type signature for Open3 standard library
|
|
2
|
+
module Open3
|
|
3
|
+
# Execute command and capture stdout, stderr, and status
|
|
4
|
+
#
|
|
5
|
+
# @param cmd Command to execute
|
|
6
|
+
# @param args Command arguments
|
|
7
|
+
# @return Array containing stdout (String), stderr (String), and status (Process::Status)
|
|
8
|
+
def self.capture3: (
|
|
9
|
+
String cmd,
|
|
10
|
+
*String args
|
|
11
|
+
) -> [String, String, Process::Status]
|
|
12
|
+
end
|
data/spec/cli_proxy_spec.rb
CHANGED
|
@@ -1,42 +1,42 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'spec_helper'
|
|
4
|
-
require 'html_to_markdown/cli_proxy'
|
|
5
|
-
require 'html_to_markdown/cli'
|
|
6
|
-
require 'stringio'
|
|
7
|
-
|
|
8
|
-
RSpec.describe HtmlToMarkdown::CLIProxy do
|
|
9
|
-
describe '.call' do
|
|
10
|
-
it 'executes the CLI binary' do
|
|
11
|
-
begin
|
|
12
|
-
binary = described_class.find_cli_binary
|
|
13
|
-
rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
|
|
14
|
-
skip 'CLI binary not built'
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
expect(binary).to be_file
|
|
18
|
-
|
|
19
|
-
output = described_class.call(['--version'])
|
|
20
|
-
expect(output).to include(HtmlToMarkdown::VERSION)
|
|
21
|
-
end
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
describe HtmlToMarkdown::CLI do
|
|
25
|
-
it 'writes CLI output to stdout' do
|
|
26
|
-
begin
|
|
27
|
-
HtmlToMarkdown::CLIProxy.find_cli_binary
|
|
28
|
-
rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
|
|
29
|
-
skip 'CLI binary not built'
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
stdout = StringIO.new
|
|
33
|
-
stderr = StringIO.new
|
|
34
|
-
|
|
35
|
-
exit_code = described_class.run(['--version'], stdout: stdout, stderr: stderr)
|
|
36
|
-
|
|
37
|
-
expect(exit_code).to eq(0)
|
|
38
|
-
expect(stdout.string).to include(HtmlToMarkdown::VERSION)
|
|
39
|
-
expect(stderr.string).to be_empty
|
|
40
|
-
end
|
|
41
|
-
end
|
|
42
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
require 'html_to_markdown/cli_proxy'
|
|
5
|
+
require 'html_to_markdown/cli'
|
|
6
|
+
require 'stringio'
|
|
7
|
+
|
|
8
|
+
RSpec.describe HtmlToMarkdown::CLIProxy do
|
|
9
|
+
describe '.call' do
|
|
10
|
+
it 'executes the CLI binary' do
|
|
11
|
+
begin
|
|
12
|
+
binary = described_class.find_cli_binary
|
|
13
|
+
rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
|
|
14
|
+
skip 'CLI binary not built'
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
expect(binary).to be_file
|
|
18
|
+
|
|
19
|
+
output = described_class.call(['--version'])
|
|
20
|
+
expect(output).to include(HtmlToMarkdown::VERSION)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
describe HtmlToMarkdown::CLI do
|
|
25
|
+
it 'writes CLI output to stdout' do
|
|
26
|
+
begin
|
|
27
|
+
HtmlToMarkdown::CLIProxy.find_cli_binary
|
|
28
|
+
rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
|
|
29
|
+
skip 'CLI binary not built'
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
stdout = StringIO.new
|
|
33
|
+
stderr = StringIO.new
|
|
34
|
+
|
|
35
|
+
exit_code = described_class.run(['--version'], stdout: stdout, stderr: stderr)
|
|
36
|
+
|
|
37
|
+
expect(exit_code).to eq(0)
|
|
38
|
+
expect(stdout.string).to include(HtmlToMarkdown::VERSION)
|
|
39
|
+
expect(stderr.string).to be_empty
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
data/spec/convert_spec.rb
CHANGED
|
@@ -1,38 +1,77 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'spec_helper'
|
|
4
|
-
|
|
5
|
-
RSpec.describe HtmlToMarkdown do
|
|
6
|
-
describe '.convert' do
|
|
7
|
-
it 'converts simple headings' do
|
|
8
|
-
expect(described_class.convert('<h1>Hello</h1>')).to eq("# Hello\n")
|
|
9
|
-
end
|
|
10
|
-
|
|
11
|
-
it 'accepts options hash' do
|
|
12
|
-
result = described_class.convert(
|
|
13
|
-
'<h1>Hello</h1>',
|
|
14
|
-
heading_style: :atx_closed,
|
|
15
|
-
default_title: true
|
|
16
|
-
)
|
|
17
|
-
expect(result).to include('Hello')
|
|
18
|
-
end
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
describe '.convert_with_inline_images' do
|
|
22
|
-
it 'returns inline images metadata' do
|
|
23
|
-
html = '<p><img src="" alt="fake"></p>'
|
|
24
|
-
extraction = described_class.convert_with_inline_images(html)
|
|
25
|
-
expect(extraction).to include(:markdown, :inline_images, :warnings)
|
|
26
|
-
expect(extraction[:inline_images].first[:description]).to eq('fake')
|
|
27
|
-
end
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
describe '.options' do
|
|
31
|
-
it 'returns a reusable options handle' do
|
|
32
|
-
handle = described_class.options(heading_style: :atx_closed)
|
|
33
|
-
expect(handle).to be_a(HtmlToMarkdown::Options)
|
|
34
|
-
result = described_class.convert_with_options('<h1>Hello</h1>', handle)
|
|
35
|
-
expect(result).to include('# Hello #')
|
|
36
|
-
end
|
|
37
|
-
end
|
|
38
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
|
|
5
|
+
RSpec.describe HtmlToMarkdown do
|
|
6
|
+
describe '.convert' do
|
|
7
|
+
it 'converts simple headings' do
|
|
8
|
+
expect(described_class.convert('<h1>Hello</h1>')).to eq("# Hello\n")
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
it 'accepts options hash' do
|
|
12
|
+
result = described_class.convert(
|
|
13
|
+
'<h1>Hello</h1>',
|
|
14
|
+
heading_style: :atx_closed,
|
|
15
|
+
default_title: true
|
|
16
|
+
)
|
|
17
|
+
expect(result).to include('Hello')
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
describe '.convert_with_inline_images' do
|
|
22
|
+
it 'returns inline images metadata' do
|
|
23
|
+
html = '<p><img src="" alt="fake"></p>'
|
|
24
|
+
extraction = described_class.convert_with_inline_images(html)
|
|
25
|
+
expect(extraction).to include(:markdown, :inline_images, :warnings)
|
|
26
|
+
expect(extraction[:inline_images].first[:description]).to eq('fake')
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
describe '.options' do
|
|
31
|
+
it 'returns a reusable options handle' do
|
|
32
|
+
handle = described_class.options(heading_style: :atx_closed)
|
|
33
|
+
expect(handle).to be_a(HtmlToMarkdown::Options)
|
|
34
|
+
result = described_class.convert_with_options('<h1>Hello</h1>', handle)
|
|
35
|
+
expect(result).to include('# Hello #')
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
describe 'panic handling' do
|
|
40
|
+
context 'when a Rust panic would occur' do
|
|
41
|
+
it 'catches panics in convert method' do
|
|
42
|
+
malformed_html = "#{'<' * 100_000}div#{'>' * 100_000}"
|
|
43
|
+
|
|
44
|
+
begin
|
|
45
|
+
result = described_class.convert(malformed_html)
|
|
46
|
+
expect(result).to be_a(String)
|
|
47
|
+
rescue RuntimeError => e
|
|
48
|
+
expect(e.message).to match(/html-to-markdown panic during conversion/)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
it 'catches panics in convert_with_options method' do
|
|
53
|
+
malformed_html = "#{'<' * 100_000}div#{'>' * 100_000}"
|
|
54
|
+
handle = described_class.options(heading_style: :atx)
|
|
55
|
+
|
|
56
|
+
begin
|
|
57
|
+
result = described_class.convert_with_options(malformed_html, handle)
|
|
58
|
+
expect(result).to be_a(String)
|
|
59
|
+
rescue RuntimeError => e
|
|
60
|
+
expect(e.message).to match(/html-to-markdown panic during conversion/)
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it 'catches panics in convert_with_inline_images method' do
|
|
65
|
+
malformed_html = "#{'<' * 100_000}div#{'>' * 100_000}"
|
|
66
|
+
|
|
67
|
+
begin
|
|
68
|
+
result = described_class.convert_with_inline_images(malformed_html)
|
|
69
|
+
expect(result).to be_a(Hash)
|
|
70
|
+
expect(result).to include(:markdown, :inline_images, :warnings)
|
|
71
|
+
rescue RuntimeError => e
|
|
72
|
+
expect(e.message).to match(/html-to-markdown panic during conversion/)
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
data/spec/spec_helper.rb
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'bundler/setup'
|
|
4
|
-
require 'html_to_markdown'
|
|
5
|
-
|
|
6
|
-
RSpec.configure do |config|
|
|
7
|
-
config.expect_with :rspec do |c|
|
|
8
|
-
c.syntax = :expect
|
|
9
|
-
end
|
|
10
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'bundler/setup'
|
|
4
|
+
require 'html_to_markdown'
|
|
5
|
+
|
|
6
|
+
RSpec.configure do |config|
|
|
7
|
+
config.expect_with :rspec do |c|
|
|
8
|
+
c.syntax = :expect
|
|
9
|
+
end
|
|
10
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.11.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-
|
|
11
|
+
date: 2025-12-05 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|