html-to-markdown 2.9.2 → 2.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,139 +1,139 @@
1
- # Type definitions for HtmlToMarkdown Ruby gem
2
- module HtmlToMarkdown
3
- VERSION: String
4
-
5
- # Opaque handle for reusable conversion options
6
- class Options
7
- end
8
-
9
- type heading_style = :underlined | :atx | :atx_closed
10
- type list_indent_type = :spaces | :tabs
11
- type highlight_style = :double_equal | :html | :bold | :none
12
- type whitespace_mode = :normalized | :strict
13
- type newline_style = :spaces | :backslash
14
- type code_block_style = :indented | :backticks | :tildes
15
- type preprocessing_preset = :minimal | :standard | :aggressive
16
-
17
- type preprocessing_options = {
18
- enabled: bool,
19
- preset: preprocessing_preset,
20
- remove_navigation: bool,
21
- remove_forms: bool
22
- }
23
-
24
- type conversion_options = {
25
- heading_style: heading_style,
26
- list_indent_type: list_indent_type,
27
- list_indent_width: Integer,
28
- bullets: String,
29
- strong_em_symbol: String,
30
- escape_asterisks: bool,
31
- escape_underscores: bool,
32
- escape_misc: bool,
33
- escape_ascii: bool,
34
- code_language: String,
35
- autolinks: bool,
36
- default_title: bool,
37
- br_in_tables: bool,
38
- hocr_spatial_tables: bool,
39
- highlight_style: highlight_style,
40
- extract_metadata: bool,
41
- whitespace_mode: whitespace_mode,
42
- strip_newlines: bool,
43
- wrap: bool,
44
- wrap_width: Integer,
45
- convert_as_inline: bool,
46
- sub_symbol: String,
47
- sup_symbol: String,
48
- newline_style: newline_style,
49
- code_block_style: code_block_style,
50
- keep_inline_images_in: Array[String],
51
- preprocessing: preprocessing_options,
52
- encoding: String,
53
- debug: bool,
54
- strip_tags: Array[String],
55
- preserve_tags: Array[String]
56
- }
57
-
58
- type inline_image_config = {
59
- max_decoded_size_bytes: Integer,
60
- filename_prefix: String?,
61
- capture_svg: bool,
62
- infer_dimensions: bool
63
- }
64
-
65
- type inline_image_format = "png" | "jpeg" | "gif" | "bmp" | "webp" | "svg" | String
66
-
67
- type inline_image_source = "img_data_uri" | "svg_element"
68
-
69
- type inline_image = {
70
- data: String,
71
- format: inline_image_format,
72
- filename: String?,
73
- description: String?,
74
- dimensions: [Integer, Integer]?,
75
- source: inline_image_source,
76
- attributes: Hash[String, String]
77
- }
78
-
79
- type inline_image_warning = {
80
- index: Integer,
81
- message: String
82
- }
83
-
84
- type html_extraction = {
85
- markdown: String,
86
- inline_images: Array[inline_image],
87
- warnings: Array[inline_image_warning]
88
- }
89
-
90
- # Native methods (implemented in Rust via Magnus/rb-sys)
91
- # These are aliased from the Rust extension and available as both module and instance methods
92
- private
93
-
94
- def self.native_convert: (String html, conversion_options? options) -> String
95
- def self.native_options: (conversion_options? options_hash) -> Options
96
- def self.native_convert_with_options: (String html, Options options_handle) -> String
97
- def self.native_convert_with_inline_images: (
98
- String html,
99
- conversion_options? options,
100
- inline_image_config? image_config
101
- ) -> html_extraction
102
-
103
- def native_convert: (String html, conversion_options? options) -> String
104
- def native_options: (conversion_options? options_hash) -> Options
105
- def native_convert_with_options: (String html, Options options_handle) -> String
106
- def native_convert_with_inline_images: (
107
- String html,
108
- conversion_options? options,
109
- inline_image_config? image_config
110
- ) -> html_extraction
111
-
112
- public
113
-
114
- # Convert HTML to Markdown with optional configuration
115
- def self.convert: (String html, ?conversion_options? options) -> String
116
-
117
- # Create a reusable options handle for performance
118
- def self.options: (?conversion_options? options_hash) -> Options
119
-
120
- # Convert HTML using a pre-built options handle
121
- def self.convert_with_options: (String html, Options options_handle) -> String
122
-
123
- # Convert HTML with inline image extraction
124
- def self.convert_with_inline_images: (
125
- String html,
126
- ?conversion_options? options,
127
- ?inline_image_config? image_config
128
- ) -> html_extraction
129
-
130
- # Instance method versions (created by module_function)
131
- def convert: (String html, ?conversion_options? options) -> String
132
- def options: (?conversion_options? options_hash) -> Options
133
- def convert_with_options: (String html, Options options_handle) -> String
134
- def convert_with_inline_images: (
135
- String html,
136
- ?conversion_options? options,
137
- ?inline_image_config? image_config
138
- ) -> html_extraction
139
- end
1
+ # Type definitions for HtmlToMarkdown Ruby gem
2
+ module HtmlToMarkdown
3
+ VERSION: String
4
+
5
+ # Opaque handle for reusable conversion options
6
+ class Options
7
+ end
8
+
9
+ type heading_style = :underlined | :atx | :atx_closed
10
+ type list_indent_type = :spaces | :tabs
11
+ type highlight_style = :double_equal | :html | :bold | :none
12
+ type whitespace_mode = :normalized | :strict
13
+ type newline_style = :spaces | :backslash
14
+ type code_block_style = :indented | :backticks | :tildes
15
+ type preprocessing_preset = :minimal | :standard | :aggressive
16
+
17
+ type preprocessing_options = {
18
+ enabled: bool,
19
+ preset: preprocessing_preset,
20
+ remove_navigation: bool,
21
+ remove_forms: bool
22
+ }
23
+
24
+ type conversion_options = {
25
+ heading_style: heading_style,
26
+ list_indent_type: list_indent_type,
27
+ list_indent_width: Integer,
28
+ bullets: String,
29
+ strong_em_symbol: String,
30
+ escape_asterisks: bool,
31
+ escape_underscores: bool,
32
+ escape_misc: bool,
33
+ escape_ascii: bool,
34
+ code_language: String,
35
+ autolinks: bool,
36
+ default_title: bool,
37
+ br_in_tables: bool,
38
+ hocr_spatial_tables: bool,
39
+ highlight_style: highlight_style,
40
+ extract_metadata: bool,
41
+ whitespace_mode: whitespace_mode,
42
+ strip_newlines: bool,
43
+ wrap: bool,
44
+ wrap_width: Integer,
45
+ convert_as_inline: bool,
46
+ sub_symbol: String,
47
+ sup_symbol: String,
48
+ newline_style: newline_style,
49
+ code_block_style: code_block_style,
50
+ keep_inline_images_in: Array[String],
51
+ preprocessing: preprocessing_options,
52
+ encoding: String,
53
+ debug: bool,
54
+ strip_tags: Array[String],
55
+ preserve_tags: Array[String]
56
+ }
57
+
58
+ type inline_image_config = {
59
+ max_decoded_size_bytes: Integer,
60
+ filename_prefix: String?,
61
+ capture_svg: bool,
62
+ infer_dimensions: bool
63
+ }
64
+
65
+ type inline_image_format = "png" | "jpeg" | "gif" | "bmp" | "webp" | "svg" | String
66
+
67
+ type inline_image_source = "img_data_uri" | "svg_element"
68
+
69
+ type inline_image = {
70
+ data: String,
71
+ format: inline_image_format,
72
+ filename: String?,
73
+ description: String?,
74
+ dimensions: [Integer, Integer]?,
75
+ source: inline_image_source,
76
+ attributes: Hash[String, String]
77
+ }
78
+
79
+ type inline_image_warning = {
80
+ index: Integer,
81
+ message: String
82
+ }
83
+
84
+ type html_extraction = {
85
+ markdown: String,
86
+ inline_images: Array[inline_image],
87
+ warnings: Array[inline_image_warning]
88
+ }
89
+
90
+ # Native methods (implemented in Rust via Magnus/rb-sys)
91
+ # These are aliased from the Rust extension and available as both module and instance methods
92
+ private
93
+
94
+ def self.native_convert: (String html, conversion_options? options) -> String
95
+ def self.native_options: (conversion_options? options_hash) -> Options
96
+ def self.native_convert_with_options: (String html, Options options_handle) -> String
97
+ def self.native_convert_with_inline_images: (
98
+ String html,
99
+ conversion_options? options,
100
+ inline_image_config? image_config
101
+ ) -> html_extraction
102
+
103
+ def native_convert: (String html, conversion_options? options) -> String
104
+ def native_options: (conversion_options? options_hash) -> Options
105
+ def native_convert_with_options: (String html, Options options_handle) -> String
106
+ def native_convert_with_inline_images: (
107
+ String html,
108
+ conversion_options? options,
109
+ inline_image_config? image_config
110
+ ) -> html_extraction
111
+
112
+ public
113
+
114
+ # Convert HTML to Markdown with optional configuration
115
+ def self.convert: (String html, ?conversion_options? options) -> String
116
+
117
+ # Create a reusable options handle for performance
118
+ def self.options: (?conversion_options? options_hash) -> Options
119
+
120
+ # Convert HTML using a pre-built options handle
121
+ def self.convert_with_options: (String html, Options options_handle) -> String
122
+
123
+ # Convert HTML with inline image extraction
124
+ def self.convert_with_inline_images: (
125
+ String html,
126
+ ?conversion_options? options,
127
+ ?inline_image_config? image_config
128
+ ) -> html_extraction
129
+
130
+ # Instance method versions (created by module_function)
131
+ def convert: (String html, ?conversion_options? options) -> String
132
+ def options: (?conversion_options? options_hash) -> Options
133
+ def convert_with_options: (String html, Options options_handle) -> String
134
+ def convert_with_inline_images: (
135
+ String html,
136
+ ?conversion_options? options,
137
+ ?inline_image_config? image_config
138
+ ) -> html_extraction
139
+ end
data/sig/open3.rbs CHANGED
@@ -1,12 +1,12 @@
1
- # Type signature for Open3 standard library
2
- module Open3
3
- # Execute command and capture stdout, stderr, and status
4
- #
5
- # @param cmd Command to execute
6
- # @param args Command arguments
7
- # @return Array containing stdout (String), stderr (String), and status (Process::Status)
8
- def self.capture3: (
9
- String cmd,
10
- *String args
11
- ) -> [String, String, Process::Status]
12
- end
1
+ # Type signature for Open3 standard library
2
+ module Open3
3
+ # Execute command and capture stdout, stderr, and status
4
+ #
5
+ # @param cmd Command to execute
6
+ # @param args Command arguments
7
+ # @return Array containing stdout (String), stderr (String), and status (Process::Status)
8
+ def self.capture3: (
9
+ String cmd,
10
+ *String args
11
+ ) -> [String, String, Process::Status]
12
+ end
@@ -1,42 +1,42 @@
1
- # frozen_string_literal: true
2
-
3
- require 'spec_helper'
4
- require 'html_to_markdown/cli_proxy'
5
- require 'html_to_markdown/cli'
6
- require 'stringio'
7
-
8
- RSpec.describe HtmlToMarkdown::CLIProxy do
9
- describe '.call' do
10
- it 'executes the CLI binary' do
11
- begin
12
- binary = described_class.find_cli_binary
13
- rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
14
- skip 'CLI binary not built'
15
- end
16
-
17
- expect(binary).to be_file
18
-
19
- output = described_class.call(['--version'])
20
- expect(output).to include(HtmlToMarkdown::VERSION)
21
- end
22
- end
23
-
24
- describe HtmlToMarkdown::CLI do
25
- it 'writes CLI output to stdout' do
26
- begin
27
- HtmlToMarkdown::CLIProxy.find_cli_binary
28
- rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
29
- skip 'CLI binary not built'
30
- end
31
-
32
- stdout = StringIO.new
33
- stderr = StringIO.new
34
-
35
- exit_code = described_class.run(['--version'], stdout: stdout, stderr: stderr)
36
-
37
- expect(exit_code).to eq(0)
38
- expect(stdout.string).to include(HtmlToMarkdown::VERSION)
39
- expect(stderr.string).to be_empty
40
- end
41
- end
42
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'html_to_markdown/cli_proxy'
5
+ require 'html_to_markdown/cli'
6
+ require 'stringio'
7
+
8
+ RSpec.describe HtmlToMarkdown::CLIProxy do
9
+ describe '.call' do
10
+ it 'executes the CLI binary' do
11
+ begin
12
+ binary = described_class.find_cli_binary
13
+ rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
14
+ skip 'CLI binary not built'
15
+ end
16
+
17
+ expect(binary).to be_file
18
+
19
+ output = described_class.call(['--version'])
20
+ expect(output).to include(HtmlToMarkdown::VERSION)
21
+ end
22
+ end
23
+
24
+ describe HtmlToMarkdown::CLI do
25
+ it 'writes CLI output to stdout' do
26
+ begin
27
+ HtmlToMarkdown::CLIProxy.find_cli_binary
28
+ rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
29
+ skip 'CLI binary not built'
30
+ end
31
+
32
+ stdout = StringIO.new
33
+ stderr = StringIO.new
34
+
35
+ exit_code = described_class.run(['--version'], stdout: stdout, stderr: stderr)
36
+
37
+ expect(exit_code).to eq(0)
38
+ expect(stdout.string).to include(HtmlToMarkdown::VERSION)
39
+ expect(stderr.string).to be_empty
40
+ end
41
+ end
42
+ end
data/spec/convert_spec.rb CHANGED
@@ -1,38 +1,77 @@
1
- # frozen_string_literal: true
2
-
3
- require 'spec_helper'
4
-
5
- RSpec.describe HtmlToMarkdown do
6
- describe '.convert' do
7
- it 'converts simple headings' do
8
- expect(described_class.convert('<h1>Hello</h1>')).to eq("# Hello\n")
9
- end
10
-
11
- it 'accepts options hash' do
12
- result = described_class.convert(
13
- '<h1>Hello</h1>',
14
- heading_style: :atx_closed,
15
- default_title: true
16
- )
17
- expect(result).to include('Hello')
18
- end
19
- end
20
-
21
- describe '.convert_with_inline_images' do
22
- it 'returns inline images metadata' do
23
- html = '<p><img src="data:image/png;base64,ZmFrZQ==" alt="fake"></p>'
24
- extraction = described_class.convert_with_inline_images(html)
25
- expect(extraction).to include(:markdown, :inline_images, :warnings)
26
- expect(extraction[:inline_images].first[:description]).to eq('fake')
27
- end
28
- end
29
-
30
- describe '.options' do
31
- it 'returns a reusable options handle' do
32
- handle = described_class.options(heading_style: :atx_closed)
33
- expect(handle).to be_a(HtmlToMarkdown::Options)
34
- result = described_class.convert_with_options('<h1>Hello</h1>', handle)
35
- expect(result).to include('# Hello #')
36
- end
37
- end
38
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe HtmlToMarkdown do
6
+ describe '.convert' do
7
+ it 'converts simple headings' do
8
+ expect(described_class.convert('<h1>Hello</h1>')).to eq("# Hello\n")
9
+ end
10
+
11
+ it 'accepts options hash' do
12
+ result = described_class.convert(
13
+ '<h1>Hello</h1>',
14
+ heading_style: :atx_closed,
15
+ default_title: true
16
+ )
17
+ expect(result).to include('Hello')
18
+ end
19
+ end
20
+
21
+ describe '.convert_with_inline_images' do
22
+ it 'returns inline images metadata' do
23
+ html = '<p><img src="data:image/png;base64,ZmFrZQ==" alt="fake"></p>'
24
+ extraction = described_class.convert_with_inline_images(html)
25
+ expect(extraction).to include(:markdown, :inline_images, :warnings)
26
+ expect(extraction[:inline_images].first[:description]).to eq('fake')
27
+ end
28
+ end
29
+
30
+ describe '.options' do
31
+ it 'returns a reusable options handle' do
32
+ handle = described_class.options(heading_style: :atx_closed)
33
+ expect(handle).to be_a(HtmlToMarkdown::Options)
34
+ result = described_class.convert_with_options('<h1>Hello</h1>', handle)
35
+ expect(result).to include('# Hello #')
36
+ end
37
+ end
38
+
39
+ describe 'panic handling' do
40
+ context 'when a Rust panic would occur' do
41
+ it 'catches panics in convert method' do
42
+ malformed_html = "#{'<' * 100_000}div#{'>' * 100_000}"
43
+
44
+ begin
45
+ result = described_class.convert(malformed_html)
46
+ expect(result).to be_a(String)
47
+ rescue RuntimeError => e
48
+ expect(e.message).to match(/html-to-markdown panic during conversion/)
49
+ end
50
+ end
51
+
52
+ it 'catches panics in convert_with_options method' do
53
+ malformed_html = "#{'<' * 100_000}div#{'>' * 100_000}"
54
+ handle = described_class.options(heading_style: :atx)
55
+
56
+ begin
57
+ result = described_class.convert_with_options(malformed_html, handle)
58
+ expect(result).to be_a(String)
59
+ rescue RuntimeError => e
60
+ expect(e.message).to match(/html-to-markdown panic during conversion/)
61
+ end
62
+ end
63
+
64
+ it 'catches panics in convert_with_inline_images method' do
65
+ malformed_html = "#{'<' * 100_000}div#{'>' * 100_000}"
66
+
67
+ begin
68
+ result = described_class.convert_with_inline_images(malformed_html)
69
+ expect(result).to be_a(Hash)
70
+ expect(result).to include(:markdown, :inline_images, :warnings)
71
+ rescue RuntimeError => e
72
+ expect(e.message).to match(/html-to-markdown panic during conversion/)
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,10 +1,10 @@
1
- # frozen_string_literal: true
2
-
3
- require 'bundler/setup'
4
- require 'html_to_markdown'
5
-
6
- RSpec.configure do |config|
7
- config.expect_with :rspec do |c|
8
- c.syntax = :expect
9
- end
10
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/setup'
4
+ require 'html_to_markdown'
5
+
6
+ RSpec.configure do |config|
7
+ config.expect_with :rspec do |c|
8
+ c.syntax = :expect
9
+ end
10
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.9.2
4
+ version: 2.11.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-11-28 00:00:00.000000000 Z
11
+ date: 2025-12-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys