html-to-markdown 2.7.1 → 2.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 92b1acd7c60d7aa288f3a73ee5d5f67e0397719f61879edd0d8fa4d8e3b09601
4
- data.tar.gz: 687dabe472299a6007d1cc0462acb8a5103b6a41d63c4987788c8d915bdfe8c1
3
+ metadata.gz: 00b01de61bb11f04e93d91e1b76d39eba48fc92f86eade0d4161e26cc057d145
4
+ data.tar.gz: fd4eb68b80988e643bcd5ff26a8090a2f5857d622583e9018d98ff5294c331bb
5
5
  SHA512:
6
- metadata.gz: c98e25f2a37a2cedec0fa611e0460aaa6f26e7be19b3fba461f9a5a4fa6ebcc8bd76e0698489e1ba0c5e8a8a172596e67cdae9ba8dc0409c7466fb34329adb93
7
- data.tar.gz: 8cea9bc49e6156ce2242c155959793bd778f8747aaa6f343a8eec8d14285d503b2ef659d3cc3304b955f614dbc8fe123c7e2b7a878d6ddc965ae0a4350fab443
6
+ metadata.gz: 25c7819d959377c33d53dc8c2e98a83901f9eed243861ab8f46920cca6461d669cd7aa0069b409f6d9926c5d1cf3c4902a2c03b046bc8ab149efc876f8d22903
7
+ data.tar.gz: e9af104c04a4ec024c39f7de01a57b4981a43163a9b0e0855e78c897f663a00c78ab0e4bcb0676188e8224407c20257b58a1d1371b005621167147cea905ebcf
data/Gemfile CHANGED
@@ -9,7 +9,9 @@ gemspec
9
9
  group :development, :test do
10
10
  gem 'rake-compiler'
11
11
  gem 'rb_sys' # provides build tooling when developing locally
12
+ gem 'rbs', require: false
12
13
  gem 'rspec'
13
14
  gem 'rubocop', require: false
14
15
  gem 'rubocop-rspec', require: false
16
+ gem 'steep', require: false
15
17
  end
data/Gemfile.lock CHANGED
@@ -1,17 +1,47 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.7.1)
4
+ html-to-markdown (2.7.2)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
9
9
  specs:
10
+ activesupport (8.1.1)
11
+ base64
12
+ bigdecimal
13
+ concurrent-ruby (~> 1.0, >= 1.3.1)
14
+ connection_pool (>= 2.2.5)
15
+ drb
16
+ i18n (>= 1.6, < 2)
17
+ json
18
+ logger (>= 1.4.2)
19
+ minitest (>= 5.1)
20
+ securerandom (>= 0.3)
21
+ tzinfo (~> 2.0, >= 2.0.5)
22
+ uri (>= 0.13.1)
10
23
  ast (2.4.3)
24
+ base64 (0.3.0)
25
+ bigdecimal (3.3.1)
26
+ concurrent-ruby (1.3.5)
27
+ connection_pool (2.5.4)
28
+ csv (3.3.5)
11
29
  diff-lcs (1.6.2)
30
+ drb (2.2.3)
31
+ ffi (1.17.2)
32
+ ffi (1.17.2-arm64-darwin)
33
+ fileutils (1.8.0)
34
+ i18n (1.14.7)
35
+ concurrent-ruby (~> 1.0)
12
36
  json (2.16.0)
13
37
  language_server-protocol (3.17.0.5)
14
38
  lint_roller (1.1.0)
39
+ listen (3.9.0)
40
+ rb-fsevent (~> 0.10, >= 0.10.3)
41
+ rb-inotify (~> 0.9, >= 0.9.10)
42
+ logger (1.7.0)
43
+ minitest (5.26.1)
44
+ mutex_m (0.3.0)
15
45
  parallel (1.27.0)
16
46
  parser (3.3.10.0)
17
47
  ast (~> 2.4.1)
@@ -23,8 +53,13 @@ GEM
23
53
  rake-compiler (1.3.0)
24
54
  rake
25
55
  rake-compiler-dock (1.9.1)
56
+ rb-fsevent (0.11.2)
57
+ rb-inotify (0.11.1)
58
+ ffi (~> 1.0)
26
59
  rb_sys (0.9.117)
27
60
  rake-compiler-dock (= 1.9.1)
61
+ rbs (3.9.5)
62
+ logger
28
63
  regexp_parser (2.11.3)
29
64
  rspec (3.13.2)
30
65
  rspec-core (~> 3.13.0)
@@ -53,13 +88,37 @@ GEM
53
88
  rubocop-ast (1.48.0)
54
89
  parser (>= 3.3.7.2)
55
90
  prism (~> 1.4)
56
- rubocop-rspec (3.7.0)
91
+ rubocop-rspec (3.8.0)
57
92
  lint_roller (~> 1.1)
58
- rubocop (~> 1.72, >= 1.72.1)
93
+ rubocop (~> 1.81)
59
94
  ruby-progressbar (1.13.0)
95
+ securerandom (0.4.1)
96
+ steep (1.10.0)
97
+ activesupport (>= 5.1)
98
+ concurrent-ruby (>= 1.1.10)
99
+ csv (>= 3.0.9)
100
+ fileutils (>= 1.1.0)
101
+ json (>= 2.1.0)
102
+ language_server-protocol (>= 3.17.0.4, < 4.0)
103
+ listen (~> 3.0)
104
+ logger (>= 1.3.0)
105
+ mutex_m (>= 0.3.0)
106
+ parser (>= 3.1)
107
+ rainbow (>= 2.2.2, < 4.0)
108
+ rbs (~> 3.9)
109
+ securerandom (>= 0.1)
110
+ strscan (>= 1.0.0)
111
+ terminal-table (>= 2, < 5)
112
+ uri (>= 0.12.0)
113
+ strscan (3.1.5)
114
+ terminal-table (4.0.0)
115
+ unicode-display_width (>= 1.1.1, < 4)
116
+ tzinfo (2.0.6)
117
+ concurrent-ruby (~> 1.0)
60
118
  unicode-display_width (3.2.0)
61
119
  unicode-emoji (~> 4.1)
62
120
  unicode-emoji (4.1.0)
121
+ uri (1.1.1)
63
122
 
64
123
  PLATFORMS
65
124
  arm64-darwin-24
@@ -69,9 +128,11 @@ DEPENDENCIES
69
128
  html-to-markdown!
70
129
  rake-compiler
71
130
  rb_sys
131
+ rbs
72
132
  rspec
73
133
  rubocop
74
134
  rubocop-rspec
135
+ steep
75
136
 
76
137
  RUBY VERSION
77
138
  ruby 3.2.9p248
data/Steepfile ADDED
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Steepfile for type checking html-to-markdown Ruby gem
4
+
5
+ target :lib do
6
+ signature "sig"
7
+
8
+ check "lib"
9
+
10
+ # Configure libraries
11
+ library "pathname"
12
+ library "open3"
13
+
14
+ # Ignore vendor directory
15
+ ignore "vendor"
16
+
17
+ # Ignore spec directory
18
+ ignore "spec"
19
+
20
+ # Ignore bin directory
21
+ ignore "bin"
22
+
23
+ # Ignore internal implementation modules (not public API)
24
+ ignore "lib/html_to_markdown/cli.rb"
25
+ ignore "lib/html_to_markdown/cli_proxy.rb"
26
+ end
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version = "2.7.1"
3
+ version = "2.7.2"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
6
6
  license = "MIT"
@@ -42,11 +42,11 @@ module HtmlToMarkdown
42
42
  end
43
43
 
44
44
  def root_path
45
- @root_path ||= Pathname(__dir__).join('../..').expand_path
45
+ @root_path ||= Pathname(__dir__.to_s).join('../..').expand_path
46
46
  end
47
47
 
48
48
  def lib_path
49
- @lib_path ||= Pathname(__dir__).join('..').expand_path
49
+ @lib_path ||= Pathname(__dir__.to_s).join('..').expand_path
50
50
  end
51
51
 
52
52
  def search_paths(binary_name)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.7.1'
4
+ VERSION = '2.7.2'
5
5
  end
@@ -0,0 +1,24 @@
1
+ module HtmlToMarkdown
2
+ module CLI
3
+ # Module method (module_function creates both module and instance methods)
4
+ #
5
+ # Run the CLI with the given arguments
6
+ #
7
+ # @param argv Command-line arguments (defaults to ARGV)
8
+ # @param stdout Output stream for standard output
9
+ # @param stderr Output stream for standard error
10
+ # @return Exit code (0 for success, non-zero for failure)
11
+ def self.run: (
12
+ ?Array[String] argv,
13
+ ?stdout: IO,
14
+ ?stderr: IO
15
+ ) -> Integer
16
+
17
+ # Instance method version (created by module_function)
18
+ def run: (
19
+ ?Array[String] argv,
20
+ ?stdout: IO,
21
+ ?stderr: IO
22
+ ) -> Integer
23
+ end
24
+ end
@@ -0,0 +1,48 @@
1
+ module HtmlToMarkdown
2
+ module CLIProxy
3
+ # Base error class
4
+ class Error < StandardError
5
+ end
6
+
7
+ # Error when CLI binary is not found
8
+ class MissingBinaryError < Error
9
+ end
10
+
11
+ # Error when CLI execution fails
12
+ class CLIExecutionError < Error
13
+ attr_reader stderr: String
14
+ attr_reader status: Integer?
15
+
16
+ def initialize: (String message, stderr: String, status: Integer?) -> void
17
+ end
18
+
19
+ # Module methods (module_function creates both module and instance methods)
20
+
21
+ # Execute CLI with given arguments
22
+ def self.call: (Array[String] argv) -> String
23
+
24
+ # Find the CLI binary in search paths
25
+ def self.find_cli_binary: () -> Pathname
26
+
27
+ # Get root path of the gem
28
+ def self.root_path: () -> Pathname
29
+
30
+ # Get lib path of the gem
31
+ def self.lib_path: () -> Pathname
32
+
33
+ # Get search paths for CLI binary
34
+ def self.search_paths: (String binary_name) -> Array[Pathname]
35
+
36
+ # Get error message for missing binary
37
+ def self.missing_binary_message: () -> String
38
+
39
+ # Instance method versions (created by module_function)
40
+
41
+ def call: (Array[String] argv) -> String
42
+ def find_cli_binary: () -> Pathname
43
+ def root_path: () -> Pathname
44
+ def lib_path: () -> Pathname
45
+ def search_paths: (String binary_name) -> Array[Pathname]
46
+ def missing_binary_message: () -> String
47
+ end
48
+ end
@@ -0,0 +1,139 @@
1
+ # Type definitions for HtmlToMarkdown Ruby gem
2
+ module HtmlToMarkdown
3
+ VERSION: String
4
+
5
+ # Opaque handle for reusable conversion options
6
+ class Options
7
+ end
8
+
9
+ type heading_style = :underlined | :atx | :atx_closed
10
+ type list_indent_type = :spaces | :tabs
11
+ type highlight_style = :double_equal | :html | :bold | :none
12
+ type whitespace_mode = :normalized | :strict
13
+ type newline_style = :spaces | :backslash
14
+ type code_block_style = :indented | :backticks | :tildes
15
+ type preprocessing_preset = :minimal | :standard | :aggressive
16
+
17
+ type preprocessing_options = {
18
+ enabled: bool,
19
+ preset: preprocessing_preset,
20
+ remove_navigation: bool,
21
+ remove_forms: bool
22
+ }
23
+
24
+ type conversion_options = {
25
+ heading_style: heading_style,
26
+ list_indent_type: list_indent_type,
27
+ list_indent_width: Integer,
28
+ bullets: String,
29
+ strong_em_symbol: String,
30
+ escape_asterisks: bool,
31
+ escape_underscores: bool,
32
+ escape_misc: bool,
33
+ escape_ascii: bool,
34
+ code_language: String,
35
+ autolinks: bool,
36
+ default_title: bool,
37
+ br_in_tables: bool,
38
+ hocr_spatial_tables: bool,
39
+ highlight_style: highlight_style,
40
+ extract_metadata: bool,
41
+ whitespace_mode: whitespace_mode,
42
+ strip_newlines: bool,
43
+ wrap: bool,
44
+ wrap_width: Integer,
45
+ convert_as_inline: bool,
46
+ sub_symbol: String,
47
+ sup_symbol: String,
48
+ newline_style: newline_style,
49
+ code_block_style: code_block_style,
50
+ keep_inline_images_in: Array[String],
51
+ preprocessing: preprocessing_options,
52
+ encoding: String,
53
+ debug: bool,
54
+ strip_tags: Array[String],
55
+ preserve_tags: Array[String]
56
+ }
57
+
58
+ type inline_image_config = {
59
+ max_decoded_size_bytes: Integer,
60
+ filename_prefix: String?,
61
+ capture_svg: bool,
62
+ infer_dimensions: bool
63
+ }
64
+
65
+ type inline_image_format = "png" | "jpeg" | "gif" | "bmp" | "webp" | "svg" | String
66
+
67
+ type inline_image_source = "img_data_uri" | "svg_element"
68
+
69
+ type inline_image = {
70
+ data: String,
71
+ format: inline_image_format,
72
+ filename: String?,
73
+ description: String?,
74
+ dimensions: [Integer, Integer]?,
75
+ source: inline_image_source,
76
+ attributes: Hash[String, String]
77
+ }
78
+
79
+ type inline_image_warning = {
80
+ index: Integer,
81
+ message: String
82
+ }
83
+
84
+ type html_extraction = {
85
+ markdown: String,
86
+ inline_images: Array[inline_image],
87
+ warnings: Array[inline_image_warning]
88
+ }
89
+
90
+ # Native methods (implemented in Rust via Magnus/rb-sys)
91
+ # These are aliased from the Rust extension and available as both module and instance methods
92
+ private
93
+
94
+ def self.native_convert: (String html, conversion_options? options) -> String
95
+ def self.native_options: (conversion_options? options_hash) -> Options
96
+ def self.native_convert_with_options: (String html, Options options_handle) -> String
97
+ def self.native_convert_with_inline_images: (
98
+ String html,
99
+ conversion_options? options,
100
+ inline_image_config? image_config
101
+ ) -> html_extraction
102
+
103
+ def native_convert: (String html, conversion_options? options) -> String
104
+ def native_options: (conversion_options? options_hash) -> Options
105
+ def native_convert_with_options: (String html, Options options_handle) -> String
106
+ def native_convert_with_inline_images: (
107
+ String html,
108
+ conversion_options? options,
109
+ inline_image_config? image_config
110
+ ) -> html_extraction
111
+
112
+ public
113
+
114
+ # Convert HTML to Markdown with optional configuration
115
+ def self.convert: (String html, ?conversion_options? options) -> String
116
+
117
+ # Create a reusable options handle for performance
118
+ def self.options: (?conversion_options? options_hash) -> Options
119
+
120
+ # Convert HTML using a pre-built options handle
121
+ def self.convert_with_options: (String html, Options options_handle) -> String
122
+
123
+ # Convert HTML with inline image extraction
124
+ def self.convert_with_inline_images: (
125
+ String html,
126
+ ?conversion_options? options,
127
+ ?inline_image_config? image_config
128
+ ) -> html_extraction
129
+
130
+ # Instance method versions (created by module_function)
131
+ def convert: (String html, ?conversion_options? options) -> String
132
+ def options: (?conversion_options? options_hash) -> Options
133
+ def convert_with_options: (String html, Options options_handle) -> String
134
+ def convert_with_inline_images: (
135
+ String html,
136
+ ?conversion_options? options,
137
+ ?inline_image_config? image_config
138
+ ) -> html_extraction
139
+ end
data/sig/open3.rbs ADDED
@@ -0,0 +1,12 @@
1
+ # Type signature for Open3 standard library
2
+ module Open3
3
+ # Execute command and capture stdout, stderr, and status
4
+ #
5
+ # @param cmd Command to execute
6
+ # @param args Command arguments
7
+ # @return Array containing stdout (String), stderr (String), and status (Process::Status)
8
+ def self.capture3: (
9
+ String cmd,
10
+ *String args
11
+ ) -> [String, String, Process::Status]
12
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.7.1
4
+ version: 2.7.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-11-12 00:00:00.000000000 Z
11
+ date: 2025-11-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -47,6 +47,7 @@ files:
47
47
  - Gemfile.lock
48
48
  - README.md
49
49
  - Rakefile
50
+ - Steepfile
50
51
  - bin/benchmark.rb
51
52
  - exe/html-to-markdown
52
53
  - ext/html-to-markdown-rb/extconf.rb
@@ -59,6 +60,10 @@ files:
59
60
  - lib/html_to_markdown/cli.rb
60
61
  - lib/html_to_markdown/cli_proxy.rb
61
62
  - lib/html_to_markdown/version.rb
63
+ - sig/html_to_markdown.rbs
64
+ - sig/html_to_markdown/cli.rbs
65
+ - sig/html_to_markdown/cli_proxy.rbs
66
+ - sig/open3.rbs
62
67
  - spec/cli_proxy_spec.rb
63
68
  - spec/convert_spec.rb
64
69
  - spec/spec_helper.rb