html-to-markdown 2.7.1 → 2.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/Gemfile.lock +64 -3
- data/Steepfile +26 -0
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/lib/html_to_markdown/cli_proxy.rb +2 -2
- data/lib/html_to_markdown/version.rb +1 -1
- data/sig/html_to_markdown/cli.rbs +24 -0
- data/sig/html_to_markdown/cli_proxy.rbs +48 -0
- data/sig/html_to_markdown.rbs +139 -0
- data/sig/open3.rbs +12 -0
- metadata +7 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 00b01de61bb11f04e93d91e1b76d39eba48fc92f86eade0d4161e26cc057d145
|
|
4
|
+
data.tar.gz: fd4eb68b80988e643bcd5ff26a8090a2f5857d622583e9018d98ff5294c331bb
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 25c7819d959377c33d53dc8c2e98a83901f9eed243861ab8f46920cca6461d669cd7aa0069b409f6d9926c5d1cf3c4902a2c03b046bc8ab149efc876f8d22903
|
|
7
|
+
data.tar.gz: e9af104c04a4ec024c39f7de01a57b4981a43163a9b0e0855e78c897f663a00c78ab0e4bcb0676188e8224407c20257b58a1d1371b005621167147cea905ebcf
|
data/Gemfile
CHANGED
|
@@ -9,7 +9,9 @@ gemspec
|
|
|
9
9
|
group :development, :test do
|
|
10
10
|
gem 'rake-compiler'
|
|
11
11
|
gem 'rb_sys' # provides build tooling when developing locally
|
|
12
|
+
gem 'rbs', require: false
|
|
12
13
|
gem 'rspec'
|
|
13
14
|
gem 'rubocop', require: false
|
|
14
15
|
gem 'rubocop-rspec', require: false
|
|
16
|
+
gem 'steep', require: false
|
|
15
17
|
end
|
data/Gemfile.lock
CHANGED
|
@@ -1,17 +1,47 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (2.7.
|
|
4
|
+
html-to-markdown (2.7.2)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
8
8
|
remote: https://rubygems.org/
|
|
9
9
|
specs:
|
|
10
|
+
activesupport (8.1.1)
|
|
11
|
+
base64
|
|
12
|
+
bigdecimal
|
|
13
|
+
concurrent-ruby (~> 1.0, >= 1.3.1)
|
|
14
|
+
connection_pool (>= 2.2.5)
|
|
15
|
+
drb
|
|
16
|
+
i18n (>= 1.6, < 2)
|
|
17
|
+
json
|
|
18
|
+
logger (>= 1.4.2)
|
|
19
|
+
minitest (>= 5.1)
|
|
20
|
+
securerandom (>= 0.3)
|
|
21
|
+
tzinfo (~> 2.0, >= 2.0.5)
|
|
22
|
+
uri (>= 0.13.1)
|
|
10
23
|
ast (2.4.3)
|
|
24
|
+
base64 (0.3.0)
|
|
25
|
+
bigdecimal (3.3.1)
|
|
26
|
+
concurrent-ruby (1.3.5)
|
|
27
|
+
connection_pool (2.5.4)
|
|
28
|
+
csv (3.3.5)
|
|
11
29
|
diff-lcs (1.6.2)
|
|
30
|
+
drb (2.2.3)
|
|
31
|
+
ffi (1.17.2)
|
|
32
|
+
ffi (1.17.2-arm64-darwin)
|
|
33
|
+
fileutils (1.8.0)
|
|
34
|
+
i18n (1.14.7)
|
|
35
|
+
concurrent-ruby (~> 1.0)
|
|
12
36
|
json (2.16.0)
|
|
13
37
|
language_server-protocol (3.17.0.5)
|
|
14
38
|
lint_roller (1.1.0)
|
|
39
|
+
listen (3.9.0)
|
|
40
|
+
rb-fsevent (~> 0.10, >= 0.10.3)
|
|
41
|
+
rb-inotify (~> 0.9, >= 0.9.10)
|
|
42
|
+
logger (1.7.0)
|
|
43
|
+
minitest (5.26.1)
|
|
44
|
+
mutex_m (0.3.0)
|
|
15
45
|
parallel (1.27.0)
|
|
16
46
|
parser (3.3.10.0)
|
|
17
47
|
ast (~> 2.4.1)
|
|
@@ -23,8 +53,13 @@ GEM
|
|
|
23
53
|
rake-compiler (1.3.0)
|
|
24
54
|
rake
|
|
25
55
|
rake-compiler-dock (1.9.1)
|
|
56
|
+
rb-fsevent (0.11.2)
|
|
57
|
+
rb-inotify (0.11.1)
|
|
58
|
+
ffi (~> 1.0)
|
|
26
59
|
rb_sys (0.9.117)
|
|
27
60
|
rake-compiler-dock (= 1.9.1)
|
|
61
|
+
rbs (3.9.5)
|
|
62
|
+
logger
|
|
28
63
|
regexp_parser (2.11.3)
|
|
29
64
|
rspec (3.13.2)
|
|
30
65
|
rspec-core (~> 3.13.0)
|
|
@@ -53,13 +88,37 @@ GEM
|
|
|
53
88
|
rubocop-ast (1.48.0)
|
|
54
89
|
parser (>= 3.3.7.2)
|
|
55
90
|
prism (~> 1.4)
|
|
56
|
-
rubocop-rspec (3.
|
|
91
|
+
rubocop-rspec (3.8.0)
|
|
57
92
|
lint_roller (~> 1.1)
|
|
58
|
-
rubocop (~> 1.
|
|
93
|
+
rubocop (~> 1.81)
|
|
59
94
|
ruby-progressbar (1.13.0)
|
|
95
|
+
securerandom (0.4.1)
|
|
96
|
+
steep (1.10.0)
|
|
97
|
+
activesupport (>= 5.1)
|
|
98
|
+
concurrent-ruby (>= 1.1.10)
|
|
99
|
+
csv (>= 3.0.9)
|
|
100
|
+
fileutils (>= 1.1.0)
|
|
101
|
+
json (>= 2.1.0)
|
|
102
|
+
language_server-protocol (>= 3.17.0.4, < 4.0)
|
|
103
|
+
listen (~> 3.0)
|
|
104
|
+
logger (>= 1.3.0)
|
|
105
|
+
mutex_m (>= 0.3.0)
|
|
106
|
+
parser (>= 3.1)
|
|
107
|
+
rainbow (>= 2.2.2, < 4.0)
|
|
108
|
+
rbs (~> 3.9)
|
|
109
|
+
securerandom (>= 0.1)
|
|
110
|
+
strscan (>= 1.0.0)
|
|
111
|
+
terminal-table (>= 2, < 5)
|
|
112
|
+
uri (>= 0.12.0)
|
|
113
|
+
strscan (3.1.5)
|
|
114
|
+
terminal-table (4.0.0)
|
|
115
|
+
unicode-display_width (>= 1.1.1, < 4)
|
|
116
|
+
tzinfo (2.0.6)
|
|
117
|
+
concurrent-ruby (~> 1.0)
|
|
60
118
|
unicode-display_width (3.2.0)
|
|
61
119
|
unicode-emoji (~> 4.1)
|
|
62
120
|
unicode-emoji (4.1.0)
|
|
121
|
+
uri (1.1.1)
|
|
63
122
|
|
|
64
123
|
PLATFORMS
|
|
65
124
|
arm64-darwin-24
|
|
@@ -69,9 +128,11 @@ DEPENDENCIES
|
|
|
69
128
|
html-to-markdown!
|
|
70
129
|
rake-compiler
|
|
71
130
|
rb_sys
|
|
131
|
+
rbs
|
|
72
132
|
rspec
|
|
73
133
|
rubocop
|
|
74
134
|
rubocop-rspec
|
|
135
|
+
steep
|
|
75
136
|
|
|
76
137
|
RUBY VERSION
|
|
77
138
|
ruby 3.2.9p248
|
data/Steepfile
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Steepfile for type checking html-to-markdown Ruby gem
|
|
4
|
+
|
|
5
|
+
target :lib do
|
|
6
|
+
signature "sig"
|
|
7
|
+
|
|
8
|
+
check "lib"
|
|
9
|
+
|
|
10
|
+
# Configure libraries
|
|
11
|
+
library "pathname"
|
|
12
|
+
library "open3"
|
|
13
|
+
|
|
14
|
+
# Ignore vendor directory
|
|
15
|
+
ignore "vendor"
|
|
16
|
+
|
|
17
|
+
# Ignore spec directory
|
|
18
|
+
ignore "spec"
|
|
19
|
+
|
|
20
|
+
# Ignore bin directory
|
|
21
|
+
ignore "bin"
|
|
22
|
+
|
|
23
|
+
# Ignore internal implementation modules (not public API)
|
|
24
|
+
ignore "lib/html_to_markdown/cli.rb"
|
|
25
|
+
ignore "lib/html_to_markdown/cli_proxy.rb"
|
|
26
|
+
end
|
|
@@ -42,11 +42,11 @@ module HtmlToMarkdown
|
|
|
42
42
|
end
|
|
43
43
|
|
|
44
44
|
def root_path
|
|
45
|
-
@root_path ||= Pathname(__dir__).join('../..').expand_path
|
|
45
|
+
@root_path ||= Pathname(__dir__.to_s).join('../..').expand_path
|
|
46
46
|
end
|
|
47
47
|
|
|
48
48
|
def lib_path
|
|
49
|
-
@lib_path ||= Pathname(__dir__).join('..').expand_path
|
|
49
|
+
@lib_path ||= Pathname(__dir__.to_s).join('..').expand_path
|
|
50
50
|
end
|
|
51
51
|
|
|
52
52
|
def search_paths(binary_name)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
module HtmlToMarkdown
|
|
2
|
+
module CLI
|
|
3
|
+
# Module method (module_function creates both module and instance methods)
|
|
4
|
+
#
|
|
5
|
+
# Run the CLI with the given arguments
|
|
6
|
+
#
|
|
7
|
+
# @param argv Command-line arguments (defaults to ARGV)
|
|
8
|
+
# @param stdout Output stream for standard output
|
|
9
|
+
# @param stderr Output stream for standard error
|
|
10
|
+
# @return Exit code (0 for success, non-zero for failure)
|
|
11
|
+
def self.run: (
|
|
12
|
+
?Array[String] argv,
|
|
13
|
+
?stdout: IO,
|
|
14
|
+
?stderr: IO
|
|
15
|
+
) -> Integer
|
|
16
|
+
|
|
17
|
+
# Instance method version (created by module_function)
|
|
18
|
+
def run: (
|
|
19
|
+
?Array[String] argv,
|
|
20
|
+
?stdout: IO,
|
|
21
|
+
?stderr: IO
|
|
22
|
+
) -> Integer
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
module HtmlToMarkdown
|
|
2
|
+
module CLIProxy
|
|
3
|
+
# Base error class
|
|
4
|
+
class Error < StandardError
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
# Error when CLI binary is not found
|
|
8
|
+
class MissingBinaryError < Error
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
# Error when CLI execution fails
|
|
12
|
+
class CLIExecutionError < Error
|
|
13
|
+
attr_reader stderr: String
|
|
14
|
+
attr_reader status: Integer?
|
|
15
|
+
|
|
16
|
+
def initialize: (String message, stderr: String, status: Integer?) -> void
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Module methods (module_function creates both module and instance methods)
|
|
20
|
+
|
|
21
|
+
# Execute CLI with given arguments
|
|
22
|
+
def self.call: (Array[String] argv) -> String
|
|
23
|
+
|
|
24
|
+
# Find the CLI binary in search paths
|
|
25
|
+
def self.find_cli_binary: () -> Pathname
|
|
26
|
+
|
|
27
|
+
# Get root path of the gem
|
|
28
|
+
def self.root_path: () -> Pathname
|
|
29
|
+
|
|
30
|
+
# Get lib path of the gem
|
|
31
|
+
def self.lib_path: () -> Pathname
|
|
32
|
+
|
|
33
|
+
# Get search paths for CLI binary
|
|
34
|
+
def self.search_paths: (String binary_name) -> Array[Pathname]
|
|
35
|
+
|
|
36
|
+
# Get error message for missing binary
|
|
37
|
+
def self.missing_binary_message: () -> String
|
|
38
|
+
|
|
39
|
+
# Instance method versions (created by module_function)
|
|
40
|
+
|
|
41
|
+
def call: (Array[String] argv) -> String
|
|
42
|
+
def find_cli_binary: () -> Pathname
|
|
43
|
+
def root_path: () -> Pathname
|
|
44
|
+
def lib_path: () -> Pathname
|
|
45
|
+
def search_paths: (String binary_name) -> Array[Pathname]
|
|
46
|
+
def missing_binary_message: () -> String
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# Type definitions for HtmlToMarkdown Ruby gem
|
|
2
|
+
module HtmlToMarkdown
|
|
3
|
+
VERSION: String
|
|
4
|
+
|
|
5
|
+
# Opaque handle for reusable conversion options
|
|
6
|
+
class Options
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
type heading_style = :underlined | :atx | :atx_closed
|
|
10
|
+
type list_indent_type = :spaces | :tabs
|
|
11
|
+
type highlight_style = :double_equal | :html | :bold | :none
|
|
12
|
+
type whitespace_mode = :normalized | :strict
|
|
13
|
+
type newline_style = :spaces | :backslash
|
|
14
|
+
type code_block_style = :indented | :backticks | :tildes
|
|
15
|
+
type preprocessing_preset = :minimal | :standard | :aggressive
|
|
16
|
+
|
|
17
|
+
type preprocessing_options = {
|
|
18
|
+
enabled: bool,
|
|
19
|
+
preset: preprocessing_preset,
|
|
20
|
+
remove_navigation: bool,
|
|
21
|
+
remove_forms: bool
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
type conversion_options = {
|
|
25
|
+
heading_style: heading_style,
|
|
26
|
+
list_indent_type: list_indent_type,
|
|
27
|
+
list_indent_width: Integer,
|
|
28
|
+
bullets: String,
|
|
29
|
+
strong_em_symbol: String,
|
|
30
|
+
escape_asterisks: bool,
|
|
31
|
+
escape_underscores: bool,
|
|
32
|
+
escape_misc: bool,
|
|
33
|
+
escape_ascii: bool,
|
|
34
|
+
code_language: String,
|
|
35
|
+
autolinks: bool,
|
|
36
|
+
default_title: bool,
|
|
37
|
+
br_in_tables: bool,
|
|
38
|
+
hocr_spatial_tables: bool,
|
|
39
|
+
highlight_style: highlight_style,
|
|
40
|
+
extract_metadata: bool,
|
|
41
|
+
whitespace_mode: whitespace_mode,
|
|
42
|
+
strip_newlines: bool,
|
|
43
|
+
wrap: bool,
|
|
44
|
+
wrap_width: Integer,
|
|
45
|
+
convert_as_inline: bool,
|
|
46
|
+
sub_symbol: String,
|
|
47
|
+
sup_symbol: String,
|
|
48
|
+
newline_style: newline_style,
|
|
49
|
+
code_block_style: code_block_style,
|
|
50
|
+
keep_inline_images_in: Array[String],
|
|
51
|
+
preprocessing: preprocessing_options,
|
|
52
|
+
encoding: String,
|
|
53
|
+
debug: bool,
|
|
54
|
+
strip_tags: Array[String],
|
|
55
|
+
preserve_tags: Array[String]
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
type inline_image_config = {
|
|
59
|
+
max_decoded_size_bytes: Integer,
|
|
60
|
+
filename_prefix: String?,
|
|
61
|
+
capture_svg: bool,
|
|
62
|
+
infer_dimensions: bool
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
type inline_image_format = "png" | "jpeg" | "gif" | "bmp" | "webp" | "svg" | String
|
|
66
|
+
|
|
67
|
+
type inline_image_source = "img_data_uri" | "svg_element"
|
|
68
|
+
|
|
69
|
+
type inline_image = {
|
|
70
|
+
data: String,
|
|
71
|
+
format: inline_image_format,
|
|
72
|
+
filename: String?,
|
|
73
|
+
description: String?,
|
|
74
|
+
dimensions: [Integer, Integer]?,
|
|
75
|
+
source: inline_image_source,
|
|
76
|
+
attributes: Hash[String, String]
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
type inline_image_warning = {
|
|
80
|
+
index: Integer,
|
|
81
|
+
message: String
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
type html_extraction = {
|
|
85
|
+
markdown: String,
|
|
86
|
+
inline_images: Array[inline_image],
|
|
87
|
+
warnings: Array[inline_image_warning]
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
# Native methods (implemented in Rust via Magnus/rb-sys)
|
|
91
|
+
# These are aliased from the Rust extension and available as both module and instance methods
|
|
92
|
+
private
|
|
93
|
+
|
|
94
|
+
def self.native_convert: (String html, conversion_options? options) -> String
|
|
95
|
+
def self.native_options: (conversion_options? options_hash) -> Options
|
|
96
|
+
def self.native_convert_with_options: (String html, Options options_handle) -> String
|
|
97
|
+
def self.native_convert_with_inline_images: (
|
|
98
|
+
String html,
|
|
99
|
+
conversion_options? options,
|
|
100
|
+
inline_image_config? image_config
|
|
101
|
+
) -> html_extraction
|
|
102
|
+
|
|
103
|
+
def native_convert: (String html, conversion_options? options) -> String
|
|
104
|
+
def native_options: (conversion_options? options_hash) -> Options
|
|
105
|
+
def native_convert_with_options: (String html, Options options_handle) -> String
|
|
106
|
+
def native_convert_with_inline_images: (
|
|
107
|
+
String html,
|
|
108
|
+
conversion_options? options,
|
|
109
|
+
inline_image_config? image_config
|
|
110
|
+
) -> html_extraction
|
|
111
|
+
|
|
112
|
+
public
|
|
113
|
+
|
|
114
|
+
# Convert HTML to Markdown with optional configuration
|
|
115
|
+
def self.convert: (String html, ?conversion_options? options) -> String
|
|
116
|
+
|
|
117
|
+
# Create a reusable options handle for performance
|
|
118
|
+
def self.options: (?conversion_options? options_hash) -> Options
|
|
119
|
+
|
|
120
|
+
# Convert HTML using a pre-built options handle
|
|
121
|
+
def self.convert_with_options: (String html, Options options_handle) -> String
|
|
122
|
+
|
|
123
|
+
# Convert HTML with inline image extraction
|
|
124
|
+
def self.convert_with_inline_images: (
|
|
125
|
+
String html,
|
|
126
|
+
?conversion_options? options,
|
|
127
|
+
?inline_image_config? image_config
|
|
128
|
+
) -> html_extraction
|
|
129
|
+
|
|
130
|
+
# Instance method versions (created by module_function)
|
|
131
|
+
def convert: (String html, ?conversion_options? options) -> String
|
|
132
|
+
def options: (?conversion_options? options_hash) -> Options
|
|
133
|
+
def convert_with_options: (String html, Options options_handle) -> String
|
|
134
|
+
def convert_with_inline_images: (
|
|
135
|
+
String html,
|
|
136
|
+
?conversion_options? options,
|
|
137
|
+
?inline_image_config? image_config
|
|
138
|
+
) -> html_extraction
|
|
139
|
+
end
|
data/sig/open3.rbs
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# Type signature for Open3 standard library
|
|
2
|
+
module Open3
|
|
3
|
+
# Execute command and capture stdout, stderr, and status
|
|
4
|
+
#
|
|
5
|
+
# @param cmd Command to execute
|
|
6
|
+
# @param args Command arguments
|
|
7
|
+
# @return Array containing stdout (String), stderr (String), and status (Process::Status)
|
|
8
|
+
def self.capture3: (
|
|
9
|
+
String cmd,
|
|
10
|
+
*String args
|
|
11
|
+
) -> [String, String, Process::Status]
|
|
12
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.7.
|
|
4
|
+
version: 2.7.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-11-
|
|
11
|
+
date: 2025-11-14 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -47,6 +47,7 @@ files:
|
|
|
47
47
|
- Gemfile.lock
|
|
48
48
|
- README.md
|
|
49
49
|
- Rakefile
|
|
50
|
+
- Steepfile
|
|
50
51
|
- bin/benchmark.rb
|
|
51
52
|
- exe/html-to-markdown
|
|
52
53
|
- ext/html-to-markdown-rb/extconf.rb
|
|
@@ -59,6 +60,10 @@ files:
|
|
|
59
60
|
- lib/html_to_markdown/cli.rb
|
|
60
61
|
- lib/html_to_markdown/cli_proxy.rb
|
|
61
62
|
- lib/html_to_markdown/version.rb
|
|
63
|
+
- sig/html_to_markdown.rbs
|
|
64
|
+
- sig/html_to_markdown/cli.rbs
|
|
65
|
+
- sig/html_to_markdown/cli_proxy.rbs
|
|
66
|
+
- sig/open3.rbs
|
|
62
67
|
- spec/cli_proxy_spec.rb
|
|
63
68
|
- spec/convert_spec.rb
|
|
64
69
|
- spec/spec_helper.rb
|