html-to-markdown 3.1.0-arm64-darwin → 3.2.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +42 -12
- data/Gemfile +1 -0
- data/Gemfile.lock +27 -55
- data/README.md +9 -10
- data/Rakefile +4 -10
- data/html-to-markdown-rb.gemspec +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +31 -21
- data/lib/html_to_markdown_rb.bundle +0 -0
- data/lib/html_to_markdown_rs.rb +3 -0
- data/sig/html_to_markdown.rbs +17 -5
- metadata +3 -6
- data/lib/html_to_markdown/cli.rb +0 -21
- data/lib/html_to_markdown/cli_proxy.rb +0 -74
- data/spec/cli_proxy_spec.rb +0 -42
- data/spec/spec_helper.rb +0 -10
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e431d86a5ba9ddfaa1a293dcd7b08ea85bf7e450ba4999634cf6a5ff08972374
|
|
4
|
+
data.tar.gz: 6cd48dd5c759daa03d63c09276989de143209c0851651bf7a24426efe436d982
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: aebbdedaf61dbafc6228fee22631db3cc0dfa52439a4dbebb8e260bec7f71f29d638ea6dbb9b9a9eb4b5fc6f0220e972ce4e1d818fa5a33018bccc76397cb894
|
|
7
|
+
data.tar.gz: e3ff57fd3c7c5129f9ff99a25656b28e4cc74fa4b98981cddeeb54958fd72f7527df0048f18812338444ec8c92e3a5454f18965b63512784b263aa67b49ceb08
|
data/.rubocop.yml
CHANGED
|
@@ -1,29 +1,59 @@
|
|
|
1
1
|
plugins:
|
|
2
|
+
- rubocop-performance
|
|
2
3
|
- rubocop-rspec
|
|
3
4
|
|
|
4
5
|
AllCops:
|
|
5
|
-
NewCops: enable
|
|
6
6
|
TargetRubyVersion: 3.2
|
|
7
|
+
NewCops: enable
|
|
8
|
+
SuggestExtensions: false
|
|
7
9
|
Exclude:
|
|
8
|
-
-
|
|
9
|
-
-
|
|
10
|
+
- 'vendor/**/*'
|
|
11
|
+
- 'tmp/**/*'
|
|
12
|
+
- 'lib/**/*.bundle'
|
|
13
|
+
- 'ext/**/*'
|
|
14
|
+
|
|
15
|
+
Style/FrozenStringLiteralComment:
|
|
16
|
+
Enabled: true
|
|
17
|
+
EnforcedStyle: always
|
|
18
|
+
|
|
19
|
+
Style/StringLiterals:
|
|
20
|
+
Enabled: true
|
|
21
|
+
EnforcedStyle: single_quotes
|
|
22
|
+
|
|
23
|
+
Style/StringLiteralsInInterpolation:
|
|
24
|
+
Enabled: true
|
|
25
|
+
EnforcedStyle: single_quotes
|
|
10
26
|
|
|
11
27
|
Style/Documentation:
|
|
12
28
|
Enabled: false
|
|
13
29
|
|
|
14
|
-
|
|
30
|
+
Layout/LineLength:
|
|
31
|
+
Max: 120
|
|
32
|
+
AllowedPatterns:
|
|
33
|
+
- '\A\s*#'
|
|
15
34
|
Exclude:
|
|
16
|
-
-
|
|
17
|
-
- "*.gemspec"
|
|
35
|
+
- 'spec/**/*'
|
|
18
36
|
|
|
19
37
|
Metrics/MethodLength:
|
|
20
|
-
Max:
|
|
38
|
+
Max: 20
|
|
39
|
+
Exclude:
|
|
40
|
+
- 'spec/**/*'
|
|
21
41
|
|
|
22
|
-
|
|
23
|
-
Enabled:
|
|
42
|
+
Metrics/BlockLength:
|
|
43
|
+
Enabled: true
|
|
44
|
+
Max: 350
|
|
45
|
+
CountComments: false
|
|
46
|
+
|
|
47
|
+
Metrics/AbcSize:
|
|
48
|
+
Max: 20
|
|
49
|
+
Exclude:
|
|
50
|
+
- 'spec/**/*'
|
|
24
51
|
|
|
25
52
|
RSpec/ExampleLength:
|
|
26
|
-
|
|
53
|
+
Max: 50
|
|
27
54
|
|
|
28
|
-
RSpec/
|
|
29
|
-
|
|
55
|
+
RSpec/MultipleExpectations:
|
|
56
|
+
Max: 25
|
|
57
|
+
|
|
58
|
+
RSpec/NestedGroups:
|
|
59
|
+
Max: 6
|
data/Gemfile
CHANGED
|
@@ -12,6 +12,7 @@ group :development, :test do
|
|
|
12
12
|
gem 'rb_sys' # provides build tooling when developing locally
|
|
13
13
|
gem 'rspec'
|
|
14
14
|
gem 'rubocop', require: false
|
|
15
|
+
gem 'rubocop-performance', require: false
|
|
15
16
|
gem 'rubocop-rspec', require: false
|
|
16
17
|
gem 'steep', require: false
|
|
17
18
|
end
|
data/Gemfile.lock
CHANGED
|
@@ -1,37 +1,18 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (3.
|
|
4
|
+
html-to-markdown (3.2.0)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
8
8
|
remote: https://rubygems.org/
|
|
9
9
|
specs:
|
|
10
|
-
activesupport (8.1.3)
|
|
11
|
-
base64
|
|
12
|
-
bigdecimal
|
|
13
|
-
concurrent-ruby (~> 1.0, >= 1.3.1)
|
|
14
|
-
connection_pool (>= 2.2.5)
|
|
15
|
-
drb
|
|
16
|
-
i18n (>= 1.6, < 2)
|
|
17
|
-
json
|
|
18
|
-
logger (>= 1.4.2)
|
|
19
|
-
minitest (>= 5.1)
|
|
20
|
-
securerandom (>= 0.3)
|
|
21
|
-
tzinfo (~> 2.0, >= 2.0.5)
|
|
22
|
-
uri (>= 0.13.1)
|
|
23
10
|
ast (2.4.3)
|
|
24
|
-
base64 (0.3.0)
|
|
25
|
-
bigdecimal (4.1.0)
|
|
26
11
|
concurrent-ruby (1.3.6)
|
|
27
|
-
connection_pool (3.0.2)
|
|
28
12
|
csv (3.3.5)
|
|
29
13
|
diff-lcs (1.6.2)
|
|
30
|
-
drb (2.2.3)
|
|
31
14
|
ffi (1.17.4-arm64-darwin)
|
|
32
15
|
fileutils (1.8.0)
|
|
33
|
-
i18n (1.14.8)
|
|
34
|
-
concurrent-ruby (~> 1.0)
|
|
35
16
|
json (2.19.3)
|
|
36
17
|
language_server-protocol (3.17.0.5)
|
|
37
18
|
lint_roller (1.1.0)
|
|
@@ -40,31 +21,28 @@ GEM
|
|
|
40
21
|
rb-fsevent (~> 0.10, >= 0.10.3)
|
|
41
22
|
rb-inotify (~> 0.9, >= 0.9.10)
|
|
42
23
|
logger (1.7.0)
|
|
43
|
-
|
|
44
|
-
drb (~> 2.0)
|
|
45
|
-
prism (~> 1.5)
|
|
46
|
-
mutex_m (0.3.0)
|
|
47
|
-
parallel (1.27.0)
|
|
24
|
+
parallel (2.0.1)
|
|
48
25
|
parser (3.3.11.1)
|
|
49
26
|
ast (~> 2.4.1)
|
|
50
27
|
racc
|
|
51
28
|
prism (1.9.0)
|
|
52
29
|
racc (1.8.1)
|
|
53
30
|
rainbow (3.1.1)
|
|
54
|
-
rake (13.
|
|
31
|
+
rake (13.4.1)
|
|
55
32
|
rake-compiler (1.3.1)
|
|
56
33
|
rake
|
|
57
34
|
rake-compiler-dock (1.11.0)
|
|
58
35
|
rb-fsevent (0.11.2)
|
|
59
36
|
rb-inotify (0.11.1)
|
|
60
37
|
ffi (~> 1.0)
|
|
61
|
-
rb_sys (0.9.
|
|
38
|
+
rb_sys (0.9.126)
|
|
62
39
|
json (>= 2)
|
|
63
40
|
rake-compiler-dock (= 1.11.0)
|
|
64
|
-
rbs (
|
|
41
|
+
rbs (4.0.2)
|
|
65
42
|
logger
|
|
43
|
+
prism (>= 1.6.0)
|
|
66
44
|
tsort
|
|
67
|
-
regexp_parser (2.
|
|
45
|
+
regexp_parser (2.12.0)
|
|
68
46
|
rspec (3.13.2)
|
|
69
47
|
rspec-core (~> 3.13.0)
|
|
70
48
|
rspec-expectations (~> 3.13.0)
|
|
@@ -78,11 +56,11 @@ GEM
|
|
|
78
56
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
79
57
|
rspec-support (~> 3.13.0)
|
|
80
58
|
rspec-support (3.13.7)
|
|
81
|
-
rubocop (1.86.
|
|
59
|
+
rubocop (1.86.1)
|
|
82
60
|
json (~> 2.3)
|
|
83
61
|
language_server-protocol (~> 3.17.0.2)
|
|
84
62
|
lint_roller (~> 1.1.0)
|
|
85
|
-
parallel (
|
|
63
|
+
parallel (>= 1.10)
|
|
86
64
|
parser (>= 3.3.0.2)
|
|
87
65
|
rainbow (>= 2.2.2, < 4.0)
|
|
88
66
|
regexp_parser (>= 2.9.3, < 3.0)
|
|
@@ -92,13 +70,16 @@ GEM
|
|
|
92
70
|
rubocop-ast (1.49.1)
|
|
93
71
|
parser (>= 3.3.7.2)
|
|
94
72
|
prism (~> 1.7)
|
|
73
|
+
rubocop-performance (1.26.1)
|
|
74
|
+
lint_roller (~> 1.1)
|
|
75
|
+
rubocop (>= 1.75.0, < 2.0)
|
|
76
|
+
rubocop-ast (>= 1.47.1, < 2.0)
|
|
95
77
|
rubocop-rspec (3.9.0)
|
|
96
78
|
lint_roller (~> 1.1)
|
|
97
79
|
rubocop (~> 1.81)
|
|
98
80
|
ruby-progressbar (1.13.0)
|
|
99
81
|
securerandom (0.4.1)
|
|
100
|
-
steep (
|
|
101
|
-
activesupport (>= 5.1)
|
|
82
|
+
steep (2.0.0)
|
|
102
83
|
concurrent-ruby (>= 1.1.10)
|
|
103
84
|
csv (>= 3.0.9)
|
|
104
85
|
fileutils (>= 1.1.0)
|
|
@@ -106,10 +87,10 @@ GEM
|
|
|
106
87
|
language_server-protocol (>= 3.17.0.4, < 4.0)
|
|
107
88
|
listen (~> 3.0)
|
|
108
89
|
logger (>= 1.3.0)
|
|
109
|
-
|
|
110
|
-
|
|
90
|
+
parser (>= 3.2)
|
|
91
|
+
prism (>= 0.25.0)
|
|
111
92
|
rainbow (>= 2.2.2, < 4.0)
|
|
112
|
-
rbs (~>
|
|
93
|
+
rbs (~> 4.0)
|
|
113
94
|
securerandom (>= 0.1)
|
|
114
95
|
strscan (>= 1.0.0)
|
|
115
96
|
terminal-table (>= 2, < 5)
|
|
@@ -118,8 +99,6 @@ GEM
|
|
|
118
99
|
terminal-table (4.0.0)
|
|
119
100
|
unicode-display_width (>= 1.1.1, < 4)
|
|
120
101
|
tsort (0.2.0)
|
|
121
|
-
tzinfo (2.0.6)
|
|
122
|
-
concurrent-ruby (~> 1.0)
|
|
123
102
|
unicode-display_width (3.2.0)
|
|
124
103
|
unicode-emoji (~> 4.1)
|
|
125
104
|
unicode-emoji (4.2.0)
|
|
@@ -135,58 +114,51 @@ DEPENDENCIES
|
|
|
135
114
|
rbs
|
|
136
115
|
rspec
|
|
137
116
|
rubocop
|
|
117
|
+
rubocop-performance
|
|
138
118
|
rubocop-rspec
|
|
139
119
|
steep
|
|
140
120
|
|
|
141
121
|
CHECKSUMS
|
|
142
|
-
activesupport (8.1.3) sha256=21a5e0dfbd4c3ddd9e1317ec6a4d782fa226e7867dc70b0743acda81a1dca20e
|
|
143
122
|
ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
|
|
144
|
-
base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
|
|
145
|
-
bigdecimal (4.1.0) sha256=6dc07767aa3dc456ccd48e7ae70a07b474e9afd7c5bc576f80bd6da5c8dd6cae
|
|
146
123
|
concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
|
|
147
|
-
connection_pool (3.0.2) sha256=33fff5ba71a12d2aa26cb72b1db8bba2a1a01823559fb01d29eb74c286e62e0a
|
|
148
124
|
csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
|
|
149
125
|
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
|
|
150
|
-
drb (2.2.3) sha256=0b00d6fdb50995fe4a45dea13663493c841112e4068656854646f418fda13373
|
|
151
126
|
ffi (1.17.4-arm64-darwin) sha256=19071aaf1419251b0a46852abf960e77330a3b334d13a4ab51d58b31a937001b
|
|
152
127
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
153
|
-
html-to-markdown (3.
|
|
154
|
-
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
128
|
+
html-to-markdown (3.2.0)
|
|
155
129
|
json (2.19.3) sha256=289b0bb53052a1fa8c34ab33cc750b659ba14a5c45f3fcf4b18762dc67c78646
|
|
156
130
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
157
131
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
158
132
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
159
133
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
160
|
-
|
|
161
|
-
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
162
|
-
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
|
|
134
|
+
parallel (2.0.1) sha256=337782d3e39f4121e67563bf91dd8ece67f48923d90698614773a0ec9a5b2c7d
|
|
163
135
|
parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54
|
|
164
136
|
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
|
165
137
|
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
|
|
166
138
|
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
|
|
167
|
-
rake (13.
|
|
139
|
+
rake (13.4.1) sha256=b4e81bd6a748308a6799619d824ec6a23cd1acd07d9ec41e5f2ebfb2294447c8
|
|
168
140
|
rake-compiler (1.3.1) sha256=6b351612b6e2d73ddd5563ee799bb58685176e05363db6758504bd11573d670a
|
|
169
141
|
rake-compiler-dock (1.11.0) sha256=eab51f2cd533eb35cea6b624a75281f047123e70a64c58b607471bb49428f8c2
|
|
170
142
|
rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
|
|
171
143
|
rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
|
|
172
|
-
rb_sys (0.9.
|
|
173
|
-
rbs (
|
|
174
|
-
regexp_parser (2.
|
|
144
|
+
rb_sys (0.9.126) sha256=ba958e0b8b4b89eeae0b3d24b64c809eb2c37e0ab0773a49e9b1c2e22c95aef8
|
|
145
|
+
rbs (4.0.2) sha256=af75671e66cd03434cc546622741ebf83f6197ec4328375805306330bf78ef25
|
|
146
|
+
regexp_parser (2.12.0) sha256=35a916a1d63190ab5c9009457136ae5f3c0c7512d60291d0d1378ba18ce08ebb
|
|
175
147
|
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
|
176
148
|
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
|
|
177
149
|
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
178
150
|
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
|
|
179
151
|
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
|
|
180
|
-
rubocop (1.86.
|
|
152
|
+
rubocop (1.86.1) sha256=44415f3f01d01a21e01132248d2fd0867572475b566ca188a0a42133a08d4531
|
|
181
153
|
rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
|
|
154
|
+
rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
|
|
182
155
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
183
156
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
184
157
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
185
|
-
steep (
|
|
158
|
+
steep (2.0.0) sha256=6eb0ecc09637bbb54f0a5f2cf63daea6d3208ccace64b4f1107d976333605c30
|
|
186
159
|
strscan (3.1.8) sha256=aae2db611a225559f21ffbb71765c9a4e60fd262534a9ea84f4f11c7f32f679e
|
|
187
160
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
|
188
161
|
tsort (0.2.0) sha256=9650a793f6859a43b6641671278f79cfead60ac714148aabe4e3f0060480089f
|
|
189
|
-
tzinfo (2.0.6) sha256=8daf828cc77bcf7d63b0e3bdb6caa47e2272dcfaf4fbfe46f8c3a9df087a829b
|
|
190
162
|
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
|
|
191
163
|
unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
|
|
192
164
|
uri (1.1.1) sha256=379fa58d27ffb1387eaada68c749d1426738bd0f654d812fcc07e7568f5c57c6
|
data/README.md
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
|
|
19
19
|
</a>
|
|
20
20
|
<a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
|
|
21
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.
|
|
21
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.2.0" alt="Go">
|
|
22
22
|
</a>
|
|
23
23
|
<a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
|
|
24
24
|
<img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
|
|
@@ -56,7 +56,6 @@
|
|
|
56
56
|
</a>
|
|
57
57
|
</div>
|
|
58
58
|
|
|
59
|
-
|
|
60
59
|
Blazing-fast HTML to Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, WebAssembly, and PHP packages.
|
|
61
60
|
Ship identical Markdown across every runtime while enjoying native extension performance with Magnus bindings.
|
|
62
61
|
|
|
@@ -78,10 +77,10 @@ Requires Ruby 3.2+ with Magnus native extension bindings. Published for Linux, m
|
|
|
78
77
|
|
|
79
78
|
## Performance Snapshot
|
|
80
79
|
|
|
81
|
-
Apple M4
|
|
80
|
+
**Apple M4** · `convert()` · Real Wikipedia documents
|
|
82
81
|
|
|
83
82
|
| Document | Size | Latency | Throughput |
|
|
84
|
-
|
|
83
|
+
|----------|------|---------|------------|
|
|
85
84
|
| Lists (Timeline) | 129KB | 0.71ms | 182 MB/s |
|
|
86
85
|
| Tables (Countries) | 360KB | 2.15ms | 167 MB/s |
|
|
87
86
|
| Mixed (Python wiki) | 656KB | 4.89ms | 134 MB/s |
|
|
@@ -102,7 +101,6 @@ markdown = result[:content]
|
|
|
102
101
|
```
|
|
103
102
|
|
|
104
103
|
|
|
105
|
-
|
|
106
104
|
With conversion options:
|
|
107
105
|
|
|
108
106
|
```ruby
|
|
@@ -114,8 +112,6 @@ markdown = result[:content]
|
|
|
114
112
|
```
|
|
115
113
|
|
|
116
114
|
|
|
117
|
-
|
|
118
|
-
|
|
119
115
|
## API Reference
|
|
120
116
|
|
|
121
117
|
### Core Function
|
|
@@ -153,7 +149,6 @@ warnings = result[:warnings] # Any conversion warnings
|
|
|
153
149
|
- `extract_tables`: Enable structured table extraction into `result.tables` — default: `false`
|
|
154
150
|
- `output_format`: Output markup format (`"markdown"` | `"djot"` | `"plain"`) — default: `"markdown"`
|
|
155
151
|
|
|
156
|
-
|
|
157
152
|
## Djot Output Format
|
|
158
153
|
|
|
159
154
|
The library supports converting HTML to [Djot](https://djot.net/), a lightweight markup language similar to Markdown but with a different syntax for some elements. Set `output_format` to `"djot"` to use this format.
|
|
@@ -173,6 +168,7 @@ The library supports converting HTML to [Djot](https://djot.net/), a lightweight
|
|
|
173
168
|
### Example Usage
|
|
174
169
|
|
|
175
170
|
|
|
171
|
+
|
|
176
172
|
```ruby
|
|
177
173
|
require 'html_to_markdown'
|
|
178
174
|
|
|
@@ -188,14 +184,15 @@ djot = HtmlToMarkdown.convert(html, output_format: 'djot')
|
|
|
188
184
|
```
|
|
189
185
|
|
|
190
186
|
|
|
191
|
-
Djot's extended syntax allows you to express more semantic meaning in lightweight text, making it useful for documents that require strikethrough, insertion tracking, or mathematical notation.
|
|
192
187
|
|
|
188
|
+
Djot's extended syntax allows you to express more semantic meaning in lightweight text, making it useful for documents that require strikethrough, insertion tracking, or mathematical notation.
|
|
193
189
|
|
|
194
190
|
## Plain Text Output
|
|
195
191
|
|
|
196
192
|
Set `output_format` to `"plain"` to strip all markup and return only visible text. This bypasses the Markdown conversion pipeline entirely for maximum speed.
|
|
197
193
|
|
|
198
194
|
|
|
195
|
+
|
|
199
196
|
```ruby
|
|
200
197
|
require 'html_to_markdown'
|
|
201
198
|
|
|
@@ -206,6 +203,7 @@ plain = HtmlToMarkdown.convert(html, output_format: 'plain')
|
|
|
206
203
|
```
|
|
207
204
|
|
|
208
205
|
|
|
206
|
+
|
|
209
207
|
Plain text mode is useful for search indexing, text extraction, and feeding content to LLMs.
|
|
210
208
|
|
|
211
209
|
|
|
@@ -227,6 +225,7 @@ The metadata extraction feature enables comprehensive document analysis during c
|
|
|
227
225
|
### Example: Quick Start
|
|
228
226
|
|
|
229
227
|
|
|
228
|
+
|
|
230
229
|
```ruby
|
|
231
230
|
require 'html_to_markdown'
|
|
232
231
|
|
|
@@ -263,6 +262,7 @@ The visitor pattern enables custom HTML→Markdown conversion logic by providing
|
|
|
263
262
|
### Example: Quick Start
|
|
264
263
|
|
|
265
264
|
|
|
265
|
+
|
|
266
266
|
```ruby
|
|
267
267
|
require 'html_to_markdown'
|
|
268
268
|
|
|
@@ -289,7 +289,6 @@ markdown = result[:content]
|
|
|
289
289
|
|
|
290
290
|
|
|
291
291
|
|
|
292
|
-
|
|
293
292
|
## Examples
|
|
294
293
|
|
|
295
294
|
|
data/Rakefile
CHANGED
|
@@ -4,29 +4,23 @@ require 'bundler/gem_tasks'
|
|
|
4
4
|
require 'rake/extensiontask'
|
|
5
5
|
require 'rspec/core/rake_task'
|
|
6
6
|
|
|
7
|
-
GEMSPEC = Gem::Specification.load(File.expand_path('
|
|
8
|
-
|
|
9
|
-
# Vendor html-to-markdown-rs core crate before compilation
|
|
10
|
-
task :vendor do
|
|
11
|
-
vendor_script = File.expand_path('../../scripts/ci/ruby/vendor-core-crate.py', __dir__)
|
|
12
|
-
puts 'Vendoring html-to-markdown-rs core crate...'
|
|
13
|
-
sh "python3 #{vendor_script}"
|
|
14
|
-
end
|
|
7
|
+
GEMSPEC = Gem::Specification.load(File.expand_path('html_to_markdown_rs.gemspec', __dir__))
|
|
15
8
|
|
|
16
9
|
Rake::ExtensionTask.new('html_to_markdown_rb', GEMSPEC) do |ext|
|
|
17
10
|
ext.lib_dir = 'lib'
|
|
18
|
-
ext.ext_dir = 'ext/
|
|
11
|
+
ext.ext_dir = 'ext/html_to_markdown_rb'
|
|
19
12
|
ext.cross_compile = true
|
|
20
13
|
ext.cross_platform = %w[
|
|
21
14
|
x86_64-linux
|
|
15
|
+
aarch64-linux
|
|
22
16
|
x86_64-darwin
|
|
23
17
|
arm64-darwin
|
|
24
18
|
x64-mingw32
|
|
19
|
+
x64-mingw-ucrt
|
|
25
20
|
]
|
|
26
21
|
end
|
|
27
22
|
|
|
28
23
|
RSpec::Core::RakeTask.new(:spec)
|
|
29
24
|
|
|
30
|
-
task compile: :vendor
|
|
31
25
|
task spec: :compile
|
|
32
26
|
task default: :spec
|
data/html-to-markdown-rb.gemspec
CHANGED
|
@@ -87,7 +87,7 @@ Gem::Specification.new do |spec|
|
|
|
87
87
|
spec.files = files
|
|
88
88
|
spec.extra_rdoc_files = ['README.md']
|
|
89
89
|
|
|
90
|
-
spec.extensions = ['ext/
|
|
90
|
+
spec.extensions = ['ext/html_to_markdown_rb/extconf.rb']
|
|
91
91
|
|
|
92
92
|
spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
|
|
93
93
|
spec.metadata['rubygems_mfa_required'] = 'true'
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -3,28 +3,38 @@
|
|
|
3
3
|
require_relative 'html_to_markdown/version'
|
|
4
4
|
require 'html_to_markdown_rb'
|
|
5
5
|
|
|
6
|
+
# High-performance HTML to Markdown conversion.
|
|
7
|
+
#
|
|
8
|
+
# @example Simple conversion
|
|
9
|
+
# HtmlToMarkdown.convert('<h1>Hello</h1>') # => "# Hello\n\n"
|
|
10
|
+
#
|
|
11
|
+
# @example With options
|
|
12
|
+
# HtmlToMarkdown.convert('<h1>Hello</h1>', heading_style: 'atx')
|
|
6
13
|
module HtmlToMarkdown
|
|
7
|
-
|
|
8
|
-
autoload :CLIProxy, 'html_to_markdown/cli_proxy'
|
|
9
|
-
|
|
10
|
-
class << self
|
|
11
|
-
alias native_convert convert
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
module_function
|
|
15
|
-
|
|
16
|
-
# Convert HTML to Markdown, returning a Hash with:
|
|
17
|
-
# - :content [String, nil] the converted Markdown output
|
|
18
|
-
# - :document [nil] document structure (not yet exposed)
|
|
19
|
-
# - :metadata [Hash, nil] extracted HTML metadata
|
|
20
|
-
# - :tables [Array<Hash>] extracted tables with :grid and :markdown
|
|
21
|
-
# - :images [Array<Hash>] extracted inline images
|
|
22
|
-
# - :warnings [Array<Hash>] processing warnings
|
|
14
|
+
# Convert HTML to Markdown.
|
|
23
15
|
#
|
|
24
|
-
# @param html [String] HTML
|
|
25
|
-
# @param options [Hash
|
|
26
|
-
#
|
|
27
|
-
|
|
28
|
-
|
|
16
|
+
# @param html [String] The HTML content to convert.
|
|
17
|
+
# @param options [Hash] Optional conversion options.
|
|
18
|
+
# Supported keys (all optional):
|
|
19
|
+
# - :heading_style - 'atx', 'atx_closed', 'setext', 'underlined'
|
|
20
|
+
# - :code_block_style - 'backticks', 'tildes', 'indented'
|
|
21
|
+
# - :escape_asterisks - Boolean
|
|
22
|
+
# - :escape_underscores - Boolean
|
|
23
|
+
# - :escape_misc - Boolean
|
|
24
|
+
# - :escape_ascii - Boolean
|
|
25
|
+
# - :strip_newlines - Boolean
|
|
26
|
+
# - :keep_inline_images_in - Array of tag names
|
|
27
|
+
# - :strip_tags - Array of tag names to strip
|
|
28
|
+
# - :preserve_tags - Array of tag names to preserve verbatim
|
|
29
|
+
# (and more, matching ConversionOptions fields)
|
|
30
|
+
# @return [String] The converted Markdown content.
|
|
31
|
+
def self.convert(html, options = {})
|
|
32
|
+
opts = if options.nil? || options.empty?
|
|
33
|
+
nil
|
|
34
|
+
else
|
|
35
|
+
HtmlToMarkdownRs::ConversionOptions.new(options)
|
|
36
|
+
end
|
|
37
|
+
result = HtmlToMarkdownRs.convert(html, opts)
|
|
38
|
+
result.content || ''
|
|
29
39
|
end
|
|
30
40
|
end
|
|
Binary file
|
data/sig/html_to_markdown.rbs
CHANGED
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Native extension module (Magnus/rb-sys)
|
|
2
|
+
module HtmlToMarkdownRs
|
|
3
|
+
class ConversionOptions
|
|
4
|
+
def initialize: (Hash[Symbol, untyped]) -> void
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
class ConversionResult
|
|
8
|
+
def content: () -> String?
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.convert: (String html, ConversionOptions? options) -> ConversionResult
|
|
12
|
+
end
|
|
13
|
+
|
|
1
14
|
# Type definitions for HtmlToMarkdown Ruby gem
|
|
2
15
|
module HtmlToMarkdown
|
|
3
16
|
VERSION: String
|
|
@@ -8,6 +21,7 @@ module HtmlToMarkdown
|
|
|
8
21
|
type whitespace_mode = :normalized | :strict
|
|
9
22
|
type newline_style = :spaces | :backslash
|
|
10
23
|
type code_block_style = :indented | :backticks | :tildes
|
|
24
|
+
type link_style = :inline | :reference
|
|
11
25
|
type output_format = :markdown | :djot
|
|
12
26
|
type preprocessing_preset = :minimal | :standard | :aggressive
|
|
13
27
|
|
|
@@ -49,6 +63,7 @@ module HtmlToMarkdown
|
|
|
49
63
|
debug?: bool,
|
|
50
64
|
strip_tags?: Array[String],
|
|
51
65
|
preserve_tags?: Array[String],
|
|
66
|
+
link_style?: link_style,
|
|
52
67
|
output_format?: output_format,
|
|
53
68
|
skip_images?: bool,
|
|
54
69
|
include_document_structure?: bool,
|
|
@@ -126,12 +141,9 @@ module HtmlToMarkdown
|
|
|
126
141
|
|
|
127
142
|
public
|
|
128
143
|
|
|
129
|
-
# Convert HTML to Markdown, returning
|
|
144
|
+
# Convert HTML to Markdown, returning the markdown content string.
|
|
130
145
|
#
|
|
131
146
|
# Example:
|
|
132
147
|
# result = HtmlToMarkdown.convert(html)
|
|
133
|
-
def self.convert: (String html, ?conversion_options options) ->
|
|
134
|
-
|
|
135
|
-
# Instance method version (created by module_function)
|
|
136
|
-
def convert: (String html, ?conversion_options options) -> Hash[String, untyped]
|
|
148
|
+
def self.convert: (String html, ?conversion_options options) -> String
|
|
137
149
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.
|
|
4
|
+
version: 3.2.0
|
|
5
5
|
platform: arm64-darwin
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-16 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: |-
|
|
14
14
|
html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
|
|
@@ -32,16 +32,13 @@ files:
|
|
|
32
32
|
- exe/html-to-markdown
|
|
33
33
|
- html-to-markdown-rb.gemspec
|
|
34
34
|
- lib/html_to_markdown.rb
|
|
35
|
-
- lib/html_to_markdown/cli.rb
|
|
36
|
-
- lib/html_to_markdown/cli_proxy.rb
|
|
37
35
|
- lib/html_to_markdown/version.rb
|
|
38
36
|
- lib/html_to_markdown_rb.bundle
|
|
37
|
+
- lib/html_to_markdown_rs.rb
|
|
39
38
|
- sig/html_to_markdown.rbs
|
|
40
39
|
- sig/html_to_markdown/cli.rbs
|
|
41
40
|
- sig/html_to_markdown/cli_proxy.rbs
|
|
42
41
|
- sig/open3.rbs
|
|
43
|
-
- spec/cli_proxy_spec.rb
|
|
44
|
-
- spec/spec_helper.rb
|
|
45
42
|
homepage: https://github.com/kreuzberg-dev/html-to-markdown
|
|
46
43
|
licenses:
|
|
47
44
|
- MIT
|
data/lib/html_to_markdown/cli.rb
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'html_to_markdown/cli_proxy'
|
|
4
|
-
|
|
5
|
-
module HtmlToMarkdown
|
|
6
|
-
module CLI
|
|
7
|
-
module_function
|
|
8
|
-
|
|
9
|
-
def run(argv = ARGV, stdout: $stdout, stderr: $stderr)
|
|
10
|
-
output = CLIProxy.call(argv)
|
|
11
|
-
stdout.print(output)
|
|
12
|
-
0
|
|
13
|
-
rescue CLIProxy::CLIExecutionError => e
|
|
14
|
-
stderr.print(e.stderr)
|
|
15
|
-
e.status || 1
|
|
16
|
-
rescue CLIProxy::MissingBinaryError, CLIProxy::Error => e
|
|
17
|
-
stderr.puts(e.message)
|
|
18
|
-
1
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
end
|
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'open3'
|
|
4
|
-
require 'pathname'
|
|
5
|
-
|
|
6
|
-
module HtmlToMarkdown
|
|
7
|
-
module CLIProxy
|
|
8
|
-
class Error < StandardError
|
|
9
|
-
end
|
|
10
|
-
|
|
11
|
-
class MissingBinaryError < Error
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
class CLIExecutionError < Error
|
|
15
|
-
attr_reader :stderr, :status
|
|
16
|
-
|
|
17
|
-
def initialize(message, stderr:, status:)
|
|
18
|
-
super(message)
|
|
19
|
-
@stderr = stderr
|
|
20
|
-
@status = status
|
|
21
|
-
end
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
module_function
|
|
25
|
-
|
|
26
|
-
def call(argv)
|
|
27
|
-
binary = find_cli_binary
|
|
28
|
-
args = Array(argv).map(&:to_s)
|
|
29
|
-
stdout, stderr, status = Open3.capture3(binary.to_s, *args)
|
|
30
|
-
return stdout if status.success?
|
|
31
|
-
|
|
32
|
-
raise CLIExecutionError.new(
|
|
33
|
-
"html-to-markdown CLI exited with status #{status.exitstatus}",
|
|
34
|
-
stderr: stderr,
|
|
35
|
-
status: status.exitstatus
|
|
36
|
-
)
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
def find_cli_binary
|
|
40
|
-
binary_name = Gem.win_platform? ? 'html-to-markdown.exe' : 'html-to-markdown'
|
|
41
|
-
found = search_paths(binary_name).find(&:file?)
|
|
42
|
-
return found if found
|
|
43
|
-
|
|
44
|
-
raise MissingBinaryError, missing_binary_message
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
def root_path
|
|
48
|
-
@root_path ||= Pathname(__dir__.to_s).join('../..').expand_path
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
def lib_path
|
|
52
|
-
@lib_path ||= Pathname(__dir__.to_s).join('..').expand_path
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
def search_paths(binary_name)
|
|
56
|
-
paths = [
|
|
57
|
-
root_path.join('target', 'release', binary_name),
|
|
58
|
-
lib_path.join('bin', binary_name),
|
|
59
|
-
lib_path.join(binary_name)
|
|
60
|
-
]
|
|
61
|
-
|
|
62
|
-
workspace_root = root_path.parent&.parent
|
|
63
|
-
paths << workspace_root.join('target', 'release', binary_name) if workspace_root
|
|
64
|
-
paths
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
def missing_binary_message
|
|
68
|
-
<<~MSG.strip
|
|
69
|
-
html-to-markdown CLI binary not found. Build it with
|
|
70
|
-
`cargo build --release --package html-to-markdown-cli`.
|
|
71
|
-
MSG
|
|
72
|
-
end
|
|
73
|
-
end
|
|
74
|
-
end
|
data/spec/cli_proxy_spec.rb
DELETED
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'spec_helper'
|
|
4
|
-
require 'html_to_markdown/cli_proxy'
|
|
5
|
-
require 'html_to_markdown/cli'
|
|
6
|
-
require 'stringio'
|
|
7
|
-
|
|
8
|
-
RSpec.describe HtmlToMarkdown::CLIProxy do
|
|
9
|
-
describe '.call' do
|
|
10
|
-
it 'executes the CLI binary' do
|
|
11
|
-
begin
|
|
12
|
-
binary = described_class.find_cli_binary
|
|
13
|
-
rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
|
|
14
|
-
skip 'CLI binary not built'
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
expect(binary).to be_file
|
|
18
|
-
|
|
19
|
-
output = described_class.call(['--version'])
|
|
20
|
-
expect(output).to include(HtmlToMarkdown::VERSION)
|
|
21
|
-
end
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
describe HtmlToMarkdown::CLI do
|
|
25
|
-
it 'writes CLI output to stdout' do
|
|
26
|
-
begin
|
|
27
|
-
HtmlToMarkdown::CLIProxy.find_cli_binary
|
|
28
|
-
rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
|
|
29
|
-
skip 'CLI binary not built'
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
stdout = StringIO.new
|
|
33
|
-
stderr = StringIO.new
|
|
34
|
-
|
|
35
|
-
exit_code = described_class.run(['--version'], stdout: stdout, stderr: stderr)
|
|
36
|
-
|
|
37
|
-
expect(exit_code).to eq(0)
|
|
38
|
-
expect(stdout.string).to include(HtmlToMarkdown::VERSION)
|
|
39
|
-
expect(stderr.string).to be_empty
|
|
40
|
-
end
|
|
41
|
-
end
|
|
42
|
-
end
|