html-to-markdown 3.1.0-x86_64-linux → 3.2.1-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +42 -12
- data/Gemfile +1 -0
- data/Gemfile.lock +27 -55
- data/README.md +9 -10
- data/Rakefile +4 -10
- data/html-to-markdown-rb.gemspec +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +31 -21
- data/lib/html_to_markdown_rb.so +0 -0
- data/lib/html_to_markdown_rs.rb +3 -0
- data/sig/html_to_markdown.rbs +17 -5
- metadata +3 -6
- data/lib/html_to_markdown/cli.rb +0 -21
- data/lib/html_to_markdown/cli_proxy.rb +0 -74
- data/spec/cli_proxy_spec.rb +0 -42
- data/spec/spec_helper.rb +0 -10
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fabc0f62f4b2e3c874f86e6e0720a26bd38a9dafc32035ac027f3be7941b6b3a
|
|
4
|
+
data.tar.gz: ea5ec1a33a97380f985b6644d50262bba66b944471d77596e67005d337de15f2
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ecfd68be5475850f405803d6dca4ac7eaf5dac0f26f04d2f94a871ac054497c1aab5feaf9a10efa68dce6a85238d054154af266ba0f59b36e0d41b9e9786df80
|
|
7
|
+
data.tar.gz: fe1341fd1c451a728a57fd1398947a52a86bcbbf60aee94ae2df7b80e624b138834eafd121639adf8a11e81c250e844a6524bac388e60a8c52ee866883b96afb
|
data/.rubocop.yml
CHANGED
|
@@ -1,29 +1,59 @@
|
|
|
1
1
|
plugins:
|
|
2
|
+
- rubocop-performance
|
|
2
3
|
- rubocop-rspec
|
|
3
4
|
|
|
4
5
|
AllCops:
|
|
5
|
-
NewCops: enable
|
|
6
6
|
TargetRubyVersion: 3.2
|
|
7
|
+
NewCops: enable
|
|
8
|
+
SuggestExtensions: false
|
|
7
9
|
Exclude:
|
|
8
|
-
-
|
|
9
|
-
-
|
|
10
|
+
- 'vendor/**/*'
|
|
11
|
+
- 'tmp/**/*'
|
|
12
|
+
- 'lib/**/*.bundle'
|
|
13
|
+
- 'ext/**/*'
|
|
14
|
+
|
|
15
|
+
Style/FrozenStringLiteralComment:
|
|
16
|
+
Enabled: true
|
|
17
|
+
EnforcedStyle: always
|
|
18
|
+
|
|
19
|
+
Style/StringLiterals:
|
|
20
|
+
Enabled: true
|
|
21
|
+
EnforcedStyle: single_quotes
|
|
22
|
+
|
|
23
|
+
Style/StringLiteralsInInterpolation:
|
|
24
|
+
Enabled: true
|
|
25
|
+
EnforcedStyle: single_quotes
|
|
10
26
|
|
|
11
27
|
Style/Documentation:
|
|
12
28
|
Enabled: false
|
|
13
29
|
|
|
14
|
-
|
|
30
|
+
Layout/LineLength:
|
|
31
|
+
Max: 120
|
|
32
|
+
AllowedPatterns:
|
|
33
|
+
- '\A\s*#'
|
|
15
34
|
Exclude:
|
|
16
|
-
-
|
|
17
|
-
- "*.gemspec"
|
|
35
|
+
- 'spec/**/*'
|
|
18
36
|
|
|
19
37
|
Metrics/MethodLength:
|
|
20
|
-
Max:
|
|
38
|
+
Max: 20
|
|
39
|
+
Exclude:
|
|
40
|
+
- 'spec/**/*'
|
|
21
41
|
|
|
22
|
-
|
|
23
|
-
Enabled:
|
|
42
|
+
Metrics/BlockLength:
|
|
43
|
+
Enabled: true
|
|
44
|
+
Max: 350
|
|
45
|
+
CountComments: false
|
|
46
|
+
|
|
47
|
+
Metrics/AbcSize:
|
|
48
|
+
Max: 20
|
|
49
|
+
Exclude:
|
|
50
|
+
- 'spec/**/*'
|
|
24
51
|
|
|
25
52
|
RSpec/ExampleLength:
|
|
26
|
-
|
|
53
|
+
Max: 50
|
|
27
54
|
|
|
28
|
-
RSpec/
|
|
29
|
-
|
|
55
|
+
RSpec/MultipleExpectations:
|
|
56
|
+
Max: 25
|
|
57
|
+
|
|
58
|
+
RSpec/NestedGroups:
|
|
59
|
+
Max: 6
|
data/Gemfile
CHANGED
|
@@ -12,6 +12,7 @@ group :development, :test do
|
|
|
12
12
|
gem 'rb_sys' # provides build tooling when developing locally
|
|
13
13
|
gem 'rspec'
|
|
14
14
|
gem 'rubocop', require: false
|
|
15
|
+
gem 'rubocop-performance', require: false
|
|
15
16
|
gem 'rubocop-rspec', require: false
|
|
16
17
|
gem 'steep', require: false
|
|
17
18
|
end
|
data/Gemfile.lock
CHANGED
|
@@ -1,38 +1,19 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (3.1
|
|
4
|
+
html-to-markdown (3.2.1)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
8
8
|
remote: https://rubygems.org/
|
|
9
9
|
specs:
|
|
10
|
-
activesupport (8.1.3)
|
|
11
|
-
base64
|
|
12
|
-
bigdecimal
|
|
13
|
-
concurrent-ruby (~> 1.0, >= 1.3.1)
|
|
14
|
-
connection_pool (>= 2.2.5)
|
|
15
|
-
drb
|
|
16
|
-
i18n (>= 1.6, < 2)
|
|
17
|
-
json
|
|
18
|
-
logger (>= 1.4.2)
|
|
19
|
-
minitest (>= 5.1)
|
|
20
|
-
securerandom (>= 0.3)
|
|
21
|
-
tzinfo (~> 2.0, >= 2.0.5)
|
|
22
|
-
uri (>= 0.13.1)
|
|
23
10
|
ast (2.4.3)
|
|
24
|
-
base64 (0.3.0)
|
|
25
|
-
bigdecimal (4.1.0)
|
|
26
11
|
concurrent-ruby (1.3.6)
|
|
27
|
-
connection_pool (3.0.2)
|
|
28
12
|
csv (3.3.5)
|
|
29
13
|
diff-lcs (1.6.2)
|
|
30
|
-
drb (2.2.3)
|
|
31
14
|
ffi (1.17.4-arm64-darwin)
|
|
32
15
|
ffi (1.17.4-x86_64-linux-gnu)
|
|
33
16
|
fileutils (1.8.0)
|
|
34
|
-
i18n (1.14.8)
|
|
35
|
-
concurrent-ruby (~> 1.0)
|
|
36
17
|
json (2.19.3)
|
|
37
18
|
language_server-protocol (3.17.0.5)
|
|
38
19
|
lint_roller (1.1.0)
|
|
@@ -41,31 +22,28 @@ GEM
|
|
|
41
22
|
rb-fsevent (~> 0.10, >= 0.10.3)
|
|
42
23
|
rb-inotify (~> 0.9, >= 0.9.10)
|
|
43
24
|
logger (1.7.0)
|
|
44
|
-
|
|
45
|
-
drb (~> 2.0)
|
|
46
|
-
prism (~> 1.5)
|
|
47
|
-
mutex_m (0.3.0)
|
|
48
|
-
parallel (1.27.0)
|
|
25
|
+
parallel (2.0.1)
|
|
49
26
|
parser (3.3.11.1)
|
|
50
27
|
ast (~> 2.4.1)
|
|
51
28
|
racc
|
|
52
29
|
prism (1.9.0)
|
|
53
30
|
racc (1.8.1)
|
|
54
31
|
rainbow (3.1.1)
|
|
55
|
-
rake (13.
|
|
32
|
+
rake (13.4.2)
|
|
56
33
|
rake-compiler (1.3.1)
|
|
57
34
|
rake
|
|
58
35
|
rake-compiler-dock (1.11.0)
|
|
59
36
|
rb-fsevent (0.11.2)
|
|
60
37
|
rb-inotify (0.11.1)
|
|
61
38
|
ffi (~> 1.0)
|
|
62
|
-
rb_sys (0.9.
|
|
39
|
+
rb_sys (0.9.126)
|
|
63
40
|
json (>= 2)
|
|
64
41
|
rake-compiler-dock (= 1.11.0)
|
|
65
|
-
rbs (
|
|
42
|
+
rbs (4.0.2)
|
|
66
43
|
logger
|
|
44
|
+
prism (>= 1.6.0)
|
|
67
45
|
tsort
|
|
68
|
-
regexp_parser (2.
|
|
46
|
+
regexp_parser (2.12.0)
|
|
69
47
|
rspec (3.13.2)
|
|
70
48
|
rspec-core (~> 3.13.0)
|
|
71
49
|
rspec-expectations (~> 3.13.0)
|
|
@@ -79,11 +57,11 @@ GEM
|
|
|
79
57
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
80
58
|
rspec-support (~> 3.13.0)
|
|
81
59
|
rspec-support (3.13.7)
|
|
82
|
-
rubocop (1.86.
|
|
60
|
+
rubocop (1.86.1)
|
|
83
61
|
json (~> 2.3)
|
|
84
62
|
language_server-protocol (~> 3.17.0.2)
|
|
85
63
|
lint_roller (~> 1.1.0)
|
|
86
|
-
parallel (
|
|
64
|
+
parallel (>= 1.10)
|
|
87
65
|
parser (>= 3.3.0.2)
|
|
88
66
|
rainbow (>= 2.2.2, < 4.0)
|
|
89
67
|
regexp_parser (>= 2.9.3, < 3.0)
|
|
@@ -93,13 +71,16 @@ GEM
|
|
|
93
71
|
rubocop-ast (1.49.1)
|
|
94
72
|
parser (>= 3.3.7.2)
|
|
95
73
|
prism (~> 1.7)
|
|
74
|
+
rubocop-performance (1.26.1)
|
|
75
|
+
lint_roller (~> 1.1)
|
|
76
|
+
rubocop (>= 1.75.0, < 2.0)
|
|
77
|
+
rubocop-ast (>= 1.47.1, < 2.0)
|
|
96
78
|
rubocop-rspec (3.9.0)
|
|
97
79
|
lint_roller (~> 1.1)
|
|
98
80
|
rubocop (~> 1.81)
|
|
99
81
|
ruby-progressbar (1.13.0)
|
|
100
82
|
securerandom (0.4.1)
|
|
101
|
-
steep (
|
|
102
|
-
activesupport (>= 5.1)
|
|
83
|
+
steep (2.0.0)
|
|
103
84
|
concurrent-ruby (>= 1.1.10)
|
|
104
85
|
csv (>= 3.0.9)
|
|
105
86
|
fileutils (>= 1.1.0)
|
|
@@ -107,10 +88,10 @@ GEM
|
|
|
107
88
|
language_server-protocol (>= 3.17.0.4, < 4.0)
|
|
108
89
|
listen (~> 3.0)
|
|
109
90
|
logger (>= 1.3.0)
|
|
110
|
-
|
|
111
|
-
|
|
91
|
+
parser (>= 3.2)
|
|
92
|
+
prism (>= 0.25.0)
|
|
112
93
|
rainbow (>= 2.2.2, < 4.0)
|
|
113
|
-
rbs (~>
|
|
94
|
+
rbs (~> 4.0)
|
|
114
95
|
securerandom (>= 0.1)
|
|
115
96
|
strscan (>= 1.0.0)
|
|
116
97
|
terminal-table (>= 2, < 5)
|
|
@@ -119,8 +100,6 @@ GEM
|
|
|
119
100
|
terminal-table (4.0.0)
|
|
120
101
|
unicode-display_width (>= 1.1.1, < 4)
|
|
121
102
|
tsort (0.2.0)
|
|
122
|
-
tzinfo (2.0.6)
|
|
123
|
-
concurrent-ruby (~> 1.0)
|
|
124
103
|
unicode-display_width (3.2.0)
|
|
125
104
|
unicode-emoji (~> 4.1)
|
|
126
105
|
unicode-emoji (4.2.0)
|
|
@@ -137,59 +116,52 @@ DEPENDENCIES
|
|
|
137
116
|
rbs
|
|
138
117
|
rspec
|
|
139
118
|
rubocop
|
|
119
|
+
rubocop-performance
|
|
140
120
|
rubocop-rspec
|
|
141
121
|
steep
|
|
142
122
|
|
|
143
123
|
CHECKSUMS
|
|
144
|
-
activesupport (8.1.3) sha256=21a5e0dfbd4c3ddd9e1317ec6a4d782fa226e7867dc70b0743acda81a1dca20e
|
|
145
124
|
ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
|
|
146
|
-
base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
|
|
147
|
-
bigdecimal (4.1.0) sha256=6dc07767aa3dc456ccd48e7ae70a07b474e9afd7c5bc576f80bd6da5c8dd6cae
|
|
148
125
|
concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
|
|
149
|
-
connection_pool (3.0.2) sha256=33fff5ba71a12d2aa26cb72b1db8bba2a1a01823559fb01d29eb74c286e62e0a
|
|
150
126
|
csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
|
|
151
127
|
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
|
|
152
|
-
drb (2.2.3) sha256=0b00d6fdb50995fe4a45dea13663493c841112e4068656854646f418fda13373
|
|
153
128
|
ffi (1.17.4-arm64-darwin) sha256=19071aaf1419251b0a46852abf960e77330a3b334d13a4ab51d58b31a937001b
|
|
154
129
|
ffi (1.17.4-x86_64-linux-gnu) sha256=9d3db14c2eae074b382fa9c083fe95aec6e0a1451da249eab096c34002bc752d
|
|
155
130
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
156
|
-
html-to-markdown (3.1
|
|
157
|
-
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
131
|
+
html-to-markdown (3.2.1)
|
|
158
132
|
json (2.19.3) sha256=289b0bb53052a1fa8c34ab33cc750b659ba14a5c45f3fcf4b18762dc67c78646
|
|
159
133
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
160
134
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
161
135
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
162
136
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
163
|
-
|
|
164
|
-
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
165
|
-
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
|
|
137
|
+
parallel (2.0.1) sha256=337782d3e39f4121e67563bf91dd8ece67f48923d90698614773a0ec9a5b2c7d
|
|
166
138
|
parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54
|
|
167
139
|
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
|
168
140
|
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
|
|
169
141
|
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
|
|
170
|
-
rake (13.
|
|
142
|
+
rake (13.4.2) sha256=cb825b2bd5f1f8e91ca37bddb4b9aaf345551b4731da62949be002fa89283701
|
|
171
143
|
rake-compiler (1.3.1) sha256=6b351612b6e2d73ddd5563ee799bb58685176e05363db6758504bd11573d670a
|
|
172
144
|
rake-compiler-dock (1.11.0) sha256=eab51f2cd533eb35cea6b624a75281f047123e70a64c58b607471bb49428f8c2
|
|
173
145
|
rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
|
|
174
146
|
rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
|
|
175
|
-
rb_sys (0.9.
|
|
176
|
-
rbs (
|
|
177
|
-
regexp_parser (2.
|
|
147
|
+
rb_sys (0.9.126) sha256=ba958e0b8b4b89eeae0b3d24b64c809eb2c37e0ab0773a49e9b1c2e22c95aef8
|
|
148
|
+
rbs (4.0.2) sha256=af75671e66cd03434cc546622741ebf83f6197ec4328375805306330bf78ef25
|
|
149
|
+
regexp_parser (2.12.0) sha256=35a916a1d63190ab5c9009457136ae5f3c0c7512d60291d0d1378ba18ce08ebb
|
|
178
150
|
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
|
179
151
|
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
|
|
180
152
|
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
181
153
|
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
|
|
182
154
|
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
|
|
183
|
-
rubocop (1.86.
|
|
155
|
+
rubocop (1.86.1) sha256=44415f3f01d01a21e01132248d2fd0867572475b566ca188a0a42133a08d4531
|
|
184
156
|
rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
|
|
157
|
+
rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
|
|
185
158
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
186
159
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
187
160
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
188
|
-
steep (
|
|
161
|
+
steep (2.0.0) sha256=6eb0ecc09637bbb54f0a5f2cf63daea6d3208ccace64b4f1107d976333605c30
|
|
189
162
|
strscan (3.1.8) sha256=aae2db611a225559f21ffbb71765c9a4e60fd262534a9ea84f4f11c7f32f679e
|
|
190
163
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
|
191
164
|
tsort (0.2.0) sha256=9650a793f6859a43b6641671278f79cfead60ac714148aabe4e3f0060480089f
|
|
192
|
-
tzinfo (2.0.6) sha256=8daf828cc77bcf7d63b0e3bdb6caa47e2272dcfaf4fbfe46f8c3a9df087a829b
|
|
193
165
|
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
|
|
194
166
|
unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
|
|
195
167
|
uri (1.1.1) sha256=379fa58d27ffb1387eaada68c749d1426738bd0f654d812fcc07e7568f5c57c6
|
data/README.md
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
|
|
19
19
|
</a>
|
|
20
20
|
<a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
|
|
21
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.
|
|
21
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.2.0" alt="Go">
|
|
22
22
|
</a>
|
|
23
23
|
<a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
|
|
24
24
|
<img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
|
|
@@ -56,7 +56,6 @@
|
|
|
56
56
|
</a>
|
|
57
57
|
</div>
|
|
58
58
|
|
|
59
|
-
|
|
60
59
|
Blazing-fast HTML to Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, WebAssembly, and PHP packages.
|
|
61
60
|
Ship identical Markdown across every runtime while enjoying native extension performance with Magnus bindings.
|
|
62
61
|
|
|
@@ -78,10 +77,10 @@ Requires Ruby 3.2+ with Magnus native extension bindings. Published for Linux, m
|
|
|
78
77
|
|
|
79
78
|
## Performance Snapshot
|
|
80
79
|
|
|
81
|
-
Apple M4
|
|
80
|
+
**Apple M4** · `convert()` · Real Wikipedia documents
|
|
82
81
|
|
|
83
82
|
| Document | Size | Latency | Throughput |
|
|
84
|
-
|
|
83
|
+
|----------|------|---------|------------|
|
|
85
84
|
| Lists (Timeline) | 129KB | 0.71ms | 182 MB/s |
|
|
86
85
|
| Tables (Countries) | 360KB | 2.15ms | 167 MB/s |
|
|
87
86
|
| Mixed (Python wiki) | 656KB | 4.89ms | 134 MB/s |
|
|
@@ -102,7 +101,6 @@ markdown = result[:content]
|
|
|
102
101
|
```
|
|
103
102
|
|
|
104
103
|
|
|
105
|
-
|
|
106
104
|
With conversion options:
|
|
107
105
|
|
|
108
106
|
```ruby
|
|
@@ -114,8 +112,6 @@ markdown = result[:content]
|
|
|
114
112
|
```
|
|
115
113
|
|
|
116
114
|
|
|
117
|
-
|
|
118
|
-
|
|
119
115
|
## API Reference
|
|
120
116
|
|
|
121
117
|
### Core Function
|
|
@@ -153,7 +149,6 @@ warnings = result[:warnings] # Any conversion warnings
|
|
|
153
149
|
- `extract_tables`: Enable structured table extraction into `result.tables` — default: `false`
|
|
154
150
|
- `output_format`: Output markup format (`"markdown"` | `"djot"` | `"plain"`) — default: `"markdown"`
|
|
155
151
|
|
|
156
|
-
|
|
157
152
|
## Djot Output Format
|
|
158
153
|
|
|
159
154
|
The library supports converting HTML to [Djot](https://djot.net/), a lightweight markup language similar to Markdown but with a different syntax for some elements. Set `output_format` to `"djot"` to use this format.
|
|
@@ -173,6 +168,7 @@ The library supports converting HTML to [Djot](https://djot.net/), a lightweight
|
|
|
173
168
|
### Example Usage
|
|
174
169
|
|
|
175
170
|
|
|
171
|
+
|
|
176
172
|
```ruby
|
|
177
173
|
require 'html_to_markdown'
|
|
178
174
|
|
|
@@ -188,14 +184,15 @@ djot = HtmlToMarkdown.convert(html, output_format: 'djot')
|
|
|
188
184
|
```
|
|
189
185
|
|
|
190
186
|
|
|
191
|
-
Djot's extended syntax allows you to express more semantic meaning in lightweight text, making it useful for documents that require strikethrough, insertion tracking, or mathematical notation.
|
|
192
187
|
|
|
188
|
+
Djot's extended syntax allows you to express more semantic meaning in lightweight text, making it useful for documents that require strikethrough, insertion tracking, or mathematical notation.
|
|
193
189
|
|
|
194
190
|
## Plain Text Output
|
|
195
191
|
|
|
196
192
|
Set `output_format` to `"plain"` to strip all markup and return only visible text. This bypasses the Markdown conversion pipeline entirely for maximum speed.
|
|
197
193
|
|
|
198
194
|
|
|
195
|
+
|
|
199
196
|
```ruby
|
|
200
197
|
require 'html_to_markdown'
|
|
201
198
|
|
|
@@ -206,6 +203,7 @@ plain = HtmlToMarkdown.convert(html, output_format: 'plain')
|
|
|
206
203
|
```
|
|
207
204
|
|
|
208
205
|
|
|
206
|
+
|
|
209
207
|
Plain text mode is useful for search indexing, text extraction, and feeding content to LLMs.
|
|
210
208
|
|
|
211
209
|
|
|
@@ -227,6 +225,7 @@ The metadata extraction feature enables comprehensive document analysis during c
|
|
|
227
225
|
### Example: Quick Start
|
|
228
226
|
|
|
229
227
|
|
|
228
|
+
|
|
230
229
|
```ruby
|
|
231
230
|
require 'html_to_markdown'
|
|
232
231
|
|
|
@@ -263,6 +262,7 @@ The visitor pattern enables custom HTML→Markdown conversion logic by providing
|
|
|
263
262
|
### Example: Quick Start
|
|
264
263
|
|
|
265
264
|
|
|
265
|
+
|
|
266
266
|
```ruby
|
|
267
267
|
require 'html_to_markdown'
|
|
268
268
|
|
|
@@ -289,7 +289,6 @@ markdown = result[:content]
|
|
|
289
289
|
|
|
290
290
|
|
|
291
291
|
|
|
292
|
-
|
|
293
292
|
## Examples
|
|
294
293
|
|
|
295
294
|
|
data/Rakefile
CHANGED
|
@@ -4,29 +4,23 @@ require 'bundler/gem_tasks'
|
|
|
4
4
|
require 'rake/extensiontask'
|
|
5
5
|
require 'rspec/core/rake_task'
|
|
6
6
|
|
|
7
|
-
GEMSPEC = Gem::Specification.load(File.expand_path('
|
|
8
|
-
|
|
9
|
-
# Vendor html-to-markdown-rs core crate before compilation
|
|
10
|
-
task :vendor do
|
|
11
|
-
vendor_script = File.expand_path('../../scripts/ci/ruby/vendor-core-crate.py', __dir__)
|
|
12
|
-
puts 'Vendoring html-to-markdown-rs core crate...'
|
|
13
|
-
sh "python3 #{vendor_script}"
|
|
14
|
-
end
|
|
7
|
+
GEMSPEC = Gem::Specification.load(File.expand_path('html_to_markdown_rs.gemspec', __dir__))
|
|
15
8
|
|
|
16
9
|
Rake::ExtensionTask.new('html_to_markdown_rb', GEMSPEC) do |ext|
|
|
17
10
|
ext.lib_dir = 'lib'
|
|
18
|
-
ext.ext_dir = 'ext/
|
|
11
|
+
ext.ext_dir = 'ext/html_to_markdown_rb'
|
|
19
12
|
ext.cross_compile = true
|
|
20
13
|
ext.cross_platform = %w[
|
|
21
14
|
x86_64-linux
|
|
15
|
+
aarch64-linux
|
|
22
16
|
x86_64-darwin
|
|
23
17
|
arm64-darwin
|
|
24
18
|
x64-mingw32
|
|
19
|
+
x64-mingw-ucrt
|
|
25
20
|
]
|
|
26
21
|
end
|
|
27
22
|
|
|
28
23
|
RSpec::Core::RakeTask.new(:spec)
|
|
29
24
|
|
|
30
|
-
task compile: :vendor
|
|
31
25
|
task spec: :compile
|
|
32
26
|
task default: :spec
|
data/html-to-markdown-rb.gemspec
CHANGED
|
@@ -87,7 +87,7 @@ Gem::Specification.new do |spec|
|
|
|
87
87
|
spec.files = files
|
|
88
88
|
spec.extra_rdoc_files = ['README.md']
|
|
89
89
|
|
|
90
|
-
spec.extensions = ['ext/
|
|
90
|
+
spec.extensions = ['ext/html_to_markdown_rb/extconf.rb']
|
|
91
91
|
|
|
92
92
|
spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
|
|
93
93
|
spec.metadata['rubygems_mfa_required'] = 'true'
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -3,28 +3,38 @@
|
|
|
3
3
|
require_relative 'html_to_markdown/version'
|
|
4
4
|
require 'html_to_markdown_rb'
|
|
5
5
|
|
|
6
|
+
# High-performance HTML to Markdown conversion.
|
|
7
|
+
#
|
|
8
|
+
# @example Simple conversion
|
|
9
|
+
# HtmlToMarkdown.convert('<h1>Hello</h1>') # => "# Hello\n\n"
|
|
10
|
+
#
|
|
11
|
+
# @example With options
|
|
12
|
+
# HtmlToMarkdown.convert('<h1>Hello</h1>', heading_style: 'atx')
|
|
6
13
|
module HtmlToMarkdown
|
|
7
|
-
|
|
8
|
-
autoload :CLIProxy, 'html_to_markdown/cli_proxy'
|
|
9
|
-
|
|
10
|
-
class << self
|
|
11
|
-
alias native_convert convert
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
module_function
|
|
15
|
-
|
|
16
|
-
# Convert HTML to Markdown, returning a Hash with:
|
|
17
|
-
# - :content [String, nil] the converted Markdown output
|
|
18
|
-
# - :document [nil] document structure (not yet exposed)
|
|
19
|
-
# - :metadata [Hash, nil] extracted HTML metadata
|
|
20
|
-
# - :tables [Array<Hash>] extracted tables with :grid and :markdown
|
|
21
|
-
# - :images [Array<Hash>] extracted inline images
|
|
22
|
-
# - :warnings [Array<Hash>] processing warnings
|
|
14
|
+
# Convert HTML to Markdown.
|
|
23
15
|
#
|
|
24
|
-
# @param html [String] HTML
|
|
25
|
-
# @param options [Hash
|
|
26
|
-
#
|
|
27
|
-
|
|
28
|
-
|
|
16
|
+
# @param html [String] The HTML content to convert.
|
|
17
|
+
# @param options [Hash] Optional conversion options.
|
|
18
|
+
# Supported keys (all optional):
|
|
19
|
+
# - :heading_style - 'atx', 'atx_closed', 'setext', 'underlined'
|
|
20
|
+
# - :code_block_style - 'backticks', 'tildes', 'indented'
|
|
21
|
+
# - :escape_asterisks - Boolean
|
|
22
|
+
# - :escape_underscores - Boolean
|
|
23
|
+
# - :escape_misc - Boolean
|
|
24
|
+
# - :escape_ascii - Boolean
|
|
25
|
+
# - :strip_newlines - Boolean
|
|
26
|
+
# - :keep_inline_images_in - Array of tag names
|
|
27
|
+
# - :strip_tags - Array of tag names to strip
|
|
28
|
+
# - :preserve_tags - Array of tag names to preserve verbatim
|
|
29
|
+
# (and more, matching ConversionOptions fields)
|
|
30
|
+
# @return [String] The converted Markdown content.
|
|
31
|
+
def self.convert(html, options = {})
|
|
32
|
+
opts = if options.nil? || options.empty?
|
|
33
|
+
nil
|
|
34
|
+
else
|
|
35
|
+
HtmlToMarkdownRs::ConversionOptions.new(options)
|
|
36
|
+
end
|
|
37
|
+
result = HtmlToMarkdownRs.convert(html, opts)
|
|
38
|
+
result.content || ''
|
|
29
39
|
end
|
|
30
40
|
end
|
data/lib/html_to_markdown_rb.so
CHANGED
|
Binary file
|
data/sig/html_to_markdown.rbs
CHANGED
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Native extension module (Magnus/rb-sys)
|
|
2
|
+
module HtmlToMarkdownRs
|
|
3
|
+
class ConversionOptions
|
|
4
|
+
def initialize: (Hash[Symbol, untyped]) -> void
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
class ConversionResult
|
|
8
|
+
def content: () -> String?
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.convert: (String html, ConversionOptions? options) -> ConversionResult
|
|
12
|
+
end
|
|
13
|
+
|
|
1
14
|
# Type definitions for HtmlToMarkdown Ruby gem
|
|
2
15
|
module HtmlToMarkdown
|
|
3
16
|
VERSION: String
|
|
@@ -8,6 +21,7 @@ module HtmlToMarkdown
|
|
|
8
21
|
type whitespace_mode = :normalized | :strict
|
|
9
22
|
type newline_style = :spaces | :backslash
|
|
10
23
|
type code_block_style = :indented | :backticks | :tildes
|
|
24
|
+
type link_style = :inline | :reference
|
|
11
25
|
type output_format = :markdown | :djot
|
|
12
26
|
type preprocessing_preset = :minimal | :standard | :aggressive
|
|
13
27
|
|
|
@@ -49,6 +63,7 @@ module HtmlToMarkdown
|
|
|
49
63
|
debug?: bool,
|
|
50
64
|
strip_tags?: Array[String],
|
|
51
65
|
preserve_tags?: Array[String],
|
|
66
|
+
link_style?: link_style,
|
|
52
67
|
output_format?: output_format,
|
|
53
68
|
skip_images?: bool,
|
|
54
69
|
include_document_structure?: bool,
|
|
@@ -126,12 +141,9 @@ module HtmlToMarkdown
|
|
|
126
141
|
|
|
127
142
|
public
|
|
128
143
|
|
|
129
|
-
# Convert HTML to Markdown, returning
|
|
144
|
+
# Convert HTML to Markdown, returning the markdown content string.
|
|
130
145
|
#
|
|
131
146
|
# Example:
|
|
132
147
|
# result = HtmlToMarkdown.convert(html)
|
|
133
|
-
def self.convert: (String html, ?conversion_options options) ->
|
|
134
|
-
|
|
135
|
-
# Instance method version (created by module_function)
|
|
136
|
-
def convert: (String html, ?conversion_options options) -> Hash[String, untyped]
|
|
148
|
+
def self.convert: (String html, ?conversion_options options) -> String
|
|
137
149
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.1
|
|
4
|
+
version: 3.2.1
|
|
5
5
|
platform: x86_64-linux
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-16 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: |-
|
|
14
14
|
html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
|
|
@@ -32,16 +32,13 @@ files:
|
|
|
32
32
|
- exe/html-to-markdown
|
|
33
33
|
- html-to-markdown-rb.gemspec
|
|
34
34
|
- lib/html_to_markdown.rb
|
|
35
|
-
- lib/html_to_markdown/cli.rb
|
|
36
|
-
- lib/html_to_markdown/cli_proxy.rb
|
|
37
35
|
- lib/html_to_markdown/version.rb
|
|
38
36
|
- lib/html_to_markdown_rb.so
|
|
37
|
+
- lib/html_to_markdown_rs.rb
|
|
39
38
|
- sig/html_to_markdown.rbs
|
|
40
39
|
- sig/html_to_markdown/cli.rbs
|
|
41
40
|
- sig/html_to_markdown/cli_proxy.rbs
|
|
42
41
|
- sig/open3.rbs
|
|
43
|
-
- spec/cli_proxy_spec.rb
|
|
44
|
-
- spec/spec_helper.rb
|
|
45
42
|
homepage: https://github.com/kreuzberg-dev/html-to-markdown
|
|
46
43
|
licenses:
|
|
47
44
|
- MIT
|
data/lib/html_to_markdown/cli.rb
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'html_to_markdown/cli_proxy'
|
|
4
|
-
|
|
5
|
-
module HtmlToMarkdown
|
|
6
|
-
module CLI
|
|
7
|
-
module_function
|
|
8
|
-
|
|
9
|
-
def run(argv = ARGV, stdout: $stdout, stderr: $stderr)
|
|
10
|
-
output = CLIProxy.call(argv)
|
|
11
|
-
stdout.print(output)
|
|
12
|
-
0
|
|
13
|
-
rescue CLIProxy::CLIExecutionError => e
|
|
14
|
-
stderr.print(e.stderr)
|
|
15
|
-
e.status || 1
|
|
16
|
-
rescue CLIProxy::MissingBinaryError, CLIProxy::Error => e
|
|
17
|
-
stderr.puts(e.message)
|
|
18
|
-
1
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
end
|
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'open3'
|
|
4
|
-
require 'pathname'
|
|
5
|
-
|
|
6
|
-
module HtmlToMarkdown
|
|
7
|
-
module CLIProxy
|
|
8
|
-
class Error < StandardError
|
|
9
|
-
end
|
|
10
|
-
|
|
11
|
-
class MissingBinaryError < Error
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
class CLIExecutionError < Error
|
|
15
|
-
attr_reader :stderr, :status
|
|
16
|
-
|
|
17
|
-
def initialize(message, stderr:, status:)
|
|
18
|
-
super(message)
|
|
19
|
-
@stderr = stderr
|
|
20
|
-
@status = status
|
|
21
|
-
end
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
module_function
|
|
25
|
-
|
|
26
|
-
def call(argv)
|
|
27
|
-
binary = find_cli_binary
|
|
28
|
-
args = Array(argv).map(&:to_s)
|
|
29
|
-
stdout, stderr, status = Open3.capture3(binary.to_s, *args)
|
|
30
|
-
return stdout if status.success?
|
|
31
|
-
|
|
32
|
-
raise CLIExecutionError.new(
|
|
33
|
-
"html-to-markdown CLI exited with status #{status.exitstatus}",
|
|
34
|
-
stderr: stderr,
|
|
35
|
-
status: status.exitstatus
|
|
36
|
-
)
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
def find_cli_binary
|
|
40
|
-
binary_name = Gem.win_platform? ? 'html-to-markdown.exe' : 'html-to-markdown'
|
|
41
|
-
found = search_paths(binary_name).find(&:file?)
|
|
42
|
-
return found if found
|
|
43
|
-
|
|
44
|
-
raise MissingBinaryError, missing_binary_message
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
def root_path
|
|
48
|
-
@root_path ||= Pathname(__dir__.to_s).join('../..').expand_path
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
def lib_path
|
|
52
|
-
@lib_path ||= Pathname(__dir__.to_s).join('..').expand_path
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
def search_paths(binary_name)
|
|
56
|
-
paths = [
|
|
57
|
-
root_path.join('target', 'release', binary_name),
|
|
58
|
-
lib_path.join('bin', binary_name),
|
|
59
|
-
lib_path.join(binary_name)
|
|
60
|
-
]
|
|
61
|
-
|
|
62
|
-
workspace_root = root_path.parent&.parent
|
|
63
|
-
paths << workspace_root.join('target', 'release', binary_name) if workspace_root
|
|
64
|
-
paths
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
def missing_binary_message
|
|
68
|
-
<<~MSG.strip
|
|
69
|
-
html-to-markdown CLI binary not found. Build it with
|
|
70
|
-
`cargo build --release --package html-to-markdown-cli`.
|
|
71
|
-
MSG
|
|
72
|
-
end
|
|
73
|
-
end
|
|
74
|
-
end
|
data/spec/cli_proxy_spec.rb
DELETED
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'spec_helper'
|
|
4
|
-
require 'html_to_markdown/cli_proxy'
|
|
5
|
-
require 'html_to_markdown/cli'
|
|
6
|
-
require 'stringio'
|
|
7
|
-
|
|
8
|
-
RSpec.describe HtmlToMarkdown::CLIProxy do
|
|
9
|
-
describe '.call' do
|
|
10
|
-
it 'executes the CLI binary' do
|
|
11
|
-
begin
|
|
12
|
-
binary = described_class.find_cli_binary
|
|
13
|
-
rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
|
|
14
|
-
skip 'CLI binary not built'
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
expect(binary).to be_file
|
|
18
|
-
|
|
19
|
-
output = described_class.call(['--version'])
|
|
20
|
-
expect(output).to include(HtmlToMarkdown::VERSION)
|
|
21
|
-
end
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
describe HtmlToMarkdown::CLI do
|
|
25
|
-
it 'writes CLI output to stdout' do
|
|
26
|
-
begin
|
|
27
|
-
HtmlToMarkdown::CLIProxy.find_cli_binary
|
|
28
|
-
rescue HtmlToMarkdown::CLIProxy::MissingBinaryError
|
|
29
|
-
skip 'CLI binary not built'
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
stdout = StringIO.new
|
|
33
|
-
stderr = StringIO.new
|
|
34
|
-
|
|
35
|
-
exit_code = described_class.run(['--version'], stdout: stdout, stderr: stderr)
|
|
36
|
-
|
|
37
|
-
expect(exit_code).to eq(0)
|
|
38
|
-
expect(stdout.string).to include(HtmlToMarkdown::VERSION)
|
|
39
|
-
expect(stderr.string).to be_empty
|
|
40
|
-
end
|
|
41
|
-
end
|
|
42
|
-
end
|