selma 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +8 -15
- data/README.md +108 -24
- data/ext/selma/Cargo.toml +1 -1
- data/ext/selma/src/sanitizer.rs +20 -13
- data/lib/selma/sanitizer/config/default.rb +1 -1
- data/lib/selma/sanitizer/config/relaxed.rb +1 -0
- data/lib/selma/sanitizer.rb +6 -1
- data/lib/selma/version.rb +1 -1
- metadata +7 -7
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 69919cfb80c694b6f77e4e29e29c66d7b482d96ec83a9f47d3a3711d0b636924
|
|
4
|
+
data.tar.gz: 66f9f62677e3f25391180eb9e952852a351cae1d165b2427fe9c537f05884121
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fb26ee26a928c9aa30c6f71e78914317213552c21729c6ffb46712859eac21b4ddc67709584e4af9ac2a0e4e5b98acce5e5fdbdfd921afdd124faac647b0db05
|
|
7
|
+
data.tar.gz: 9ed49314dc03d3d8f7c6a3b67da8e77536a13a8efd04a8d519796751832527339d291722ac7cb3e44dab9224c2294e70378e0c37d79c97aa3f924506567a990d
|
data/Cargo.lock
CHANGED
|
@@ -165,18 +165,18 @@ checksum = "b5320ae4c3782150d900b79807611a59a99fc9a1d61d686faafc24b93fc8d7ca"
|
|
|
165
165
|
|
|
166
166
|
[[package]]
|
|
167
167
|
name = "enum-iterator"
|
|
168
|
-
version = "1.
|
|
168
|
+
version = "2.1.0"
|
|
169
169
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
170
|
-
checksum = "
|
|
170
|
+
checksum = "c280b9e6b3ae19e152d8e31cf47f18389781e119d4013a2a2bb0180e5facc635"
|
|
171
171
|
dependencies = [
|
|
172
172
|
"enum-iterator-derive",
|
|
173
173
|
]
|
|
174
174
|
|
|
175
175
|
[[package]]
|
|
176
176
|
name = "enum-iterator-derive"
|
|
177
|
-
version = "1.
|
|
177
|
+
version = "1.4.0"
|
|
178
178
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
179
|
-
checksum = "
|
|
179
|
+
checksum = "a1ab991c1362ac86c61ab6f556cff143daa22e5a15e4e189df818b2fd19fe65b"
|
|
180
180
|
dependencies = [
|
|
181
181
|
"proc-macro2",
|
|
182
182
|
"quote",
|
|
@@ -269,9 +269,9 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
|
|
|
269
269
|
|
|
270
270
|
[[package]]
|
|
271
271
|
name = "lol_html"
|
|
272
|
-
version = "1.2.
|
|
272
|
+
version = "1.2.1"
|
|
273
273
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
274
|
-
checksum = "
|
|
274
|
+
checksum = "a4629ff9c2deeb7aad9b2d0f379fc41937a02f3b739f007732c46af40339dee5"
|
|
275
275
|
dependencies = [
|
|
276
276
|
"bitflags 2.4.1",
|
|
277
277
|
"cfg-if",
|
|
@@ -282,16 +282,15 @@ dependencies = [
|
|
|
282
282
|
"lazycell",
|
|
283
283
|
"memchr",
|
|
284
284
|
"mime",
|
|
285
|
-
"safemem",
|
|
286
285
|
"selectors",
|
|
287
286
|
"thiserror",
|
|
288
287
|
]
|
|
289
288
|
|
|
290
289
|
[[package]]
|
|
291
290
|
name = "magnus"
|
|
292
|
-
version = "0.6.
|
|
291
|
+
version = "0.6.4"
|
|
293
292
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
294
|
-
checksum = "
|
|
293
|
+
checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
|
|
295
294
|
dependencies = [
|
|
296
295
|
"magnus-macros",
|
|
297
296
|
"rb-sys",
|
|
@@ -577,12 +576,6 @@ dependencies = [
|
|
|
577
576
|
"semver",
|
|
578
577
|
]
|
|
579
578
|
|
|
580
|
-
[[package]]
|
|
581
|
-
name = "safemem"
|
|
582
|
-
version = "0.3.3"
|
|
583
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
584
|
-
checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072"
|
|
585
|
-
|
|
586
579
|
[[package]]
|
|
587
580
|
name = "selectors"
|
|
588
581
|
version = "0.22.0"
|
data/README.md
CHANGED
|
@@ -76,7 +76,7 @@ attributes: {
|
|
|
76
76
|
|
|
77
77
|
# URL handling protocols to allow in specific attributes. By default, no
|
|
78
78
|
# protocols are allowed. Use :relative in place of a protocol if you want
|
|
79
|
-
# to allow relative URLs sans protocol.
|
|
79
|
+
# to allow relative URLs sans protocol. Set to `:all` to allow any protocol.
|
|
80
80
|
protocols: {
|
|
81
81
|
"a" => { "href" => ["http", "https", "mailto", :relative] },
|
|
82
82
|
"img" => { "href" => ["http", "https"] },
|
|
@@ -103,7 +103,11 @@ Here's an example which rewrites the `href` attribute on `a` and the `src` attri
|
|
|
103
103
|
|
|
104
104
|
```ruby
|
|
105
105
|
class MatchAttribute
|
|
106
|
-
SELECTOR = Selma::Selector(match_element: %(a[href^="http:"], img[src^="http:"]"))
|
|
106
|
+
SELECTOR = Selma::Selector.new(match_element: %(a[href^="http:"], img[src^="http:"]"))
|
|
107
|
+
|
|
108
|
+
def selector
|
|
109
|
+
SELECTOR
|
|
110
|
+
end
|
|
107
111
|
|
|
108
112
|
def handle_element(element)
|
|
109
113
|
if element.tag_name == "a"
|
|
@@ -178,38 +182,118 @@ The `element` argument in `handle_element` has the following methods:
|
|
|
178
182
|
|
|
179
183
|
## Benchmarks
|
|
180
184
|
|
|
185
|
+
When `bundle exec rake benchmark`, two different benchmarks are calculated. Here are those results on my machine.
|
|
186
|
+
|
|
187
|
+
### Benchmarks for just the sanitization process
|
|
188
|
+
|
|
189
|
+
Comparing Selma against popular Ruby sanitization gems:
|
|
190
|
+
|
|
191
|
+
<!-- prettier-ignore-start -->
|
|
181
192
|
<details>
|
|
182
193
|
<pre>
|
|
183
|
-
ruby test/benchmark.rb
|
|
184
|
-
ruby test/benchmark.rb
|
|
185
194
|
Warming up --------------------------------------
|
|
186
|
-
sanitize-
|
|
187
|
-
|
|
188
|
-
|
|
195
|
+
sanitize-sm 15.000 i/100ms
|
|
196
|
+
selma-sm 126.000 i/100ms
|
|
197
|
+
Calculating -------------------------------------
|
|
198
|
+
sanitize-sm 155.074 (± 1.9%) i/s - 4.665k in 30.092214s
|
|
199
|
+
selma-sm 1.290k (± 1.3%) i/s - 38.808k in 30.085333s
|
|
200
|
+
|
|
201
|
+
Comparison:
|
|
202
|
+
selma-sm: 1290.1 i/s
|
|
203
|
+
sanitize-sm: 155.1 i/s - 8.32x slower
|
|
204
|
+
|
|
205
|
+
input size = 86686 bytes, 0.09 MB
|
|
206
|
+
|
|
207
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
|
208
|
+
Warming up --------------------------------------
|
|
209
|
+
sanitize-md 3.000 i/100ms
|
|
210
|
+
selma-md 33.000 i/100ms
|
|
189
211
|
Calculating -------------------------------------
|
|
190
|
-
sanitize-
|
|
191
|
-
|
|
192
|
-
|
|
212
|
+
sanitize-md 40.321 (± 5.0%) i/s - 1.206k in 30.004711s
|
|
213
|
+
selma-md 337.417 (± 1.5%) i/s - 10.131k in 30.032772s
|
|
214
|
+
|
|
215
|
+
Comparison:
|
|
216
|
+
selma-md: 337.4 i/s
|
|
217
|
+
sanitize-md: 40.3 i/s - 8.37x slower
|
|
218
|
+
|
|
219
|
+
input size = 7172510 bytes, 7.17 MB
|
|
220
|
+
|
|
221
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
|
193
222
|
Warming up --------------------------------------
|
|
194
|
-
sanitize-
|
|
195
|
-
|
|
196
|
-
selma-document-medium
|
|
197
|
-
22.000 i/100ms
|
|
223
|
+
sanitize-lg 1.000 i/100ms
|
|
224
|
+
selma-lg 1.000 i/100ms
|
|
198
225
|
Calculating -------------------------------------
|
|
199
|
-
sanitize-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
226
|
+
sanitize-lg 0.144 (± 0.0%) i/s - 5.000 in 34.772526s
|
|
227
|
+
selma-lg 4.026 (± 0.0%) i/s - 121.000 in 30.067415s
|
|
228
|
+
|
|
229
|
+
Comparison:
|
|
230
|
+
selma-lg: 4.0 i/s
|
|
231
|
+
sanitize-lg: 0.1 i/s - 27.99x slower
|
|
232
|
+
</pre>
|
|
233
|
+
</details>
|
|
234
|
+
<!-- prettier-ignore-end -->
|
|
235
|
+
|
|
236
|
+
### Benchmarks for just the rewriting process
|
|
237
|
+
|
|
238
|
+
Comparing Selma against popular Ruby HTML parsing gems:
|
|
239
|
+
|
|
240
|
+
<!-- prettier-ignore-start -->
|
|
241
|
+
<details>
|
|
242
|
+
<pre>
|
|
243
|
+
|
|
244
|
+
input size = 25309 bytes, 0.03 MB
|
|
245
|
+
|
|
246
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
|
247
|
+
Warming up --------------------------------------
|
|
248
|
+
nokogiri-sm 79.000 i/100ms
|
|
249
|
+
nokolexbor-sm 285.000 i/100ms
|
|
250
|
+
selma-sm 244.000 i/100ms
|
|
251
|
+
Calculating -------------------------------------
|
|
252
|
+
nokogiri-sm 807.790 (± 3.1%) i/s - 24.253k in 30.056301s
|
|
253
|
+
nokolexbor-sm 2.880k (± 6.4%) i/s - 86.070k in 30.044766s
|
|
254
|
+
selma-sm 2.508k (± 1.2%) i/s - 75.396k in 30.068792s
|
|
255
|
+
|
|
256
|
+
Comparison:
|
|
257
|
+
nokolexbor-sm: 2880.3 i/s
|
|
258
|
+
selma-sm: 2507.8 i/s - 1.15x slower
|
|
259
|
+
nokogiri-sm: 807.8 i/s - 3.57x slower
|
|
260
|
+
|
|
261
|
+
input size = 86686 bytes, 0.09 MB
|
|
262
|
+
|
|
263
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
|
264
|
+
Warming up --------------------------------------
|
|
265
|
+
nokogiri-md 8.000 i/100ms
|
|
266
|
+
nokolexbor-md 43.000 i/100ms
|
|
267
|
+
selma-md 39.000 i/100ms
|
|
268
|
+
Calculating -------------------------------------
|
|
269
|
+
nokogiri-md 87.367 (± 3.4%) i/s - 2.624k in 30.061642s
|
|
270
|
+
nokolexbor-md 438.782 (± 3.9%) i/s - 13.158k in 30.031163s
|
|
271
|
+
selma-md 392.591 (± 3.1%) i/s - 11.778k in 30.031391s
|
|
272
|
+
|
|
273
|
+
Comparison:
|
|
274
|
+
nokolexbor-md: 438.8 i/s
|
|
275
|
+
selma-md: 392.6 i/s - 1.12x slower
|
|
276
|
+
nokogiri-md: 87.4 i/s - 5.02x slower
|
|
277
|
+
|
|
278
|
+
input size = 7172510 bytes, 7.17 MB
|
|
279
|
+
|
|
280
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
|
203
281
|
Warming up --------------------------------------
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
selma-
|
|
282
|
+
nokogiri-lg 1.000 i/100ms
|
|
283
|
+
nokolexbor-lg 1.000 i/100ms
|
|
284
|
+
selma-lg 1.000 i/100ms
|
|
207
285
|
Calculating -------------------------------------
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
selma-
|
|
286
|
+
nokogiri-lg 0.895 (± 0.0%) i/s - 27.000 in 30.300832s
|
|
287
|
+
nokolexbor-lg 2.163 (± 0.0%) i/s - 65.000 in 30.085656s
|
|
288
|
+
selma-lg 5.867 (± 0.0%) i/s - 176.000 in 30.006240s
|
|
289
|
+
|
|
290
|
+
Comparison:
|
|
291
|
+
selma-lg: 5.9 i/s
|
|
292
|
+
nokolexbor-lg: 2.2 i/s - 2.71x slower
|
|
293
|
+
nokogiri-lg: 0.9 i/s - 6.55x slower
|
|
211
294
|
</pre>
|
|
212
295
|
</details>
|
|
296
|
+
<!-- prettier-ignore-end -->
|
|
213
297
|
|
|
214
298
|
## Contributing
|
|
215
299
|
|
data/ext/selma/Cargo.toml
CHANGED
data/ext/selma/src/sanitizer.rs
CHANGED
|
@@ -211,20 +211,23 @@ impl SelmaSanitizer {
|
|
|
211
211
|
}
|
|
212
212
|
Some(protocol_list) => protocol_list.push(allowed_protocol.to_string()),
|
|
213
213
|
}
|
|
214
|
-
} else if allowed_protocol.is_kind_of(class::symbol())
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
214
|
+
} else if allowed_protocol.is_kind_of(class::symbol()) {
|
|
215
|
+
let protocol_config = allowed_protocol.inspect();
|
|
216
|
+
if protocol_config == ":relative" {
|
|
217
|
+
match protocol_list {
|
|
218
|
+
None => {
|
|
219
|
+
protocol_sanitizers.insert(
|
|
220
|
+
attr_name.to_string(),
|
|
221
|
+
vec!["#".to_string(), "/".to_string()],
|
|
222
|
+
);
|
|
223
|
+
}
|
|
224
|
+
Some(protocol_list) => {
|
|
225
|
+
protocol_list.push("#".to_string());
|
|
226
|
+
protocol_list.push("/".to_string());
|
|
227
|
+
}
|
|
227
228
|
}
|
|
229
|
+
} else if protocol_config == ":all" {
|
|
230
|
+
protocol_sanitizers.insert(attr_name.to_string(), vec!["all".to_string()]);
|
|
228
231
|
}
|
|
229
232
|
}
|
|
230
233
|
}
|
|
@@ -388,6 +391,10 @@ impl SelmaSanitizer {
|
|
|
388
391
|
}
|
|
389
392
|
|
|
390
393
|
fn has_allowed_protocol(protocols_allowed: &[String], attr_val: &String) -> bool {
|
|
394
|
+
if protocols_allowed.contains(&"all".to_string()) {
|
|
395
|
+
return true;
|
|
396
|
+
}
|
|
397
|
+
|
|
391
398
|
// FIXME: is there a more idiomatic way to do this?
|
|
392
399
|
let mut pos: usize = 0;
|
|
393
400
|
let mut chars = attr_val.chars();
|
|
@@ -28,7 +28,7 @@ module Selma
|
|
|
28
28
|
|
|
29
29
|
# URL handling protocols to allow in specific attributes. By default, no
|
|
30
30
|
# protocols are allowed. Use :relative in place of a protocol if you want
|
|
31
|
-
# to allow relative URLs sans protocol.
|
|
31
|
+
# to allow relative URLs sans protocol. Set to `:all` to allow any protocol.
|
|
32
32
|
protocols: {},
|
|
33
33
|
|
|
34
34
|
# An Array of element names whose contents will be removed. The contents
|
data/lib/selma/sanitizer.rb
CHANGED
|
@@ -66,7 +66,12 @@ module Selma
|
|
|
66
66
|
end
|
|
67
67
|
|
|
68
68
|
def allow_protocol(element, attr, protos)
|
|
69
|
-
|
|
69
|
+
if protos.is_a?(Array)
|
|
70
|
+
raise ArgumentError, "`:all` must be passed outside of an array" if protos.include?(:all)
|
|
71
|
+
else
|
|
72
|
+
protos = [protos]
|
|
73
|
+
end
|
|
74
|
+
|
|
70
75
|
set_allowed_protocols(element, attr, protos)
|
|
71
76
|
end
|
|
72
77
|
|
data/lib/selma/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: selma
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Garen J. Torikian
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-
|
|
11
|
+
date: 2024-06-07 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -52,7 +52,7 @@ dependencies:
|
|
|
52
52
|
- - "~>"
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
54
|
version: '1.2'
|
|
55
|
-
description:
|
|
55
|
+
description:
|
|
56
56
|
email:
|
|
57
57
|
- gjtorikian@gmail.com
|
|
58
58
|
executables: []
|
|
@@ -88,7 +88,7 @@ files:
|
|
|
88
88
|
- lib/selma/sanitizer/config/restricted.rb
|
|
89
89
|
- lib/selma/selector.rb
|
|
90
90
|
- lib/selma/version.rb
|
|
91
|
-
homepage:
|
|
91
|
+
homepage:
|
|
92
92
|
licenses:
|
|
93
93
|
- MIT
|
|
94
94
|
metadata:
|
|
@@ -96,7 +96,7 @@ metadata:
|
|
|
96
96
|
funding_uri: https://github.com/sponsors/gjtorikian/
|
|
97
97
|
source_code_uri: https://github.com/gjtorikian/selma
|
|
98
98
|
rubygems_mfa_required: 'true'
|
|
99
|
-
post_install_message:
|
|
99
|
+
post_install_message:
|
|
100
100
|
rdoc_options: []
|
|
101
101
|
require_paths:
|
|
102
102
|
- lib
|
|
@@ -112,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
112
112
|
version: 3.3.22
|
|
113
113
|
requirements: []
|
|
114
114
|
rubygems_version: 3.5.3
|
|
115
|
-
signing_key:
|
|
115
|
+
signing_key:
|
|
116
116
|
specification_version: 4
|
|
117
117
|
summary: Selma selects and matches HTML nodes using CSS rules. Backed by Rust's lol_html
|
|
118
118
|
parser.
|