selma 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 50b438a2bdc8e13515ccc76c7e57a6fffb492c3ac7ff62781226ca5288f22913
4
- data.tar.gz: 66b31905df161bf62a2dd88df9a5ead5e9f0a0f3a2d1464b669634dc966bc9a3
3
+ metadata.gz: 69919cfb80c694b6f77e4e29e29c66d7b482d96ec83a9f47d3a3711d0b636924
4
+ data.tar.gz: 66f9f62677e3f25391180eb9e952852a351cae1d165b2427fe9c537f05884121
5
5
  SHA512:
6
- metadata.gz: f37389220da9e5b2ea7193cf1a3b7fa0198351233d5afa871c66679b06f274b161caab8b13222996345008899a5334a37150708533f3fd295ca266a5d1e35d1f
7
- data.tar.gz: 2dc168d6bf3d1032fd97dd21f126bfb1028a9b62693b7bc4af254953510f259e96aa7d4c541b23a9a0160c9f5c616b40eebec05b873f0df6fc3d03dc0114f018
6
+ metadata.gz: fb26ee26a928c9aa30c6f71e78914317213552c21729c6ffb46712859eac21b4ddc67709584e4af9ac2a0e4e5b98acce5e5fdbdfd921afdd124faac647b0db05
7
+ data.tar.gz: 9ed49314dc03d3d8f7c6a3b67da8e77536a13a8efd04a8d519796751832527339d291722ac7cb3e44dab9224c2294e70378e0c37d79c97aa3f924506567a990d
data/Cargo.lock CHANGED
@@ -165,18 +165,18 @@ checksum = "b5320ae4c3782150d900b79807611a59a99fc9a1d61d686faafc24b93fc8d7ca"
165
165
 
166
166
  [[package]]
167
167
  name = "enum-iterator"
168
- version = "1.4.1"
168
+ version = "2.1.0"
169
169
  source = "registry+https://github.com/rust-lang/crates.io-index"
170
- checksum = "7add3873b5dd076766ee79c8e406ad1a472c385476b9e38849f8eec24f1be689"
170
+ checksum = "c280b9e6b3ae19e152d8e31cf47f18389781e119d4013a2a2bb0180e5facc635"
171
171
  dependencies = [
172
172
  "enum-iterator-derive",
173
173
  ]
174
174
 
175
175
  [[package]]
176
176
  name = "enum-iterator-derive"
177
- version = "1.2.1"
177
+ version = "1.4.0"
178
178
  source = "registry+https://github.com/rust-lang/crates.io-index"
179
- checksum = "eecf8589574ce9b895052fa12d69af7a233f99e6107f5cb8dd1044f2a17bfdcb"
179
+ checksum = "a1ab991c1362ac86c61ab6f556cff143daa22e5a15e4e189df818b2fd19fe65b"
180
180
  dependencies = [
181
181
  "proc-macro2",
182
182
  "quote",
@@ -269,9 +269,9 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
269
269
 
270
270
  [[package]]
271
271
  name = "lol_html"
272
- version = "1.2.0"
272
+ version = "1.2.1"
273
273
  source = "registry+https://github.com/rust-lang/crates.io-index"
274
- checksum = "10662f7aad081ec900fd735be33076da75e0389400277dc3734e2b0aa02bb115"
274
+ checksum = "a4629ff9c2deeb7aad9b2d0f379fc41937a02f3b739f007732c46af40339dee5"
275
275
  dependencies = [
276
276
  "bitflags 2.4.1",
277
277
  "cfg-if",
@@ -282,16 +282,15 @@ dependencies = [
282
282
  "lazycell",
283
283
  "memchr",
284
284
  "mime",
285
- "safemem",
286
285
  "selectors",
287
286
  "thiserror",
288
287
  ]
289
288
 
290
289
  [[package]]
291
290
  name = "magnus"
292
- version = "0.6.2"
291
+ version = "0.6.4"
293
292
  source = "registry+https://github.com/rust-lang/crates.io-index"
294
- checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
293
+ checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
295
294
  dependencies = [
296
295
  "magnus-macros",
297
296
  "rb-sys",
@@ -577,12 +576,6 @@ dependencies = [
577
576
  "semver",
578
577
  ]
579
578
 
580
- [[package]]
581
- name = "safemem"
582
- version = "0.3.3"
583
- source = "registry+https://github.com/rust-lang/crates.io-index"
584
- checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072"
585
-
586
579
  [[package]]
587
580
  name = "selectors"
588
581
  version = "0.22.0"
data/README.md CHANGED
@@ -76,7 +76,7 @@ attributes: {
76
76
 
77
77
  # URL handling protocols to allow in specific attributes. By default, no
78
78
  # protocols are allowed. Use :relative in place of a protocol if you want
79
- # to allow relative URLs sans protocol.
79
+ # to allow relative URLs sans protocol. Set to `:all` to allow any protocol.
80
80
  protocols: {
81
81
  "a" => { "href" => ["http", "https", "mailto", :relative] },
82
82
  "img" => { "href" => ["http", "https"] },
@@ -103,7 +103,11 @@ Here's an example which rewrites the `href` attribute on `a` and the `src` attri
103
103
 
104
104
  ```ruby
105
105
  class MatchAttribute
106
- SELECTOR = Selma::Selector(match_element: %(a[href^="http:"], img[src^="http:"]"))
106
+ SELECTOR = Selma::Selector.new(match_element: %(a[href^="http:"], img[src^="http:"]"))
107
+
108
+ def selector
109
+ SELECTOR
110
+ end
107
111
 
108
112
  def handle_element(element)
109
113
  if element.tag_name == "a"
@@ -178,38 +182,118 @@ The `element` argument in `handle_element` has the following methods:
178
182
 
179
183
  ## Benchmarks
180
184
 
185
+ When `bundle exec rake benchmark`, two different benchmarks are calculated. Here are those results on my machine.
186
+
187
+ ### Benchmarks for just the sanitization process
188
+
189
+ Comparing Selma against popular Ruby sanitization gems:
190
+
191
+ <!-- prettier-ignore-start -->
181
192
  <details>
182
193
  <pre>
183
- ruby test/benchmark.rb
184
- ruby test/benchmark.rb
185
194
  Warming up --------------------------------------
186
- sanitize-document-huge
187
- 1.000 i/100ms
188
- selma-document-huge 1.000 i/100ms
195
+ sanitize-sm 15.000 i/100ms
196
+ selma-sm 126.000 i/100ms
197
+ Calculating -------------------------------------
198
+ sanitize-sm 155.074 (± 1.9%) i/s - 4.665k in 30.092214s
199
+ selma-sm 1.290k (± 1.3%) i/s - 38.808k in 30.085333s
200
+
201
+ Comparison:
202
+ selma-sm: 1290.1 i/s
203
+ sanitize-sm: 155.1 i/s - 8.32x slower
204
+
205
+ input size = 86686 bytes, 0.09 MB
206
+
207
+ ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
208
+ Warming up --------------------------------------
209
+ sanitize-md 3.000 i/100ms
210
+ selma-md 33.000 i/100ms
189
211
  Calculating -------------------------------------
190
- sanitize-document-huge
191
- 0.257 0.0%) i/s - 2.000 in 7.783398s
192
- selma-document-huge 4.602 (± 0.0%) i/s - 23.000 in 5.002870s
212
+ sanitize-md 40.321 (± 5.0%) i/s - 1.206k in 30.004711s
213
+ selma-md 337.417 1.5%) i/s - 10.131k in 30.032772s
214
+
215
+ Comparison:
216
+ selma-md: 337.4 i/s
217
+ sanitize-md: 40.3 i/s - 8.37x slower
218
+
219
+ input size = 7172510 bytes, 7.17 MB
220
+
221
+ ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
193
222
  Warming up --------------------------------------
194
- sanitize-document-medium
195
- 2.000 i/100ms
196
- selma-document-medium
197
- 22.000 i/100ms
223
+ sanitize-lg 1.000 i/100ms
224
+ selma-lg 1.000 i/100ms
198
225
  Calculating -------------------------------------
199
- sanitize-document-medium
200
- 28.676 3.5%) i/s - 144.000 in 5.024669s
201
- selma-document-medium
202
- 121.500 (±22.2%) i/s - 594.000 in 5.135410s
226
+ sanitize-lg 0.144 (± 0.0%) i/s - 5.000 in 34.772526s
227
+ selma-lg 4.026 0.0%) i/s - 121.000 in 30.067415s
228
+
229
+ Comparison:
230
+ selma-lg: 4.0 i/s
231
+ sanitize-lg: 0.1 i/s - 27.99x slower
232
+ </pre>
233
+ </details>
234
+ <!-- prettier-ignore-end -->
235
+
236
+ ### Benchmarks for just the rewriting process
237
+
238
+ Comparing Selma against popular Ruby HTML parsing gems:
239
+
240
+ <!-- prettier-ignore-start -->
241
+ <details>
242
+ <pre>
243
+
244
+ input size = 25309 bytes, 0.03 MB
245
+
246
+ ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
247
+ Warming up --------------------------------------
248
+ nokogiri-sm 79.000 i/100ms
249
+ nokolexbor-sm 285.000 i/100ms
250
+ selma-sm 244.000 i/100ms
251
+ Calculating -------------------------------------
252
+ nokogiri-sm 807.790 (± 3.1%) i/s - 24.253k in 30.056301s
253
+ nokolexbor-sm 2.880k (± 6.4%) i/s - 86.070k in 30.044766s
254
+ selma-sm 2.508k (± 1.2%) i/s - 75.396k in 30.068792s
255
+
256
+ Comparison:
257
+ nokolexbor-sm: 2880.3 i/s
258
+ selma-sm: 2507.8 i/s - 1.15x slower
259
+ nokogiri-sm: 807.8 i/s - 3.57x slower
260
+
261
+ input size = 86686 bytes, 0.09 MB
262
+
263
+ ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
264
+ Warming up --------------------------------------
265
+ nokogiri-md 8.000 i/100ms
266
+ nokolexbor-md 43.000 i/100ms
267
+ selma-md 39.000 i/100ms
268
+ Calculating -------------------------------------
269
+ nokogiri-md 87.367 (± 3.4%) i/s - 2.624k in 30.061642s
270
+ nokolexbor-md 438.782 (± 3.9%) i/s - 13.158k in 30.031163s
271
+ selma-md 392.591 (± 3.1%) i/s - 11.778k in 30.031391s
272
+
273
+ Comparison:
274
+ nokolexbor-md: 438.8 i/s
275
+ selma-md: 392.6 i/s - 1.12x slower
276
+ nokogiri-md: 87.4 i/s - 5.02x slower
277
+
278
+ input size = 7172510 bytes, 7.17 MB
279
+
280
+ ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
203
281
  Warming up --------------------------------------
204
- sanitize-document-small
205
- 10.000 i/100ms
206
- selma-document-small 20.000 i/100ms
282
+ nokogiri-lg 1.000 i/100ms
283
+ nokolexbor-lg 1.000 i/100ms
284
+ selma-lg 1.000 i/100ms
207
285
  Calculating -------------------------------------
208
- sanitize-document-small
209
- 107.280 (± 0.9%) i/s - 540.000 in 5.033850s
210
- selma-document-small 118.867 31.1%) i/s - 540.000 in 5.080726s
286
+ nokogiri-lg 0.895 (± 0.0%) i/s - 27.000 in 30.300832s
287
+ nokolexbor-lg 2.163 (± 0.0%) i/s - 65.000 in 30.085656s
288
+ selma-lg 5.867 0.0%) i/s - 176.000 in 30.006240s
289
+
290
+ Comparison:
291
+ selma-lg: 5.9 i/s
292
+ nokolexbor-lg: 2.2 i/s - 2.71x slower
293
+ nokogiri-lg: 0.9 i/s - 6.55x slower
211
294
  </pre>
212
295
  </details>
296
+ <!-- prettier-ignore-end -->
213
297
 
214
298
  ## Contributing
215
299
 
data/ext/selma/Cargo.toml CHANGED
@@ -6,7 +6,7 @@ rust-version = "1.75.0"
6
6
  publish = false
7
7
 
8
8
  [dependencies]
9
- enum-iterator = "1.4"
9
+ enum-iterator = "2.1"
10
10
  escapist = "0.0.2"
11
11
  magnus = "0.6"
12
12
  lol_html = "1.2"
@@ -211,20 +211,23 @@ impl SelmaSanitizer {
211
211
  }
212
212
  Some(protocol_list) => protocol_list.push(allowed_protocol.to_string()),
213
213
  }
214
- } else if allowed_protocol.is_kind_of(class::symbol())
215
- && allowed_protocol.inspect() == ":relative"
216
- {
217
- match protocol_list {
218
- None => {
219
- protocol_sanitizers.insert(
220
- attr_name.to_string(),
221
- vec!["#".to_string(), "/".to_string()],
222
- );
223
- }
224
- Some(protocol_list) => {
225
- protocol_list.push("#".to_string());
226
- protocol_list.push("/".to_string());
214
+ } else if allowed_protocol.is_kind_of(class::symbol()) {
215
+ let protocol_config = allowed_protocol.inspect();
216
+ if protocol_config == ":relative" {
217
+ match protocol_list {
218
+ None => {
219
+ protocol_sanitizers.insert(
220
+ attr_name.to_string(),
221
+ vec!["#".to_string(), "/".to_string()],
222
+ );
223
+ }
224
+ Some(protocol_list) => {
225
+ protocol_list.push("#".to_string());
226
+ protocol_list.push("/".to_string());
227
+ }
227
228
  }
229
+ } else if protocol_config == ":all" {
230
+ protocol_sanitizers.insert(attr_name.to_string(), vec!["all".to_string()]);
228
231
  }
229
232
  }
230
233
  }
@@ -388,6 +391,10 @@ impl SelmaSanitizer {
388
391
  }
389
392
 
390
393
  fn has_allowed_protocol(protocols_allowed: &[String], attr_val: &String) -> bool {
394
+ if protocols_allowed.contains(&"all".to_string()) {
395
+ return true;
396
+ }
397
+
391
398
  // FIXME: is there a more idiomatic way to do this?
392
399
  let mut pos: usize = 0;
393
400
  let mut chars = attr_val.chars();
@@ -28,7 +28,7 @@ module Selma
28
28
 
29
29
  # URL handling protocols to allow in specific attributes. By default, no
30
30
  # protocols are allowed. Use :relative in place of a protocol if you want
31
- # to allow relative URLs sans protocol.
31
+ # to allow relative URLs sans protocol. Set to `:all` to allow any protocol.
32
32
  protocols: {},
33
33
 
34
34
  # An Array of element names whose contents will be removed. The contents
@@ -16,6 +16,7 @@ module Selma
16
16
  "colgroup",
17
17
  "data",
18
18
  "del",
19
+ "details",
19
20
  "div",
20
21
  "figcaption",
21
22
  "figure",
@@ -66,7 +66,12 @@ module Selma
66
66
  end
67
67
 
68
68
  def allow_protocol(element, attr, protos)
69
- protos = [protos] unless protos.is_a?(Array)
69
+ if protos.is_a?(Array)
70
+ raise ArgumentError, "`:all` must be passed outside of an array" if protos.include?(:all)
71
+ else
72
+ protos = [protos]
73
+ end
74
+
70
75
  set_allowed_protocols(element, attr, protos)
71
76
  end
72
77
 
data/lib/selma/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Selma
4
- VERSION = "0.2.2"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: selma
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Garen J. Torikian
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-01-11 00:00:00.000000000 Z
11
+ date: 2024-06-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -52,7 +52,7 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '1.2'
55
- description:
55
+ description:
56
56
  email:
57
57
  - gjtorikian@gmail.com
58
58
  executables: []
@@ -88,7 +88,7 @@ files:
88
88
  - lib/selma/sanitizer/config/restricted.rb
89
89
  - lib/selma/selector.rb
90
90
  - lib/selma/version.rb
91
- homepage:
91
+ homepage:
92
92
  licenses:
93
93
  - MIT
94
94
  metadata:
@@ -96,7 +96,7 @@ metadata:
96
96
  funding_uri: https://github.com/sponsors/gjtorikian/
97
97
  source_code_uri: https://github.com/gjtorikian/selma
98
98
  rubygems_mfa_required: 'true'
99
- post_install_message:
99
+ post_install_message:
100
100
  rdoc_options: []
101
101
  require_paths:
102
102
  - lib
@@ -112,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
112
112
  version: 3.3.22
113
113
  requirements: []
114
114
  rubygems_version: 3.5.3
115
- signing_key:
115
+ signing_key:
116
116
  specification_version: 4
117
117
  summary: Selma selects and matches HTML nodes using CSS rules. Backed by Rust's lol_html
118
118
  parser.