selma 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 50b438a2bdc8e13515ccc76c7e57a6fffb492c3ac7ff62781226ca5288f22913
4
- data.tar.gz: 66b31905df161bf62a2dd88df9a5ead5e9f0a0f3a2d1464b669634dc966bc9a3
3
+ metadata.gz: 69919cfb80c694b6f77e4e29e29c66d7b482d96ec83a9f47d3a3711d0b636924
4
+ data.tar.gz: 66f9f62677e3f25391180eb9e952852a351cae1d165b2427fe9c537f05884121
5
5
  SHA512:
6
- metadata.gz: f37389220da9e5b2ea7193cf1a3b7fa0198351233d5afa871c66679b06f274b161caab8b13222996345008899a5334a37150708533f3fd295ca266a5d1e35d1f
7
- data.tar.gz: 2dc168d6bf3d1032fd97dd21f126bfb1028a9b62693b7bc4af254953510f259e96aa7d4c541b23a9a0160c9f5c616b40eebec05b873f0df6fc3d03dc0114f018
6
+ metadata.gz: fb26ee26a928c9aa30c6f71e78914317213552c21729c6ffb46712859eac21b4ddc67709584e4af9ac2a0e4e5b98acce5e5fdbdfd921afdd124faac647b0db05
7
+ data.tar.gz: 9ed49314dc03d3d8f7c6a3b67da8e77536a13a8efd04a8d519796751832527339d291722ac7cb3e44dab9224c2294e70378e0c37d79c97aa3f924506567a990d
data/Cargo.lock CHANGED
@@ -165,18 +165,18 @@ checksum = "b5320ae4c3782150d900b79807611a59a99fc9a1d61d686faafc24b93fc8d7ca"
165
165
 
166
166
  [[package]]
167
167
  name = "enum-iterator"
168
- version = "1.4.1"
168
+ version = "2.1.0"
169
169
  source = "registry+https://github.com/rust-lang/crates.io-index"
170
- checksum = "7add3873b5dd076766ee79c8e406ad1a472c385476b9e38849f8eec24f1be689"
170
+ checksum = "c280b9e6b3ae19e152d8e31cf47f18389781e119d4013a2a2bb0180e5facc635"
171
171
  dependencies = [
172
172
  "enum-iterator-derive",
173
173
  ]
174
174
 
175
175
  [[package]]
176
176
  name = "enum-iterator-derive"
177
- version = "1.2.1"
177
+ version = "1.4.0"
178
178
  source = "registry+https://github.com/rust-lang/crates.io-index"
179
- checksum = "eecf8589574ce9b895052fa12d69af7a233f99e6107f5cb8dd1044f2a17bfdcb"
179
+ checksum = "a1ab991c1362ac86c61ab6f556cff143daa22e5a15e4e189df818b2fd19fe65b"
180
180
  dependencies = [
181
181
  "proc-macro2",
182
182
  "quote",
@@ -269,9 +269,9 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
269
269
 
270
270
  [[package]]
271
271
  name = "lol_html"
272
- version = "1.2.0"
272
+ version = "1.2.1"
273
273
  source = "registry+https://github.com/rust-lang/crates.io-index"
274
- checksum = "10662f7aad081ec900fd735be33076da75e0389400277dc3734e2b0aa02bb115"
274
+ checksum = "a4629ff9c2deeb7aad9b2d0f379fc41937a02f3b739f007732c46af40339dee5"
275
275
  dependencies = [
276
276
  "bitflags 2.4.1",
277
277
  "cfg-if",
@@ -282,16 +282,15 @@ dependencies = [
282
282
  "lazycell",
283
283
  "memchr",
284
284
  "mime",
285
- "safemem",
286
285
  "selectors",
287
286
  "thiserror",
288
287
  ]
289
288
 
290
289
  [[package]]
291
290
  name = "magnus"
292
- version = "0.6.2"
291
+ version = "0.6.4"
293
292
  source = "registry+https://github.com/rust-lang/crates.io-index"
294
- checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
293
+ checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
295
294
  dependencies = [
296
295
  "magnus-macros",
297
296
  "rb-sys",
@@ -577,12 +576,6 @@ dependencies = [
577
576
  "semver",
578
577
  ]
579
578
 
580
- [[package]]
581
- name = "safemem"
582
- version = "0.3.3"
583
- source = "registry+https://github.com/rust-lang/crates.io-index"
584
- checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072"
585
-
586
579
  [[package]]
587
580
  name = "selectors"
588
581
  version = "0.22.0"
data/README.md CHANGED
@@ -76,7 +76,7 @@ attributes: {
76
76
 
77
77
  # URL handling protocols to allow in specific attributes. By default, no
78
78
  # protocols are allowed. Use :relative in place of a protocol if you want
79
- # to allow relative URLs sans protocol.
79
+ # to allow relative URLs sans protocol. Set to `:all` to allow any protocol.
80
80
  protocols: {
81
81
  "a" => { "href" => ["http", "https", "mailto", :relative] },
82
82
  "img" => { "href" => ["http", "https"] },
@@ -103,7 +103,11 @@ Here's an example which rewrites the `href` attribute on `a` and the `src` attri
103
103
 
104
104
  ```ruby
105
105
  class MatchAttribute
106
- SELECTOR = Selma::Selector(match_element: %(a[href^="http:"], img[src^="http:"]"))
106
+ SELECTOR = Selma::Selector.new(match_element: %(a[href^="http:"], img[src^="http:"]"))
107
+
108
+ def selector
109
+ SELECTOR
110
+ end
107
111
 
108
112
  def handle_element(element)
109
113
  if element.tag_name == "a"
@@ -178,38 +182,118 @@ The `element` argument in `handle_element` has the following methods:
178
182
 
179
183
  ## Benchmarks
180
184
 
185
+ When `bundle exec rake benchmark`, two different benchmarks are calculated. Here are those results on my machine.
186
+
187
+ ### Benchmarks for just the sanitization process
188
+
189
+ Comparing Selma against popular Ruby sanitization gems:
190
+
191
+ <!-- prettier-ignore-start -->
181
192
  <details>
182
193
  <pre>
183
- ruby test/benchmark.rb
184
- ruby test/benchmark.rb
185
194
  Warming up --------------------------------------
186
- sanitize-document-huge
187
- 1.000 i/100ms
188
- selma-document-huge 1.000 i/100ms
195
+ sanitize-sm 15.000 i/100ms
196
+ selma-sm 126.000 i/100ms
197
+ Calculating -------------------------------------
198
+ sanitize-sm 155.074 (± 1.9%) i/s - 4.665k in 30.092214s
199
+ selma-sm 1.290k (± 1.3%) i/s - 38.808k in 30.085333s
200
+
201
+ Comparison:
202
+ selma-sm: 1290.1 i/s
203
+ sanitize-sm: 155.1 i/s - 8.32x slower
204
+
205
+ input size = 86686 bytes, 0.09 MB
206
+
207
+ ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
208
+ Warming up --------------------------------------
209
+ sanitize-md 3.000 i/100ms
210
+ selma-md 33.000 i/100ms
189
211
  Calculating -------------------------------------
190
- sanitize-document-huge
191
- 0.257 0.0%) i/s - 2.000 in 7.783398s
192
- selma-document-huge 4.602 (± 0.0%) i/s - 23.000 in 5.002870s
212
+ sanitize-md 40.321 (± 5.0%) i/s - 1.206k in 30.004711s
213
+ selma-md 337.417 1.5%) i/s - 10.131k in 30.032772s
214
+
215
+ Comparison:
216
+ selma-md: 337.4 i/s
217
+ sanitize-md: 40.3 i/s - 8.37x slower
218
+
219
+ input size = 7172510 bytes, 7.17 MB
220
+
221
+ ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
193
222
  Warming up --------------------------------------
194
- sanitize-document-medium
195
- 2.000 i/100ms
196
- selma-document-medium
197
- 22.000 i/100ms
223
+ sanitize-lg 1.000 i/100ms
224
+ selma-lg 1.000 i/100ms
198
225
  Calculating -------------------------------------
199
- sanitize-document-medium
200
- 28.676 3.5%) i/s - 144.000 in 5.024669s
201
- selma-document-medium
202
- 121.500 (±22.2%) i/s - 594.000 in 5.135410s
226
+ sanitize-lg 0.144 (± 0.0%) i/s - 5.000 in 34.772526s
227
+ selma-lg 4.026 0.0%) i/s - 121.000 in 30.067415s
228
+
229
+ Comparison:
230
+ selma-lg: 4.0 i/s
231
+ sanitize-lg: 0.1 i/s - 27.99x slower
232
+ </pre>
233
+ </details>
234
+ <!-- prettier-ignore-end -->
235
+
236
+ ### Benchmarks for just the rewriting process
237
+
238
+ Comparing Selma against popular Ruby HTML parsing gems:
239
+
240
+ <!-- prettier-ignore-start -->
241
+ <details>
242
+ <pre>
243
+
244
+ input size = 25309 bytes, 0.03 MB
245
+
246
+ ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
247
+ Warming up --------------------------------------
248
+ nokogiri-sm 79.000 i/100ms
249
+ nokolexbor-sm 285.000 i/100ms
250
+ selma-sm 244.000 i/100ms
251
+ Calculating -------------------------------------
252
+ nokogiri-sm 807.790 (± 3.1%) i/s - 24.253k in 30.056301s
253
+ nokolexbor-sm 2.880k (± 6.4%) i/s - 86.070k in 30.044766s
254
+ selma-sm 2.508k (± 1.2%) i/s - 75.396k in 30.068792s
255
+
256
+ Comparison:
257
+ nokolexbor-sm: 2880.3 i/s
258
+ selma-sm: 2507.8 i/s - 1.15x slower
259
+ nokogiri-sm: 807.8 i/s - 3.57x slower
260
+
261
+ input size = 86686 bytes, 0.09 MB
262
+
263
+ ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
264
+ Warming up --------------------------------------
265
+ nokogiri-md 8.000 i/100ms
266
+ nokolexbor-md 43.000 i/100ms
267
+ selma-md 39.000 i/100ms
268
+ Calculating -------------------------------------
269
+ nokogiri-md 87.367 (± 3.4%) i/s - 2.624k in 30.061642s
270
+ nokolexbor-md 438.782 (± 3.9%) i/s - 13.158k in 30.031163s
271
+ selma-md 392.591 (± 3.1%) i/s - 11.778k in 30.031391s
272
+
273
+ Comparison:
274
+ nokolexbor-md: 438.8 i/s
275
+ selma-md: 392.6 i/s - 1.12x slower
276
+ nokogiri-md: 87.4 i/s - 5.02x slower
277
+
278
+ input size = 7172510 bytes, 7.17 MB
279
+
280
+ ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
203
281
  Warming up --------------------------------------
204
- sanitize-document-small
205
- 10.000 i/100ms
206
- selma-document-small 20.000 i/100ms
282
+ nokogiri-lg 1.000 i/100ms
283
+ nokolexbor-lg 1.000 i/100ms
284
+ selma-lg 1.000 i/100ms
207
285
  Calculating -------------------------------------
208
- sanitize-document-small
209
- 107.280 (± 0.9%) i/s - 540.000 in 5.033850s
210
- selma-document-small 118.867 31.1%) i/s - 540.000 in 5.080726s
286
+ nokogiri-lg 0.895 (± 0.0%) i/s - 27.000 in 30.300832s
287
+ nokolexbor-lg 2.163 (± 0.0%) i/s - 65.000 in 30.085656s
288
+ selma-lg 5.867 0.0%) i/s - 176.000 in 30.006240s
289
+
290
+ Comparison:
291
+ selma-lg: 5.9 i/s
292
+ nokolexbor-lg: 2.2 i/s - 2.71x slower
293
+ nokogiri-lg: 0.9 i/s - 6.55x slower
211
294
  </pre>
212
295
  </details>
296
+ <!-- prettier-ignore-end -->
213
297
 
214
298
  ## Contributing
215
299
 
data/ext/selma/Cargo.toml CHANGED
@@ -6,7 +6,7 @@ rust-version = "1.75.0"
6
6
  publish = false
7
7
 
8
8
  [dependencies]
9
- enum-iterator = "1.4"
9
+ enum-iterator = "2.1"
10
10
  escapist = "0.0.2"
11
11
  magnus = "0.6"
12
12
  lol_html = "1.2"
@@ -211,20 +211,23 @@ impl SelmaSanitizer {
211
211
  }
212
212
  Some(protocol_list) => protocol_list.push(allowed_protocol.to_string()),
213
213
  }
214
- } else if allowed_protocol.is_kind_of(class::symbol())
215
- && allowed_protocol.inspect() == ":relative"
216
- {
217
- match protocol_list {
218
- None => {
219
- protocol_sanitizers.insert(
220
- attr_name.to_string(),
221
- vec!["#".to_string(), "/".to_string()],
222
- );
223
- }
224
- Some(protocol_list) => {
225
- protocol_list.push("#".to_string());
226
- protocol_list.push("/".to_string());
214
+ } else if allowed_protocol.is_kind_of(class::symbol()) {
215
+ let protocol_config = allowed_protocol.inspect();
216
+ if protocol_config == ":relative" {
217
+ match protocol_list {
218
+ None => {
219
+ protocol_sanitizers.insert(
220
+ attr_name.to_string(),
221
+ vec!["#".to_string(), "/".to_string()],
222
+ );
223
+ }
224
+ Some(protocol_list) => {
225
+ protocol_list.push("#".to_string());
226
+ protocol_list.push("/".to_string());
227
+ }
227
228
  }
229
+ } else if protocol_config == ":all" {
230
+ protocol_sanitizers.insert(attr_name.to_string(), vec!["all".to_string()]);
228
231
  }
229
232
  }
230
233
  }
@@ -388,6 +391,10 @@ impl SelmaSanitizer {
388
391
  }
389
392
 
390
393
  fn has_allowed_protocol(protocols_allowed: &[String], attr_val: &String) -> bool {
394
+ if protocols_allowed.contains(&"all".to_string()) {
395
+ return true;
396
+ }
397
+
391
398
  // FIXME: is there a more idiomatic way to do this?
392
399
  let mut pos: usize = 0;
393
400
  let mut chars = attr_val.chars();
@@ -28,7 +28,7 @@ module Selma
28
28
 
29
29
  # URL handling protocols to allow in specific attributes. By default, no
30
30
  # protocols are allowed. Use :relative in place of a protocol if you want
31
- # to allow relative URLs sans protocol.
31
+ # to allow relative URLs sans protocol. Set to `:all` to allow any protocol.
32
32
  protocols: {},
33
33
 
34
34
  # An Array of element names whose contents will be removed. The contents
@@ -16,6 +16,7 @@ module Selma
16
16
  "colgroup",
17
17
  "data",
18
18
  "del",
19
+ "details",
19
20
  "div",
20
21
  "figcaption",
21
22
  "figure",
@@ -66,7 +66,12 @@ module Selma
66
66
  end
67
67
 
68
68
  def allow_protocol(element, attr, protos)
69
- protos = [protos] unless protos.is_a?(Array)
69
+ if protos.is_a?(Array)
70
+ raise ArgumentError, "`:all` must be passed outside of an array" if protos.include?(:all)
71
+ else
72
+ protos = [protos]
73
+ end
74
+
70
75
  set_allowed_protocols(element, attr, protos)
71
76
  end
72
77
 
data/lib/selma/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Selma
4
- VERSION = "0.2.2"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: selma
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Garen J. Torikian
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-01-11 00:00:00.000000000 Z
11
+ date: 2024-06-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -52,7 +52,7 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '1.2'
55
- description:
55
+ description:
56
56
  email:
57
57
  - gjtorikian@gmail.com
58
58
  executables: []
@@ -88,7 +88,7 @@ files:
88
88
  - lib/selma/sanitizer/config/restricted.rb
89
89
  - lib/selma/selector.rb
90
90
  - lib/selma/version.rb
91
- homepage:
91
+ homepage:
92
92
  licenses:
93
93
  - MIT
94
94
  metadata:
@@ -96,7 +96,7 @@ metadata:
96
96
  funding_uri: https://github.com/sponsors/gjtorikian/
97
97
  source_code_uri: https://github.com/gjtorikian/selma
98
98
  rubygems_mfa_required: 'true'
99
- post_install_message:
99
+ post_install_message:
100
100
  rdoc_options: []
101
101
  require_paths:
102
102
  - lib
@@ -112,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
112
112
  version: 3.3.22
113
113
  requirements: []
114
114
  rubygems_version: 3.5.3
115
- signing_key:
115
+ signing_key:
116
116
  specification_version: 4
117
117
  summary: Selma selects and matches HTML nodes using CSS rules. Backed by Rust's lol_html
118
118
  parser.