selma 0.2.2-x86_64-darwin → 0.4.1-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +130 -24
- data/lib/selma/3.1/selma.bundle +0 -0
- data/lib/selma/3.2/selma.bundle +0 -0
- data/lib/selma/3.3/selma.bundle +0 -0
- data/lib/selma/config.rb +12 -0
- data/lib/selma/sanitizer/config/default.rb +1 -1
- data/lib/selma/sanitizer/config/relaxed.rb +1 -0
- data/lib/selma/sanitizer.rb +6 -1
- data/lib/selma/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 61454a1a26fdd81111379af2a15a22d58219c705e7d8af31dcfd495cbba7bb15
|
4
|
+
data.tar.gz: 8b37a785ccab776ba5c5a278fd8a548d3d72846b561fef5c1bb807d5d748eab4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 235dc124ab1faf1c3713d2f67b3a6ccc7c134791b4ca8613be8deaa7da9c7953042d8f2bceab5157bbc4893c0a7c4455a1b4c594e001d41b13049a671ad0bf28
|
7
|
+
data.tar.gz: 47c5f830c2ed780afd21e222e6a521b84e1deb84309e75c6a7a8d477d1a0f80684894403bff6bd83f517fef126a5900434e7cd98d450ec190d39f11cc8064450
|
data/README.md
CHANGED
@@ -76,7 +76,7 @@ attributes: {
|
|
76
76
|
|
77
77
|
# URL handling protocols to allow in specific attributes. By default, no
|
78
78
|
# protocols are allowed. Use :relative in place of a protocol if you want
|
79
|
-
# to allow relative URLs sans protocol.
|
79
|
+
# to allow relative URLs sans protocol. Set to `:all` to allow any protocol.
|
80
80
|
protocols: {
|
81
81
|
"a" => { "href" => ["http", "https", "mailto", :relative] },
|
82
82
|
"img" => { "href" => ["http", "https"] },
|
@@ -103,7 +103,11 @@ Here's an example which rewrites the `href` attribute on `a` and the `src` attri
|
|
103
103
|
|
104
104
|
```ruby
|
105
105
|
class MatchAttribute
|
106
|
-
SELECTOR = Selma::Selector(match_element: %(a[href^="http:"], img[src^="http:"]"))
|
106
|
+
SELECTOR = Selma::Selector.new(match_element: %(a[href^="http:"], img[src^="http:"]"))
|
107
|
+
|
108
|
+
def selector
|
109
|
+
SELECTOR
|
110
|
+
end
|
107
111
|
|
108
112
|
def handle_element(element)
|
109
113
|
if element.tag_name == "a"
|
@@ -176,40 +180,142 @@ The `element` argument in `handle_element` has the following methods:
|
|
176
180
|
- `after(content, as: content_type)`: Inserts `content` after the text. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
177
181
|
- `replace(content, as: content_type)`: Replaces the text node with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
178
182
|
|
183
|
+
## Security
|
184
|
+
|
185
|
+
Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide `memory` options:
|
186
|
+
|
187
|
+
```ruby
|
188
|
+
Selma::Rewriter.new(options: { memory: { max_allowed_memory_usage: 1_000_000 } }) # ~1MB
|
189
|
+
```
|
190
|
+
|
191
|
+
The structure of the `memory` options looks like this:
|
192
|
+
```ruby
|
193
|
+
{
|
194
|
+
memory: {
|
195
|
+
max_allowed_memory_usage: 1000,
|
196
|
+
preallocated_parsing_buffer_size: 100,
|
197
|
+
}
|
198
|
+
}
|
199
|
+
```
|
200
|
+
|
201
|
+
Note that `preallocated_parsing_buffer_size` must always be less than `max_allowed_memory_usage`. See [the`lol_html` project documentation](https://docs.rs/lol_html/1.2.1/lol_html/struct.MemorySettings.html) to learn more about the default values.
|
202
|
+
|
179
203
|
## Benchmarks
|
180
204
|
|
205
|
+
When `bundle exec rake benchmark`, two different benchmarks are calculated. Here are those results on my machine.
|
206
|
+
|
207
|
+
### Benchmarks for just the sanitization process
|
208
|
+
|
209
|
+
Comparing Selma against popular Ruby sanitization gems:
|
210
|
+
|
211
|
+
<!-- prettier-ignore-start -->
|
181
212
|
<details>
|
182
213
|
<pre>
|
183
|
-
|
184
|
-
|
214
|
+
input size = 25309 bytes, 0.03 MB
|
215
|
+
|
216
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
185
217
|
Warming up --------------------------------------
|
186
|
-
sanitize-
|
187
|
-
|
188
|
-
selma-document-huge 1.000 i/100ms
|
218
|
+
sanitize-sm 15.000 i/100ms
|
219
|
+
selma-sm 127.000 i/100ms
|
189
220
|
Calculating -------------------------------------
|
190
|
-
sanitize-
|
191
|
-
|
192
|
-
|
221
|
+
sanitize-sm 157.643 (± 1.9%) i/s - 4.740k in 30.077172s
|
222
|
+
selma-sm 1.278k (± 1.5%) i/s - 38.354k in 30.019722s
|
223
|
+
|
224
|
+
Comparison:
|
225
|
+
selma-sm: 1277.9 i/s
|
226
|
+
sanitize-sm: 157.6 i/s - 8.11x slower
|
227
|
+
|
228
|
+
input size = 86686 bytes, 0.09 MB
|
229
|
+
|
230
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
193
231
|
Warming up --------------------------------------
|
194
|
-
sanitize-
|
195
|
-
|
196
|
-
selma-document-medium
|
197
|
-
22.000 i/100ms
|
232
|
+
sanitize-md 4.000 i/100ms
|
233
|
+
selma-md 33.000 i/100ms
|
198
234
|
Calculating -------------------------------------
|
199
|
-
sanitize-
|
200
|
-
|
201
|
-
|
202
|
-
|
235
|
+
sanitize-md 40.034 (± 5.0%) i/s - 1.200k in 30.043322s
|
236
|
+
selma-md 332.959 (± 2.1%) i/s - 9.999k in 30.045733s
|
237
|
+
|
238
|
+
Comparison:
|
239
|
+
selma-md: 333.0 i/s
|
240
|
+
sanitize-md: 40.0 i/s - 8.32x slower
|
241
|
+
|
242
|
+
input size = 7172510 bytes, 7.17 MB
|
243
|
+
|
244
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
245
|
+
Warming up --------------------------------------
|
246
|
+
sanitize-lg 1.000 i/100ms
|
247
|
+
selma-lg 1.000 i/100ms
|
248
|
+
Calculating -------------------------------------
|
249
|
+
sanitize-lg 0.141 (± 0.0%) i/s - 5.000 in 35.426127s
|
250
|
+
selma-lg 3.963 (± 0.0%) i/s - 119.000 in 30.037386s
|
251
|
+
|
252
|
+
Comparison:
|
253
|
+
selma-lg: 4.0 i/s
|
254
|
+
sanitize-lg: 0.1 i/s - 28.03x slower
|
255
|
+
</pre>
|
256
|
+
</details>
|
257
|
+
<!-- prettier-ignore-end -->
|
258
|
+
|
259
|
+
### Benchmarks for just the rewriting process
|
260
|
+
|
261
|
+
Comparing Selma against popular Ruby HTML parsing gems:
|
262
|
+
|
263
|
+
<!-- prettier-ignore-start -->
|
264
|
+
<details>
|
265
|
+
<pre>
|
266
|
+
input size = 25309 bytes, 0.03 MB
|
267
|
+
|
268
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
269
|
+
Warming up --------------------------------------
|
270
|
+
nokogiri-sm 79.000 i/100ms
|
271
|
+
nokolexbor-sm 295.000 i/100ms
|
272
|
+
selma-sm 237.000 i/100ms
|
273
|
+
Calculating -------------------------------------
|
274
|
+
nokogiri-sm 800.531 (± 2.2%) i/s - 24.016k in 30.016056s
|
275
|
+
nokolexbor-sm 3.033k (± 3.6%) i/s - 91.155k in 30.094884s
|
276
|
+
selma-sm 2.386k (± 1.6%) i/s - 71.574k in 30.001701s
|
277
|
+
|
278
|
+
Comparison:
|
279
|
+
nokolexbor-sm: 3033.1 i/s
|
280
|
+
selma-sm: 2386.3 i/s - 1.27x slower
|
281
|
+
nokogiri-sm: 800.5 i/s - 3.79x slower
|
282
|
+
|
283
|
+
input size = 86686 bytes, 0.09 MB
|
284
|
+
|
285
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
286
|
+
Warming up --------------------------------------
|
287
|
+
nokogiri-md 8.000 i/100ms
|
288
|
+
nokolexbor-md 43.000 i/100ms
|
289
|
+
selma-md 38.000 i/100ms
|
290
|
+
Calculating -------------------------------------
|
291
|
+
nokogiri-md 85.013 (± 8.2%) i/s - 2.024k in 52.257472s
|
292
|
+
nokolexbor-md 416.074 (±11.1%) i/s - 12.341k in 30.111613s
|
293
|
+
selma-md 361.471 (± 4.7%) i/s - 10.830k in 30.033997s
|
294
|
+
|
295
|
+
Comparison:
|
296
|
+
nokolexbor-md: 416.1 i/s
|
297
|
+
selma-md: 361.5 i/s - same-ish: difference falls within error
|
298
|
+
nokogiri-md: 85.0 i/s - 4.89x slower
|
299
|
+
|
300
|
+
input size = 7172510 bytes, 7.17 MB
|
301
|
+
|
302
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
203
303
|
Warming up --------------------------------------
|
204
|
-
|
205
|
-
|
206
|
-
selma-
|
304
|
+
nokogiri-lg 1.000 i/100ms
|
305
|
+
nokolexbor-lg 1.000 i/100ms
|
306
|
+
selma-lg 1.000 i/100ms
|
207
307
|
Calculating -------------------------------------
|
208
|
-
|
209
|
-
|
210
|
-
selma-
|
308
|
+
nokogiri-lg 0.805 (± 0.0%) i/s - 25.000 in 31.148730s
|
309
|
+
nokolexbor-lg 2.194 (± 0.0%) i/s - 66.000 in 30.278108s
|
310
|
+
selma-lg 5.541 (± 0.0%) i/s - 166.000 in 30.037197s
|
311
|
+
|
312
|
+
Comparison:
|
313
|
+
selma-lg: 5.5 i/s
|
314
|
+
nokolexbor-lg: 2.2 i/s - 2.53x slower
|
315
|
+
nokogiri-lg: 0.8 i/s - 6.88x slower
|
211
316
|
</pre>
|
212
317
|
</details>
|
318
|
+
<!-- prettier-ignore-end -->
|
213
319
|
|
214
320
|
## Contributing
|
215
321
|
|
data/lib/selma/3.1/selma.bundle
CHANGED
Binary file
|
data/lib/selma/3.2/selma.bundle
CHANGED
Binary file
|
data/lib/selma/3.3/selma.bundle
CHANGED
Binary file
|
data/lib/selma/config.rb
ADDED
@@ -28,7 +28,7 @@ module Selma
|
|
28
28
|
|
29
29
|
# URL handling protocols to allow in specific attributes. By default, no
|
30
30
|
# protocols are allowed. Use :relative in place of a protocol if you want
|
31
|
-
# to allow relative URLs sans protocol.
|
31
|
+
# to allow relative URLs sans protocol. Set to `:all` to allow any protocol.
|
32
32
|
protocols: {},
|
33
33
|
|
34
34
|
# An Array of element names whose contents will be removed. The contents
|
data/lib/selma/sanitizer.rb
CHANGED
@@ -66,7 +66,12 @@ module Selma
|
|
66
66
|
end
|
67
67
|
|
68
68
|
def allow_protocol(element, attr, protos)
|
69
|
-
|
69
|
+
if protos.is_a?(Array)
|
70
|
+
raise ArgumentError, "`:all` must be passed outside of an array" if protos.include?(:all)
|
71
|
+
else
|
72
|
+
protos = [protos]
|
73
|
+
end
|
74
|
+
|
70
75
|
set_allowed_protocols(element, attr, protos)
|
71
76
|
end
|
72
77
|
|
data/lib/selma/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: selma
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: x86_64-darwin
|
6
6
|
authors:
|
7
7
|
- Garen J. Torikian
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -51,6 +51,7 @@ files:
|
|
51
51
|
- lib/selma/3.1/selma.bundle
|
52
52
|
- lib/selma/3.2/selma.bundle
|
53
53
|
- lib/selma/3.3/selma.bundle
|
54
|
+
- lib/selma/config.rb
|
54
55
|
- lib/selma/extension.rb
|
55
56
|
- lib/selma/html.rb
|
56
57
|
- lib/selma/rewriter.rb
|