selma 0.2.2 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +115 -114
- data/README.md +122 -24
- data/ext/selma/Cargo.toml +5 -2
- data/ext/selma/src/html/element.rs +11 -6
- data/ext/selma/src/native_ref_wrap.rs +15 -12
- data/ext/selma/src/rewriter.rs +257 -106
- data/ext/selma/src/sanitizer.rs +23 -16
- data/lib/selma/config.rb +12 -0
- data/lib/selma/sanitizer/config/default.rb +1 -1
- data/lib/selma/sanitizer/config/relaxed.rb +1 -0
- data/lib/selma/sanitizer.rb +6 -1
- data/lib/selma/version.rb +1 -1
- metadata +8 -7
data/README.md
CHANGED
@@ -76,7 +76,7 @@ attributes: {
|
|
76
76
|
|
77
77
|
# URL handling protocols to allow in specific attributes. By default, no
|
78
78
|
# protocols are allowed. Use :relative in place of a protocol if you want
|
79
|
-
# to allow relative URLs sans protocol.
|
79
|
+
# to allow relative URLs sans protocol. Set to `:all` to allow any protocol.
|
80
80
|
protocols: {
|
81
81
|
"a" => { "href" => ["http", "https", "mailto", :relative] },
|
82
82
|
"img" => { "href" => ["http", "https"] },
|
@@ -103,7 +103,11 @@ Here's an example which rewrites the `href` attribute on `a` and the `src` attri
|
|
103
103
|
|
104
104
|
```ruby
|
105
105
|
class MatchAttribute
|
106
|
-
SELECTOR = Selma::Selector(match_element: %(a[href^="http:"], img[src^="http:"]"))
|
106
|
+
SELECTOR = Selma::Selector.new(match_element: %(a[href^="http:"], img[src^="http:"]"))
|
107
|
+
|
108
|
+
def selector
|
109
|
+
SELECTOR
|
110
|
+
end
|
107
111
|
|
108
112
|
def handle_element(element)
|
109
113
|
if element.tag_name == "a"
|
@@ -176,40 +180,134 @@ The `element` argument in `handle_element` has the following methods:
|
|
176
180
|
- `after(content, as: content_type)`: Inserts `content` after the text. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
177
181
|
- `replace(content, as: content_type)`: Replaces the text node with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
178
182
|
|
183
|
+
## Security
|
184
|
+
|
185
|
+
Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide two options into the `memory` namespace:
|
186
|
+
|
187
|
+
```ruby
|
188
|
+
memory: {
|
189
|
+
max_allowed_memory_usage: 1000,
|
190
|
+
preallocated_parsing_buffer_size: 100,
|
191
|
+
},
|
192
|
+
```
|
193
|
+
|
194
|
+
Note that `preallocated_parsing_buffer_size` must always be less than `max_allowed_memory_usage`. See [the`lol_html` project documentation](https://docs.rs/lol_html/1.2.1/lol_html/struct.MemorySettings.html) to learn more about the default values.
|
195
|
+
|
179
196
|
## Benchmarks
|
180
197
|
|
198
|
+
When `bundle exec rake benchmark`, two different benchmarks are calculated. Here are those results on my machine.
|
199
|
+
|
200
|
+
### Benchmarks for just the sanitization process
|
201
|
+
|
202
|
+
Comparing Selma against popular Ruby sanitization gems:
|
203
|
+
|
204
|
+
<!-- prettier-ignore-start -->
|
181
205
|
<details>
|
182
206
|
<pre>
|
183
|
-
|
184
|
-
|
207
|
+
input size = 25309 bytes, 0.03 MB
|
208
|
+
|
209
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
185
210
|
Warming up --------------------------------------
|
186
|
-
sanitize-
|
187
|
-
|
188
|
-
selma-document-huge 1.000 i/100ms
|
211
|
+
sanitize-sm 16.000 i/100ms
|
212
|
+
selma-sm 214.000 i/100ms
|
189
213
|
Calculating -------------------------------------
|
190
|
-
sanitize-
|
191
|
-
|
192
|
-
|
214
|
+
sanitize-sm 171.670 (± 1.2%) i/s - 5.152k in 30.017081s
|
215
|
+
selma-sm 2.146k (± 3.0%) i/s - 64.414k in 30.058470s
|
216
|
+
|
217
|
+
Comparison:
|
218
|
+
selma-sm: 2145.8 i/s
|
219
|
+
sanitize-sm: 171.7 i/s - 12.50x slower
|
220
|
+
|
221
|
+
input size = 86686 bytes, 0.09 MB
|
222
|
+
|
223
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
193
224
|
Warming up --------------------------------------
|
194
|
-
sanitize-
|
195
|
-
|
196
|
-
selma-document-medium
|
197
|
-
22.000 i/100ms
|
225
|
+
sanitize-md 4.000 i/100ms
|
226
|
+
selma-md 56.000 i/100ms
|
198
227
|
Calculating -------------------------------------
|
199
|
-
sanitize-
|
200
|
-
|
201
|
-
|
202
|
-
|
228
|
+
sanitize-md 44.397 (± 2.3%) i/s - 1.332k in 30.022430s
|
229
|
+
selma-md 558.448 (± 1.4%) i/s - 16.800k in 30.089196s
|
230
|
+
|
231
|
+
Comparison:
|
232
|
+
selma-md: 558.4 i/s
|
233
|
+
sanitize-md: 44.4 i/s - 12.58x slower
|
234
|
+
|
235
|
+
input size = 7172510 bytes, 7.17 MB
|
236
|
+
|
237
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
238
|
+
Warming up --------------------------------------
|
239
|
+
sanitize-lg 1.000 i/100ms
|
240
|
+
selma-lg 1.000 i/100ms
|
241
|
+
Calculating -------------------------------------
|
242
|
+
sanitize-lg 0.163 (± 0.0%) i/s - 6.000 in 37.375628s
|
243
|
+
selma-lg 6.750 (± 0.0%) i/s - 203.000 in 30.080976s
|
244
|
+
|
245
|
+
Comparison:
|
246
|
+
selma-lg: 6.7 i/s
|
247
|
+
sanitize-lg: 0.2 i/s - 41.32x slower
|
248
|
+
</pre>
|
249
|
+
</details>
|
250
|
+
<!-- prettier-ignore-end -->
|
251
|
+
|
252
|
+
### Benchmarks for just the rewriting process
|
253
|
+
|
254
|
+
Comparing Selma against popular Ruby HTML parsing gems:
|
255
|
+
|
256
|
+
<!-- prettier-ignore-start -->
|
257
|
+
<details>
|
258
|
+
<pre>input size = 25309 bytes, 0.03 MB
|
259
|
+
|
260
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
261
|
+
Warming up --------------------------------------
|
262
|
+
nokogiri-sm 107.000 i/100ms
|
263
|
+
nokolexbor-sm 340.000 i/100ms
|
264
|
+
selma-sm 380.000 i/100ms
|
265
|
+
Calculating -------------------------------------
|
266
|
+
nokogiri-sm 1.073k (± 2.1%) i/s - 32.207k in 30.025474s
|
267
|
+
nokolexbor-sm 3.300k (±13.2%) i/s - 27.540k in 36.788212s
|
268
|
+
selma-sm 3.779k (± 3.4%) i/s - 113.240k in 30.013908s
|
269
|
+
|
270
|
+
Comparison:
|
271
|
+
selma-sm: 3779.4 i/s
|
272
|
+
nokolexbor-sm: 3300.1 i/s - same-ish: difference falls within error
|
273
|
+
nokogiri-sm: 1073.1 i/s - 3.52x slower
|
274
|
+
|
275
|
+
input size = 86686 bytes, 0.09 MB
|
276
|
+
|
277
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
278
|
+
Warming up --------------------------------------
|
279
|
+
nokogiri-md 11.000 i/100ms
|
280
|
+
nokolexbor-md 48.000 i/100ms
|
281
|
+
selma-md 53.000 i/100ms
|
282
|
+
Calculating -------------------------------------
|
283
|
+
nokogiri-md 103.998 (± 5.8%) i/s - 3.113k in 30.029932s
|
284
|
+
nokolexbor-md 428.928 (± 7.9%) i/s - 12.816k in 30.066662s
|
285
|
+
selma-md 492.190 (± 6.9%) i/s - 14.734k in 30.082943s
|
286
|
+
|
287
|
+
Comparison:
|
288
|
+
selma-md: 492.2 i/s
|
289
|
+
nokolexbor-md: 428.9 i/s - same-ish: difference falls within error
|
290
|
+
nokogiri-md: 104.0 i/s - 4.73x slower
|
291
|
+
|
292
|
+
input size = 7172510 bytes, 7.17 MB
|
293
|
+
|
294
|
+
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
203
295
|
Warming up --------------------------------------
|
204
|
-
|
205
|
-
|
206
|
-
selma-
|
296
|
+
nokogiri-lg 1.000 i/100ms
|
297
|
+
nokolexbor-lg 1.000 i/100ms
|
298
|
+
selma-lg 1.000 i/100ms
|
207
299
|
Calculating -------------------------------------
|
208
|
-
|
209
|
-
|
210
|
-
selma-
|
300
|
+
nokogiri-lg 0.874 (± 0.0%) i/s - 27.000 in 30.921090s
|
301
|
+
nokolexbor-lg 2.227 (± 0.0%) i/s - 67.000 in 30.137903s
|
302
|
+
selma-lg 8.354 (± 0.0%) i/s - 251.000 in 30.075227s
|
303
|
+
|
304
|
+
Comparison:
|
305
|
+
selma-lg: 8.4 i/s
|
306
|
+
nokolexbor-lg: 2.2 i/s - 3.75x slower
|
307
|
+
nokogiri-lg: 0.9 i/s - 9.56x slower
|
211
308
|
</pre>
|
212
309
|
</details>
|
310
|
+
<!-- prettier-ignore-end -->
|
213
311
|
|
214
312
|
## Contributing
|
215
313
|
|
data/ext/selma/Cargo.toml
CHANGED
@@ -6,9 +6,12 @@ rust-version = "1.75.0"
|
|
6
6
|
publish = false
|
7
7
|
|
8
8
|
[dependencies]
|
9
|
-
enum-iterator = "1
|
9
|
+
enum-iterator = "2.1"
|
10
10
|
escapist = "0.0.2"
|
11
|
-
magnus = "0.6"
|
11
|
+
magnus = { version = "0.6", features = ["rb-sys"] }
|
12
|
+
rb-sys = { version = "*", default-features = false, features = [
|
13
|
+
"stable-api-compiled-fallback",
|
14
|
+
] }
|
12
15
|
lol_html = "1.2"
|
13
16
|
|
14
17
|
[lib]
|
@@ -119,11 +119,13 @@ impl SelmaHTMLElement {
|
|
119
119
|
.iter()
|
120
120
|
.for_each(|attr| match hash.aset(attr.name(), attr.value()) {
|
121
121
|
Ok(_) => {}
|
122
|
-
Err(err) =>
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
122
|
+
Err(err) => panic!(
|
123
|
+
"{:?}",
|
124
|
+
Error::new(
|
125
|
+
exception::runtime_error(),
|
126
|
+
format!("AttributeNameError: {err:?}"),
|
127
|
+
)
|
128
|
+
),
|
127
129
|
});
|
128
130
|
}
|
129
131
|
Ok(hash)
|
@@ -139,7 +141,10 @@ impl SelmaHTMLElement {
|
|
139
141
|
.for_each(|ancestor| match array.push(RString::new(ancestor)) {
|
140
142
|
Ok(_) => {}
|
141
143
|
Err(err) => {
|
142
|
-
|
144
|
+
panic!(
|
145
|
+
"{:?}",
|
146
|
+
Error::new(exception::runtime_error(), format!("{err:?}"))
|
147
|
+
)
|
143
148
|
}
|
144
149
|
});
|
145
150
|
|
@@ -1,15 +1,17 @@
|
|
1
|
-
use std::{
|
1
|
+
use std::{
|
2
|
+
marker::PhantomData,
|
3
|
+
sync::{Arc, Mutex},
|
4
|
+
};
|
2
5
|
|
3
|
-
// NOTE:
|
4
|
-
// but it was taken from https://github.com/cloudflare/lol-html/blob/1a1ab2e2bf896f815fe8888ed78ccdf46d7c6b85/js-api/src/lib.rs#LL38
|
6
|
+
// NOTE: this was taken from https://github.com/cloudflare/lol-html/blob/1a1ab2e2bf896f815fe8888ed78ccdf46d7c6b85/js-api/src/lib.rs#LL38
|
5
7
|
|
6
8
|
pub struct Anchor<'r> {
|
7
|
-
poisoned:
|
9
|
+
poisoned: Arc<Mutex<bool>>,
|
8
10
|
lifetime: PhantomData<&'r mut ()>,
|
9
11
|
}
|
10
12
|
|
11
13
|
impl<'r> Anchor<'r> {
|
12
|
-
pub fn new(poisoned:
|
14
|
+
pub fn new(poisoned: Arc<Mutex<bool>>) -> Self {
|
13
15
|
Anchor {
|
14
16
|
poisoned,
|
15
17
|
lifetime: PhantomData,
|
@@ -19,7 +21,7 @@ impl<'r> Anchor<'r> {
|
|
19
21
|
|
20
22
|
// impl Drop for Anchor<'_> {
|
21
23
|
// fn drop(&mut self) {
|
22
|
-
// self.poisoned.
|
24
|
+
// *self.poisoned.lock().unwrap() = true;
|
23
25
|
// }
|
24
26
|
// }
|
25
27
|
|
@@ -31,17 +33,17 @@ impl<'r> Anchor<'r> {
|
|
31
33
|
// object results in exception.
|
32
34
|
pub struct NativeRefWrap<R> {
|
33
35
|
inner_ptr: *mut R,
|
34
|
-
poisoned:
|
36
|
+
poisoned: Arc<Mutex<bool>>,
|
35
37
|
}
|
36
38
|
|
37
39
|
impl<R> NativeRefWrap<R> {
|
38
40
|
pub fn wrap<I>(inner: &I) -> (Self, Anchor) {
|
39
41
|
let wrap = NativeRefWrap {
|
40
42
|
inner_ptr: inner as *const I as *mut R,
|
41
|
-
poisoned:
|
43
|
+
poisoned: Arc::new(Mutex::new(false)),
|
42
44
|
};
|
43
45
|
|
44
|
-
let anchor = Anchor::new(
|
46
|
+
let anchor = Anchor::new(Arc::clone(&wrap.poisoned));
|
45
47
|
|
46
48
|
(wrap, anchor)
|
47
49
|
}
|
@@ -49,10 +51,10 @@ impl<R> NativeRefWrap<R> {
|
|
49
51
|
pub fn wrap_mut<I>(inner: &mut I) -> (Self, Anchor) {
|
50
52
|
let wrap = NativeRefWrap {
|
51
53
|
inner_ptr: inner as *mut I as *mut R,
|
52
|
-
poisoned:
|
54
|
+
poisoned: Arc::new(Mutex::new(false)),
|
53
55
|
};
|
54
56
|
|
55
|
-
let anchor = Anchor::new(
|
57
|
+
let anchor = Anchor::new(Arc::clone(&wrap.poisoned));
|
56
58
|
|
57
59
|
(wrap, anchor)
|
58
60
|
}
|
@@ -70,7 +72,8 @@ impl<R> NativeRefWrap<R> {
|
|
70
72
|
}
|
71
73
|
|
72
74
|
fn assert_not_poisoned(&self) -> Result<(), &'static str> {
|
73
|
-
|
75
|
+
let lock = self.poisoned.lock().unwrap();
|
76
|
+
if *lock {
|
74
77
|
Err("The object has been freed and can't be used anymore.")
|
75
78
|
} else {
|
76
79
|
Ok(())
|