selma 0.3.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -180,6 +180,26 @@ The `element` argument in `handle_element` has the following methods:
180
180
  - `after(content, as: content_type)`: Inserts `content` after the text. `content_type` is either `:text` or `:html` and determines how the content will be applied.
181
181
  - `replace(content, as: content_type)`: Replaces the text node with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
182
182
 
183
+ ## Security
184
+
185
+ Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide `memory` options:
186
+
187
+ ```ruby
188
+ Selma::Rewriter.new(options: { memory: { max_allowed_memory_usage: 1_000_000 } }) # ~1MB
189
+ ```
190
+
191
+ The structure of the `memory` options looks like this:
192
+ ```ruby
193
+ {
194
+ memory: {
195
+ max_allowed_memory_usage: 1000,
196
+ preallocated_parsing_buffer_size: 100,
197
+ }
198
+ }
199
+ ```
200
+
201
+ Note that `preallocated_parsing_buffer_size` must always be less than `max_allowed_memory_usage`. See [the`lol_html` project documentation](https://docs.rs/lol_html/1.2.1/lol_html/struct.MemorySettings.html) to learn more about the default values.
202
+
183
203
  ## Benchmarks
184
204
 
185
205
  When `bundle exec rake benchmark`, two different benchmarks are calculated. Here are those results on my machine.
@@ -191,30 +211,33 @@ Comparing Selma against popular Ruby sanitization gems:
191
211
  <!-- prettier-ignore-start -->
192
212
  <details>
193
213
  <pre>
214
+ input size = 25309 bytes, 0.03 MB
215
+
216
+ ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
194
217
  Warming up --------------------------------------
195
218
  sanitize-sm 15.000 i/100ms
196
- selma-sm 126.000 i/100ms
219
+ selma-sm 127.000 i/100ms
197
220
  Calculating -------------------------------------
198
- sanitize-sm 155.074 (± 1.9%) i/s - 4.665k in 30.092214s
199
- selma-sm 1.290k (± 1.3%) i/s - 38.808k in 30.085333s
221
+ sanitize-sm 157.643 (± 1.9%) i/s - 4.740k in 30.077172s
222
+ selma-sm 1.278k (± 1.5%) i/s - 38.354k in 30.019722s
200
223
 
201
224
  Comparison:
202
- selma-sm: 1290.1 i/s
203
- sanitize-sm: 155.1 i/s - 8.32x slower
225
+ selma-sm: 1277.9 i/s
226
+ sanitize-sm: 157.6 i/s - 8.11x slower
204
227
 
205
228
  input size = 86686 bytes, 0.09 MB
206
229
 
207
230
  ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
208
231
  Warming up --------------------------------------
209
- sanitize-md 3.000 i/100ms
232
+ sanitize-md 4.000 i/100ms
210
233
  selma-md 33.000 i/100ms
211
234
  Calculating -------------------------------------
212
- sanitize-md 40.321 (± 5.0%) i/s - 1.206k in 30.004711s
213
- selma-md 337.417 (± 1.5%) i/s - 10.131k in 30.032772s
235
+ sanitize-md 40.034 (± 5.0%) i/s - 1.200k in 30.043322s
236
+ selma-md 332.9592.1%) i/s - 9.999k in 30.045733s
214
237
 
215
238
  Comparison:
216
- selma-md: 337.4 i/s
217
- sanitize-md: 40.3 i/s - 8.37x slower
239
+ selma-md: 333.0 i/s
240
+ sanitize-md: 40.0 i/s - 8.32x slower
218
241
 
219
242
  input size = 7172510 bytes, 7.17 MB
220
243
 
@@ -223,12 +246,12 @@ Warming up --------------------------------------
223
246
  sanitize-lg 1.000 i/100ms
224
247
  selma-lg 1.000 i/100ms
225
248
  Calculating -------------------------------------
226
- sanitize-lg 0.144 (± 0.0%) i/s - 5.000 in 34.772526s
227
- selma-lg 4.026 (± 0.0%) i/s - 121.000 in 30.067415s
249
+ sanitize-lg 0.141 (± 0.0%) i/s - 5.000 in 35.426127s
250
+ selma-lg 3.963 (± 0.0%) i/s - 119.000 in 30.037386s
228
251
 
229
252
  Comparison:
230
253
  selma-lg: 4.0 i/s
231
- sanitize-lg: 0.1 i/s - 27.99x slower
254
+ sanitize-lg: 0.1 i/s - 28.03x slower
232
255
  </pre>
233
256
  </details>
234
257
  <!-- prettier-ignore-end -->
@@ -240,23 +263,22 @@ Comparing Selma against popular Ruby HTML parsing gems:
240
263
  <!-- prettier-ignore-start -->
241
264
  <details>
242
265
  <pre>
243
-
244
266
  input size = 25309 bytes, 0.03 MB
245
267
 
246
268
  ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
247
269
  Warming up --------------------------------------
248
270
  nokogiri-sm 79.000 i/100ms
249
- nokolexbor-sm 285.000 i/100ms
250
- selma-sm 244.000 i/100ms
271
+ nokolexbor-sm 295.000 i/100ms
272
+ selma-sm 237.000 i/100ms
251
273
  Calculating -------------------------------------
252
- nokogiri-sm 807.7903.1%) i/s - 24.253k in 30.056301s
253
- nokolexbor-sm 2.880k (± 6.4%) i/s - 86.070k in 30.044766s
254
- selma-sm 2.508k (± 1.2%) i/s - 75.396k in 30.068792s
274
+ nokogiri-sm 800.5312.2%) i/s - 24.016k in 30.016056s
275
+ nokolexbor-sm 3.033k3.6%) i/s - 91.155k in 30.094884s
276
+ selma-sm 2.386k (± 1.6%) i/s - 71.574k in 30.001701s
255
277
 
256
278
  Comparison:
257
- nokolexbor-sm: 2880.3 i/s
258
- selma-sm: 2507.8 i/s - 1.15x slower
259
- nokogiri-sm: 807.8 i/s - 3.57x slower
279
+ nokolexbor-sm: 3033.1 i/s
280
+ selma-sm: 2386.3 i/s - 1.27x slower
281
+ nokogiri-sm: 800.5 i/s - 3.79x slower
260
282
 
261
283
  input size = 86686 bytes, 0.09 MB
262
284
 
@@ -264,16 +286,16 @@ ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
264
286
  Warming up --------------------------------------
265
287
  nokogiri-md 8.000 i/100ms
266
288
  nokolexbor-md 43.000 i/100ms
267
- selma-md 39.000 i/100ms
289
+ selma-md 38.000 i/100ms
268
290
  Calculating -------------------------------------
269
- nokogiri-md 87.3673.4%) i/s - 2.624k in 30.061642s
270
- nokolexbor-md 438.782 3.9%) i/s - 13.158k in 30.031163s
271
- selma-md 392.5913.1%) i/s - 11.778k in 30.031391s
291
+ nokogiri-md 85.0138.2%) i/s - 2.024k in 52.257472s
292
+ nokolexbor-md 416.07411.1%) i/s - 12.341k in 30.111613s
293
+ selma-md 361.4714.7%) i/s - 10.830k in 30.033997s
272
294
 
273
295
  Comparison:
274
- nokolexbor-md: 438.8 i/s
275
- selma-md: 392.6 i/s - 1.12x slower
276
- nokogiri-md: 87.4 i/s - 5.02x slower
296
+ nokolexbor-md: 416.1 i/s
297
+ selma-md: 361.5 i/s - same-ish: difference falls within error
298
+ nokogiri-md: 85.0 i/s - 4.89x slower
277
299
 
278
300
  input size = 7172510 bytes, 7.17 MB
279
301
 
@@ -283,14 +305,14 @@ Warming up --------------------------------------
283
305
  nokolexbor-lg 1.000 i/100ms
284
306
  selma-lg 1.000 i/100ms
285
307
  Calculating -------------------------------------
286
- nokogiri-lg 0.895 (± 0.0%) i/s - 27.000 in 30.300832s
287
- nokolexbor-lg 2.163 (± 0.0%) i/s - 65.000 in 30.085656s
288
- selma-lg 5.867 (± 0.0%) i/s - 176.000 in 30.006240s
308
+ nokogiri-lg 0.805 (± 0.0%) i/s - 25.000 in 31.148730s
309
+ nokolexbor-lg 2.194 (± 0.0%) i/s - 66.000 in 30.278108s
310
+ selma-lg 5.541 (± 0.0%) i/s - 166.000 in 30.037197s
289
311
 
290
312
  Comparison:
291
- selma-lg: 5.9 i/s
292
- nokolexbor-lg: 2.2 i/s - 2.71x slower
293
- nokogiri-lg: 0.9 i/s - 6.55x slower
313
+ selma-lg: 5.5 i/s
314
+ nokolexbor-lg: 2.2 i/s - 2.53x slower
315
+ nokogiri-lg: 0.8 i/s - 6.88x slower
294
316
  </pre>
295
317
  </details>
296
318
  <!-- prettier-ignore-end -->
data/ext/selma/Cargo.toml CHANGED
@@ -8,7 +8,10 @@ publish = false
8
8
  [dependencies]
9
9
  enum-iterator = "2.1"
10
10
  escapist = "0.0.2"
11
- magnus = "0.6"
11
+ magnus = { version = "0.6", features = ["rb-sys"] }
12
+ rb-sys = { version = "*", default-features = false, features = [
13
+ "stable-api-compiled-fallback",
14
+ ] }
12
15
  lol_html = "1.2"
13
16
 
14
17
  [lib]
@@ -119,11 +119,13 @@ impl SelmaHTMLElement {
119
119
  .iter()
120
120
  .for_each(|attr| match hash.aset(attr.name(), attr.value()) {
121
121
  Ok(_) => {}
122
- Err(err) => Err(Error::new(
123
- exception::runtime_error(),
124
- format!("AttributeNameError: {err:?}"),
125
- ))
126
- .unwrap(),
122
+ Err(err) => panic!(
123
+ "{:?}",
124
+ Error::new(
125
+ exception::runtime_error(),
126
+ format!("AttributeNameError: {err:?}"),
127
+ )
128
+ ),
127
129
  });
128
130
  }
129
131
  Ok(hash)
@@ -139,7 +141,10 @@ impl SelmaHTMLElement {
139
141
  .for_each(|ancestor| match array.push(RString::new(ancestor)) {
140
142
  Ok(_) => {}
141
143
  Err(err) => {
142
- Err(Error::new(exception::runtime_error(), format!("{err:?}"))).unwrap()
144
+ panic!(
145
+ "{:?}",
146
+ Error::new(exception::runtime_error(), format!("{err:?}"))
147
+ )
143
148
  }
144
149
  });
145
150
 
@@ -1,15 +1,17 @@
1
- use std::{cell::Cell, marker::PhantomData, rc::Rc};
1
+ use std::{
2
+ marker::PhantomData,
3
+ sync::{Arc, Mutex},
4
+ };
2
5
 
3
- // NOTE: My Rust isn't good enough to know what any of this does,
4
- // but it was taken from https://github.com/cloudflare/lol-html/blob/1a1ab2e2bf896f815fe8888ed78ccdf46d7c6b85/js-api/src/lib.rs#LL38
6
+ // NOTE: this was taken from https://github.com/cloudflare/lol-html/blob/1a1ab2e2bf896f815fe8888ed78ccdf46d7c6b85/js-api/src/lib.rs#LL38
5
7
 
6
8
  pub struct Anchor<'r> {
7
- poisoned: Rc<Cell<bool>>,
9
+ poisoned: Arc<Mutex<bool>>,
8
10
  lifetime: PhantomData<&'r mut ()>,
9
11
  }
10
12
 
11
13
  impl<'r> Anchor<'r> {
12
- pub fn new(poisoned: Rc<Cell<bool>>) -> Self {
14
+ pub fn new(poisoned: Arc<Mutex<bool>>) -> Self {
13
15
  Anchor {
14
16
  poisoned,
15
17
  lifetime: PhantomData,
@@ -19,7 +21,7 @@ impl<'r> Anchor<'r> {
19
21
 
20
22
  // impl Drop for Anchor<'_> {
21
23
  // fn drop(&mut self) {
22
- // self.poisoned.replace(true);
24
+ // *self.poisoned.lock().unwrap() = true;
23
25
  // }
24
26
  // }
25
27
 
@@ -31,17 +33,17 @@ impl<'r> Anchor<'r> {
31
33
  // object results in exception.
32
34
  pub struct NativeRefWrap<R> {
33
35
  inner_ptr: *mut R,
34
- poisoned: Rc<Cell<bool>>,
36
+ poisoned: Arc<Mutex<bool>>,
35
37
  }
36
38
 
37
39
  impl<R> NativeRefWrap<R> {
38
40
  pub fn wrap<I>(inner: &I) -> (Self, Anchor) {
39
41
  let wrap = NativeRefWrap {
40
42
  inner_ptr: inner as *const I as *mut R,
41
- poisoned: Rc::new(Cell::new(false)),
43
+ poisoned: Arc::new(Mutex::new(false)),
42
44
  };
43
45
 
44
- let anchor = Anchor::new(Rc::clone(&wrap.poisoned));
46
+ let anchor = Anchor::new(Arc::clone(&wrap.poisoned));
45
47
 
46
48
  (wrap, anchor)
47
49
  }
@@ -49,10 +51,10 @@ impl<R> NativeRefWrap<R> {
49
51
  pub fn wrap_mut<I>(inner: &mut I) -> (Self, Anchor) {
50
52
  let wrap = NativeRefWrap {
51
53
  inner_ptr: inner as *mut I as *mut R,
52
- poisoned: Rc::new(Cell::new(false)),
54
+ poisoned: Arc::new(Mutex::new(false)),
53
55
  };
54
56
 
55
- let anchor = Anchor::new(Rc::clone(&wrap.poisoned));
57
+ let anchor = Anchor::new(Arc::clone(&wrap.poisoned));
56
58
 
57
59
  (wrap, anchor)
58
60
  }
@@ -70,7 +72,8 @@ impl<R> NativeRefWrap<R> {
70
72
  }
71
73
 
72
74
  fn assert_not_poisoned(&self) -> Result<(), &'static str> {
73
- if self.poisoned.get() {
75
+ let lock = self.poisoned.lock().unwrap();
76
+ if *lock {
74
77
  Err("The object has been freed and can't be used anymore.")
75
78
  } else {
76
79
  Ok(())