selma 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 22f8e5128ea3eea52860c70f3fbd9d3f4d2b1139cccab700611e2aedf032fa99
4
- data.tar.gz: 07e888bed12f8e2e7bbf7ab92ebe691464c017e6dbe43faafd18e10e50bfc92c
3
+ metadata.gz: e5b5e70ce5e954bbd53e8df94b58facb21a78174562f001ce328f9d3ba4b09a3
4
+ data.tar.gz: 4360c58ce183593b8b5ec8c2bc2a15090aa5ed86f3e8a69b5975111274e01749
5
5
  SHA512:
6
- metadata.gz: f92066c8d7a4bd1357f852c34f4f050a7c863e250e0fe9cf742186aa82bba410738d0c25d209d0a13dbbcd9cb354012e3d10762193b9a40f690cad4c5293aa74
7
- data.tar.gz: 6773e0af5e365fafc6b76a3bdae555f02d87a06a488abbe8cb75c964cce9846b0f46e0acfeb6bf628db2f9c56ed236afcf58d49c405f4cff13ba5bd5f2b87a63
6
+ metadata.gz: 9528b3320b2bbe90bf17ee1cffe17e6ab65f58ebae1c8edda77f61d2c4b8e8c08782b4bf11cbbaae50610c44aa0dc69c58ca70c4c943a96af84285db167929bd
7
+ data.tar.gz: 8cf5ebd4259fa09b07b7908a2de795746a45cf7fc25eb3243affe373e0c90a04760cd2d8798367ffc3abe343e5b499a9253b2f01ac195843a067c8cbc88701a5
data/README.md CHANGED
@@ -182,13 +182,20 @@ The `element` argument in `handle_element` has the following methods:
182
182
 
183
183
  ## Security
184
184
 
185
- Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide two options into the `memory` namespace:
185
+ Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide `memory` options:
186
186
 
187
187
  ```ruby
188
- memory: {
189
- max_allowed_memory_usage: 1000,
190
- preallocated_parsing_buffer_size: 100,
191
- },
188
+ Selma::Rewriter.new(options: { memory: { max_allowed_memory_usage: 1_000_000 } }) # ~1MB
189
+ ```
190
+
191
+ The structure of the `memory` options looks like this:
192
+ ```ruby
193
+ {
194
+ memory: {
195
+ max_allowed_memory_usage: 1000,
196
+ preallocated_parsing_buffer_size: 100,
197
+ }
198
+ }
192
199
  ```
193
200
 
194
201
  Note that `preallocated_parsing_buffer_size` must always be less than `max_allowed_memory_usage`. See [the`lol_html` project documentation](https://docs.rs/lol_html/1.2.1/lol_html/struct.MemorySettings.html) to learn more about the default values.
@@ -208,29 +215,29 @@ input size = 25309 bytes, 0.03 MB
208
215
 
209
216
  ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
210
217
  Warming up --------------------------------------
211
- sanitize-sm 16.000 i/100ms
212
- selma-sm 214.000 i/100ms
218
+ sanitize-sm 15.000 i/100ms
219
+ selma-sm 127.000 i/100ms
213
220
  Calculating -------------------------------------
214
- sanitize-sm 171.670 (± 1.2%) i/s - 5.152k in 30.017081s
215
- selma-sm 2.146k3.0%) i/s - 64.414k in 30.058470s
221
+ sanitize-sm 157.643 (± 1.9%) i/s - 4.740k in 30.077172s
222
+ selma-sm 1.278k1.5%) i/s - 38.354k in 30.019722s
216
223
 
217
224
  Comparison:
218
- selma-sm: 2145.8 i/s
219
- sanitize-sm: 171.7 i/s - 12.50x slower
225
+ selma-sm: 1277.9 i/s
226
+ sanitize-sm: 157.6 i/s - 8.11x slower
220
227
 
221
228
  input size = 86686 bytes, 0.09 MB
222
229
 
223
230
  ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
224
231
  Warming up --------------------------------------
225
232
  sanitize-md 4.000 i/100ms
226
- selma-md 56.000 i/100ms
233
+ selma-md 33.000 i/100ms
227
234
  Calculating -------------------------------------
228
- sanitize-md 44.3972.3%) i/s - 1.332k in 30.022430s
229
- selma-md 558.448 (± 1.4%) i/s - 16.800k in 30.089196s
235
+ sanitize-md 40.0345.0%) i/s - 1.200k in 30.043322s
236
+ selma-md 332.9592.1%) i/s - 9.999k in 30.045733s
230
237
 
231
238
  Comparison:
232
- selma-md: 558.4 i/s
233
- sanitize-md: 44.4 i/s - 12.58x slower
239
+ selma-md: 333.0 i/s
240
+ sanitize-md: 40.0 i/s - 8.32x slower
234
241
 
235
242
  input size = 7172510 bytes, 7.17 MB
236
243
 
@@ -239,12 +246,12 @@ Warming up --------------------------------------
239
246
  sanitize-lg 1.000 i/100ms
240
247
  selma-lg 1.000 i/100ms
241
248
  Calculating -------------------------------------
242
- sanitize-lg 0.163 (± 0.0%) i/s - 6.000 in 37.375628s
243
- selma-lg 6.750 (± 0.0%) i/s - 203.000 in 30.080976s
249
+ sanitize-lg 0.141 (± 0.0%) i/s - 5.000 in 35.426127s
250
+ selma-lg 3.963 (± 0.0%) i/s - 119.000 in 30.037386s
244
251
 
245
252
  Comparison:
246
- selma-lg: 6.7 i/s
247
- sanitize-lg: 0.2 i/s - 41.32x slower
253
+ selma-lg: 4.0 i/s
254
+ sanitize-lg: 0.1 i/s - 28.03x slower
248
255
  </pre>
249
256
  </details>
250
257
  <!-- prettier-ignore-end -->
@@ -255,39 +262,40 @@ Comparing Selma against popular Ruby HTML parsing gems:
255
262
 
256
263
  <!-- prettier-ignore-start -->
257
264
  <details>
258
- <pre>input size = 25309 bytes, 0.03 MB
265
+ <pre>
266
+ input size = 25309 bytes, 0.03 MB
259
267
 
260
268
  ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
261
269
  Warming up --------------------------------------
262
- nokogiri-sm 107.000 i/100ms
263
- nokolexbor-sm 340.000 i/100ms
264
- selma-sm 380.000 i/100ms
270
+ nokogiri-sm 79.000 i/100ms
271
+ nokolexbor-sm 295.000 i/100ms
272
+ selma-sm 237.000 i/100ms
265
273
  Calculating -------------------------------------
266
- nokogiri-sm 1.073k (± 2.1%) i/s - 32.207k in 30.025474s
267
- nokolexbor-sm 3.300k13.2%) i/s - 27.540k in 36.788212s
268
- selma-sm 3.779k3.4%) i/s - 113.240k in 30.013908s
274
+ nokogiri-sm 800.531 (± 2.2%) i/s - 24.016k in 30.016056s
275
+ nokolexbor-sm 3.033k 3.6%) i/s - 91.155k in 30.094884s
276
+ selma-sm 2.386k1.6%) i/s - 71.574k in 30.001701s
269
277
 
270
278
  Comparison:
271
- selma-sm: 3779.4 i/s
272
- nokolexbor-sm: 3300.1 i/s - same-ish: difference falls within error
273
- nokogiri-sm: 1073.1 i/s - 3.52x slower
279
+ nokolexbor-sm: 3033.1 i/s
280
+ selma-sm: 2386.3 i/s - 1.27x slower
281
+ nokogiri-sm: 800.5 i/s - 3.79x slower
274
282
 
275
283
  input size = 86686 bytes, 0.09 MB
276
284
 
277
285
  ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
278
286
  Warming up --------------------------------------
279
- nokogiri-md 11.000 i/100ms
280
- nokolexbor-md 48.000 i/100ms
281
- selma-md 53.000 i/100ms
287
+ nokogiri-md 8.000 i/100ms
288
+ nokolexbor-md 43.000 i/100ms
289
+ selma-md 38.000 i/100ms
282
290
  Calculating -------------------------------------
283
- nokogiri-md 103.9985.8%) i/s - 3.113k in 30.029932s
284
- nokolexbor-md 428.928 7.9%) i/s - 12.816k in 30.066662s
285
- selma-md 492.1906.9%) i/s - 14.734k in 30.082943s
291
+ nokogiri-md 85.0138.2%) i/s - 2.024k in 52.257472s
292
+ nokolexbor-md 416.07411.1%) i/s - 12.341k in 30.111613s
293
+ selma-md 361.4714.7%) i/s - 10.830k in 30.033997s
286
294
 
287
295
  Comparison:
288
- selma-md: 492.2 i/s
289
- nokolexbor-md: 428.9 i/s - same-ish: difference falls within error
290
- nokogiri-md: 104.0 i/s - 4.73x slower
296
+ nokolexbor-md: 416.1 i/s
297
+ selma-md: 361.5 i/s - same-ish: difference falls within error
298
+ nokogiri-md: 85.0 i/s - 4.89x slower
291
299
 
292
300
  input size = 7172510 bytes, 7.17 MB
293
301
 
@@ -297,14 +305,14 @@ Warming up --------------------------------------
297
305
  nokolexbor-lg 1.000 i/100ms
298
306
  selma-lg 1.000 i/100ms
299
307
  Calculating -------------------------------------
300
- nokogiri-lg 0.874 (± 0.0%) i/s - 27.000 in 30.921090s
301
- nokolexbor-lg 2.227 (± 0.0%) i/s - 67.000 in 30.137903s
302
- selma-lg 8.354 (± 0.0%) i/s - 251.000 in 30.075227s
308
+ nokogiri-lg 0.805 (± 0.0%) i/s - 25.000 in 31.148730s
309
+ nokolexbor-lg 2.194 (± 0.0%) i/s - 66.000 in 30.278108s
310
+ selma-lg 5.541 (± 0.0%) i/s - 166.000 in 30.037197s
303
311
 
304
312
  Comparison:
305
- selma-lg: 8.4 i/s
306
- nokolexbor-lg: 2.2 i/s - 3.75x slower
307
- nokogiri-lg: 0.9 i/s - 9.56x slower
313
+ selma-lg: 5.5 i/s
314
+ nokolexbor-lg: 2.2 i/s - 2.53x slower
315
+ nokogiri-lg: 0.8 i/s - 6.88x slower
308
316
  </pre>
309
317
  </details>
310
318
  <!-- prettier-ignore-end -->
@@ -273,10 +273,8 @@ impl SelmaRewriter {
273
273
  let binding = self.0.borrow();
274
274
 
275
275
  let mut sanitizer_document_content_handlers: Vec<DocumentContentHandlers> = vec![];
276
- let mut sanitizer_initial_element_content_handlers: Vec<(
277
- Cow<Selector>,
278
- ElementContentHandlers,
279
- )> = vec![];
276
+ let mut sanitizer_element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> =
277
+ vec![];
280
278
 
281
279
  match &binding.sanitizer {
282
280
  None => (),
@@ -293,7 +291,7 @@ impl SelmaRewriter {
293
291
  Ok(())
294
292
  }));
295
293
  }
296
- sanitizer_initial_element_content_handlers.push(element!("*", |el| {
294
+ sanitizer_element_content_handlers.push(element!("*", |el| {
297
295
  sanitizer.try_remove_element(el);
298
296
  if el.removed() {
299
297
  return Ok(());
@@ -311,7 +309,7 @@ impl SelmaRewriter {
311
309
  match Self::perform_handler_rewrite(
312
310
  self,
313
311
  sanitizer_document_content_handlers,
314
- sanitizer_initial_element_content_handlers,
312
+ sanitizer_element_content_handlers,
315
313
  handlers,
316
314
  html,
317
315
  ) {
@@ -367,7 +365,7 @@ impl SelmaRewriter {
367
365
  pub fn perform_handler_rewrite(
368
366
  &self,
369
367
  sanitizer_document_content_handlers: Vec<DocumentContentHandlers>,
370
- sanitizer_initial_element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)>,
368
+ sanitizer_element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)>,
371
369
  handlers: &[Handler],
372
370
  html: String,
373
371
  ) -> Result<Vec<u8>, magnus::Error> {
@@ -453,14 +451,25 @@ impl SelmaRewriter {
453
451
  }));
454
452
  });
455
453
 
456
- element_content_handlers.extend(sanitizer_initial_element_content_handlers);
457
-
458
- Self::run_rewrite(
454
+ let rewritten_html = Self::run_rewrite(
459
455
  self,
460
456
  sanitizer_document_content_handlers,
461
457
  element_content_handlers,
462
458
  html.as_bytes(),
463
- )
459
+ );
460
+
461
+ // sanitization must happen separately, because text chunks
462
+ // could potentially have rewritten the html. ideally we'd
463
+ // be able to sanitize around the `process_text_handlers` call
464
+ match rewritten_html {
465
+ Ok(rewritten_html) => Self::run_rewrite(
466
+ self,
467
+ vec![],
468
+ sanitizer_element_content_handlers,
469
+ rewritten_html.as_slice(),
470
+ ),
471
+ Err(err) => Err(err),
472
+ }
464
473
  }
465
474
 
466
475
  fn run_rewrite(
data/lib/selma/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Selma
4
- VERSION = "0.4.0"
4
+ VERSION = "0.4.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: selma
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Garen J. Torikian