selma 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 22f8e5128ea3eea52860c70f3fbd9d3f4d2b1139cccab700611e2aedf032fa99
4
- data.tar.gz: 07e888bed12f8e2e7bbf7ab92ebe691464c017e6dbe43faafd18e10e50bfc92c
3
+ metadata.gz: e5b5e70ce5e954bbd53e8df94b58facb21a78174562f001ce328f9d3ba4b09a3
4
+ data.tar.gz: 4360c58ce183593b8b5ec8c2bc2a15090aa5ed86f3e8a69b5975111274e01749
5
5
  SHA512:
6
- metadata.gz: f92066c8d7a4bd1357f852c34f4f050a7c863e250e0fe9cf742186aa82bba410738d0c25d209d0a13dbbcd9cb354012e3d10762193b9a40f690cad4c5293aa74
7
- data.tar.gz: 6773e0af5e365fafc6b76a3bdae555f02d87a06a488abbe8cb75c964cce9846b0f46e0acfeb6bf628db2f9c56ed236afcf58d49c405f4cff13ba5bd5f2b87a63
6
+ metadata.gz: 9528b3320b2bbe90bf17ee1cffe17e6ab65f58ebae1c8edda77f61d2c4b8e8c08782b4bf11cbbaae50610c44aa0dc69c58ca70c4c943a96af84285db167929bd
7
+ data.tar.gz: 8cf5ebd4259fa09b07b7908a2de795746a45cf7fc25eb3243affe373e0c90a04760cd2d8798367ffc3abe343e5b499a9253b2f01ac195843a067c8cbc88701a5
data/README.md CHANGED
@@ -182,13 +182,20 @@ The `element` argument in `handle_element` has the following methods:
182
182
 
183
183
  ## Security
184
184
 
185
- Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide two options into the `memory` namespace:
185
+ Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide `memory` options:
186
186
 
187
187
  ```ruby
188
- memory: {
189
- max_allowed_memory_usage: 1000,
190
- preallocated_parsing_buffer_size: 100,
191
- },
188
+ Selma::Rewriter.new(options: { memory: { max_allowed_memory_usage: 1_000_000 } }) # ~1MB
189
+ ```
190
+
191
+ The structure of the `memory` options looks like this:
192
+ ```ruby
193
+ {
194
+ memory: {
195
+ max_allowed_memory_usage: 1000,
196
+ preallocated_parsing_buffer_size: 100,
197
+ }
198
+ }
192
199
  ```
193
200
 
194
201
  Note that `preallocated_parsing_buffer_size` must always be less than `max_allowed_memory_usage`. See [the`lol_html` project documentation](https://docs.rs/lol_html/1.2.1/lol_html/struct.MemorySettings.html) to learn more about the default values.
@@ -208,29 +215,29 @@ input size = 25309 bytes, 0.03 MB
208
215
 
209
216
  ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
210
217
  Warming up --------------------------------------
211
- sanitize-sm 16.000 i/100ms
212
- selma-sm 214.000 i/100ms
218
+ sanitize-sm 15.000 i/100ms
219
+ selma-sm 127.000 i/100ms
213
220
  Calculating -------------------------------------
214
- sanitize-sm 171.670 (± 1.2%) i/s - 5.152k in 30.017081s
215
- selma-sm 2.146k3.0%) i/s - 64.414k in 30.058470s
221
+ sanitize-sm 157.643 (± 1.9%) i/s - 4.740k in 30.077172s
222
+ selma-sm 1.278k1.5%) i/s - 38.354k in 30.019722s
216
223
 
217
224
  Comparison:
218
- selma-sm: 2145.8 i/s
219
- sanitize-sm: 171.7 i/s - 12.50x slower
225
+ selma-sm: 1277.9 i/s
226
+ sanitize-sm: 157.6 i/s - 8.11x slower
220
227
 
221
228
  input size = 86686 bytes, 0.09 MB
222
229
 
223
230
  ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
224
231
  Warming up --------------------------------------
225
232
  sanitize-md 4.000 i/100ms
226
- selma-md 56.000 i/100ms
233
+ selma-md 33.000 i/100ms
227
234
  Calculating -------------------------------------
228
- sanitize-md 44.3972.3%) i/s - 1.332k in 30.022430s
229
- selma-md 558.448 (± 1.4%) i/s - 16.800k in 30.089196s
235
+ sanitize-md 40.0345.0%) i/s - 1.200k in 30.043322s
236
+ selma-md 332.9592.1%) i/s - 9.999k in 30.045733s
230
237
 
231
238
  Comparison:
232
- selma-md: 558.4 i/s
233
- sanitize-md: 44.4 i/s - 12.58x slower
239
+ selma-md: 333.0 i/s
240
+ sanitize-md: 40.0 i/s - 8.32x slower
234
241
 
235
242
  input size = 7172510 bytes, 7.17 MB
236
243
 
@@ -239,12 +246,12 @@ Warming up --------------------------------------
239
246
  sanitize-lg 1.000 i/100ms
240
247
  selma-lg 1.000 i/100ms
241
248
  Calculating -------------------------------------
242
- sanitize-lg 0.163 (± 0.0%) i/s - 6.000 in 37.375628s
243
- selma-lg 6.750 (± 0.0%) i/s - 203.000 in 30.080976s
249
+ sanitize-lg 0.141 (± 0.0%) i/s - 5.000 in 35.426127s
250
+ selma-lg 3.963 (± 0.0%) i/s - 119.000 in 30.037386s
244
251
 
245
252
  Comparison:
246
- selma-lg: 6.7 i/s
247
- sanitize-lg: 0.2 i/s - 41.32x slower
253
+ selma-lg: 4.0 i/s
254
+ sanitize-lg: 0.1 i/s - 28.03x slower
248
255
  </pre>
249
256
  </details>
250
257
  <!-- prettier-ignore-end -->
@@ -255,39 +262,40 @@ Comparing Selma against popular Ruby HTML parsing gems:
255
262
 
256
263
  <!-- prettier-ignore-start -->
257
264
  <details>
258
- <pre>input size = 25309 bytes, 0.03 MB
265
+ <pre>
266
+ input size = 25309 bytes, 0.03 MB
259
267
 
260
268
  ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
261
269
  Warming up --------------------------------------
262
- nokogiri-sm 107.000 i/100ms
263
- nokolexbor-sm 340.000 i/100ms
264
- selma-sm 380.000 i/100ms
270
+ nokogiri-sm 79.000 i/100ms
271
+ nokolexbor-sm 295.000 i/100ms
272
+ selma-sm 237.000 i/100ms
265
273
  Calculating -------------------------------------
266
- nokogiri-sm 1.073k (± 2.1%) i/s - 32.207k in 30.025474s
267
- nokolexbor-sm 3.300k13.2%) i/s - 27.540k in 36.788212s
268
- selma-sm 3.779k3.4%) i/s - 113.240k in 30.013908s
274
+ nokogiri-sm 800.531 (± 2.2%) i/s - 24.016k in 30.016056s
275
+ nokolexbor-sm 3.033k 3.6%) i/s - 91.155k in 30.094884s
276
+ selma-sm 2.386k1.6%) i/s - 71.574k in 30.001701s
269
277
 
270
278
  Comparison:
271
- selma-sm: 3779.4 i/s
272
- nokolexbor-sm: 3300.1 i/s - same-ish: difference falls within error
273
- nokogiri-sm: 1073.1 i/s - 3.52x slower
279
+ nokolexbor-sm: 3033.1 i/s
280
+ selma-sm: 2386.3 i/s - 1.27x slower
281
+ nokogiri-sm: 800.5 i/s - 3.79x slower
274
282
 
275
283
  input size = 86686 bytes, 0.09 MB
276
284
 
277
285
  ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
278
286
  Warming up --------------------------------------
279
- nokogiri-md 11.000 i/100ms
280
- nokolexbor-md 48.000 i/100ms
281
- selma-md 53.000 i/100ms
287
+ nokogiri-md 8.000 i/100ms
288
+ nokolexbor-md 43.000 i/100ms
289
+ selma-md 38.000 i/100ms
282
290
  Calculating -------------------------------------
283
- nokogiri-md 103.9985.8%) i/s - 3.113k in 30.029932s
284
- nokolexbor-md 428.928 7.9%) i/s - 12.816k in 30.066662s
285
- selma-md 492.1906.9%) i/s - 14.734k in 30.082943s
291
+ nokogiri-md 85.0138.2%) i/s - 2.024k in 52.257472s
292
+ nokolexbor-md 416.07411.1%) i/s - 12.341k in 30.111613s
293
+ selma-md 361.4714.7%) i/s - 10.830k in 30.033997s
286
294
 
287
295
  Comparison:
288
- selma-md: 492.2 i/s
289
- nokolexbor-md: 428.9 i/s - same-ish: difference falls within error
290
- nokogiri-md: 104.0 i/s - 4.73x slower
296
+ nokolexbor-md: 416.1 i/s
297
+ selma-md: 361.5 i/s - same-ish: difference falls within error
298
+ nokogiri-md: 85.0 i/s - 4.89x slower
291
299
 
292
300
  input size = 7172510 bytes, 7.17 MB
293
301
 
@@ -297,14 +305,14 @@ Warming up --------------------------------------
297
305
  nokolexbor-lg 1.000 i/100ms
298
306
  selma-lg 1.000 i/100ms
299
307
  Calculating -------------------------------------
300
- nokogiri-lg 0.874 (± 0.0%) i/s - 27.000 in 30.921090s
301
- nokolexbor-lg 2.227 (± 0.0%) i/s - 67.000 in 30.137903s
302
- selma-lg 8.354 (± 0.0%) i/s - 251.000 in 30.075227s
308
+ nokogiri-lg 0.805 (± 0.0%) i/s - 25.000 in 31.148730s
309
+ nokolexbor-lg 2.194 (± 0.0%) i/s - 66.000 in 30.278108s
310
+ selma-lg 5.541 (± 0.0%) i/s - 166.000 in 30.037197s
303
311
 
304
312
  Comparison:
305
- selma-lg: 8.4 i/s
306
- nokolexbor-lg: 2.2 i/s - 3.75x slower
307
- nokogiri-lg: 0.9 i/s - 9.56x slower
313
+ selma-lg: 5.5 i/s
314
+ nokolexbor-lg: 2.2 i/s - 2.53x slower
315
+ nokogiri-lg: 0.8 i/s - 6.88x slower
308
316
  </pre>
309
317
  </details>
310
318
  <!-- prettier-ignore-end -->
@@ -273,10 +273,8 @@ impl SelmaRewriter {
273
273
  let binding = self.0.borrow();
274
274
 
275
275
  let mut sanitizer_document_content_handlers: Vec<DocumentContentHandlers> = vec![];
276
- let mut sanitizer_initial_element_content_handlers: Vec<(
277
- Cow<Selector>,
278
- ElementContentHandlers,
279
- )> = vec![];
276
+ let mut sanitizer_element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> =
277
+ vec![];
280
278
 
281
279
  match &binding.sanitizer {
282
280
  None => (),
@@ -293,7 +291,7 @@ impl SelmaRewriter {
293
291
  Ok(())
294
292
  }));
295
293
  }
296
- sanitizer_initial_element_content_handlers.push(element!("*", |el| {
294
+ sanitizer_element_content_handlers.push(element!("*", |el| {
297
295
  sanitizer.try_remove_element(el);
298
296
  if el.removed() {
299
297
  return Ok(());
@@ -311,7 +309,7 @@ impl SelmaRewriter {
311
309
  match Self::perform_handler_rewrite(
312
310
  self,
313
311
  sanitizer_document_content_handlers,
314
- sanitizer_initial_element_content_handlers,
312
+ sanitizer_element_content_handlers,
315
313
  handlers,
316
314
  html,
317
315
  ) {
@@ -367,7 +365,7 @@ impl SelmaRewriter {
367
365
  pub fn perform_handler_rewrite(
368
366
  &self,
369
367
  sanitizer_document_content_handlers: Vec<DocumentContentHandlers>,
370
- sanitizer_initial_element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)>,
368
+ sanitizer_element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)>,
371
369
  handlers: &[Handler],
372
370
  html: String,
373
371
  ) -> Result<Vec<u8>, magnus::Error> {
@@ -453,14 +451,25 @@ impl SelmaRewriter {
453
451
  }));
454
452
  });
455
453
 
456
- element_content_handlers.extend(sanitizer_initial_element_content_handlers);
457
-
458
- Self::run_rewrite(
454
+ let rewritten_html = Self::run_rewrite(
459
455
  self,
460
456
  sanitizer_document_content_handlers,
461
457
  element_content_handlers,
462
458
  html.as_bytes(),
463
- )
459
+ );
460
+
461
+ // sanitization must happen separately, because text chunks
462
+ // could potentially have rewritten the html. ideally we'd
463
+ // be able to sanitize around the `process_text_handlers` call
464
+ match rewritten_html {
465
+ Ok(rewritten_html) => Self::run_rewrite(
466
+ self,
467
+ vec![],
468
+ sanitizer_element_content_handlers,
469
+ rewritten_html.as_slice(),
470
+ ),
471
+ Err(err) => Err(err),
472
+ }
464
473
  }
465
474
 
466
475
  fn run_rewrite(
data/lib/selma/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Selma
4
- VERSION = "0.4.0"
4
+ VERSION = "0.4.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: selma
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Garen J. Torikian