selma 0.4.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +53 -45
- data/ext/selma/src/rewriter.rs +20 -11
- data/lib/selma/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5b5e70ce5e954bbd53e8df94b58facb21a78174562f001ce328f9d3ba4b09a3
|
4
|
+
data.tar.gz: 4360c58ce183593b8b5ec8c2bc2a15090aa5ed86f3e8a69b5975111274e01749
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9528b3320b2bbe90bf17ee1cffe17e6ab65f58ebae1c8edda77f61d2c4b8e8c08782b4bf11cbbaae50610c44aa0dc69c58ca70c4c943a96af84285db167929bd
|
7
|
+
data.tar.gz: 8cf5ebd4259fa09b07b7908a2de795746a45cf7fc25eb3243affe373e0c90a04760cd2d8798367ffc3abe343e5b499a9253b2f01ac195843a067c8cbc88701a5
|
data/README.md
CHANGED
@@ -182,13 +182,20 @@ The `element` argument in `handle_element` has the following methods:
|
|
182
182
|
|
183
183
|
## Security
|
184
184
|
|
185
|
-
Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide
|
185
|
+
Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide `memory` options:
|
186
186
|
|
187
187
|
```ruby
|
188
|
-
memory: {
|
189
|
-
|
190
|
-
|
191
|
-
|
188
|
+
Selma::Rewriter.new(options: { memory: { max_allowed_memory_usage: 1_000_000 } }) # ~1MB
|
189
|
+
```
|
190
|
+
|
191
|
+
The structure of the `memory` options looks like this:
|
192
|
+
```ruby
|
193
|
+
{
|
194
|
+
memory: {
|
195
|
+
max_allowed_memory_usage: 1000,
|
196
|
+
preallocated_parsing_buffer_size: 100,
|
197
|
+
}
|
198
|
+
}
|
192
199
|
```
|
193
200
|
|
194
201
|
Note that `preallocated_parsing_buffer_size` must always be less than `max_allowed_memory_usage`. See [the`lol_html` project documentation](https://docs.rs/lol_html/1.2.1/lol_html/struct.MemorySettings.html) to learn more about the default values.
|
@@ -208,29 +215,29 @@ input size = 25309 bytes, 0.03 MB
|
|
208
215
|
|
209
216
|
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
210
217
|
Warming up --------------------------------------
|
211
|
-
sanitize-sm
|
212
|
-
selma-sm
|
218
|
+
sanitize-sm 15.000 i/100ms
|
219
|
+
selma-sm 127.000 i/100ms
|
213
220
|
Calculating -------------------------------------
|
214
|
-
sanitize-sm
|
215
|
-
selma-sm
|
221
|
+
sanitize-sm 157.643 (± 1.9%) i/s - 4.740k in 30.077172s
|
222
|
+
selma-sm 1.278k (± 1.5%) i/s - 38.354k in 30.019722s
|
216
223
|
|
217
224
|
Comparison:
|
218
|
-
selma-sm:
|
219
|
-
sanitize-sm:
|
225
|
+
selma-sm: 1277.9 i/s
|
226
|
+
sanitize-sm: 157.6 i/s - 8.11x slower
|
220
227
|
|
221
228
|
input size = 86686 bytes, 0.09 MB
|
222
229
|
|
223
230
|
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
224
231
|
Warming up --------------------------------------
|
225
232
|
sanitize-md 4.000 i/100ms
|
226
|
-
selma-md
|
233
|
+
selma-md 33.000 i/100ms
|
227
234
|
Calculating -------------------------------------
|
228
|
-
sanitize-md
|
229
|
-
selma-md
|
235
|
+
sanitize-md 40.034 (± 5.0%) i/s - 1.200k in 30.043322s
|
236
|
+
selma-md 332.959 (± 2.1%) i/s - 9.999k in 30.045733s
|
230
237
|
|
231
238
|
Comparison:
|
232
|
-
selma-md:
|
233
|
-
sanitize-md:
|
239
|
+
selma-md: 333.0 i/s
|
240
|
+
sanitize-md: 40.0 i/s - 8.32x slower
|
234
241
|
|
235
242
|
input size = 7172510 bytes, 7.17 MB
|
236
243
|
|
@@ -239,12 +246,12 @@ Warming up --------------------------------------
|
|
239
246
|
sanitize-lg 1.000 i/100ms
|
240
247
|
selma-lg 1.000 i/100ms
|
241
248
|
Calculating -------------------------------------
|
242
|
-
sanitize-lg 0.
|
243
|
-
selma-lg
|
249
|
+
sanitize-lg 0.141 (± 0.0%) i/s - 5.000 in 35.426127s
|
250
|
+
selma-lg 3.963 (± 0.0%) i/s - 119.000 in 30.037386s
|
244
251
|
|
245
252
|
Comparison:
|
246
|
-
selma-lg:
|
247
|
-
sanitize-lg: 0.
|
253
|
+
selma-lg: 4.0 i/s
|
254
|
+
sanitize-lg: 0.1 i/s - 28.03x slower
|
248
255
|
</pre>
|
249
256
|
</details>
|
250
257
|
<!-- prettier-ignore-end -->
|
@@ -255,39 +262,40 @@ Comparing Selma against popular Ruby HTML parsing gems:
|
|
255
262
|
|
256
263
|
<!-- prettier-ignore-start -->
|
257
264
|
<details>
|
258
|
-
<pre>
|
265
|
+
<pre>
|
266
|
+
input size = 25309 bytes, 0.03 MB
|
259
267
|
|
260
268
|
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
261
269
|
Warming up --------------------------------------
|
262
|
-
nokogiri-sm
|
263
|
-
nokolexbor-sm
|
264
|
-
selma-sm
|
270
|
+
nokogiri-sm 79.000 i/100ms
|
271
|
+
nokolexbor-sm 295.000 i/100ms
|
272
|
+
selma-sm 237.000 i/100ms
|
265
273
|
Calculating -------------------------------------
|
266
|
-
nokogiri-sm
|
267
|
-
nokolexbor-sm 3.
|
268
|
-
selma-sm
|
274
|
+
nokogiri-sm 800.531 (± 2.2%) i/s - 24.016k in 30.016056s
|
275
|
+
nokolexbor-sm 3.033k (± 3.6%) i/s - 91.155k in 30.094884s
|
276
|
+
selma-sm 2.386k (± 1.6%) i/s - 71.574k in 30.001701s
|
269
277
|
|
270
278
|
Comparison:
|
271
|
-
|
272
|
-
|
273
|
-
nokogiri-sm:
|
279
|
+
nokolexbor-sm: 3033.1 i/s
|
280
|
+
selma-sm: 2386.3 i/s - 1.27x slower
|
281
|
+
nokogiri-sm: 800.5 i/s - 3.79x slower
|
274
282
|
|
275
283
|
input size = 86686 bytes, 0.09 MB
|
276
284
|
|
277
285
|
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
278
286
|
Warming up --------------------------------------
|
279
|
-
nokogiri-md
|
280
|
-
nokolexbor-md
|
281
|
-
selma-md
|
287
|
+
nokogiri-md 8.000 i/100ms
|
288
|
+
nokolexbor-md 43.000 i/100ms
|
289
|
+
selma-md 38.000 i/100ms
|
282
290
|
Calculating -------------------------------------
|
283
|
-
nokogiri-md
|
284
|
-
nokolexbor-md
|
285
|
-
selma-md
|
291
|
+
nokogiri-md 85.013 (± 8.2%) i/s - 2.024k in 52.257472s
|
292
|
+
nokolexbor-md 416.074 (±11.1%) i/s - 12.341k in 30.111613s
|
293
|
+
selma-md 361.471 (± 4.7%) i/s - 10.830k in 30.033997s
|
286
294
|
|
287
295
|
Comparison:
|
288
|
-
|
289
|
-
|
290
|
-
nokogiri-md:
|
296
|
+
nokolexbor-md: 416.1 i/s
|
297
|
+
selma-md: 361.5 i/s - same-ish: difference falls within error
|
298
|
+
nokogiri-md: 85.0 i/s - 4.89x slower
|
291
299
|
|
292
300
|
input size = 7172510 bytes, 7.17 MB
|
293
301
|
|
@@ -297,14 +305,14 @@ Warming up --------------------------------------
|
|
297
305
|
nokolexbor-lg 1.000 i/100ms
|
298
306
|
selma-lg 1.000 i/100ms
|
299
307
|
Calculating -------------------------------------
|
300
|
-
nokogiri-lg 0.
|
301
|
-
nokolexbor-lg 2.
|
302
|
-
selma-lg
|
308
|
+
nokogiri-lg 0.805 (± 0.0%) i/s - 25.000 in 31.148730s
|
309
|
+
nokolexbor-lg 2.194 (± 0.0%) i/s - 66.000 in 30.278108s
|
310
|
+
selma-lg 5.541 (± 0.0%) i/s - 166.000 in 30.037197s
|
303
311
|
|
304
312
|
Comparison:
|
305
|
-
selma-lg:
|
306
|
-
nokolexbor-lg: 2.2 i/s -
|
307
|
-
nokogiri-lg: 0.
|
313
|
+
selma-lg: 5.5 i/s
|
314
|
+
nokolexbor-lg: 2.2 i/s - 2.53x slower
|
315
|
+
nokogiri-lg: 0.8 i/s - 6.88x slower
|
308
316
|
</pre>
|
309
317
|
</details>
|
310
318
|
<!-- prettier-ignore-end -->
|
data/ext/selma/src/rewriter.rs
CHANGED
@@ -273,10 +273,8 @@ impl SelmaRewriter {
|
|
273
273
|
let binding = self.0.borrow();
|
274
274
|
|
275
275
|
let mut sanitizer_document_content_handlers: Vec<DocumentContentHandlers> = vec![];
|
276
|
-
let mut
|
277
|
-
|
278
|
-
ElementContentHandlers,
|
279
|
-
)> = vec![];
|
276
|
+
let mut sanitizer_element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> =
|
277
|
+
vec![];
|
280
278
|
|
281
279
|
match &binding.sanitizer {
|
282
280
|
None => (),
|
@@ -293,7 +291,7 @@ impl SelmaRewriter {
|
|
293
291
|
Ok(())
|
294
292
|
}));
|
295
293
|
}
|
296
|
-
|
294
|
+
sanitizer_element_content_handlers.push(element!("*", |el| {
|
297
295
|
sanitizer.try_remove_element(el);
|
298
296
|
if el.removed() {
|
299
297
|
return Ok(());
|
@@ -311,7 +309,7 @@ impl SelmaRewriter {
|
|
311
309
|
match Self::perform_handler_rewrite(
|
312
310
|
self,
|
313
311
|
sanitizer_document_content_handlers,
|
314
|
-
|
312
|
+
sanitizer_element_content_handlers,
|
315
313
|
handlers,
|
316
314
|
html,
|
317
315
|
) {
|
@@ -367,7 +365,7 @@ impl SelmaRewriter {
|
|
367
365
|
pub fn perform_handler_rewrite(
|
368
366
|
&self,
|
369
367
|
sanitizer_document_content_handlers: Vec<DocumentContentHandlers>,
|
370
|
-
|
368
|
+
sanitizer_element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)>,
|
371
369
|
handlers: &[Handler],
|
372
370
|
html: String,
|
373
371
|
) -> Result<Vec<u8>, magnus::Error> {
|
@@ -453,14 +451,25 @@ impl SelmaRewriter {
|
|
453
451
|
}));
|
454
452
|
});
|
455
453
|
|
456
|
-
|
457
|
-
|
458
|
-
Self::run_rewrite(
|
454
|
+
let rewritten_html = Self::run_rewrite(
|
459
455
|
self,
|
460
456
|
sanitizer_document_content_handlers,
|
461
457
|
element_content_handlers,
|
462
458
|
html.as_bytes(),
|
463
|
-
)
|
459
|
+
);
|
460
|
+
|
461
|
+
// sanitization must happen separately, because text chunks
|
462
|
+
// could potentially have rewritten the html. ideally we'd
|
463
|
+
// be able to sanitize around the `process_text_handlers` call
|
464
|
+
match rewritten_html {
|
465
|
+
Ok(rewritten_html) => Self::run_rewrite(
|
466
|
+
self,
|
467
|
+
vec![],
|
468
|
+
sanitizer_element_content_handlers,
|
469
|
+
rewritten_html.as_slice(),
|
470
|
+
),
|
471
|
+
Err(err) => Err(err),
|
472
|
+
}
|
464
473
|
}
|
465
474
|
|
466
475
|
fn run_rewrite(
|
data/lib/selma/version.rb
CHANGED