selma 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +53 -45
- data/ext/selma/src/rewriter.rs +20 -11
- data/lib/selma/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5b5e70ce5e954bbd53e8df94b58facb21a78174562f001ce328f9d3ba4b09a3
|
4
|
+
data.tar.gz: 4360c58ce183593b8b5ec8c2bc2a15090aa5ed86f3e8a69b5975111274e01749
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9528b3320b2bbe90bf17ee1cffe17e6ab65f58ebae1c8edda77f61d2c4b8e8c08782b4bf11cbbaae50610c44aa0dc69c58ca70c4c943a96af84285db167929bd
|
7
|
+
data.tar.gz: 8cf5ebd4259fa09b07b7908a2de795746a45cf7fc25eb3243affe373e0c90a04760cd2d8798367ffc3abe343e5b499a9253b2f01ac195843a067c8cbc88701a5
|
data/README.md
CHANGED
@@ -182,13 +182,20 @@ The `element` argument in `handle_element` has the following methods:
|
|
182
182
|
|
183
183
|
## Security
|
184
184
|
|
185
|
-
Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide
|
185
|
+
Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide `memory` options:
|
186
186
|
|
187
187
|
```ruby
|
188
|
-
memory: {
|
189
|
-
|
190
|
-
|
191
|
-
|
188
|
+
Selma::Rewriter.new(options: { memory: { max_allowed_memory_usage: 1_000_000 } }) # ~1MB
|
189
|
+
```
|
190
|
+
|
191
|
+
The structure of the `memory` options looks like this:
|
192
|
+
```ruby
|
193
|
+
{
|
194
|
+
memory: {
|
195
|
+
max_allowed_memory_usage: 1000,
|
196
|
+
preallocated_parsing_buffer_size: 100,
|
197
|
+
}
|
198
|
+
}
|
192
199
|
```
|
193
200
|
|
194
201
|
Note that `preallocated_parsing_buffer_size` must always be less than `max_allowed_memory_usage`. See [the`lol_html` project documentation](https://docs.rs/lol_html/1.2.1/lol_html/struct.MemorySettings.html) to learn more about the default values.
|
@@ -208,29 +215,29 @@ input size = 25309 bytes, 0.03 MB
|
|
208
215
|
|
209
216
|
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
210
217
|
Warming up --------------------------------------
|
211
|
-
sanitize-sm
|
212
|
-
selma-sm
|
218
|
+
sanitize-sm 15.000 i/100ms
|
219
|
+
selma-sm 127.000 i/100ms
|
213
220
|
Calculating -------------------------------------
|
214
|
-
sanitize-sm
|
215
|
-
selma-sm
|
221
|
+
sanitize-sm 157.643 (± 1.9%) i/s - 4.740k in 30.077172s
|
222
|
+
selma-sm 1.278k (± 1.5%) i/s - 38.354k in 30.019722s
|
216
223
|
|
217
224
|
Comparison:
|
218
|
-
selma-sm:
|
219
|
-
sanitize-sm:
|
225
|
+
selma-sm: 1277.9 i/s
|
226
|
+
sanitize-sm: 157.6 i/s - 8.11x slower
|
220
227
|
|
221
228
|
input size = 86686 bytes, 0.09 MB
|
222
229
|
|
223
230
|
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
224
231
|
Warming up --------------------------------------
|
225
232
|
sanitize-md 4.000 i/100ms
|
226
|
-
selma-md
|
233
|
+
selma-md 33.000 i/100ms
|
227
234
|
Calculating -------------------------------------
|
228
|
-
sanitize-md
|
229
|
-
selma-md
|
235
|
+
sanitize-md 40.034 (± 5.0%) i/s - 1.200k in 30.043322s
|
236
|
+
selma-md 332.959 (± 2.1%) i/s - 9.999k in 30.045733s
|
230
237
|
|
231
238
|
Comparison:
|
232
|
-
selma-md:
|
233
|
-
sanitize-md:
|
239
|
+
selma-md: 333.0 i/s
|
240
|
+
sanitize-md: 40.0 i/s - 8.32x slower
|
234
241
|
|
235
242
|
input size = 7172510 bytes, 7.17 MB
|
236
243
|
|
@@ -239,12 +246,12 @@ Warming up --------------------------------------
|
|
239
246
|
sanitize-lg 1.000 i/100ms
|
240
247
|
selma-lg 1.000 i/100ms
|
241
248
|
Calculating -------------------------------------
|
242
|
-
sanitize-lg 0.
|
243
|
-
selma-lg
|
249
|
+
sanitize-lg 0.141 (± 0.0%) i/s - 5.000 in 35.426127s
|
250
|
+
selma-lg 3.963 (± 0.0%) i/s - 119.000 in 30.037386s
|
244
251
|
|
245
252
|
Comparison:
|
246
|
-
selma-lg:
|
247
|
-
sanitize-lg: 0.
|
253
|
+
selma-lg: 4.0 i/s
|
254
|
+
sanitize-lg: 0.1 i/s - 28.03x slower
|
248
255
|
</pre>
|
249
256
|
</details>
|
250
257
|
<!-- prettier-ignore-end -->
|
@@ -255,39 +262,40 @@ Comparing Selma against popular Ruby HTML parsing gems:
|
|
255
262
|
|
256
263
|
<!-- prettier-ignore-start -->
|
257
264
|
<details>
|
258
|
-
<pre>
|
265
|
+
<pre>
|
266
|
+
input size = 25309 bytes, 0.03 MB
|
259
267
|
|
260
268
|
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
261
269
|
Warming up --------------------------------------
|
262
|
-
nokogiri-sm
|
263
|
-
nokolexbor-sm
|
264
|
-
selma-sm
|
270
|
+
nokogiri-sm 79.000 i/100ms
|
271
|
+
nokolexbor-sm 295.000 i/100ms
|
272
|
+
selma-sm 237.000 i/100ms
|
265
273
|
Calculating -------------------------------------
|
266
|
-
nokogiri-sm
|
267
|
-
nokolexbor-sm 3.
|
268
|
-
selma-sm
|
274
|
+
nokogiri-sm 800.531 (± 2.2%) i/s - 24.016k in 30.016056s
|
275
|
+
nokolexbor-sm 3.033k (± 3.6%) i/s - 91.155k in 30.094884s
|
276
|
+
selma-sm 2.386k (± 1.6%) i/s - 71.574k in 30.001701s
|
269
277
|
|
270
278
|
Comparison:
|
271
|
-
|
272
|
-
|
273
|
-
nokogiri-sm:
|
279
|
+
nokolexbor-sm: 3033.1 i/s
|
280
|
+
selma-sm: 2386.3 i/s - 1.27x slower
|
281
|
+
nokogiri-sm: 800.5 i/s - 3.79x slower
|
274
282
|
|
275
283
|
input size = 86686 bytes, 0.09 MB
|
276
284
|
|
277
285
|
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
|
278
286
|
Warming up --------------------------------------
|
279
|
-
nokogiri-md
|
280
|
-
nokolexbor-md
|
281
|
-
selma-md
|
287
|
+
nokogiri-md 8.000 i/100ms
|
288
|
+
nokolexbor-md 43.000 i/100ms
|
289
|
+
selma-md 38.000 i/100ms
|
282
290
|
Calculating -------------------------------------
|
283
|
-
nokogiri-md
|
284
|
-
nokolexbor-md
|
285
|
-
selma-md
|
291
|
+
nokogiri-md 85.013 (± 8.2%) i/s - 2.024k in 52.257472s
|
292
|
+
nokolexbor-md 416.074 (±11.1%) i/s - 12.341k in 30.111613s
|
293
|
+
selma-md 361.471 (± 4.7%) i/s - 10.830k in 30.033997s
|
286
294
|
|
287
295
|
Comparison:
|
288
|
-
|
289
|
-
|
290
|
-
nokogiri-md:
|
296
|
+
nokolexbor-md: 416.1 i/s
|
297
|
+
selma-md: 361.5 i/s - same-ish: difference falls within error
|
298
|
+
nokogiri-md: 85.0 i/s - 4.89x slower
|
291
299
|
|
292
300
|
input size = 7172510 bytes, 7.17 MB
|
293
301
|
|
@@ -297,14 +305,14 @@ Warming up --------------------------------------
|
|
297
305
|
nokolexbor-lg 1.000 i/100ms
|
298
306
|
selma-lg 1.000 i/100ms
|
299
307
|
Calculating -------------------------------------
|
300
|
-
nokogiri-lg 0.
|
301
|
-
nokolexbor-lg 2.
|
302
|
-
selma-lg
|
308
|
+
nokogiri-lg 0.805 (± 0.0%) i/s - 25.000 in 31.148730s
|
309
|
+
nokolexbor-lg 2.194 (± 0.0%) i/s - 66.000 in 30.278108s
|
310
|
+
selma-lg 5.541 (± 0.0%) i/s - 166.000 in 30.037197s
|
303
311
|
|
304
312
|
Comparison:
|
305
|
-
selma-lg:
|
306
|
-
nokolexbor-lg: 2.2 i/s -
|
307
|
-
nokogiri-lg: 0.
|
313
|
+
selma-lg: 5.5 i/s
|
314
|
+
nokolexbor-lg: 2.2 i/s - 2.53x slower
|
315
|
+
nokogiri-lg: 0.8 i/s - 6.88x slower
|
308
316
|
</pre>
|
309
317
|
</details>
|
310
318
|
<!-- prettier-ignore-end -->
|
data/ext/selma/src/rewriter.rs
CHANGED
@@ -273,10 +273,8 @@ impl SelmaRewriter {
|
|
273
273
|
let binding = self.0.borrow();
|
274
274
|
|
275
275
|
let mut sanitizer_document_content_handlers: Vec<DocumentContentHandlers> = vec![];
|
276
|
-
let mut
|
277
|
-
|
278
|
-
ElementContentHandlers,
|
279
|
-
)> = vec![];
|
276
|
+
let mut sanitizer_element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> =
|
277
|
+
vec![];
|
280
278
|
|
281
279
|
match &binding.sanitizer {
|
282
280
|
None => (),
|
@@ -293,7 +291,7 @@ impl SelmaRewriter {
|
|
293
291
|
Ok(())
|
294
292
|
}));
|
295
293
|
}
|
296
|
-
|
294
|
+
sanitizer_element_content_handlers.push(element!("*", |el| {
|
297
295
|
sanitizer.try_remove_element(el);
|
298
296
|
if el.removed() {
|
299
297
|
return Ok(());
|
@@ -311,7 +309,7 @@ impl SelmaRewriter {
|
|
311
309
|
match Self::perform_handler_rewrite(
|
312
310
|
self,
|
313
311
|
sanitizer_document_content_handlers,
|
314
|
-
|
312
|
+
sanitizer_element_content_handlers,
|
315
313
|
handlers,
|
316
314
|
html,
|
317
315
|
) {
|
@@ -367,7 +365,7 @@ impl SelmaRewriter {
|
|
367
365
|
pub fn perform_handler_rewrite(
|
368
366
|
&self,
|
369
367
|
sanitizer_document_content_handlers: Vec<DocumentContentHandlers>,
|
370
|
-
|
368
|
+
sanitizer_element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)>,
|
371
369
|
handlers: &[Handler],
|
372
370
|
html: String,
|
373
371
|
) -> Result<Vec<u8>, magnus::Error> {
|
@@ -453,14 +451,25 @@ impl SelmaRewriter {
|
|
453
451
|
}));
|
454
452
|
});
|
455
453
|
|
456
|
-
|
457
|
-
|
458
|
-
Self::run_rewrite(
|
454
|
+
let rewritten_html = Self::run_rewrite(
|
459
455
|
self,
|
460
456
|
sanitizer_document_content_handlers,
|
461
457
|
element_content_handlers,
|
462
458
|
html.as_bytes(),
|
463
|
-
)
|
459
|
+
);
|
460
|
+
|
461
|
+
// sanitization must happen separately, because text chunks
|
462
|
+
// could potentially have rewritten the html. ideally we'd
|
463
|
+
// be able to sanitize around the `process_text_handlers` call
|
464
|
+
match rewritten_html {
|
465
|
+
Ok(rewritten_html) => Self::run_rewrite(
|
466
|
+
self,
|
467
|
+
vec![],
|
468
|
+
sanitizer_element_content_handlers,
|
469
|
+
rewritten_html.as_slice(),
|
470
|
+
),
|
471
|
+
Err(err) => Err(err),
|
472
|
+
}
|
464
473
|
}
|
465
474
|
|
466
475
|
fn run_rewrite(
|
data/lib/selma/version.rb
CHANGED