RubyGems - selma - Versions diffs - 0.3.0 → 0.4.0 - Mend

selma 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/Cargo.lock +107 -99
data/README.md +57 -43
data/ext/selma/Cargo.toml +4 -1
data/ext/selma/src/html/element.rs +11 -6
data/ext/selma/src/native_ref_wrap.rs +15 -12
data/ext/selma/src/rewriter.rs +257 -106
data/ext/selma/src/sanitizer.rs +3 -3
data/lib/selma/config.rb +12 -0
data/lib/selma/version.rb +1 -1
metadata +3 -2

data/README.md CHANGED Viewed

@@ -180,6 +180,19 @@ The `element` argument in `handle_element` has the following methods:
 - `after(content, as: content_type)`: Inserts `content` after the text. `content_type` is either `:text` or `:html` and determines how the content will be applied.
 - `replace(content, as: content_type)`: Replaces the text node with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
+## Security
+Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide two options into the `memory` namespace:
+```ruby
+memory: {
+  max_allowed_memory_usage: 1000,
+  preallocated_parsing_buffer_size: 100,
+},
+```
+Note that `preallocated_parsing_buffer_size` must always be less than `max_allowed_memory_usage`. See [the`lol_html` project documentation](https://docs.rs/lol_html/1.2.1/lol_html/struct.MemorySettings.html) to learn more about the default values.
 ## Benchmarks
 When `bundle exec rake benchmark`, two different benchmarks are calculated. Here are those results on my machine.
@@ -191,30 +204,33 @@ Comparing Selma against popular Ruby sanitization gems:
 <!-- prettier-ignore-start -->
 <details>
 <pre>
+input size = 25309 bytes, 0.03 MB
+ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
 Warming up --------------------------------------
-         sanitize-sm    15.000 i/100ms
-            selma-sm   126.000 i/100ms
+         sanitize-sm    16.000 i/100ms
+            selma-sm   214.000 i/100ms
 Calculating -------------------------------------
-         sanitize-sm    155.074 (± 1.9%) i/s -      4.665k in  30.092214s
-            selma-sm      1.290k (± 1.3%) i/s -     38.808k in  30.085333s
+         sanitize-sm    171.670 (± 1.2%) i/s -      5.152k in  30.017081s
+            selma-sm      2.146k (± 3.0%) i/s -     64.414k in  30.058470s
 Comparison:
-            selma-sm:     1290.1 i/s
-         sanitize-sm:      155.1 i/s - 8.32x  slower
+            selma-sm:     2145.8 i/s
+         sanitize-sm:      171.7 i/s - 12.50x  slower
 input size = 86686 bytes, 0.09 MB
 ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
 Warming up --------------------------------------
-         sanitize-md     3.000 i/100ms
-            selma-md    33.000 i/100ms
+         sanitize-md     4.000 i/100ms
+            selma-md    56.000 i/100ms
 Calculating -------------------------------------
-         sanitize-md     40.321 (± 5.0%) i/s -      1.206k in  30.004711s
-            selma-md    337.417 (± 1.5%) i/s -     10.131k in  30.032772s
+         sanitize-md     44.397 (± 2.3%) i/s -      1.332k in  30.022430s
+            selma-md    558.448 (± 1.4%) i/s -     16.800k in  30.089196s
 Comparison:
-            selma-md:      337.4 i/s
-         sanitize-md:       40.3 i/s - 8.37x  slower
+            selma-md:      558.4 i/s
+         sanitize-md:       44.4 i/s - 12.58x  slower
 input size = 7172510 bytes, 7.17 MB
@@ -223,12 +239,12 @@ Warming up --------------------------------------
          sanitize-lg     1.000 i/100ms
             selma-lg     1.000 i/100ms
 Calculating -------------------------------------
-         sanitize-lg      0.144 (± 0.0%) i/s -      5.000 in  34.772526s
-            selma-lg      4.026 (± 0.0%) i/s -    121.000 in  30.067415s
+         sanitize-lg      0.163 (± 0.0%) i/s -      6.000 in  37.375628s
+            selma-lg      6.750 (± 0.0%) i/s -    203.000 in  30.080976s
 Comparison:
-            selma-lg:        4.0 i/s
-         sanitize-lg:        0.1 i/s - 27.99x  slower
+            selma-lg:        6.7 i/s
+         sanitize-lg:        0.2 i/s - 41.32x  slower
 </pre>
 </details>
 <!-- prettier-ignore-end -->
@@ -239,41 +255,39 @@ Comparing Selma against popular Ruby HTML parsing gems:
 <!-- prettier-ignore-start -->
 <details>
-<pre>
-input size = 25309 bytes, 0.03 MB
+<pre>input size = 25309 bytes, 0.03 MB
 ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
 Warming up --------------------------------------
-         nokogiri-sm    79.000 i/100ms
-       nokolexbor-sm   285.000 i/100ms
-            selma-sm   244.000 i/100ms
+         nokogiri-sm   107.000 i/100ms
+       nokolexbor-sm   340.000 i/100ms
+            selma-sm   380.000 i/100ms
 Calculating -------------------------------------
-         nokogiri-sm    807.790 (± 3.1%) i/s -     24.253k in  30.056301s
-       nokolexbor-sm      2.880k (± 6.4%) i/s -     86.070k in  30.044766s
-            selma-sm      2.508k (± 1.2%) i/s -     75.396k in  30.068792s
+         nokogiri-sm      1.073k (± 2.1%) i/s -     32.207k in  30.025474s
+       nokolexbor-sm      3.300k (±13.2%) i/s -     27.540k in  36.788212s
+            selma-sm      3.779k (± 3.4%) i/s -    113.240k in  30.013908s
 Comparison:
-       nokolexbor-sm:     2880.3 i/s
-            selma-sm:     2507.8 i/s - 1.15x  slower
-         nokogiri-sm:      807.8 i/s - 3.57x  slower
+            selma-sm:     3779.4 i/s
+       nokolexbor-sm:     3300.1 i/s - same-ish: difference falls within error
+         nokogiri-sm:     1073.1 i/s - 3.52x  slower
 input size = 86686 bytes, 0.09 MB
 ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
 Warming up --------------------------------------
-         nokogiri-md     8.000 i/100ms
-       nokolexbor-md    43.000 i/100ms
-            selma-md    39.000 i/100ms
+         nokogiri-md    11.000 i/100ms
+       nokolexbor-md    48.000 i/100ms
+            selma-md    53.000 i/100ms
 Calculating -------------------------------------
-         nokogiri-md     87.367 (± 3.4%) i/s -      2.624k in  30.061642s
-       nokolexbor-md    438.782 (± 3.9%) i/s -     13.158k in  30.031163s
-            selma-md    392.591 (± 3.1%) i/s -     11.778k in  30.031391s
+         nokogiri-md    103.998 (± 5.8%) i/s -      3.113k in  30.029932s
+       nokolexbor-md    428.928 (± 7.9%) i/s -     12.816k in  30.066662s
+            selma-md    492.190 (± 6.9%) i/s -     14.734k in  30.082943s
 Comparison:
-       nokolexbor-md:      438.8 i/s
-            selma-md:      392.6 i/s - 1.12x  slower
-         nokogiri-md:       87.4 i/s - 5.02x  slower
+            selma-md:      492.2 i/s
+       nokolexbor-md:      428.9 i/s - same-ish: difference falls within error
+         nokogiri-md:      104.0 i/s - 4.73x  slower
 input size = 7172510 bytes, 7.17 MB
@@ -283,14 +297,14 @@ Warming up --------------------------------------
        nokolexbor-lg     1.000 i/100ms
             selma-lg     1.000 i/100ms
 Calculating -------------------------------------
-         nokogiri-lg      0.895 (± 0.0%) i/s -     27.000 in  30.300832s
-       nokolexbor-lg      2.163 (± 0.0%) i/s -     65.000 in  30.085656s
-            selma-lg      5.867 (± 0.0%) i/s -    176.000 in  30.006240s
+         nokogiri-lg      0.874 (± 0.0%) i/s -     27.000 in  30.921090s
+       nokolexbor-lg      2.227 (± 0.0%) i/s -     67.000 in  30.137903s
+            selma-lg      8.354 (± 0.0%) i/s -    251.000 in  30.075227s
 Comparison:
-            selma-lg:        5.9 i/s
-       nokolexbor-lg:        2.2 i/s - 2.71x  slower
-         nokogiri-lg:        0.9 i/s - 6.55x  slower
+            selma-lg:        8.4 i/s
+       nokolexbor-lg:        2.2 i/s - 3.75x  slower
+         nokogiri-lg:        0.9 i/s - 9.56x  slower
 </pre>
 </details>
 <!-- prettier-ignore-end -->

data/ext/selma/Cargo.toml CHANGED Viewed

@@ -8,7 +8,10 @@ publish = false
 [dependencies]
 enum-iterator = "2.1"
 escapist = "0.0.2"
-magnus = "0.6"
+magnus = { version = "0.6", features = ["rb-sys"] }
+rb-sys = { version = "*", default-features = false, features = [
+    "stable-api-compiled-fallback",
+] }
 lol_html = "1.2"
 [lib]

data/ext/selma/src/html/element.rs CHANGED Viewed

@@ -119,11 +119,13 @@ impl SelmaHTMLElement {
                 .iter()
                 .for_each(|attr| match hash.aset(attr.name(), attr.value()) {
                     Ok(_) => {}
-                    Err(err) => Err(Error::new(
-                        exception::runtime_error(),
-                        format!("AttributeNameError: {err:?}"),
-                    ))
-                    .unwrap(),
+                    Err(err) => panic!(
+                        "{:?}",
+                        Error::new(
+                            exception::runtime_error(),
+                            format!("AttributeNameError: {err:?}"),
+                        )
+                    ),
                 });
         }
         Ok(hash)
@@ -139,7 +141,10 @@ impl SelmaHTMLElement {
             .for_each(|ancestor| match array.push(RString::new(ancestor)) {
                 Ok(_) => {}
                 Err(err) => {
-                    Err(Error::new(exception::runtime_error(), format!("{err:?}"))).unwrap()
+                    panic!(
+                        "{:?}",
+                        Error::new(exception::runtime_error(), format!("{err:?}"))
+                    )
                 }
             });

data/ext/selma/src/native_ref_wrap.rs CHANGED Viewed

@@ -1,15 +1,17 @@
-use std::{cell::Cell, marker::PhantomData, rc::Rc};
+use std::{
+    marker::PhantomData,
+    sync::{Arc, Mutex},
+};
-// NOTE: My Rust isn't good enough to know what any of this does,
-// but it was taken from https://github.com/cloudflare/lol-html/blob/1a1ab2e2bf896f815fe8888ed78ccdf46d7c6b85/js-api/src/lib.rs#LL38
+// NOTE: this was taken from https://github.com/cloudflare/lol-html/blob/1a1ab2e2bf896f815fe8888ed78ccdf46d7c6b85/js-api/src/lib.rs#LL38
 pub struct Anchor<'r> {
-    poisoned: Rc<Cell<bool>>,
+    poisoned: Arc<Mutex<bool>>,
     lifetime: PhantomData<&'r mut ()>,
 }
 impl<'r> Anchor<'r> {
-    pub fn new(poisoned: Rc<Cell<bool>>) -> Self {
+    pub fn new(poisoned: Arc<Mutex<bool>>) -> Self {
         Anchor {
             poisoned,
             lifetime: PhantomData,
@@ -19,7 +21,7 @@ impl<'r> Anchor<'r> {
 // impl Drop for Anchor<'_> {
 //     fn drop(&mut self) {
-//         self.poisoned.replace(true);
+//         *self.poisoned.lock().unwrap() = true;
 //     }
 // }
@@ -31,17 +33,17 @@ impl<'r> Anchor<'r> {
 // object results in exception.
 pub struct NativeRefWrap<R> {
     inner_ptr: *mut R,
-    poisoned: Rc<Cell<bool>>,
+    poisoned: Arc<Mutex<bool>>,
 }
 impl<R> NativeRefWrap<R> {
     pub fn wrap<I>(inner: &I) -> (Self, Anchor) {
         let wrap = NativeRefWrap {
             inner_ptr: inner as *const I as *mut R,
-            poisoned: Rc::new(Cell::new(false)),
+            poisoned: Arc::new(Mutex::new(false)),
         };
-        let anchor = Anchor::new(Rc::clone(&wrap.poisoned));
+        let anchor = Anchor::new(Arc::clone(&wrap.poisoned));
         (wrap, anchor)
     }
@@ -49,10 +51,10 @@ impl<R> NativeRefWrap<R> {
     pub fn wrap_mut<I>(inner: &mut I) -> (Self, Anchor) {
         let wrap = NativeRefWrap {
             inner_ptr: inner as *mut I as *mut R,
-            poisoned: Rc::new(Cell::new(false)),
+            poisoned: Arc::new(Mutex::new(false)),
         };
-        let anchor = Anchor::new(Rc::clone(&wrap.poisoned));
+        let anchor = Anchor::new(Arc::clone(&wrap.poisoned));
         (wrap, anchor)
     }
@@ -70,7 +72,8 @@ impl<R> NativeRefWrap<R> {
     }
     fn assert_not_poisoned(&self) -> Result<(), &'static str> {
-        if self.poisoned.get() {
+        let lock = self.poisoned.lock().unwrap();
+        if *lock {
             Err("The object has been freed and can't be used anymore.")
         } else {
             Ok(())