selma 0.2.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +115 -114
- data/README.md +122 -24
- data/ext/selma/Cargo.toml +5 -2
- data/ext/selma/src/html/element.rs +11 -6
- data/ext/selma/src/native_ref_wrap.rs +15 -12
- data/ext/selma/src/rewriter.rs +257 -106
- data/ext/selma/src/sanitizer.rs +23 -16
- data/lib/selma/config.rb +12 -0
- data/lib/selma/sanitizer/config/default.rb +1 -1
- data/lib/selma/sanitizer/config/relaxed.rb +1 -0
- data/lib/selma/sanitizer.rb +6 -1
- data/lib/selma/version.rb +1 -1
- metadata +8 -7
    
        data/README.md
    CHANGED
    
    | @@ -76,7 +76,7 @@ attributes: { | |
| 76 76 |  | 
| 77 77 | 
             
            # URL handling protocols to allow in specific attributes. By default, no
         | 
| 78 78 | 
             
            # protocols are allowed. Use :relative in place of a protocol if you want
         | 
| 79 | 
            -
            # to allow relative URLs sans protocol.
         | 
| 79 | 
            +
            # to allow relative URLs sans protocol. Set to `:all` to allow any protocol.
         | 
| 80 80 | 
             
            protocols: {
         | 
| 81 81 | 
             
                "a" => { "href" => ["http", "https", "mailto", :relative] },
         | 
| 82 82 | 
             
                "img" => { "href" => ["http", "https"] },
         | 
| @@ -103,7 +103,11 @@ Here's an example which rewrites the `href` attribute on `a` and the `src` attri | |
| 103 103 |  | 
| 104 104 | 
             
            ```ruby
         | 
| 105 105 | 
             
            class MatchAttribute
         | 
| 106 | 
            -
              SELECTOR = Selma::Selector(match_element: %(a[href^="http:"], img[src^="http:"]"))
         | 
| 106 | 
            +
              SELECTOR = Selma::Selector.new(match_element: %(a[href^="http:"], img[src^="http:"]"))
         | 
| 107 | 
            +
             | 
| 108 | 
            +
              def selector
         | 
| 109 | 
            +
                SELECTOR
         | 
| 110 | 
            +
              end
         | 
| 107 111 |  | 
| 108 112 | 
             
              def handle_element(element)
         | 
| 109 113 | 
             
                if element.tag_name == "a"
         | 
| @@ -176,40 +180,134 @@ The `element` argument in `handle_element` has the following methods: | |
| 176 180 | 
             
            - `after(content, as: content_type)`: Inserts `content` after the text. `content_type` is either `:text` or `:html` and determines how the content will be applied.
         | 
| 177 181 | 
             
            - `replace(content, as: content_type)`: Replaces the text node with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
         | 
| 178 182 |  | 
| 183 | 
            +
            ## Security
         | 
| 184 | 
            +
             | 
| 185 | 
            +
            Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide two options into the `memory` namespace:
         | 
| 186 | 
            +
             | 
| 187 | 
            +
            ```ruby
         | 
| 188 | 
            +
            memory: {
         | 
| 189 | 
            +
              max_allowed_memory_usage: 1000,
         | 
| 190 | 
            +
              preallocated_parsing_buffer_size: 100,
         | 
| 191 | 
            +
            },
         | 
| 192 | 
            +
            ```
         | 
| 193 | 
            +
             | 
| 194 | 
            +
            Note that `preallocated_parsing_buffer_size` must always be less than `max_allowed_memory_usage`. See [the`lol_html` project documentation](https://docs.rs/lol_html/1.2.1/lol_html/struct.MemorySettings.html) to learn more about the default values.
         | 
| 195 | 
            +
             | 
| 179 196 | 
             
            ## Benchmarks
         | 
| 180 197 |  | 
| 198 | 
            +
            When `bundle exec rake benchmark`, two different benchmarks are calculated. Here are those results on my machine.
         | 
| 199 | 
            +
             | 
| 200 | 
            +
            ### Benchmarks for just the sanitization process
         | 
| 201 | 
            +
             | 
| 202 | 
            +
            Comparing Selma against popular Ruby sanitization gems:
         | 
| 203 | 
            +
             | 
| 204 | 
            +
            <!-- prettier-ignore-start -->
         | 
| 181 205 | 
             
            <details>
         | 
| 182 206 | 
             
            <pre>
         | 
| 183 | 
            -
             | 
| 184 | 
            -
             | 
| 207 | 
            +
            input size = 25309 bytes, 0.03 MB
         | 
| 208 | 
            +
             | 
| 209 | 
            +
            ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
         | 
| 185 210 | 
             
            Warming up --------------------------------------
         | 
| 186 | 
            -
            sanitize- | 
| 187 | 
            -
             | 
| 188 | 
            -
             selma-document-huge     1.000  i/100ms
         | 
| 211 | 
            +
                     sanitize-sm    16.000 i/100ms
         | 
| 212 | 
            +
                        selma-sm   214.000 i/100ms
         | 
| 189 213 | 
             
            Calculating -------------------------------------
         | 
| 190 | 
            -
            sanitize- | 
| 191 | 
            -
             | 
| 192 | 
            -
             | 
| 214 | 
            +
                     sanitize-sm    171.670 (± 1.2%) i/s -      5.152k in  30.017081s
         | 
| 215 | 
            +
                        selma-sm      2.146k (± 3.0%) i/s -     64.414k in  30.058470s
         | 
| 216 | 
            +
             | 
| 217 | 
            +
            Comparison:
         | 
| 218 | 
            +
                        selma-sm:     2145.8 i/s
         | 
| 219 | 
            +
                     sanitize-sm:      171.7 i/s - 12.50x  slower
         | 
| 220 | 
            +
             | 
| 221 | 
            +
            input size = 86686 bytes, 0.09 MB
         | 
| 222 | 
            +
             | 
| 223 | 
            +
            ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
         | 
| 193 224 | 
             
            Warming up --------------------------------------
         | 
| 194 | 
            -
            sanitize- | 
| 195 | 
            -
             | 
| 196 | 
            -
            selma-document-medium
         | 
| 197 | 
            -
                                    22.000  i/100ms
         | 
| 225 | 
            +
                     sanitize-md     4.000 i/100ms
         | 
| 226 | 
            +
                        selma-md    56.000 i/100ms
         | 
| 198 227 | 
             
            Calculating -------------------------------------
         | 
| 199 | 
            -
            sanitize- | 
| 200 | 
            -
             | 
| 201 | 
            -
             | 
| 202 | 
            -
             | 
| 228 | 
            +
                     sanitize-md     44.397 (± 2.3%) i/s -      1.332k in  30.022430s
         | 
| 229 | 
            +
                        selma-md    558.448 (± 1.4%) i/s -     16.800k in  30.089196s
         | 
| 230 | 
            +
             | 
| 231 | 
            +
            Comparison:
         | 
| 232 | 
            +
                        selma-md:      558.4 i/s
         | 
| 233 | 
            +
                     sanitize-md:       44.4 i/s - 12.58x  slower
         | 
| 234 | 
            +
             | 
| 235 | 
            +
            input size = 7172510 bytes, 7.17 MB
         | 
| 236 | 
            +
             | 
| 237 | 
            +
            ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
         | 
| 238 | 
            +
            Warming up --------------------------------------
         | 
| 239 | 
            +
                     sanitize-lg     1.000 i/100ms
         | 
| 240 | 
            +
                        selma-lg     1.000 i/100ms
         | 
| 241 | 
            +
            Calculating -------------------------------------
         | 
| 242 | 
            +
                     sanitize-lg      0.163 (± 0.0%) i/s -      6.000 in  37.375628s
         | 
| 243 | 
            +
                        selma-lg      6.750 (± 0.0%) i/s -    203.000 in  30.080976s
         | 
| 244 | 
            +
             | 
| 245 | 
            +
            Comparison:
         | 
| 246 | 
            +
                        selma-lg:        6.7 i/s
         | 
| 247 | 
            +
                     sanitize-lg:        0.2 i/s - 41.32x  slower
         | 
| 248 | 
            +
            </pre>
         | 
| 249 | 
            +
            </details>
         | 
| 250 | 
            +
            <!-- prettier-ignore-end -->
         | 
| 251 | 
            +
             | 
| 252 | 
            +
            ### Benchmarks for just the rewriting process
         | 
| 253 | 
            +
             | 
| 254 | 
            +
            Comparing Selma against popular Ruby HTML parsing gems:
         | 
| 255 | 
            +
             | 
| 256 | 
            +
            <!-- prettier-ignore-start -->
         | 
| 257 | 
            +
            <details>
         | 
| 258 | 
            +
            <pre>input size = 25309 bytes, 0.03 MB
         | 
| 259 | 
            +
             | 
| 260 | 
            +
            ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
         | 
| 261 | 
            +
            Warming up --------------------------------------
         | 
| 262 | 
            +
                     nokogiri-sm   107.000 i/100ms
         | 
| 263 | 
            +
                   nokolexbor-sm   340.000 i/100ms
         | 
| 264 | 
            +
                        selma-sm   380.000 i/100ms
         | 
| 265 | 
            +
            Calculating -------------------------------------
         | 
| 266 | 
            +
                     nokogiri-sm      1.073k (± 2.1%) i/s -     32.207k in  30.025474s
         | 
| 267 | 
            +
                   nokolexbor-sm      3.300k (±13.2%) i/s -     27.540k in  36.788212s
         | 
| 268 | 
            +
                        selma-sm      3.779k (± 3.4%) i/s -    113.240k in  30.013908s
         | 
| 269 | 
            +
             | 
| 270 | 
            +
            Comparison:
         | 
| 271 | 
            +
                        selma-sm:     3779.4 i/s
         | 
| 272 | 
            +
                   nokolexbor-sm:     3300.1 i/s - same-ish: difference falls within error
         | 
| 273 | 
            +
                     nokogiri-sm:     1073.1 i/s - 3.52x  slower
         | 
| 274 | 
            +
             | 
| 275 | 
            +
            input size = 86686 bytes, 0.09 MB
         | 
| 276 | 
            +
             | 
| 277 | 
            +
            ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
         | 
| 278 | 
            +
            Warming up --------------------------------------
         | 
| 279 | 
            +
                     nokogiri-md    11.000 i/100ms
         | 
| 280 | 
            +
                   nokolexbor-md    48.000 i/100ms
         | 
| 281 | 
            +
                        selma-md    53.000 i/100ms
         | 
| 282 | 
            +
            Calculating -------------------------------------
         | 
| 283 | 
            +
                     nokogiri-md    103.998 (± 5.8%) i/s -      3.113k in  30.029932s
         | 
| 284 | 
            +
                   nokolexbor-md    428.928 (± 7.9%) i/s -     12.816k in  30.066662s
         | 
| 285 | 
            +
                        selma-md    492.190 (± 6.9%) i/s -     14.734k in  30.082943s
         | 
| 286 | 
            +
             | 
| 287 | 
            +
            Comparison:
         | 
| 288 | 
            +
                        selma-md:      492.2 i/s
         | 
| 289 | 
            +
                   nokolexbor-md:      428.9 i/s - same-ish: difference falls within error
         | 
| 290 | 
            +
                     nokogiri-md:      104.0 i/s - 4.73x  slower
         | 
| 291 | 
            +
             | 
| 292 | 
            +
            input size = 7172510 bytes, 7.17 MB
         | 
| 293 | 
            +
             | 
| 294 | 
            +
            ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
         | 
| 203 295 | 
             
            Warming up --------------------------------------
         | 
| 204 | 
            -
             | 
| 205 | 
            -
             | 
| 206 | 
            -
            selma- | 
| 296 | 
            +
                     nokogiri-lg     1.000 i/100ms
         | 
| 297 | 
            +
                   nokolexbor-lg     1.000 i/100ms
         | 
| 298 | 
            +
                        selma-lg     1.000 i/100ms
         | 
| 207 299 | 
             
            Calculating -------------------------------------
         | 
| 208 | 
            -
             | 
| 209 | 
            -
             | 
| 210 | 
            -
            selma- | 
| 300 | 
            +
                     nokogiri-lg      0.874 (± 0.0%) i/s -     27.000 in  30.921090s
         | 
| 301 | 
            +
                   nokolexbor-lg      2.227 (± 0.0%) i/s -     67.000 in  30.137903s
         | 
| 302 | 
            +
                        selma-lg      8.354 (± 0.0%) i/s -    251.000 in  30.075227s
         | 
| 303 | 
            +
             | 
| 304 | 
            +
            Comparison:
         | 
| 305 | 
            +
                        selma-lg:        8.4 i/s
         | 
| 306 | 
            +
                   nokolexbor-lg:        2.2 i/s - 3.75x  slower
         | 
| 307 | 
            +
                     nokogiri-lg:        0.9 i/s - 9.56x  slower
         | 
| 211 308 | 
             
            </pre>
         | 
| 212 309 | 
             
            </details>
         | 
| 310 | 
            +
            <!-- prettier-ignore-end -->
         | 
| 213 311 |  | 
| 214 312 | 
             
            ## Contributing
         | 
| 215 313 |  | 
    
        data/ext/selma/Cargo.toml
    CHANGED
    
    | @@ -6,9 +6,12 @@ rust-version = "1.75.0" | |
| 6 6 | 
             
            publish = false
         | 
| 7 7 |  | 
| 8 8 | 
             
            [dependencies]
         | 
| 9 | 
            -
            enum-iterator = "1 | 
| 9 | 
            +
            enum-iterator = "2.1"
         | 
| 10 10 | 
             
            escapist = "0.0.2"
         | 
| 11 | 
            -
            magnus = "0.6"
         | 
| 11 | 
            +
            magnus = { version = "0.6", features = ["rb-sys"] }
         | 
| 12 | 
            +
            rb-sys = { version = "*", default-features = false, features = [
         | 
| 13 | 
            +
                "stable-api-compiled-fallback",
         | 
| 14 | 
            +
            ] }
         | 
| 12 15 | 
             
            lol_html = "1.2"
         | 
| 13 16 |  | 
| 14 17 | 
             
            [lib]
         | 
| @@ -119,11 +119,13 @@ impl SelmaHTMLElement { | |
| 119 119 | 
             
                            .iter()
         | 
| 120 120 | 
             
                            .for_each(|attr| match hash.aset(attr.name(), attr.value()) {
         | 
| 121 121 | 
             
                                Ok(_) => {}
         | 
| 122 | 
            -
                                Err(err) =>  | 
| 123 | 
            -
                                     | 
| 124 | 
            -
                                     | 
| 125 | 
            -
             | 
| 126 | 
            -
             | 
| 122 | 
            +
                                Err(err) => panic!(
         | 
| 123 | 
            +
                                    "{:?}",
         | 
| 124 | 
            +
                                    Error::new(
         | 
| 125 | 
            +
                                        exception::runtime_error(),
         | 
| 126 | 
            +
                                        format!("AttributeNameError: {err:?}"),
         | 
| 127 | 
            +
                                    )
         | 
| 128 | 
            +
                                ),
         | 
| 127 129 | 
             
                            });
         | 
| 128 130 | 
             
                    }
         | 
| 129 131 | 
             
                    Ok(hash)
         | 
| @@ -139,7 +141,10 @@ impl SelmaHTMLElement { | |
| 139 141 | 
             
                        .for_each(|ancestor| match array.push(RString::new(ancestor)) {
         | 
| 140 142 | 
             
                            Ok(_) => {}
         | 
| 141 143 | 
             
                            Err(err) => {
         | 
| 142 | 
            -
                                 | 
| 144 | 
            +
                                panic!(
         | 
| 145 | 
            +
                                    "{:?}",
         | 
| 146 | 
            +
                                    Error::new(exception::runtime_error(), format!("{err:?}"))
         | 
| 147 | 
            +
                                )
         | 
| 143 148 | 
             
                            }
         | 
| 144 149 | 
             
                        });
         | 
| 145 150 |  | 
| @@ -1,15 +1,17 @@ | |
| 1 | 
            -
            use std::{ | 
| 1 | 
            +
            use std::{
         | 
| 2 | 
            +
                marker::PhantomData,
         | 
| 3 | 
            +
                sync::{Arc, Mutex},
         | 
| 4 | 
            +
            };
         | 
| 2 5 |  | 
| 3 | 
            -
            // NOTE:  | 
| 4 | 
            -
            // but it was taken from https://github.com/cloudflare/lol-html/blob/1a1ab2e2bf896f815fe8888ed78ccdf46d7c6b85/js-api/src/lib.rs#LL38
         | 
| 6 | 
            +
            // NOTE: this was taken from https://github.com/cloudflare/lol-html/blob/1a1ab2e2bf896f815fe8888ed78ccdf46d7c6b85/js-api/src/lib.rs#LL38
         | 
| 5 7 |  | 
| 6 8 | 
             
            pub struct Anchor<'r> {
         | 
| 7 | 
            -
                poisoned:  | 
| 9 | 
            +
                poisoned: Arc<Mutex<bool>>,
         | 
| 8 10 | 
             
                lifetime: PhantomData<&'r mut ()>,
         | 
| 9 11 | 
             
            }
         | 
| 10 12 |  | 
| 11 13 | 
             
            impl<'r> Anchor<'r> {
         | 
| 12 | 
            -
                pub fn new(poisoned:  | 
| 14 | 
            +
                pub fn new(poisoned: Arc<Mutex<bool>>) -> Self {
         | 
| 13 15 | 
             
                    Anchor {
         | 
| 14 16 | 
             
                        poisoned,
         | 
| 15 17 | 
             
                        lifetime: PhantomData,
         | 
| @@ -19,7 +21,7 @@ impl<'r> Anchor<'r> { | |
| 19 21 |  | 
| 20 22 | 
             
            // impl Drop for Anchor<'_> {
         | 
| 21 23 | 
             
            //     fn drop(&mut self) {
         | 
| 22 | 
            -
            //         self.poisoned. | 
| 24 | 
            +
            //         *self.poisoned.lock().unwrap() = true;
         | 
| 23 25 | 
             
            //     }
         | 
| 24 26 | 
             
            // }
         | 
| 25 27 |  | 
| @@ -31,17 +33,17 @@ impl<'r> Anchor<'r> { | |
| 31 33 | 
             
            // object results in exception.
         | 
| 32 34 | 
             
            pub struct NativeRefWrap<R> {
         | 
| 33 35 | 
             
                inner_ptr: *mut R,
         | 
| 34 | 
            -
                poisoned:  | 
| 36 | 
            +
                poisoned: Arc<Mutex<bool>>,
         | 
| 35 37 | 
             
            }
         | 
| 36 38 |  | 
| 37 39 | 
             
            impl<R> NativeRefWrap<R> {
         | 
| 38 40 | 
             
                pub fn wrap<I>(inner: &I) -> (Self, Anchor) {
         | 
| 39 41 | 
             
                    let wrap = NativeRefWrap {
         | 
| 40 42 | 
             
                        inner_ptr: inner as *const I as *mut R,
         | 
| 41 | 
            -
                        poisoned:  | 
| 43 | 
            +
                        poisoned: Arc::new(Mutex::new(false)),
         | 
| 42 44 | 
             
                    };
         | 
| 43 45 |  | 
| 44 | 
            -
                    let anchor = Anchor::new( | 
| 46 | 
            +
                    let anchor = Anchor::new(Arc::clone(&wrap.poisoned));
         | 
| 45 47 |  | 
| 46 48 | 
             
                    (wrap, anchor)
         | 
| 47 49 | 
             
                }
         | 
| @@ -49,10 +51,10 @@ impl<R> NativeRefWrap<R> { | |
| 49 51 | 
             
                pub fn wrap_mut<I>(inner: &mut I) -> (Self, Anchor) {
         | 
| 50 52 | 
             
                    let wrap = NativeRefWrap {
         | 
| 51 53 | 
             
                        inner_ptr: inner as *mut I as *mut R,
         | 
| 52 | 
            -
                        poisoned:  | 
| 54 | 
            +
                        poisoned: Arc::new(Mutex::new(false)),
         | 
| 53 55 | 
             
                    };
         | 
| 54 56 |  | 
| 55 | 
            -
                    let anchor = Anchor::new( | 
| 57 | 
            +
                    let anchor = Anchor::new(Arc::clone(&wrap.poisoned));
         | 
| 56 58 |  | 
| 57 59 | 
             
                    (wrap, anchor)
         | 
| 58 60 | 
             
                }
         | 
| @@ -70,7 +72,8 @@ impl<R> NativeRefWrap<R> { | |
| 70 72 | 
             
                }
         | 
| 71 73 |  | 
| 72 74 | 
             
                fn assert_not_poisoned(&self) -> Result<(), &'static str> {
         | 
| 73 | 
            -
                     | 
| 75 | 
            +
                    let lock = self.poisoned.lock().unwrap();
         | 
| 76 | 
            +
                    if *lock {
         | 
| 74 77 | 
             
                        Err("The object has been freed and can't be used anymore.")
         | 
| 75 78 | 
             
                    } else {
         | 
| 76 79 | 
             
                        Ok(())
         |