selma 0.2.2 → 0.4.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +129 -124
- data/README.md +133 -25
- data/ext/selma/Cargo.toml +6 -3
- data/ext/selma/src/html/element.rs +32 -27
- data/ext/selma/src/html/end_tag.rs +5 -5
- data/ext/selma/src/html/text_chunk.rs +55 -12
- data/ext/selma/src/native_ref_wrap.rs +30 -33
- data/ext/selma/src/rewriter.rs +299 -139
- data/ext/selma/src/sanitizer.rs +256 -138
- data/lib/selma/config.rb +12 -0
- data/lib/selma/html/element.rb +11 -0
- data/lib/selma/html.rb +2 -0
- data/lib/selma/sanitizer/config/default.rb +1 -1
- data/lib/selma/sanitizer/config/relaxed.rb +1 -0
- data/lib/selma/sanitizer/config.rb +2 -2
- data/lib/selma/sanitizer.rb +0 -77
- data/lib/selma/version.rb +1 -1
- metadata +9 -7
    
        data/README.md
    CHANGED
    
    | @@ -76,7 +76,7 @@ attributes: { | |
| 76 76 |  | 
| 77 77 | 
             
            # URL handling protocols to allow in specific attributes. By default, no
         | 
| 78 78 | 
             
            # protocols are allowed. Use :relative in place of a protocol if you want
         | 
| 79 | 
            -
            # to allow relative URLs sans protocol.
         | 
| 79 | 
            +
            # to allow relative URLs sans protocol. Set to `:all` to allow any protocol.
         | 
| 80 80 | 
             
            protocols: {
         | 
| 81 81 | 
             
                "a" => { "href" => ["http", "https", "mailto", :relative] },
         | 
| 82 82 | 
             
                "img" => { "href" => ["http", "https"] },
         | 
| @@ -103,7 +103,11 @@ Here's an example which rewrites the `href` attribute on `a` and the `src` attri | |
| 103 103 |  | 
| 104 104 | 
             
            ```ruby
         | 
| 105 105 | 
             
            class MatchAttribute
         | 
| 106 | 
            -
              SELECTOR = Selma::Selector(match_element: %(a[href^="http:"], img[src^="http:"]"))
         | 
| 106 | 
            +
              SELECTOR = Selma::Selector.new(match_element: %(a[href^="http:"], img[src^="http:"]"))
         | 
| 107 | 
            +
             | 
| 108 | 
            +
              def selector
         | 
| 109 | 
            +
                SELECTOR
         | 
| 110 | 
            +
              end
         | 
| 107 111 |  | 
| 108 112 | 
             
              def handle_element(element)
         | 
| 109 113 | 
             
                if element.tag_name == "a"
         | 
| @@ -130,7 +134,6 @@ The `Selma::Selector` object has three possible kwargs: | |
| 130 134 | 
             
            Here's an example for `handle_text_chunk` which changes strings in various elements which are _not_ `pre` or `code`:
         | 
| 131 135 |  | 
| 132 136 | 
             
            ```ruby
         | 
| 133 | 
            -
             | 
| 134 137 | 
             
            class MatchText
         | 
| 135 138 | 
             
              SELECTOR = Selma::Selector.new(match_text_within: "*", ignore_text_within: ["pre", "code"])
         | 
| 136 139 |  | 
| @@ -176,40 +179,145 @@ The `element` argument in `handle_element` has the following methods: | |
| 176 179 | 
             
            - `after(content, as: content_type)`: Inserts `content` after the text. `content_type` is either `:text` or `:html` and determines how the content will be applied.
         | 
| 177 180 | 
             
            - `replace(content, as: content_type)`: Replaces the text node with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
         | 
| 178 181 |  | 
| 182 | 
            +
            ## Security
         | 
| 183 | 
            +
             | 
| 184 | 
            +
            Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide `memory` options:
         | 
| 185 | 
            +
             | 
| 186 | 
            +
            ```ruby
         | 
| 187 | 
            +
            Selma::Rewriter.new(options: { memory: { max_allowed_memory_usage: 1_000_000 } }) # ~1MB
         | 
| 188 | 
            +
            ```
         | 
| 189 | 
            +
             | 
| 190 | 
            +
            The structure of the `memory` options looks like this:
         | 
| 191 | 
            +
             | 
| 192 | 
            +
            ```ruby
         | 
| 193 | 
            +
            {
         | 
| 194 | 
            +
              memory: {
         | 
| 195 | 
            +
                max_allowed_memory_usage: 1000,
         | 
| 196 | 
            +
                preallocated_parsing_buffer_size: 100,
         | 
| 197 | 
            +
              }
         | 
| 198 | 
            +
            }
         | 
| 199 | 
            +
            ```
         | 
| 200 | 
            +
             | 
| 201 | 
            +
            Note that `preallocated_parsing_buffer_size` must always be less than `max_allowed_memory_usage`. See [the`lol_html` project documentation](https://docs.rs/lol_html/1.2.1/lol_html/struct.MemorySettings.html) to learn more about the default values.
         | 
| 202 | 
            +
             | 
| 179 203 | 
             
            ## Benchmarks
         | 
| 180 204 |  | 
| 205 | 
            +
            When `bundle exec rake benchmark`, two different benchmarks are calculated. Here are those results on my machine.
         | 
| 206 | 
            +
             | 
| 207 | 
            +
            ### Benchmarks for just the sanitization process
         | 
| 208 | 
            +
             | 
| 209 | 
            +
            Comparing Selma against popular Ruby sanitization gems:
         | 
| 210 | 
            +
             | 
| 211 | 
            +
            <!-- prettier-ignore-start -->
         | 
| 212 | 
            +
            <details>
         | 
| 213 | 
            +
            <pre>
         | 
| 214 | 
            +
            input size = 25309 bytes, 0.03 MB
         | 
| 215 | 
            +
             | 
| 216 | 
            +
            ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
         | 
| 217 | 
            +
            Warming up --------------------------------------
         | 
| 218 | 
            +
            sanitize-sm 15.000 i/100ms
         | 
| 219 | 
            +
            selma-sm 127.000 i/100ms
         | 
| 220 | 
            +
            Calculating -------------------------------------
         | 
| 221 | 
            +
            sanitize-sm 157.643 (± 1.9%) i/s - 4.740k in 30.077172s
         | 
| 222 | 
            +
            selma-sm 1.278k (± 1.5%) i/s - 38.354k in 30.019722s
         | 
| 223 | 
            +
             | 
| 224 | 
            +
            Comparison:
         | 
| 225 | 
            +
            selma-sm: 1277.9 i/s
         | 
| 226 | 
            +
            sanitize-sm: 157.6 i/s - 8.11x slower
         | 
| 227 | 
            +
             | 
| 228 | 
            +
            input size = 86686 bytes, 0.09 MB
         | 
| 229 | 
            +
             | 
| 230 | 
            +
            ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
         | 
| 231 | 
            +
            Warming up --------------------------------------
         | 
| 232 | 
            +
            sanitize-md 4.000 i/100ms
         | 
| 233 | 
            +
            selma-md 33.000 i/100ms
         | 
| 234 | 
            +
            Calculating -------------------------------------
         | 
| 235 | 
            +
            sanitize-md 40.034 (± 5.0%) i/s - 1.200k in 30.043322s
         | 
| 236 | 
            +
            selma-md 332.959 (± 2.1%) i/s - 9.999k in 30.045733s
         | 
| 237 | 
            +
             | 
| 238 | 
            +
            Comparison:
         | 
| 239 | 
            +
            selma-md: 333.0 i/s
         | 
| 240 | 
            +
            sanitize-md: 40.0 i/s - 8.32x slower
         | 
| 241 | 
            +
             | 
| 242 | 
            +
            input size = 7172510 bytes, 7.17 MB
         | 
| 243 | 
            +
             | 
| 244 | 
            +
            ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
         | 
| 245 | 
            +
            Warming up --------------------------------------
         | 
| 246 | 
            +
            sanitize-lg 1.000 i/100ms
         | 
| 247 | 
            +
            selma-lg 1.000 i/100ms
         | 
| 248 | 
            +
            Calculating -------------------------------------
         | 
| 249 | 
            +
            sanitize-lg 0.141 (± 0.0%) i/s - 5.000 in 35.426127s
         | 
| 250 | 
            +
            selma-lg 3.963 (± 0.0%) i/s - 119.000 in 30.037386s
         | 
| 251 | 
            +
             | 
| 252 | 
            +
            Comparison:
         | 
| 253 | 
            +
            selma-lg: 4.0 i/s
         | 
| 254 | 
            +
            sanitize-lg: 0.1 i/s - 28.03x slower
         | 
| 255 | 
            +
             | 
| 256 | 
            +
            </pre>
         | 
| 257 | 
            +
            </details>
         | 
| 258 | 
            +
            <!-- prettier-ignore-end -->
         | 
| 259 | 
            +
             | 
| 260 | 
            +
            ### Benchmarks for just the rewriting process
         | 
| 261 | 
            +
             | 
| 262 | 
            +
            Comparing Selma against popular Ruby HTML parsing gems:
         | 
| 263 | 
            +
             | 
| 264 | 
            +
            <!-- prettier-ignore-start -->
         | 
| 181 265 | 
             
            <details>
         | 
| 182 266 | 
             
            <pre>
         | 
| 183 | 
            -
             | 
| 184 | 
            -
             | 
| 267 | 
            +
            input size = 25309 bytes, 0.03 MB
         | 
| 268 | 
            +
             | 
| 269 | 
            +
            ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
         | 
| 185 270 | 
             
            Warming up --------------------------------------
         | 
| 186 | 
            -
             | 
| 187 | 
            -
             | 
| 188 | 
            -
             | 
| 271 | 
            +
            nokogiri-sm 79.000 i/100ms
         | 
| 272 | 
            +
            nokolexbor-sm 295.000 i/100ms
         | 
| 273 | 
            +
            selma-sm 237.000 i/100ms
         | 
| 189 274 | 
             
            Calculating -------------------------------------
         | 
| 190 | 
            -
             | 
| 191 | 
            -
             | 
| 192 | 
            -
             | 
| 275 | 
            +
            nokogiri-sm 800.531 (± 2.2%) i/s - 24.016k in 30.016056s
         | 
| 276 | 
            +
            nokolexbor-sm 3.033k (± 3.6%) i/s - 91.155k in 30.094884s
         | 
| 277 | 
            +
            selma-sm 2.386k (± 1.6%) i/s - 71.574k in 30.001701s
         | 
| 278 | 
            +
             | 
| 279 | 
            +
            Comparison:
         | 
| 280 | 
            +
            nokolexbor-sm: 3033.1 i/s
         | 
| 281 | 
            +
            selma-sm: 2386.3 i/s - 1.27x slower
         | 
| 282 | 
            +
            nokogiri-sm: 800.5 i/s - 3.79x slower
         | 
| 283 | 
            +
             | 
| 284 | 
            +
            input size = 86686 bytes, 0.09 MB
         | 
| 285 | 
            +
             | 
| 286 | 
            +
            ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
         | 
| 193 287 | 
             
            Warming up --------------------------------------
         | 
| 194 | 
            -
             | 
| 195 | 
            -
             | 
| 196 | 
            -
            selma- | 
| 197 | 
            -
                                    22.000  i/100ms
         | 
| 288 | 
            +
            nokogiri-md 8.000 i/100ms
         | 
| 289 | 
            +
            nokolexbor-md 43.000 i/100ms
         | 
| 290 | 
            +
            selma-md 38.000 i/100ms
         | 
| 198 291 | 
             
            Calculating -------------------------------------
         | 
| 199 | 
            -
             | 
| 200 | 
            -
             | 
| 201 | 
            -
            selma- | 
| 202 | 
            -
             | 
| 292 | 
            +
            nokogiri-md 85.013 (± 8.2%) i/s - 2.024k in 52.257472s
         | 
| 293 | 
            +
            nokolexbor-md 416.074 (±11.1%) i/s - 12.341k in 30.111613s
         | 
| 294 | 
            +
            selma-md 361.471 (± 4.7%) i/s - 10.830k in 30.033997s
         | 
| 295 | 
            +
             | 
| 296 | 
            +
            Comparison:
         | 
| 297 | 
            +
            nokolexbor-md: 416.1 i/s
         | 
| 298 | 
            +
            selma-md: 361.5 i/s - same-ish: difference falls within error
         | 
| 299 | 
            +
            nokogiri-md: 85.0 i/s - 4.89x slower
         | 
| 300 | 
            +
             | 
| 301 | 
            +
            input size = 7172510 bytes, 7.17 MB
         | 
| 302 | 
            +
             | 
| 303 | 
            +
            ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
         | 
| 203 304 | 
             
            Warming up --------------------------------------
         | 
| 204 | 
            -
             | 
| 205 | 
            -
             | 
| 206 | 
            -
            selma- | 
| 305 | 
            +
            nokogiri-lg 1.000 i/100ms
         | 
| 306 | 
            +
            nokolexbor-lg 1.000 i/100ms
         | 
| 307 | 
            +
            selma-lg 1.000 i/100ms
         | 
| 207 308 | 
             
            Calculating -------------------------------------
         | 
| 208 | 
            -
             | 
| 209 | 
            -
             | 
| 210 | 
            -
            selma- | 
| 309 | 
            +
            nokogiri-lg 0.805 (± 0.0%) i/s - 25.000 in 31.148730s
         | 
| 310 | 
            +
            nokolexbor-lg 2.194 (± 0.0%) i/s - 66.000 in 30.278108s
         | 
| 311 | 
            +
            selma-lg 5.541 (± 0.0%) i/s - 166.000 in 30.037197s
         | 
| 312 | 
            +
             | 
| 313 | 
            +
            Comparison:
         | 
| 314 | 
            +
            selma-lg: 5.5 i/s
         | 
| 315 | 
            +
            nokolexbor-lg: 2.2 i/s - 2.53x slower
         | 
| 316 | 
            +
            nokogiri-lg: 0.8 i/s - 6.88x slower
         | 
| 317 | 
            +
             | 
| 211 318 | 
             
            </pre>
         | 
| 212 319 | 
             
            </details>
         | 
| 320 | 
            +
            <!-- prettier-ignore-end -->
         | 
| 213 321 |  | 
| 214 322 | 
             
            ## Contributing
         | 
| 215 323 |  | 
    
        data/ext/selma/Cargo.toml
    CHANGED
    
    | @@ -6,10 +6,13 @@ rust-version = "1.75.0" | |
| 6 6 | 
             
            publish = false
         | 
| 7 7 |  | 
| 8 8 | 
             
            [dependencies]
         | 
| 9 | 
            -
            enum-iterator = "1 | 
| 9 | 
            +
            enum-iterator = "2.1"
         | 
| 10 10 | 
             
            escapist = "0.0.2"
         | 
| 11 | 
            -
            magnus = "0. | 
| 12 | 
            -
             | 
| 11 | 
            +
            magnus = { version = "0.7", features = ["rb-sys"] }
         | 
| 12 | 
            +
            rb-sys = { version = "*", default-features = false, features = [
         | 
| 13 | 
            +
                "stable-api-compiled-fallback",
         | 
| 14 | 
            +
            ] }
         | 
| 15 | 
            +
            lol_html = "2.0"
         | 
| 13 16 |  | 
| 14 17 | 
             
            [lib]
         | 
| 15 18 | 
             
            name = "selma"
         | 
| @@ -1,3 +1,5 @@ | |
| 1 | 
            +
            use std::cell::RefCell;
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            use crate::native_ref_wrap::NativeRefWrap;
         | 
| 2 4 | 
             
            use lol_html::html_content::Element;
         | 
| 3 5 | 
             
            use magnus::{exception, method, Error, Module, RArray, RClass, RHash, RString, Value};
         | 
| @@ -8,16 +10,14 @@ struct HTMLElement { | |
| 8 10 | 
             
            }
         | 
| 9 11 |  | 
| 10 12 | 
             
            #[magnus::wrap(class = "Selma::HTML::Element")]
         | 
| 11 | 
            -
            pub struct SelmaHTMLElement( | 
| 13 | 
            +
            pub struct SelmaHTMLElement(RefCell<HTMLElement>);
         | 
| 12 14 |  | 
| 13 15 | 
             
            /// SAFETY: This is safe because we only access this data when the GVL is held.
         | 
| 14 16 | 
             
            unsafe impl Send for SelmaHTMLElement {}
         | 
| 15 17 |  | 
| 16 18 | 
             
            impl SelmaHTMLElement {
         | 
| 17 | 
            -
                pub fn new( | 
| 18 | 
            -
                     | 
| 19 | 
            -
             | 
| 20 | 
            -
                    Self(std::cell::RefCell::new(HTMLElement {
         | 
| 19 | 
            +
                pub fn new(ref_wrap: NativeRefWrap<Element<'static, 'static>>, ancestors: &[String]) -> Self {
         | 
| 20 | 
            +
                    Self(RefCell::new(HTMLElement {
         | 
| 21 21 | 
             
                        element: ref_wrap,
         | 
| 22 22 | 
             
                        ancestors: ancestors.to_owned(),
         | 
| 23 23 | 
             
                    }))
         | 
| @@ -26,13 +26,12 @@ impl SelmaHTMLElement { | |
| 26 26 | 
             
                fn tag_name(&self) -> Result<String, Error> {
         | 
| 27 27 | 
             
                    let binding = self.0.borrow();
         | 
| 28 28 |  | 
| 29 | 
            -
                     | 
| 30 | 
            -
                        Ok(e.tag_name())
         | 
| 31 | 
            -
             | 
| 32 | 
            -
                        Err(Error::new(
         | 
| 29 | 
            +
                    match binding.element.get() {
         | 
| 30 | 
            +
                        Ok(e) => Ok(e.tag_name().to_string()),
         | 
| 31 | 
            +
                        Err(_) => Err(Error::new(
         | 
| 33 32 | 
             
                            exception::runtime_error(),
         | 
| 34 33 | 
             
                            "`tag_name` is not available",
         | 
| 35 | 
            -
                        ))
         | 
| 34 | 
            +
                        )),
         | 
| 36 35 | 
             
                    }
         | 
| 37 36 | 
             
                }
         | 
| 38 37 |  | 
| @@ -119,11 +118,13 @@ impl SelmaHTMLElement { | |
| 119 118 | 
             
                            .iter()
         | 
| 120 119 | 
             
                            .for_each(|attr| match hash.aset(attr.name(), attr.value()) {
         | 
| 121 120 | 
             
                                Ok(_) => {}
         | 
| 122 | 
            -
                                Err(err) =>  | 
| 123 | 
            -
                                     | 
| 124 | 
            -
                                     | 
| 125 | 
            -
             | 
| 126 | 
            -
             | 
| 121 | 
            +
                                Err(err) => panic!(
         | 
| 122 | 
            +
                                    "{:?}",
         | 
| 123 | 
            +
                                    Error::new(
         | 
| 124 | 
            +
                                        exception::runtime_error(),
         | 
| 125 | 
            +
                                        format!("AttributeNameError: {err:?}"),
         | 
| 126 | 
            +
                                    )
         | 
| 127 | 
            +
                                ),
         | 
| 127 128 | 
             
                            });
         | 
| 128 129 | 
             
                    }
         | 
| 129 130 | 
             
                    Ok(hash)
         | 
| @@ -139,7 +140,10 @@ impl SelmaHTMLElement { | |
| 139 140 | 
             
                        .for_each(|ancestor| match array.push(RString::new(ancestor)) {
         | 
| 140 141 | 
             
                            Ok(_) => {}
         | 
| 141 142 | 
             
                            Err(err) => {
         | 
| 142 | 
            -
                                 | 
| 143 | 
            +
                                panic!(
         | 
| 144 | 
            +
                                    "{:?}",
         | 
| 145 | 
            +
                                    Error::new(exception::runtime_error(), format!("{err:?}"))
         | 
| 146 | 
            +
                                )
         | 
| 143 147 | 
             
                            }
         | 
| 144 148 | 
             
                        });
         | 
| 145 149 |  | 
| @@ -224,24 +228,25 @@ impl SelmaHTMLElement { | |
| 224 228 | 
             
                    }
         | 
| 225 229 | 
             
                }
         | 
| 226 230 |  | 
| 227 | 
            -
                fn remove_and_keep_content(&self) {
         | 
| 228 | 
            -
                     | 
| 229 | 
            -
             | 
| 230 | 
            -
             | 
| 231 | 
            -
                         | 
| 232 | 
            -
             | 
| 231 | 
            +
                fn remove_and_keep_content(&self) -> Result<(), Error> {
         | 
| 232 | 
            +
                    self.0
         | 
| 233 | 
            +
                        .borrow_mut()
         | 
| 234 | 
            +
                        .element
         | 
| 235 | 
            +
                        .get_mut()
         | 
| 236 | 
            +
                        .unwrap()
         | 
| 237 | 
            +
                        .remove_and_keep_content();
         | 
| 238 | 
            +
                    Ok(())
         | 
| 233 239 | 
             
                }
         | 
| 234 240 |  | 
| 235 241 | 
             
                fn is_removed(&self) -> Result<bool, Error> {
         | 
| 236 242 | 
             
                    let binding = self.0.borrow();
         | 
| 237 243 |  | 
| 238 | 
            -
                     | 
| 239 | 
            -
                        Ok(e.removed())
         | 
| 240 | 
            -
             | 
| 241 | 
            -
                        Err(Error::new(
         | 
| 244 | 
            +
                    match binding.element.get() {
         | 
| 245 | 
            +
                        Ok(e) => Ok(e.removed()),
         | 
| 246 | 
            +
                        Err(_) => Err(Error::new(
         | 
| 242 247 | 
             
                            exception::runtime_error(),
         | 
| 243 248 | 
             
                            "`is_removed` is not available",
         | 
| 244 | 
            -
                        ))
         | 
| 249 | 
            +
                        )),
         | 
| 245 250 | 
             
                    }
         | 
| 246 251 | 
             
                }
         | 
| 247 252 | 
             
            }
         | 
| @@ -1,3 +1,5 @@ | |
| 1 | 
            +
            use std::cell::RefCell;
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            use crate::native_ref_wrap::NativeRefWrap;
         | 
| 2 4 | 
             
            use lol_html::html_content::EndTag;
         | 
| 3 5 | 
             
            use magnus::{method, Error, Module, RClass};
         | 
| @@ -7,16 +9,14 @@ struct HTMLEndTag { | |
| 7 9 | 
             
            }
         | 
| 8 10 |  | 
| 9 11 | 
             
            #[magnus::wrap(class = "Selma::HTML::EndTag")]
         | 
| 10 | 
            -
            pub struct SelmaHTMLEndTag( | 
| 12 | 
            +
            pub struct SelmaHTMLEndTag(RefCell<HTMLEndTag>);
         | 
| 11 13 |  | 
| 12 14 | 
             
            /// SAFETY: This is safe because we only access this data when the GVL is held.
         | 
| 13 15 | 
             
            unsafe impl Send for SelmaHTMLEndTag {}
         | 
| 14 16 |  | 
| 15 17 | 
             
            impl SelmaHTMLEndTag {
         | 
| 16 | 
            -
                pub fn new( | 
| 17 | 
            -
                     | 
| 18 | 
            -
             | 
| 19 | 
            -
                    Self(std::cell::RefCell::new(HTMLEndTag { end_tag: ref_wrap }))
         | 
| 18 | 
            +
                pub fn new(ref_wrap: NativeRefWrap<EndTag<'static>>) -> Self {
         | 
| 19 | 
            +
                    Self(RefCell::new(HTMLEndTag { end_tag: ref_wrap }))
         | 
| 20 20 | 
             
                }
         | 
| 21 21 |  | 
| 22 22 | 
             
                fn tag_name(&self) -> String {
         | 
| @@ -1,23 +1,44 @@ | |
| 1 | 
            +
            use std::cell::RefCell;
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            use crate::native_ref_wrap::NativeRefWrap;
         | 
| 2 4 | 
             
            use lol_html::html_content::{TextChunk, TextType};
         | 
| 3 5 | 
             
            use magnus::{exception, method, Error, Module, RClass, Symbol, Value};
         | 
| 4 6 |  | 
| 5 7 | 
             
            struct HTMLTextChunk {
         | 
| 6 8 | 
             
                text_chunk: NativeRefWrap<TextChunk<'static>>,
         | 
| 9 | 
            +
                buffer: String,
         | 
| 10 | 
            +
            }
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            macro_rules! clone_buffer_if_not_empty {
         | 
| 13 | 
            +
                ($binding:expr, $buffer:expr) => {
         | 
| 14 | 
            +
                    if !$binding.buffer.is_empty() {
         | 
| 15 | 
            +
                        $buffer.clone_from(&$binding.buffer);
         | 
| 16 | 
            +
                    }
         | 
| 17 | 
            +
                };
         | 
| 18 | 
            +
            }
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            // if this is the first time we're processing this text chunk (buffer is empty),
         | 
| 21 | 
            +
            // we carry on. Otherwise, we need to use the buffer text, not the text chunk,
         | 
| 22 | 
            +
            // because lol-html is not designed in such a way to keep track of text chunks.
         | 
| 23 | 
            +
            macro_rules! set_text_chunk_to_buffer {
         | 
| 24 | 
            +
                ($text_chunk:expr, $buffer:expr) => {
         | 
| 25 | 
            +
                    if !$buffer.is_empty() {
         | 
| 26 | 
            +
                        $text_chunk.set_str($buffer);
         | 
| 27 | 
            +
                    }
         | 
| 28 | 
            +
                };
         | 
| 7 29 | 
             
            }
         | 
| 8 30 |  | 
| 9 31 | 
             
            #[magnus::wrap(class = "Selma::HTML::TextChunk")]
         | 
| 10 | 
            -
            pub struct SelmaHTMLTextChunk( | 
| 32 | 
            +
            pub struct SelmaHTMLTextChunk(RefCell<HTMLTextChunk>);
         | 
| 11 33 |  | 
| 12 34 | 
             
            /// SAFETY: This is safe because we only access this data when the GVL is held.
         | 
| 13 35 | 
             
            unsafe impl Send for SelmaHTMLTextChunk {}
         | 
| 14 36 |  | 
| 15 37 | 
             
            impl SelmaHTMLTextChunk {
         | 
| 16 | 
            -
                pub fn new( | 
| 17 | 
            -
                     | 
| 18 | 
            -
             | 
| 19 | 
            -
                    Self(std::cell::RefCell::new(HTMLTextChunk {
         | 
| 38 | 
            +
                pub fn new(ref_wrap: NativeRefWrap<TextChunk<'static>>) -> Self {
         | 
| 39 | 
            +
                    Self(RefCell::new(HTMLTextChunk {
         | 
| 20 40 | 
             
                        text_chunk: ref_wrap,
         | 
| 41 | 
            +
                        buffer: String::new(),
         | 
| 21 42 | 
             
                    }))
         | 
| 22 43 | 
             
                }
         | 
| 23 44 |  | 
| @@ -54,7 +75,19 @@ impl SelmaHTMLTextChunk { | |
| 54 75 | 
             
                    }
         | 
| 55 76 | 
             
                }
         | 
| 56 77 |  | 
| 57 | 
            -
                fn  | 
| 78 | 
            +
                fn is_removed(&self) -> Result<bool, Error> {
         | 
| 79 | 
            +
                    let binding = self.0.borrow();
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                    match binding.text_chunk.get() {
         | 
| 82 | 
            +
                        Ok(tc) => Ok(tc.removed()),
         | 
| 83 | 
            +
                        Err(_) => Err(Error::new(
         | 
| 84 | 
            +
                            exception::runtime_error(),
         | 
| 85 | 
            +
                            "`is_removed` is not available",
         | 
| 86 | 
            +
                        )),
         | 
| 87 | 
            +
                    }
         | 
| 88 | 
            +
                }
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                fn before(&self, args: &[Value]) -> Result<String, Error> {
         | 
| 58 91 | 
             
                    let mut binding = self.0.borrow_mut();
         | 
| 59 92 | 
             
                    let text_chunk = binding.text_chunk.get_mut().unwrap();
         | 
| 60 93 |  | 
| @@ -65,10 +98,10 @@ impl SelmaHTMLTextChunk { | |
| 65 98 |  | 
| 66 99 | 
             
                    text_chunk.before(&text_str, content_type);
         | 
| 67 100 |  | 
| 68 | 
            -
                    Ok(())
         | 
| 101 | 
            +
                    Ok(text_chunk.as_str().to_string())
         | 
| 69 102 | 
             
                }
         | 
| 70 103 |  | 
| 71 | 
            -
                fn after(&self, args: &[Value]) -> Result< | 
| 104 | 
            +
                fn after(&self, args: &[Value]) -> Result<String, Error> {
         | 
| 72 105 | 
             
                    let mut binding = self.0.borrow_mut();
         | 
| 73 106 | 
             
                    let text_chunk = binding.text_chunk.get_mut().unwrap();
         | 
| 74 107 |  | 
| @@ -79,21 +112,30 @@ impl SelmaHTMLTextChunk { | |
| 79 112 |  | 
| 80 113 | 
             
                    text_chunk.after(&text_str, content_type);
         | 
| 81 114 |  | 
| 82 | 
            -
                    Ok(())
         | 
| 115 | 
            +
                    Ok(text_chunk.as_str().to_string())
         | 
| 83 116 | 
             
                }
         | 
| 84 117 |  | 
| 85 | 
            -
                fn replace(&self, args: &[Value]) -> Result< | 
| 118 | 
            +
                fn replace(&self, args: &[Value]) -> Result<String, Error> {
         | 
| 86 119 | 
             
                    let mut binding = self.0.borrow_mut();
         | 
| 120 | 
            +
                    let mut buffer = String::new();
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                    clone_buffer_if_not_empty!(binding, buffer);
         | 
| 123 | 
            +
             | 
| 87 124 | 
             
                    let text_chunk = binding.text_chunk.get_mut().unwrap();
         | 
| 88 125 |  | 
| 126 | 
            +
                    set_text_chunk_to_buffer!(text_chunk, buffer);
         | 
| 127 | 
            +
             | 
| 89 128 | 
             
                    let (text_str, content_type) = match crate::scan_text_args(args) {
         | 
| 90 129 | 
             
                        Ok((text_str, content_type)) => (text_str, content_type),
         | 
| 91 130 | 
             
                        Err(err) => return Err(err),
         | 
| 92 131 | 
             
                    };
         | 
| 93 | 
            -
             | 
| 94 132 | 
             
                    text_chunk.replace(&text_str, content_type);
         | 
| 95 133 |  | 
| 96 | 
            -
                     | 
| 134 | 
            +
                    text_chunk.set_str(text_str.clone());
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                    binding.buffer = text_chunk.as_str().to_string();
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                    Ok(text_str)
         | 
| 97 139 | 
             
                }
         | 
| 98 140 | 
             
            }
         | 
| 99 141 |  | 
| @@ -108,6 +150,7 @@ pub fn init(c_html: RClass) -> Result<(), Error> { | |
| 108 150 | 
             
                c_text_chunk.define_method("before", method!(SelmaHTMLTextChunk::before, -1))?;
         | 
| 109 151 | 
             
                c_text_chunk.define_method("after", method!(SelmaHTMLTextChunk::after, -1))?;
         | 
| 110 152 | 
             
                c_text_chunk.define_method("replace", method!(SelmaHTMLTextChunk::replace, -1))?;
         | 
| 153 | 
            +
                c_text_chunk.define_method("removed?", method!(SelmaHTMLTextChunk::is_removed, 0))?;
         | 
| 111 154 |  | 
| 112 155 | 
             
                Ok(())
         | 
| 113 156 | 
             
            }
         | 
| @@ -1,15 +1,18 @@ | |
| 1 | 
            -
            use std::{ | 
| 1 | 
            +
            use std::{
         | 
| 2 | 
            +
                marker::PhantomData,
         | 
| 3 | 
            +
                sync::{Arc, Mutex},
         | 
| 4 | 
            +
            };
         | 
| 2 5 |  | 
| 3 | 
            -
            // NOTE:  | 
| 4 | 
            -
            //  | 
| 6 | 
            +
            // NOTE: this was inspired from
         | 
| 7 | 
            +
            // https://github.com/worker-tools/html-rewriter-wasm/blob/92bafdfa34c809c37036f57cb282184cada3bbc9/src/handlers.rs
         | 
| 5 8 |  | 
| 6 9 | 
             
            pub struct Anchor<'r> {
         | 
| 7 | 
            -
                poisoned:  | 
| 10 | 
            +
                poisoned: Arc<Mutex<bool>>,
         | 
| 8 11 | 
             
                lifetime: PhantomData<&'r mut ()>,
         | 
| 9 12 | 
             
            }
         | 
| 10 13 |  | 
| 11 14 | 
             
            impl<'r> Anchor<'r> {
         | 
| 12 | 
            -
                pub fn new(poisoned:  | 
| 15 | 
            +
                pub fn new(poisoned: Arc<Mutex<bool>>) -> Self {
         | 
| 13 16 | 
             
                    Anchor {
         | 
| 14 17 | 
             
                        poisoned,
         | 
| 15 18 | 
             
                        lifetime: PhantomData,
         | 
| @@ -17,44 +20,46 @@ impl<'r> Anchor<'r> { | |
| 17 20 | 
             
                }
         | 
| 18 21 | 
             
            }
         | 
| 19 22 |  | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 23 | 
            +
            impl Drop for Anchor<'_> {
         | 
| 24 | 
            +
                fn drop(&mut self) {
         | 
| 25 | 
            +
                    *self.poisoned.lock().unwrap() = true;
         | 
| 26 | 
            +
                }
         | 
| 27 | 
            +
            }
         | 
| 25 28 |  | 
| 26 | 
            -
            // NOTE:  | 
| 27 | 
            -
            // we create a wrapper that erases all the lifetime information from the inner reference
         | 
| 29 | 
            +
            // NOTE: So far as I understand it, there's no great way to work between lol_html's lifetimes and FFI.
         | 
| 30 | 
            +
            // To work around that, we create a wrapper that erases all the lifetime information from the inner reference
         | 
| 28 31 | 
             
            // and provides an anchor object that keeps track of the lifetime in the runtime.
         | 
| 29 32 | 
             
            //
         | 
| 30 33 | 
             
            // When anchor goes out of scope, wrapper becomes poisoned and any attempt to get inner
         | 
| 31 34 | 
             
            // object results in exception.
         | 
| 35 | 
            +
            #[derive(Clone)]
         | 
| 32 36 | 
             
            pub struct NativeRefWrap<R> {
         | 
| 33 37 | 
             
                inner_ptr: *mut R,
         | 
| 34 | 
            -
                poisoned:  | 
| 38 | 
            +
                poisoned: Arc<Mutex<bool>>,
         | 
| 35 39 | 
             
            }
         | 
| 36 40 |  | 
| 37 41 | 
             
            impl<R> NativeRefWrap<R> {
         | 
| 38 | 
            -
                pub fn wrap<I>(inner: &I) -> (Self, Anchor) {
         | 
| 42 | 
            +
                pub fn wrap<I>(inner: &mut I) -> (Self, Anchor) {
         | 
| 39 43 | 
             
                    let wrap = NativeRefWrap {
         | 
| 40 | 
            -
                        inner_ptr: inner as * | 
| 41 | 
            -
                        poisoned:  | 
| 44 | 
            +
                        inner_ptr: inner as *mut I as *mut R,
         | 
| 45 | 
            +
                        poisoned: Arc::new(Mutex::new(false)),
         | 
| 42 46 | 
             
                    };
         | 
| 43 47 |  | 
| 44 | 
            -
                    let anchor = Anchor::new( | 
| 48 | 
            +
                    let anchor = Anchor::new(Arc::clone(&wrap.poisoned));
         | 
| 45 49 |  | 
| 46 50 | 
             
                    (wrap, anchor)
         | 
| 47 51 | 
             
                }
         | 
| 48 52 |  | 
| 49 | 
            -
                 | 
| 50 | 
            -
                     | 
| 51 | 
            -
                         | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 53 | 
            +
                fn assert_not_poisoned(&self) -> Result<(), &'static str> {
         | 
| 54 | 
            +
                    if self.is_poisoned() {
         | 
| 55 | 
            +
                        Err("The object has been freed and can't be used anymore.")
         | 
| 56 | 
            +
                    } else {
         | 
| 57 | 
            +
                        Ok(())
         | 
| 58 | 
            +
                    }
         | 
| 59 | 
            +
                }
         | 
| 56 60 |  | 
| 57 | 
            -
             | 
| 61 | 
            +
                pub fn is_poisoned(&self) -> bool {
         | 
| 62 | 
            +
                    *self.poisoned.lock().unwrap()
         | 
| 58 63 | 
             
                }
         | 
| 59 64 |  | 
| 60 65 | 
             
                pub fn get(&self) -> Result<&R, &'static str> {
         | 
| @@ -68,12 +73,4 @@ impl<R> NativeRefWrap<R> { | |
| 68 73 |  | 
| 69 74 | 
             
                    Ok(unsafe { self.inner_ptr.as_mut() }.unwrap())
         | 
| 70 75 | 
             
                }
         | 
| 71 | 
            -
             | 
| 72 | 
            -
                fn assert_not_poisoned(&self) -> Result<(), &'static str> {
         | 
| 73 | 
            -
                    if self.poisoned.get() {
         | 
| 74 | 
            -
                        Err("The object has been freed and can't be used anymore.")
         | 
| 75 | 
            -
                    } else {
         | 
| 76 | 
            -
                        Ok(())
         | 
| 77 | 
            -
                    }
         | 
| 78 | 
            -
                }
         | 
| 79 76 | 
             
            }
         |