selma 0.0.4-arm64-darwin → 0.0.6-arm64-darwin

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f899592f41387386f5d3710bed4ce837d90ef562f8c4d1be388ffc7d12d611a6
4
- data.tar.gz: e7b29fc1c6ffa7a8543a041c64b12511e1423e6ed966e4da34429a43923b76a2
3
+ metadata.gz: 9efde5c7c8e6fcb8474c6240bb5ff279168346d102a99cff0d46b40a3c5b2581
4
+ data.tar.gz: 175ba5ec1b7364be0021830e26b3463d94dcedad94fc42711384c33535ba4a58
5
5
  SHA512:
6
- metadata.gz: 256f9015d9ccdca3fb3e788c3cbc6d6e7bb174de08b9cfc009b3ee3ec49f33999834d8d83467ffff24e8a6a5a0efa55d70309f6505403cccd7a58cd7591198d4
7
- data.tar.gz: 64ba49bb6934213db43cc34de2886df48209a3097c9cdc540c0f5cc8a79592befe49e7065fe606c6a163193e7829daf2fbf99cb0181a9cefb721d9bdbbbd65a1
6
+ metadata.gz: ba9fc3c87b940dd08a432b3a415e6f0c764575b626f308c52b7827de61fab067c814ca737b4ca7d42a395da4a5fa27539e2c10921bbd75334b5d43d6dda3ba8e
7
+ data.tar.gz: f49a373bb5d0f0758a4473f6a39fcd6b4b1a8e569378cbb119f8d0bfc12bb036e7d96b0c9089e22732c84f1a2d7ee903ed0322105db7668342b84632e5b6034c
data/README.md CHANGED
@@ -24,23 +24,27 @@ Or install it yourself as:
24
24
 
25
25
  ## Usage
26
26
 
27
- Selma can perform two different actions:
27
+ Selma can perform two different actions, either independently or together:
28
28
 
29
29
  - Sanitize HTML, through a [Sanitize](https://github.com/rgrove/sanitize)-like allowlist syntax; and
30
- - Select HTML using CSS rules, and manipulate elements and text
30
+ - Select HTML using CSS rules, and manipulate elements and text nodes along the way.
31
31
 
32
- The basic API for Selma looks like this:
32
+ It does this through two kwargsL `sanitizer` and `handlers`. The basic API for Selma looks like this:
33
33
 
34
34
  ```ruby
35
- rewriter = Selma::Rewriter.new(sanitizer: sanitizer_config, handlers: [MatchAttribute.new, TextRewrite.new])
35
+ sanitizer_config = {
36
+ elements: ["b", "em", "i", "strong", "u"],
37
+ }
38
+ sanitizer = Selma::Sanitizer.new(sanitizer_config)
39
+ rewriter = Selma::Rewriter.new(sanitizer: sanitizer, handlers: [MatchElementRewrite.new, MatchTextRewrite.new])
36
40
  rewriter(html)
37
41
  ```
38
42
 
39
- Let's take a look at each part individually.
43
+ Here's a look at each individual part.
40
44
 
41
45
  ### Sanitization config
42
46
 
43
- Selma sanitizes by default. That is, even if the `sanitizer` kwarg is not passed in, sanitization occurs. If you want to disable HTML sanitization (for some reason), pass `nil`:
47
+ Selma sanitizes by default. That is, even if the `sanitizer` kwarg is not passed in, sanitization occurs. If you truly want to disable HTML sanitization (for some reason), pass `nil`:
44
48
 
45
49
  ```ruby
46
50
  Selma::Rewriter.new(sanitizer: nil) # dangerous and ill-advised
@@ -87,22 +91,22 @@ whitespace_elements: ["blockquote", "h1", "h2", "h3", "h4", "h5", "h6", ]
87
91
 
88
92
  ### Defining handlers
89
93
 
90
- The real power in Selma comes in its use of handlers. A handler is simply an object with various methods:
94
+ The real power in Selma comes in its use of handlers. A handler is simply an object with various methods defined:
91
95
 
92
96
  - `selector`, a method which MUST return instance of `Selma::Selector` which defines the CSS classes to match
93
97
  - `handle_element`, a method that's call on each matched element
94
- - `handle_text_chunk`, a method that's called on each matched text node; this MUST return a string
98
+ - `handle_text_chunk`, a method that's called on each matched text node
95
99
 
96
100
  Here's an example which rewrites the `href` attribute on `a` and the `src` attribute on `img` to be `https` rather than `http`.
97
101
 
98
102
  ```ruby
99
103
  class MatchAttribute
100
- SELECTOR = Selma::Selector(match_element: "a, img")
104
+ SELECTOR = Selma::Selector(match_element: %(a[href^="http:"], img[src^="http:"]"))
101
105
 
102
106
  def handle_element(element)
103
- if element.tag_name == "a" && element["href"] =~ /^http:/
107
+ if element.tag_name == "a"
104
108
  element["href"] = rename_http(element["href"])
105
- elsif element.tag_name == "img" && element["src"] =~ /^http:/
109
+ elsif element.tag_name == "img"
106
110
  element["src"] = rename_http(element["src"])
107
111
  end
108
112
  end
@@ -118,10 +122,10 @@ rewriter = Selma::Rewriter.new(handlers: [MatchAttribute.new])
118
122
  The `Selma::Selector` object has three possible kwargs:
119
123
 
120
124
  - `match_element`: any element which matches this CSS rule will be passed on to `handle_element`
121
- - `match_text_within`: any element which matches this CSS rule will be passed on to `handle_text_chunk`
125
+ - `match_text_within`: any text_chunk which matches this CSS rule will be passed on to `handle_text_chunk`
122
126
  - `ignore_text_within`: this is an array of element names whose text contents will be ignored
123
127
 
124
- You've seen an example of `match_element`; here's one for `match_text` which changes strings in various elements which are _not_ `pre` or `code`:
128
+ Here's an example for `handle_text_chunk` which changes strings in various elements which are _not_ `pre` or `code`:
125
129
 
126
130
  ```ruby
127
131
 
@@ -144,20 +148,63 @@ rewriter = Selma::Rewriter.new(handlers: [MatchText.new])
144
148
 
145
149
  The `element` argument in `handle_element` has the following methods:
146
150
 
147
- - `tag_name`: The element's name
148
- - `[]`: get an attribute
149
- - `[]=`: set an attribute
150
- - `remove_attribute`: remove an attribute
151
- - `attributes`: list all the attributes
152
- - `ancestors`: list all the ancestors
153
- - `append(content, as: content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
151
+ - `tag_name`: Gets the element's name
152
+ - `tag_name=`: Sets the element's name
153
+ - `self_closing?`: A bool which identifies whether or not the element is self-closing
154
+ - `[]`: Get an attribute
155
+ - `[]=`: Set an attribute
156
+ - `remove_attribute`: Remove an attribute
157
+ - `has_attribute?`: A bool which identifies whether or not the element has an attribute
158
+ - `attributes`: List all the attributes
159
+ - `ancestors`: List all of an element's ancestors as an array of strings
154
160
  - `before(content, as: content_type)`: Inserts `content` before the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
155
161
  - `after(content, as: content_type)`: Inserts `content` after the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
156
- - `set_inner_content`: replaces inner content of the element with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
162
+ - `prepend(content, as: content_type)`: prepends `content` to the element's inner content, i.e. inserts content right after the element's start tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
163
+ - `append(content, as: content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
164
+ - `set_inner_content`: Replaces inner content of the element with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
165
+
166
+ #### `text_chunk` methods
167
+
168
+ - `to_s` / `.content`: Gets the text node's content
169
+ - `text_type`: identifies the type of text in the text node
170
+ - `before(content, as: content_type)`: Inserts `content` before the text. `content_type` is either `:text` or `:html` and determines how the content will be applied.
171
+ - `after(content, as: content_type)`: Inserts `content` after the text. `content_type` is either `:text` or `:html` and determines how the content will be applied.
172
+ - `replace(content, as: content_type)`: Replaces the text node with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
157
173
 
158
174
  ## Benchmarks
159
175
 
160
- TBD
176
+ <details>
177
+ <pre>
178
+ ruby test/benchmark.rb
179
+ ruby test/benchmark.rb
180
+ Warming up --------------------------------------
181
+ sanitize-document-huge
182
+ 1.000 i/100ms
183
+ selma-document-huge 1.000 i/100ms
184
+ Calculating -------------------------------------
185
+ sanitize-document-huge
186
+ 0.257 (± 0.0%) i/s - 2.000 in 7.783398s
187
+ selma-document-huge 4.602 (± 0.0%) i/s - 23.000 in 5.002870s
188
+ Warming up --------------------------------------
189
+ sanitize-document-medium
190
+ 2.000 i/100ms
191
+ selma-document-medium
192
+ 22.000 i/100ms
193
+ Calculating -------------------------------------
194
+ sanitize-document-medium
195
+ 28.676 (± 3.5%) i/s - 144.000 in 5.024669s
196
+ selma-document-medium
197
+ 121.500 (±22.2%) i/s - 594.000 in 5.135410s
198
+ Warming up --------------------------------------
199
+ sanitize-document-small
200
+ 10.000 i/100ms
201
+ selma-document-small 20.000 i/100ms
202
+ Calculating -------------------------------------
203
+ sanitize-document-small
204
+ 107.280 (± 0.9%) i/s - 540.000 in 5.033850s
205
+ selma-document-small 118.867 (±31.1%) i/s - 540.000 in 5.080726s
206
+ </pre>
207
+ </details>
161
208
 
162
209
  ## Contributing
163
210
 
@@ -36,6 +36,48 @@ impl SelmaHTMLElement {
36
36
  }
37
37
  }
38
38
 
39
+ fn set_tag_name(&self, name: String) -> Result<(), Error> {
40
+ let mut binding = self.0.borrow_mut();
41
+
42
+ if let Ok(element) = binding.element.get_mut() {
43
+ match element.set_tag_name(&name) {
44
+ Ok(_) => Ok(()),
45
+ Err(err) => Err(Error::new(exception::runtime_error(), format!("{err:?}"))),
46
+ }
47
+ } else {
48
+ Err(Error::new(
49
+ exception::runtime_error(),
50
+ "`set_tag_name` is not available",
51
+ ))
52
+ }
53
+ }
54
+
55
+ fn is_self_closing(&self) -> Result<bool, Error> {
56
+ let binding = self.0.borrow();
57
+
58
+ if let Ok(e) = binding.element.get() {
59
+ Ok(e.is_self_closing())
60
+ } else {
61
+ Err(Error::new(
62
+ exception::runtime_error(),
63
+ "`is_self_closing` is not available",
64
+ ))
65
+ }
66
+ }
67
+
68
+ fn has_attribute(&self, attr: String) -> Result<bool, Error> {
69
+ let binding = self.0.borrow();
70
+
71
+ if let Ok(e) = binding.element.get() {
72
+ Ok(e.has_attribute(&attr))
73
+ } else {
74
+ Err(Error::new(
75
+ exception::runtime_error(),
76
+ "`is_self_closing` is not available",
77
+ ))
78
+ }
79
+ }
80
+
39
81
  fn get_attribute(&self, attr: String) -> Option<String> {
40
82
  let binding = self.0.borrow();
41
83
  let element = binding.element.get();
@@ -132,6 +174,20 @@ impl SelmaHTMLElement {
132
174
  Ok(())
133
175
  }
134
176
 
177
+ fn prepend(&self, args: &[Value]) -> Result<(), Error> {
178
+ let mut binding = self.0.borrow_mut();
179
+ let element = binding.element.get_mut().unwrap();
180
+
181
+ let (text_str, content_type) = match crate::scan_text_args(args) {
182
+ Ok((text_str, content_type)) => (text_str, content_type),
183
+ Err(err) => return Err(err),
184
+ };
185
+
186
+ element.prepend(&text_str, content_type);
187
+
188
+ Ok(())
189
+ }
190
+
135
191
  fn append(&self, args: &[Value]) -> Result<(), Error> {
136
192
  let mut binding = self.0.borrow_mut();
137
193
  let element = binding.element.get_mut().unwrap();
@@ -167,17 +223,27 @@ pub fn init(c_html: RClass) -> Result<(), Error> {
167
223
  .expect("cannot find class Selma::HTML::Element");
168
224
 
169
225
  c_element.define_method("tag_name", method!(SelmaHTMLElement::tag_name, 0))?;
226
+ c_element.define_method("tag_name=", method!(SelmaHTMLElement::set_tag_name, 1))?;
227
+ c_element.define_method(
228
+ "self_closing?",
229
+ method!(SelmaHTMLElement::is_self_closing, 0),
230
+ )?;
170
231
  c_element.define_method("[]", method!(SelmaHTMLElement::get_attribute, 1))?;
171
232
  c_element.define_method("[]=", method!(SelmaHTMLElement::set_attribute, 2))?;
172
233
  c_element.define_method(
173
234
  "remove_attribute",
174
235
  method!(SelmaHTMLElement::remove_attribute, 1),
175
236
  )?;
237
+ c_element.define_method(
238
+ "has_attribute?",
239
+ method!(SelmaHTMLElement::has_attribute, 1),
240
+ )?;
176
241
  c_element.define_method("attributes", method!(SelmaHTMLElement::get_attributes, 0))?;
177
242
  c_element.define_method("ancestors", method!(SelmaHTMLElement::get_ancestors, 0))?;
178
243
 
179
244
  c_element.define_method("before", method!(SelmaHTMLElement::before, -1))?;
180
245
  c_element.define_method("after", method!(SelmaHTMLElement::after, -1))?;
246
+ c_element.define_method("prepend", method!(SelmaHTMLElement::prepend, -1))?;
181
247
  c_element.define_method("append", method!(SelmaHTMLElement::append, -1))?;
182
248
  c_element.define_method(
183
249
  "set_inner_content",
@@ -54,6 +54,34 @@ impl SelmaHTMLTextChunk {
54
54
  }
55
55
  }
56
56
 
57
+ fn before(&self, args: &[Value]) -> Result<(), Error> {
58
+ let mut binding = self.0.borrow_mut();
59
+ let text_chunk = binding.text_chunk.get_mut().unwrap();
60
+
61
+ let (text_str, content_type) = match crate::scan_text_args(args) {
62
+ Ok((text_str, content_type)) => (text_str, content_type),
63
+ Err(err) => return Err(err),
64
+ };
65
+
66
+ text_chunk.before(&text_str, content_type);
67
+
68
+ Ok(())
69
+ }
70
+
71
+ fn after(&self, args: &[Value]) -> Result<(), Error> {
72
+ let mut binding = self.0.borrow_mut();
73
+ let text_chunk = binding.text_chunk.get_mut().unwrap();
74
+
75
+ let (text_str, content_type) = match crate::scan_text_args(args) {
76
+ Ok((text_str, content_type)) => (text_str, content_type),
77
+ Err(err) => return Err(err),
78
+ };
79
+
80
+ text_chunk.after(&text_str, content_type);
81
+
82
+ Ok(())
83
+ }
84
+
57
85
  fn replace(&self, args: &[Value]) -> Result<(), Error> {
58
86
  let mut binding = self.0.borrow_mut();
59
87
  let text_chunk = binding.text_chunk.get_mut().unwrap();
@@ -77,6 +105,8 @@ pub fn init(c_html: RClass) -> Result<(), Error> {
77
105
  c_text_chunk.define_method("to_s", method!(SelmaHTMLTextChunk::to_s, 0))?;
78
106
  c_text_chunk.define_method("content", method!(SelmaHTMLTextChunk::to_s, 0))?;
79
107
  c_text_chunk.define_method("text_type", method!(SelmaHTMLTextChunk::text_type, 0))?;
108
+ c_text_chunk.define_method("before", method!(SelmaHTMLTextChunk::before, -1))?;
109
+ c_text_chunk.define_method("after", method!(SelmaHTMLTextChunk::after, -1))?;
80
110
  c_text_chunk.define_method("replace", method!(SelmaHTMLTextChunk::replace, -1))?;
81
111
 
82
112
  Ok(())
@@ -1,4 +1,4 @@
1
- use std::{borrow::BorrowMut, cell::RefMut, collections::HashMap};
1
+ use std::{borrow::BorrowMut, collections::HashMap};
2
2
 
3
3
  use lol_html::{
4
4
  errors::AttributeNameError,
@@ -14,6 +14,18 @@ struct ElementSanitizer {
14
14
  protocol_sanitizers: HashMap<String, Vec<String>>,
15
15
  }
16
16
 
17
+ impl Default for ElementSanitizer {
18
+ fn default() -> Self {
19
+ ElementSanitizer {
20
+ allowed_attrs: vec![],
21
+ allowed_classes: vec![],
22
+ required_attrs: vec![],
23
+
24
+ protocol_sanitizers: HashMap::new(),
25
+ }
26
+ }
27
+ }
28
+
17
29
  #[derive(Clone, Debug)]
18
30
  pub struct Sanitizer {
19
31
  flags: [u8; crate::tags::Tag::TAG_COUNT],
@@ -49,13 +61,7 @@ impl SelmaSanitizer {
49
61
 
50
62
  let mut element_sanitizers = HashMap::new();
51
63
  crate::tags::Tag::html_tags().iter().for_each(|html_tag| {
52
- let es = ElementSanitizer {
53
- allowed_attrs: vec![],
54
- allowed_classes: vec![],
55
- required_attrs: vec![],
56
-
57
- protocol_sanitizers: HashMap::new(),
58
- };
64
+ let es = ElementSanitizer::default();
59
65
  element_sanitizers.insert(
60
66
  crate::tags::Tag::element_name_from_enum(html_tag).to_string(),
61
67
  es,
@@ -169,7 +175,8 @@ impl SelmaSanitizer {
169
175
  let allowed_attrs = &mut binding.allowed_attrs;
170
176
  Self::set_allowed(allowed_attrs, &attr_name, allow);
171
177
  } else {
172
- let element_sanitizer = Self::get_mut_element_sanitizer(&mut binding, &element_name);
178
+ let element_sanitizers = &mut binding.element_sanitizers;
179
+ let element_sanitizer = Self::get_element_sanitizer(element_sanitizers, &element_name);
173
180
 
174
181
  element_sanitizer.allowed_attrs.push(attr_name);
175
182
  }
@@ -183,7 +190,8 @@ impl SelmaSanitizer {
183
190
  let allowed_classes = &mut binding.allowed_classes;
184
191
  Self::set_allowed(allowed_classes, &class_name, allow);
185
192
  } else {
186
- let element_sanitizer = Self::get_mut_element_sanitizer(&mut binding, &element_name);
193
+ let element_sanitizers = &mut binding.element_sanitizers;
194
+ let element_sanitizer = Self::get_element_sanitizer(element_sanitizers, &element_name);
187
195
 
188
196
  let allowed_classes = element_sanitizer.allowed_classes.borrow_mut();
189
197
  Self::set_allowed(allowed_classes, &class_name, allow)
@@ -194,9 +202,10 @@ impl SelmaSanitizer {
194
202
  fn set_allowed_protocols(&self, element_name: String, attr_name: String, allow_list: RArray) {
195
203
  let mut binding = self.0.borrow_mut();
196
204
 
197
- let element_sanitizer = Self::get_mut_element_sanitizer(&mut binding, &element_name);
205
+ let element_sanitizers = &mut binding.element_sanitizers;
206
+ let element_sanitizer = Self::get_element_sanitizer(element_sanitizers, &element_name);
198
207
 
199
- let protocol_sanitizers = element_sanitizer.protocol_sanitizers.borrow_mut();
208
+ let protocol_sanitizers = &mut element_sanitizer.protocol_sanitizers.borrow_mut();
200
209
 
201
210
  for opt_allowed_protocol in allow_list.each() {
202
211
  let allowed_protocol = opt_allowed_protocol.unwrap();
@@ -237,9 +246,15 @@ impl SelmaSanitizer {
237
246
  }
238
247
 
239
248
  pub fn sanitize_attributes(&self, element: &mut Element) -> Result<(), AttributeNameError> {
240
- let binding = self.0.borrow_mut();
241
249
  let tag = crate::tags::Tag::tag_from_element(element);
242
- let element_sanitizer = Self::get_element_sanitizer(&binding, &element.tag_name());
250
+ let tag_name = &element.tag_name();
251
+ let element_sanitizer = {
252
+ let mut binding = self.0.borrow_mut();
253
+ let element_sanitizers = &mut binding.element_sanitizers;
254
+ Self::get_element_sanitizer(element_sanitizers, tag_name).clone()
255
+ };
256
+
257
+ let binding = self.0.borrow();
243
258
 
244
259
  // FIXME: This is a hack to get around the fact that we can't borrow
245
260
  let attribute_map: HashMap<String, String> = element
@@ -265,7 +280,7 @@ impl SelmaSanitizer {
265
280
  let should_keep_attrubute = match Self::should_keep_attribute(
266
281
  &binding,
267
282
  element,
268
- element_sanitizer,
283
+ &element_sanitizer,
269
284
  attr_name,
270
285
  &unescaped_attr_val,
271
286
  ) {
@@ -323,7 +338,7 @@ impl SelmaSanitizer {
323
338
  }
324
339
 
325
340
  fn should_keep_attribute(
326
- binding: &RefMut<Sanitizer>,
341
+ binding: &Sanitizer,
327
342
  element: &mut Element,
328
343
  element_sanitizer: &ElementSanitizer,
329
344
  attr_name: &String,
@@ -410,7 +425,7 @@ impl SelmaSanitizer {
410
425
  }
411
426
 
412
427
  fn sanitize_class_attribute(
413
- binding: &RefMut<Sanitizer>,
428
+ binding: &Sanitizer,
414
429
  element: &mut Element,
415
430
  element_sanitizer: &ElementSanitizer,
416
431
  attr_name: &str,
@@ -528,17 +543,12 @@ impl SelmaSanitizer {
528
543
  }
529
544
 
530
545
  fn get_element_sanitizer<'a>(
531
- binding: &'a RefMut<Sanitizer>,
532
- element_name: &str,
533
- ) -> &'a ElementSanitizer {
534
- binding.element_sanitizers.get(element_name).unwrap()
535
- }
536
-
537
- fn get_mut_element_sanitizer<'a>(
538
- binding: &'a mut Sanitizer,
546
+ element_sanitizers: &'a mut HashMap<String, ElementSanitizer>,
539
547
  element_name: &str,
540
548
  ) -> &'a mut ElementSanitizer {
541
- binding.element_sanitizers.get_mut(element_name).unwrap()
549
+ element_sanitizers
550
+ .entry(element_name.to_string())
551
+ .or_insert_with(ElementSanitizer::default)
542
552
  }
543
553
  }
544
554
 
Binary file
data/lib/selma/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Selma
4
- VERSION = "0.0.4"
4
+ VERSION = "0.0.6"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: selma
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.6
5
5
  platform: arm64-darwin
6
6
  authors:
7
7
  - Garen J. Torikian
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-12-26 00:00:00.000000000 Z
11
+ date: 2022-12-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys