selma 0.0.4-x86_64-linux → 0.0.6-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e3b9b090cd0f742bd24d8bd1d065b45d0b142b690141cce378258c8bd231501d
4
- data.tar.gz: '08eaf21b2ab2161bb5daadf451175cb3e37a4c4ab9212c294450a73291f06460'
3
+ metadata.gz: be9face4692cbc6653b2085a056679aab5eca490517f3fa17e4f53ac5c9b4028
4
+ data.tar.gz: 47fa7091498f304b8aba324637218b7ca0af09e8370084337f1e588be677ac3e
5
5
  SHA512:
6
- metadata.gz: 8be8197fb86053ae0aa6e957a2694812be266a5d3903d15c59ede3768811b11639002d92f21217428fb89e023022544cfcd930191dec20b3d7d12934d6337a0f
7
- data.tar.gz: 9e059e1e25bc52ad38ea64dd0d8b9654caaf4ce67aaa33bd6e1d1187ac534d92dc4c9631bff7457d76f7d685100d93b69e18e335dc4f6b69d8dcc053c36e0179
6
+ metadata.gz: 9e57f2b3f8a82aa92cbc68c17a465226631af6bc1c89150a65e58a0b87d37698418799f812083de22590fdf11a18e1991d847f6273c7d9d03c66fd4139c10cc7
7
+ data.tar.gz: 66f079b74f387266446c5293b17dd1a534de422c1b7a7149ff562a4a9a05fefd7857216d2e6a782d92eecb7595ec5e1ba7b2e94c2a56350caef63c2d768a3901
data/README.md CHANGED
@@ -24,23 +24,27 @@ Or install it yourself as:
24
24
 
25
25
  ## Usage
26
26
 
27
- Selma can perform two different actions:
27
+ Selma can perform two different actions, either independently or together:
28
28
 
29
29
  - Sanitize HTML, through a [Sanitize](https://github.com/rgrove/sanitize)-like allowlist syntax; and
30
- - Select HTML using CSS rules, and manipulate elements and text
30
+ - Select HTML using CSS rules, and manipulate elements and text nodes along the way.
31
31
 
32
- The basic API for Selma looks like this:
32
+ It does this through two kwargsL `sanitizer` and `handlers`. The basic API for Selma looks like this:
33
33
 
34
34
  ```ruby
35
- rewriter = Selma::Rewriter.new(sanitizer: sanitizer_config, handlers: [MatchAttribute.new, TextRewrite.new])
35
+ sanitizer_config = {
36
+ elements: ["b", "em", "i", "strong", "u"],
37
+ }
38
+ sanitizer = Selma::Sanitizer.new(sanitizer_config)
39
+ rewriter = Selma::Rewriter.new(sanitizer: sanitizer, handlers: [MatchElementRewrite.new, MatchTextRewrite.new])
36
40
  rewriter(html)
37
41
  ```
38
42
 
39
- Let's take a look at each part individually.
43
+ Here's a look at each individual part.
40
44
 
41
45
  ### Sanitization config
42
46
 
43
- Selma sanitizes by default. That is, even if the `sanitizer` kwarg is not passed in, sanitization occurs. If you want to disable HTML sanitization (for some reason), pass `nil`:
47
+ Selma sanitizes by default. That is, even if the `sanitizer` kwarg is not passed in, sanitization occurs. If you truly want to disable HTML sanitization (for some reason), pass `nil`:
44
48
 
45
49
  ```ruby
46
50
  Selma::Rewriter.new(sanitizer: nil) # dangerous and ill-advised
@@ -87,22 +91,22 @@ whitespace_elements: ["blockquote", "h1", "h2", "h3", "h4", "h5", "h6", ]
87
91
 
88
92
  ### Defining handlers
89
93
 
90
- The real power in Selma comes in its use of handlers. A handler is simply an object with various methods:
94
+ The real power in Selma comes in its use of handlers. A handler is simply an object with various methods defined:
91
95
 
92
96
  - `selector`, a method which MUST return instance of `Selma::Selector` which defines the CSS classes to match
93
97
  - `handle_element`, a method that's call on each matched element
94
- - `handle_text_chunk`, a method that's called on each matched text node; this MUST return a string
98
+ - `handle_text_chunk`, a method that's called on each matched text node
95
99
 
96
100
  Here's an example which rewrites the `href` attribute on `a` and the `src` attribute on `img` to be `https` rather than `http`.
97
101
 
98
102
  ```ruby
99
103
  class MatchAttribute
100
- SELECTOR = Selma::Selector(match_element: "a, img")
104
+ SELECTOR = Selma::Selector(match_element: %(a[href^="http:"], img[src^="http:"]"))
101
105
 
102
106
  def handle_element(element)
103
- if element.tag_name == "a" && element["href"] =~ /^http:/
107
+ if element.tag_name == "a"
104
108
  element["href"] = rename_http(element["href"])
105
- elsif element.tag_name == "img" && element["src"] =~ /^http:/
109
+ elsif element.tag_name == "img"
106
110
  element["src"] = rename_http(element["src"])
107
111
  end
108
112
  end
@@ -118,10 +122,10 @@ rewriter = Selma::Rewriter.new(handlers: [MatchAttribute.new])
118
122
  The `Selma::Selector` object has three possible kwargs:
119
123
 
120
124
  - `match_element`: any element which matches this CSS rule will be passed on to `handle_element`
121
- - `match_text_within`: any element which matches this CSS rule will be passed on to `handle_text_chunk`
125
+ - `match_text_within`: any text_chunk which matches this CSS rule will be passed on to `handle_text_chunk`
122
126
  - `ignore_text_within`: this is an array of element names whose text contents will be ignored
123
127
 
124
- You've seen an example of `match_element`; here's one for `match_text` which changes strings in various elements which are _not_ `pre` or `code`:
128
+ Here's an example for `handle_text_chunk` which changes strings in various elements which are _not_ `pre` or `code`:
125
129
 
126
130
  ```ruby
127
131
 
@@ -144,20 +148,63 @@ rewriter = Selma::Rewriter.new(handlers: [MatchText.new])
144
148
 
145
149
  The `element` argument in `handle_element` has the following methods:
146
150
 
147
- - `tag_name`: The element's name
148
- - `[]`: get an attribute
149
- - `[]=`: set an attribute
150
- - `remove_attribute`: remove an attribute
151
- - `attributes`: list all the attributes
152
- - `ancestors`: list all the ancestors
153
- - `append(content, as: content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
151
+ - `tag_name`: Gets the element's name
152
+ - `tag_name=`: Sets the element's name
153
+ - `self_closing?`: A bool which identifies whether or not the element is self-closing
154
+ - `[]`: Get an attribute
155
+ - `[]=`: Set an attribute
156
+ - `remove_attribute`: Remove an attribute
157
+ - `has_attribute?`: A bool which identifies whether or not the element has an attribute
158
+ - `attributes`: List all the attributes
159
+ - `ancestors`: List all of an element's ancestors as an array of strings
154
160
  - `before(content, as: content_type)`: Inserts `content` before the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
155
161
  - `after(content, as: content_type)`: Inserts `content` after the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
156
- - `set_inner_content`: replaces inner content of the element with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
162
+ - `prepend(content, as: content_type)`: prepends `content` to the element's inner content, i.e. inserts content right after the element's start tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
163
+ - `append(content, as: content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
164
+ - `set_inner_content`: Replaces inner content of the element with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
165
+
166
+ #### `text_chunk` methods
167
+
168
+ - `to_s` / `.content`: Gets the text node's content
169
+ - `text_type`: identifies the type of text in the text node
170
+ - `before(content, as: content_type)`: Inserts `content` before the text. `content_type` is either `:text` or `:html` and determines how the content will be applied.
171
+ - `after(content, as: content_type)`: Inserts `content` after the text. `content_type` is either `:text` or `:html` and determines how the content will be applied.
172
+ - `replace(content, as: content_type)`: Replaces the text node with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
157
173
 
158
174
  ## Benchmarks
159
175
 
160
- TBD
176
+ <details>
177
+ <pre>
178
+ ruby test/benchmark.rb
179
+ ruby test/benchmark.rb
180
+ Warming up --------------------------------------
181
+ sanitize-document-huge
182
+ 1.000 i/100ms
183
+ selma-document-huge 1.000 i/100ms
184
+ Calculating -------------------------------------
185
+ sanitize-document-huge
186
+ 0.257 (± 0.0%) i/s - 2.000 in 7.783398s
187
+ selma-document-huge 4.602 (± 0.0%) i/s - 23.000 in 5.002870s
188
+ Warming up --------------------------------------
189
+ sanitize-document-medium
190
+ 2.000 i/100ms
191
+ selma-document-medium
192
+ 22.000 i/100ms
193
+ Calculating -------------------------------------
194
+ sanitize-document-medium
195
+ 28.676 (± 3.5%) i/s - 144.000 in 5.024669s
196
+ selma-document-medium
197
+ 121.500 (±22.2%) i/s - 594.000 in 5.135410s
198
+ Warming up --------------------------------------
199
+ sanitize-document-small
200
+ 10.000 i/100ms
201
+ selma-document-small 20.000 i/100ms
202
+ Calculating -------------------------------------
203
+ sanitize-document-small
204
+ 107.280 (± 0.9%) i/s - 540.000 in 5.033850s
205
+ selma-document-small 118.867 (±31.1%) i/s - 540.000 in 5.080726s
206
+ </pre>
207
+ </details>
161
208
 
162
209
  ## Contributing
163
210
 
@@ -36,6 +36,48 @@ impl SelmaHTMLElement {
36
36
  }
37
37
  }
38
38
 
39
+ fn set_tag_name(&self, name: String) -> Result<(), Error> {
40
+ let mut binding = self.0.borrow_mut();
41
+
42
+ if let Ok(element) = binding.element.get_mut() {
43
+ match element.set_tag_name(&name) {
44
+ Ok(_) => Ok(()),
45
+ Err(err) => Err(Error::new(exception::runtime_error(), format!("{err:?}"))),
46
+ }
47
+ } else {
48
+ Err(Error::new(
49
+ exception::runtime_error(),
50
+ "`set_tag_name` is not available",
51
+ ))
52
+ }
53
+ }
54
+
55
+ fn is_self_closing(&self) -> Result<bool, Error> {
56
+ let binding = self.0.borrow();
57
+
58
+ if let Ok(e) = binding.element.get() {
59
+ Ok(e.is_self_closing())
60
+ } else {
61
+ Err(Error::new(
62
+ exception::runtime_error(),
63
+ "`is_self_closing` is not available",
64
+ ))
65
+ }
66
+ }
67
+
68
+ fn has_attribute(&self, attr: String) -> Result<bool, Error> {
69
+ let binding = self.0.borrow();
70
+
71
+ if let Ok(e) = binding.element.get() {
72
+ Ok(e.has_attribute(&attr))
73
+ } else {
74
+ Err(Error::new(
75
+ exception::runtime_error(),
76
+ "`is_self_closing` is not available",
77
+ ))
78
+ }
79
+ }
80
+
39
81
  fn get_attribute(&self, attr: String) -> Option<String> {
40
82
  let binding = self.0.borrow();
41
83
  let element = binding.element.get();
@@ -132,6 +174,20 @@ impl SelmaHTMLElement {
132
174
  Ok(())
133
175
  }
134
176
 
177
+ fn prepend(&self, args: &[Value]) -> Result<(), Error> {
178
+ let mut binding = self.0.borrow_mut();
179
+ let element = binding.element.get_mut().unwrap();
180
+
181
+ let (text_str, content_type) = match crate::scan_text_args(args) {
182
+ Ok((text_str, content_type)) => (text_str, content_type),
183
+ Err(err) => return Err(err),
184
+ };
185
+
186
+ element.prepend(&text_str, content_type);
187
+
188
+ Ok(())
189
+ }
190
+
135
191
  fn append(&self, args: &[Value]) -> Result<(), Error> {
136
192
  let mut binding = self.0.borrow_mut();
137
193
  let element = binding.element.get_mut().unwrap();
@@ -167,17 +223,27 @@ pub fn init(c_html: RClass) -> Result<(), Error> {
167
223
  .expect("cannot find class Selma::HTML::Element");
168
224
 
169
225
  c_element.define_method("tag_name", method!(SelmaHTMLElement::tag_name, 0))?;
226
+ c_element.define_method("tag_name=", method!(SelmaHTMLElement::set_tag_name, 1))?;
227
+ c_element.define_method(
228
+ "self_closing?",
229
+ method!(SelmaHTMLElement::is_self_closing, 0),
230
+ )?;
170
231
  c_element.define_method("[]", method!(SelmaHTMLElement::get_attribute, 1))?;
171
232
  c_element.define_method("[]=", method!(SelmaHTMLElement::set_attribute, 2))?;
172
233
  c_element.define_method(
173
234
  "remove_attribute",
174
235
  method!(SelmaHTMLElement::remove_attribute, 1),
175
236
  )?;
237
+ c_element.define_method(
238
+ "has_attribute?",
239
+ method!(SelmaHTMLElement::has_attribute, 1),
240
+ )?;
176
241
  c_element.define_method("attributes", method!(SelmaHTMLElement::get_attributes, 0))?;
177
242
  c_element.define_method("ancestors", method!(SelmaHTMLElement::get_ancestors, 0))?;
178
243
 
179
244
  c_element.define_method("before", method!(SelmaHTMLElement::before, -1))?;
180
245
  c_element.define_method("after", method!(SelmaHTMLElement::after, -1))?;
246
+ c_element.define_method("prepend", method!(SelmaHTMLElement::prepend, -1))?;
181
247
  c_element.define_method("append", method!(SelmaHTMLElement::append, -1))?;
182
248
  c_element.define_method(
183
249
  "set_inner_content",
@@ -54,6 +54,34 @@ impl SelmaHTMLTextChunk {
54
54
  }
55
55
  }
56
56
 
57
+ fn before(&self, args: &[Value]) -> Result<(), Error> {
58
+ let mut binding = self.0.borrow_mut();
59
+ let text_chunk = binding.text_chunk.get_mut().unwrap();
60
+
61
+ let (text_str, content_type) = match crate::scan_text_args(args) {
62
+ Ok((text_str, content_type)) => (text_str, content_type),
63
+ Err(err) => return Err(err),
64
+ };
65
+
66
+ text_chunk.before(&text_str, content_type);
67
+
68
+ Ok(())
69
+ }
70
+
71
+ fn after(&self, args: &[Value]) -> Result<(), Error> {
72
+ let mut binding = self.0.borrow_mut();
73
+ let text_chunk = binding.text_chunk.get_mut().unwrap();
74
+
75
+ let (text_str, content_type) = match crate::scan_text_args(args) {
76
+ Ok((text_str, content_type)) => (text_str, content_type),
77
+ Err(err) => return Err(err),
78
+ };
79
+
80
+ text_chunk.after(&text_str, content_type);
81
+
82
+ Ok(())
83
+ }
84
+
57
85
  fn replace(&self, args: &[Value]) -> Result<(), Error> {
58
86
  let mut binding = self.0.borrow_mut();
59
87
  let text_chunk = binding.text_chunk.get_mut().unwrap();
@@ -77,6 +105,8 @@ pub fn init(c_html: RClass) -> Result<(), Error> {
77
105
  c_text_chunk.define_method("to_s", method!(SelmaHTMLTextChunk::to_s, 0))?;
78
106
  c_text_chunk.define_method("content", method!(SelmaHTMLTextChunk::to_s, 0))?;
79
107
  c_text_chunk.define_method("text_type", method!(SelmaHTMLTextChunk::text_type, 0))?;
108
+ c_text_chunk.define_method("before", method!(SelmaHTMLTextChunk::before, -1))?;
109
+ c_text_chunk.define_method("after", method!(SelmaHTMLTextChunk::after, -1))?;
80
110
  c_text_chunk.define_method("replace", method!(SelmaHTMLTextChunk::replace, -1))?;
81
111
 
82
112
  Ok(())
@@ -1,4 +1,4 @@
1
- use std::{borrow::BorrowMut, cell::RefMut, collections::HashMap};
1
+ use std::{borrow::BorrowMut, collections::HashMap};
2
2
 
3
3
  use lol_html::{
4
4
  errors::AttributeNameError,
@@ -14,6 +14,18 @@ struct ElementSanitizer {
14
14
  protocol_sanitizers: HashMap<String, Vec<String>>,
15
15
  }
16
16
 
17
+ impl Default for ElementSanitizer {
18
+ fn default() -> Self {
19
+ ElementSanitizer {
20
+ allowed_attrs: vec![],
21
+ allowed_classes: vec![],
22
+ required_attrs: vec![],
23
+
24
+ protocol_sanitizers: HashMap::new(),
25
+ }
26
+ }
27
+ }
28
+
17
29
  #[derive(Clone, Debug)]
18
30
  pub struct Sanitizer {
19
31
  flags: [u8; crate::tags::Tag::TAG_COUNT],
@@ -49,13 +61,7 @@ impl SelmaSanitizer {
49
61
 
50
62
  let mut element_sanitizers = HashMap::new();
51
63
  crate::tags::Tag::html_tags().iter().for_each(|html_tag| {
52
- let es = ElementSanitizer {
53
- allowed_attrs: vec![],
54
- allowed_classes: vec![],
55
- required_attrs: vec![],
56
-
57
- protocol_sanitizers: HashMap::new(),
58
- };
64
+ let es = ElementSanitizer::default();
59
65
  element_sanitizers.insert(
60
66
  crate::tags::Tag::element_name_from_enum(html_tag).to_string(),
61
67
  es,
@@ -169,7 +175,8 @@ impl SelmaSanitizer {
169
175
  let allowed_attrs = &mut binding.allowed_attrs;
170
176
  Self::set_allowed(allowed_attrs, &attr_name, allow);
171
177
  } else {
172
- let element_sanitizer = Self::get_mut_element_sanitizer(&mut binding, &element_name);
178
+ let element_sanitizers = &mut binding.element_sanitizers;
179
+ let element_sanitizer = Self::get_element_sanitizer(element_sanitizers, &element_name);
173
180
 
174
181
  element_sanitizer.allowed_attrs.push(attr_name);
175
182
  }
@@ -183,7 +190,8 @@ impl SelmaSanitizer {
183
190
  let allowed_classes = &mut binding.allowed_classes;
184
191
  Self::set_allowed(allowed_classes, &class_name, allow);
185
192
  } else {
186
- let element_sanitizer = Self::get_mut_element_sanitizer(&mut binding, &element_name);
193
+ let element_sanitizers = &mut binding.element_sanitizers;
194
+ let element_sanitizer = Self::get_element_sanitizer(element_sanitizers, &element_name);
187
195
 
188
196
  let allowed_classes = element_sanitizer.allowed_classes.borrow_mut();
189
197
  Self::set_allowed(allowed_classes, &class_name, allow)
@@ -194,9 +202,10 @@ impl SelmaSanitizer {
194
202
  fn set_allowed_protocols(&self, element_name: String, attr_name: String, allow_list: RArray) {
195
203
  let mut binding = self.0.borrow_mut();
196
204
 
197
- let element_sanitizer = Self::get_mut_element_sanitizer(&mut binding, &element_name);
205
+ let element_sanitizers = &mut binding.element_sanitizers;
206
+ let element_sanitizer = Self::get_element_sanitizer(element_sanitizers, &element_name);
198
207
 
199
- let protocol_sanitizers = element_sanitizer.protocol_sanitizers.borrow_mut();
208
+ let protocol_sanitizers = &mut element_sanitizer.protocol_sanitizers.borrow_mut();
200
209
 
201
210
  for opt_allowed_protocol in allow_list.each() {
202
211
  let allowed_protocol = opt_allowed_protocol.unwrap();
@@ -237,9 +246,15 @@ impl SelmaSanitizer {
237
246
  }
238
247
 
239
248
  pub fn sanitize_attributes(&self, element: &mut Element) -> Result<(), AttributeNameError> {
240
- let binding = self.0.borrow_mut();
241
249
  let tag = crate::tags::Tag::tag_from_element(element);
242
- let element_sanitizer = Self::get_element_sanitizer(&binding, &element.tag_name());
250
+ let tag_name = &element.tag_name();
251
+ let element_sanitizer = {
252
+ let mut binding = self.0.borrow_mut();
253
+ let element_sanitizers = &mut binding.element_sanitizers;
254
+ Self::get_element_sanitizer(element_sanitizers, tag_name).clone()
255
+ };
256
+
257
+ let binding = self.0.borrow();
243
258
 
244
259
  // FIXME: This is a hack to get around the fact that we can't borrow
245
260
  let attribute_map: HashMap<String, String> = element
@@ -265,7 +280,7 @@ impl SelmaSanitizer {
265
280
  let should_keep_attrubute = match Self::should_keep_attribute(
266
281
  &binding,
267
282
  element,
268
- element_sanitizer,
283
+ &element_sanitizer,
269
284
  attr_name,
270
285
  &unescaped_attr_val,
271
286
  ) {
@@ -323,7 +338,7 @@ impl SelmaSanitizer {
323
338
  }
324
339
 
325
340
  fn should_keep_attribute(
326
- binding: &RefMut<Sanitizer>,
341
+ binding: &Sanitizer,
327
342
  element: &mut Element,
328
343
  element_sanitizer: &ElementSanitizer,
329
344
  attr_name: &String,
@@ -410,7 +425,7 @@ impl SelmaSanitizer {
410
425
  }
411
426
 
412
427
  fn sanitize_class_attribute(
413
- binding: &RefMut<Sanitizer>,
428
+ binding: &Sanitizer,
414
429
  element: &mut Element,
415
430
  element_sanitizer: &ElementSanitizer,
416
431
  attr_name: &str,
@@ -528,17 +543,12 @@ impl SelmaSanitizer {
528
543
  }
529
544
 
530
545
  fn get_element_sanitizer<'a>(
531
- binding: &'a RefMut<Sanitizer>,
532
- element_name: &str,
533
- ) -> &'a ElementSanitizer {
534
- binding.element_sanitizers.get(element_name).unwrap()
535
- }
536
-
537
- fn get_mut_element_sanitizer<'a>(
538
- binding: &'a mut Sanitizer,
546
+ element_sanitizers: &'a mut HashMap<String, ElementSanitizer>,
539
547
  element_name: &str,
540
548
  ) -> &'a mut ElementSanitizer {
541
- binding.element_sanitizers.get_mut(element_name).unwrap()
549
+ element_sanitizers
550
+ .entry(element_name.to_string())
551
+ .or_insert_with(ElementSanitizer::default)
542
552
  }
543
553
  }
544
554
 
Binary file
data/lib/selma/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Selma
4
- VERSION = "0.0.4"
4
+ VERSION = "0.0.6"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: selma
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.6
5
5
  platform: x86_64-linux
6
6
  authors:
7
7
  - Garen J. Torikian
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-12-26 00:00:00.000000000 Z
11
+ date: 2022-12-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys