selma 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5f579c1ac9c0e6e24d6c5919f27072940adb9e31c5a0cc4429b6338cc24f6315
4
- data.tar.gz: 90b190ebde12c3a38fb682af3627d83fe3e704ad7b9bc301d70eb81da6d1c63c
3
+ metadata.gz: 39f002b01be55b66f1779d5487adb3b06be2985b628e505607447d94a07baa1c
4
+ data.tar.gz: ae2fe119b992e07894d4087612062833cccb559b4b0a81c0c821a9789c1d4ab8
5
5
  SHA512:
6
- metadata.gz: 4da6d3776c6c4ad04c73bdfa43e3cef4ce1ed71ada3d5653041b388c41cf4b1c77dc65671238ea28f36092d68bd4dcf3b9585ae35f7ea8f280c68a0db6b93fed
7
- data.tar.gz: e29bb6f28f5fb9123946b97948aef0e0857938b4a5869fce451cba3f1fdb95af1033908418f22ad454f5feb64295db40cc8df63fb183f6a628019bbd3618547e
6
+ metadata.gz: 585fa793b4f6bb073bb5d88fce4d5b2e50d040a92f381211e0342419a2bc03b0c84bf47f27a68dd94b5b9b25eb7ec90e80a4e5d6a17f9031a71f1378ae87aa8b
7
+ data.tar.gz: 3b1185963681b4710af5df45daf7e63be04c3f917313218d251b51aa820303d60dc5c8af51d9708d2380de90fc51d9951b5004f1a4cf82a371090928b3d849bb
data/README.md CHANGED
@@ -56,6 +56,10 @@ allow_comments: false,
56
56
  # "<!DOCTYPE html>" when sanitizing a document.
57
57
  allow_doctype: false,
58
58
 
59
+ # HTML elements to allow. By default, no elements are allowed (which means
60
+ # that all HTML will be stripped).
61
+ elements: ["a", "b", "img", ],
62
+
59
63
  # HTML attributes to allow in specific elements. The key is the name of the element,
60
64
  # and the value is an array of allowed attributes. By default, no attributes
61
65
  # are allowed.
@@ -64,14 +68,10 @@ attributes: {
64
68
  "img" => ["src"],
65
69
  },
66
70
 
67
- # HTML elements to allow. By default, no elements are allowed (which means
68
- # that all HTML will be stripped).
69
- elements: ["a", "b", "img", ],
70
-
71
71
  # URL handling protocols to allow in specific attributes. By default, no
72
72
  # protocols are allowed. Use :relative in place of a protocol if you want
73
73
  # to allow relative URLs sans protocol.
74
- protocols: {
74
+ protocols: {
75
75
  "a" => { "href" => ["http", "https", "mailto", :relative] },
76
76
  "img" => { "href" => ["http", "https"] },
77
77
  },
@@ -91,7 +91,7 @@ The real power in Selma comes in its use of handlers. A handler is simply an obj
91
91
 
92
92
  - `selector`, a method which MUST return instance of `Selma::Selector` which defines the CSS classes to match
93
93
  - `handle_element`, a method that's call on each matched element
94
- - `handle_text`, a method that's called on each matched text node; this MUST return a string
94
+ - `handle_text_chunk`, a method that's called on each matched text node; this MUST return a string
95
95
 
96
96
  Here's an example which rewrites the `href` attribute on `a` and the `src` attribute on `img` to be `https` rather than `http`.
97
97
 
@@ -118,7 +118,7 @@ rewriter = Selma::Rewriter.new(handlers: [MatchAttribute.new])
118
118
  The `Selma::Selector` object has three possible kwargs:
119
119
 
120
120
  - `match_element`: any element which matches this CSS rule will be passed on to `handle_element`
121
- - `match_text_within`: any element which matches this CSS rule will be passed on to `handle_text`
121
+ - `match_text_within`: any element which matches this CSS rule will be passed on to `handle_text_chunk`
122
122
  - `ignore_text_within`: this is an array of element names whose text contents will be ignored
123
123
 
124
124
  You've seen an example of `match_element`; here's one for `match_text` which changes strings in various elements which are _not_ `pre` or `code`:
@@ -132,7 +132,7 @@ class MatchText
132
132
  SELECTOR
133
133
  end
134
134
 
135
- def handle_text(text)
135
+ def handle_text_chunk(text)
136
136
  string.sub(/@.+/, "<a href=\"www.yetto.app/#{Regexp.last_match}\">")
137
137
  end
138
138
  end
@@ -150,8 +150,9 @@ The `element` argument in `handle_element` has the following methods:
150
150
  - `remove_attribute`: remove an attribute
151
151
  - `attributes`: list all the attributes
152
152
  - `ancestors`: list all the ancestors
153
- - `append(content, content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
154
- - `wrap(start_text, end_text, content_type)`: adds `start_text` before an element and `end_text` after an element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
153
+ - `append(content, as: content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
154
+ - `before(content, as: content_type)`: Inserts `content` before the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
155
+ - `after(content, as: content_type)`: Inserts `content` after the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
155
156
  - `set_inner_content`: replaces inner content of the element with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
156
157
 
157
158
  ## Benchmarks
data/ext/selma/Cargo.toml CHANGED
@@ -5,7 +5,7 @@ edition = "2021"
5
5
 
6
6
  [dependencies]
7
7
  enum-iterator = "1.2"
8
- escapist = "0.0.1"
8
+ escapist = "0.0.2"
9
9
  magnus = { git = "https://github.com/matsadler/magnus", rev = "23160f7229ac74c42da1b5096a65ccbc40962697" }
10
10
  lol_html = "0.3"
11
11
 
@@ -1,8 +1,6 @@
1
- use std::borrow::Cow;
2
-
3
1
  use crate::native_ref_wrap::NativeRefWrap;
4
- use lol_html::html_content::{ContentType, Element};
5
- use magnus::{exception, method, Error, Module, RArray, RClass, RHash, RString, Symbol};
2
+ use lol_html::html_content::Element;
3
+ use magnus::{exception, method, Error, Module, RArray, RClass, RHash, RString, Value};
6
4
 
7
5
  struct HTMLElement {
8
6
  element: NativeRefWrap<Element<'static, 'static>>,
@@ -106,73 +104,67 @@ impl SelmaHTMLElement {
106
104
  Ok(array)
107
105
  }
108
106
 
109
- fn append(&self, text_to_append: String, content_type: Symbol) -> Result<(), Error> {
107
+ fn before(&self, args: &[Value]) -> Result<(), Error> {
110
108
  let mut binding = self.0.borrow_mut();
111
109
  let element = binding.element.get_mut().unwrap();
112
110
 
113
- let text_str = text_to_append.as_str();
114
-
115
- let content_type = Self::find_content_type(content_type);
111
+ let (text_str, content_type) = match crate::scan_text_args(args) {
112
+ Ok((text_str, content_type)) => (text_str, content_type),
113
+ Err(err) => return Err(err),
114
+ };
116
115
 
117
- element.append(text_str, content_type);
116
+ element.before(&text_str, content_type);
118
117
 
119
118
  Ok(())
120
119
  }
121
120
 
122
- fn wrap(
123
- &self,
124
- start_text: String,
125
- end_text: String,
126
- content_type: Symbol,
127
- ) -> Result<(), Error> {
121
+ fn after(&self, args: &[Value]) -> Result<(), Error> {
128
122
  let mut binding = self.0.borrow_mut();
129
123
  let element = binding.element.get_mut().unwrap();
130
124
 
131
- let before_content_type = Self::find_content_type(content_type);
132
- let after_content_type = Self::find_content_type(content_type);
133
- element.before(&start_text, before_content_type);
134
- element.after(&end_text, after_content_type);
125
+ let (text_str, content_type) = match crate::scan_text_args(args) {
126
+ Ok((text_str, content_type)) => (text_str, content_type),
127
+ Err(err) => return Err(err),
128
+ };
129
+
130
+ element.after(&text_str, content_type);
135
131
 
136
132
  Ok(())
137
133
  }
138
134
 
139
- fn set_inner_content(&self, text_to_set: String, content_type: Symbol) -> Result<(), Error> {
135
+ fn append(&self, args: &[Value]) -> Result<(), Error> {
140
136
  let mut binding = self.0.borrow_mut();
141
137
  let element = binding.element.get_mut().unwrap();
142
138
 
143
- let text_str = text_to_set.as_str();
144
-
145
- let content_type = Self::find_content_type(content_type);
139
+ let (text_str, content_type) = match crate::scan_text_args(args) {
140
+ Ok((text_str, content_type)) => (text_str, content_type),
141
+ Err(err) => return Err(err),
142
+ };
146
143
 
147
- element.set_inner_content(text_str, content_type);
144
+ element.append(&text_str, content_type);
148
145
 
149
146
  Ok(())
150
147
  }
151
148
 
152
- fn find_content_type(content_type: Symbol) -> ContentType {
153
- match content_type.name() {
154
- Ok(name) => match name {
155
- Cow::Borrowed("as_text") => ContentType::Text,
156
- Cow::Borrowed("as_html") => ContentType::Html,
157
- _ => Err(Error::new(
158
- exception::runtime_error(),
159
- format!("unknown symbol `{name:?}`"),
160
- ))
161
- .unwrap(),
162
- },
163
- Err(err) => Err(Error::new(
164
- exception::runtime_error(),
165
- format!("Could not unwrap symbol: {err:?}"),
166
- ))
167
- .unwrap(),
168
- }
149
+ fn set_inner_content(&self, args: &[Value]) -> Result<(), Error> {
150
+ let mut binding = self.0.borrow_mut();
151
+ let element = binding.element.get_mut().unwrap();
152
+
153
+ let (inner_content, content_type) = match crate::scan_text_args(args) {
154
+ Ok((inner_content, content_type)) => (inner_content, content_type),
155
+ Err(err) => return Err(err),
156
+ };
157
+
158
+ element.set_inner_content(&inner_content, content_type);
159
+
160
+ Ok(())
169
161
  }
170
162
  }
171
163
 
172
164
  pub fn init(c_html: RClass) -> Result<(), Error> {
173
165
  let c_element = c_html
174
166
  .define_class("Element", Default::default())
175
- .expect("cannot find class Selma::Element");
167
+ .expect("cannot find class Selma::HTML::Element");
176
168
 
177
169
  c_element.define_method("tag_name", method!(SelmaHTMLElement::tag_name, 0))?;
178
170
  c_element.define_method("[]", method!(SelmaHTMLElement::get_attribute, 1))?;
@@ -184,11 +176,12 @@ pub fn init(c_html: RClass) -> Result<(), Error> {
184
176
  c_element.define_method("attributes", method!(SelmaHTMLElement::get_attributes, 0))?;
185
177
  c_element.define_method("ancestors", method!(SelmaHTMLElement::get_ancestors, 0))?;
186
178
 
187
- c_element.define_method("append", method!(SelmaHTMLElement::append, 2))?;
188
- c_element.define_method("wrap", method!(SelmaHTMLElement::wrap, 3))?;
179
+ c_element.define_method("before", method!(SelmaHTMLElement::before, -1))?;
180
+ c_element.define_method("after", method!(SelmaHTMLElement::after, -1))?;
181
+ c_element.define_method("append", method!(SelmaHTMLElement::append, -1))?;
189
182
  c_element.define_method(
190
183
  "set_inner_content",
191
- method!(SelmaHTMLElement::set_inner_content, 2),
184
+ method!(SelmaHTMLElement::set_inner_content, -1),
192
185
  )?;
193
186
 
194
187
  Ok(())
@@ -6,7 +6,7 @@ struct HTMLEndTag {
6
6
  end_tag: NativeRefWrap<EndTag<'static>>,
7
7
  }
8
8
 
9
- #[magnus::wrap(class = "Selma::HTML::Element")]
9
+ #[magnus::wrap(class = "Selma::HTML::EndTag")]
10
10
  pub struct SelmaHTMLEndTag(std::cell::RefCell<HTMLEndTag>);
11
11
 
12
12
  /// SAFETY: This is safe because we only access this data when the GVL is held.
@@ -27,7 +27,7 @@ impl SelmaHTMLEndTag {
27
27
  pub fn init(c_html: RClass) -> Result<(), Error> {
28
28
  let c_end_tag = c_html
29
29
  .define_class("EndTag", Default::default())
30
- .expect("cannot find class Selma::EndTag");
30
+ .expect("cannot find class Selma::HTML::EndTag");
31
31
 
32
32
  c_end_tag.define_method("tag_name", method!(SelmaHTMLEndTag::tag_name, 0))?;
33
33
 
@@ -0,0 +1,83 @@
1
+ use crate::native_ref_wrap::NativeRefWrap;
2
+ use lol_html::html_content::{TextChunk, TextType};
3
+ use magnus::{exception, method, Error, Module, RClass, Symbol, Value};
4
+
5
+ struct HTMLTextChunk {
6
+ text_chunk: NativeRefWrap<TextChunk<'static>>,
7
+ }
8
+
9
+ #[magnus::wrap(class = "Selma::HTML::TextChunk")]
10
+ pub struct SelmaHTMLTextChunk(std::cell::RefCell<HTMLTextChunk>);
11
+
12
+ /// SAFETY: This is safe because we only access this data when the GVL is held.
13
+ unsafe impl Send for SelmaHTMLTextChunk {}
14
+
15
+ impl SelmaHTMLTextChunk {
16
+ pub fn new(text_chunk: &mut TextChunk) -> Self {
17
+ let (ref_wrap, _anchor) = NativeRefWrap::wrap_mut(text_chunk);
18
+
19
+ Self(std::cell::RefCell::new(HTMLTextChunk {
20
+ text_chunk: ref_wrap,
21
+ }))
22
+ }
23
+
24
+ fn to_s(&self) -> Result<String, Error> {
25
+ let binding = self.0.borrow();
26
+
27
+ if let Ok(tc) = binding.text_chunk.get() {
28
+ Ok(tc.as_str().to_string())
29
+ } else {
30
+ Err(Error::new(
31
+ exception::runtime_error(),
32
+ "`to_s` is not available",
33
+ ))
34
+ }
35
+ }
36
+
37
+ fn text_type(&self) -> Result<Symbol, Error> {
38
+ let binding = self.0.borrow();
39
+
40
+ if let Ok(tc) = binding.text_chunk.get() {
41
+ match tc.text_type() {
42
+ TextType::Data => Ok(Symbol::from("data")),
43
+ TextType::PlainText => Ok(Symbol::from("plain_text")),
44
+ TextType::RawText => Ok(Symbol::from("raw_text")),
45
+ TextType::ScriptData => Ok(Symbol::from("script")),
46
+ TextType::RCData => Ok(Symbol::from("rc_data")),
47
+ TextType::CDataSection => Ok(Symbol::from("cdata_section")),
48
+ }
49
+ } else {
50
+ Err(Error::new(
51
+ exception::runtime_error(),
52
+ "`text_type` is not available",
53
+ ))
54
+ }
55
+ }
56
+
57
+ fn replace(&self, args: &[Value]) -> Result<(), Error> {
58
+ let mut binding = self.0.borrow_mut();
59
+ let text_chunk = binding.text_chunk.get_mut().unwrap();
60
+
61
+ let (text_str, content_type) = match crate::scan_text_args(args) {
62
+ Ok((text_str, content_type)) => (text_str, content_type),
63
+ Err(err) => return Err(err),
64
+ };
65
+
66
+ text_chunk.replace(&text_str, content_type);
67
+
68
+ Ok(())
69
+ }
70
+ }
71
+
72
+ pub fn init(c_html: RClass) -> Result<(), Error> {
73
+ let c_text_chunk = c_html
74
+ .define_class("TextChunk", Default::default())
75
+ .expect("cannot find class Selma::HTML::TextChunk");
76
+
77
+ c_text_chunk.define_method("to_s", method!(SelmaHTMLTextChunk::to_s, 0))?;
78
+ c_text_chunk.define_method("content", method!(SelmaHTMLTextChunk::to_s, 0))?;
79
+ c_text_chunk.define_method("text_type", method!(SelmaHTMLTextChunk::text_type, 0))?;
80
+ c_text_chunk.define_method("replace", method!(SelmaHTMLTextChunk::replace, -1))?;
81
+
82
+ Ok(())
83
+ }
@@ -9,9 +9,11 @@ pub fn init(m_selma: RModule) -> Result<(), Error> {
9
9
 
10
10
  element::init(c_html).expect("cannot define Selma::HTML::Element class");
11
11
  end_tag::init(c_html).expect("cannot define Selma::HTML::EndTag class");
12
+ text_chunk::init(c_html).expect("cannot define Selma::HTML::TextChunk class");
12
13
 
13
14
  Ok(())
14
15
  }
15
16
 
16
17
  pub mod element;
17
18
  pub mod end_tag;
19
+ pub mod text_chunk;
data/ext/selma/src/lib.rs CHANGED
@@ -1,6 +1,7 @@
1
1
  extern crate core;
2
2
 
3
- use magnus::{define_module, Error};
3
+ use lol_html::html_content::ContentType;
4
+ use magnus::{define_module, exception, scan_args, Error, Symbol, Value};
4
5
 
5
6
  pub mod html;
6
7
  pub mod native_ref_wrap;
@@ -10,6 +11,32 @@ pub mod selector;
10
11
  pub mod tags;
11
12
  pub mod wrapped_struct;
12
13
 
14
+ #[allow(clippy::let_unit_value)]
15
+ fn scan_text_args(args: &[Value]) -> Result<(String, ContentType), magnus::Error> {
16
+ let args = scan_args::scan_args(args)?;
17
+ let (text,): (String,) = args.required;
18
+ let _: () = args.optional;
19
+ let _: () = args.splat;
20
+ let _: () = args.trailing;
21
+ let _: () = args.block;
22
+
23
+ let kwargs = scan_args::get_kwargs::<_, (Symbol,), (), ()>(args.keywords, &["as"], &[])?;
24
+ let as_sym = kwargs.required.0;
25
+ let as_sym_str = as_sym.name().unwrap();
26
+ let content_type = if as_sym_str == "text" {
27
+ ContentType::Text
28
+ } else if as_sym_str == "html" {
29
+ ContentType::Html
30
+ } else {
31
+ return Err(Error::new(
32
+ exception::runtime_error(),
33
+ format!("unknown symbol `{as_sym_str:?}`"),
34
+ ));
35
+ };
36
+
37
+ Ok((text, content_type))
38
+ }
39
+
13
40
  #[magnus::init]
14
41
  fn init() -> Result<(), Error> {
15
42
  let m_selma = define_module("Selma").expect("cannot define ::Selma module");
@@ -1,6 +1,6 @@
1
1
  use lol_html::{
2
2
  doc_comments, doctype, element,
3
- html_content::{ContentType, Element, EndTag, TextChunk},
3
+ html_content::{Element, EndTag, TextChunk},
4
4
  text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter, Selector, Settings,
5
5
  };
6
6
  use magnus::{exception, function, method, scan_args, Module, Object, RArray, RModule, Value};
@@ -8,7 +8,7 @@ use magnus::{exception, function, method, scan_args, Module, Object, RArray, RMo
8
8
  use std::{borrow::Cow, cell::RefCell, primitive::str, rc::Rc};
9
9
 
10
10
  use crate::{
11
- html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag},
11
+ html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag, text_chunk::SelmaHTMLTextChunk},
12
12
  sanitizer::SelmaSanitizer,
13
13
  selector::SelmaSelector,
14
14
  tags::Tag,
@@ -43,7 +43,7 @@ unsafe impl Send for SelmaRewriter {}
43
43
  impl SelmaRewriter {
44
44
  const SELMA_ON_END_TAG: &str = "on_end_tag";
45
45
  const SELMA_HANDLE_ELEMENT: &str = "handle_element";
46
- const SELMA_HANDLE_TEXT: &str = "handle_text";
46
+ const SELMA_HANDLE_TEXT_CHUNK: &str = "handle_text_chunk";
47
47
 
48
48
  /// @yard
49
49
  /// @def new(sanitizer: Selma::Sanitizer.new(Selma::Sanitizer::Config::DEFAULT), handlers: [])
@@ -145,7 +145,7 @@ impl SelmaRewriter {
145
145
  let _: () = args.trailing;
146
146
  let _: () = args.block;
147
147
 
148
- let kw = scan_args::get_kwargs::<
148
+ let kwargs = scan_args::get_kwargs::<
149
149
  _,
150
150
  (),
151
151
  (
@@ -154,7 +154,7 @@ impl SelmaRewriter {
154
154
  ),
155
155
  (),
156
156
  >(args.keywords, &[], &["sanitizer", "handlers"])?;
157
- let (rb_sanitizer, rb_handlers) = kw.optional;
157
+ let (rb_sanitizer, rb_handlers) = kwargs.optional;
158
158
 
159
159
  Ok((rb_sanitizer, rb_handlers))
160
160
  }
@@ -162,26 +162,22 @@ impl SelmaRewriter {
162
162
  /// Perform HTML rewrite sequence.
163
163
  fn rewrite(&self, html: String) -> Result<String, magnus::Error> {
164
164
  let sanitized_html = match &self.0.borrow().sanitizer {
165
- None => html,
165
+ None => Ok(html),
166
166
  Some(sanitizer) => {
167
- // due to malicious html crafting
168
- // (e.g. <<foo>script>...</script>, or <div <!-- comment -->> as in tests),
169
- // we need to run sanitization several times to truly remove unwanted tags,
170
- // because lol-html happily accepts this garbage (by design?)
171
- let sanitized_html = Self::perform_sanitization(sanitizer, &html).unwrap();
167
+ let sanitized_html = match Self::perform_sanitization(sanitizer, &html) {
168
+ Ok(sanitized_html) => sanitized_html,
169
+ Err(err) => return Err(err),
170
+ };
172
171
 
173
- String::from_utf8(sanitized_html).unwrap()
172
+ String::from_utf8(sanitized_html)
174
173
  }
175
174
  };
176
175
  let binding = self.0.borrow_mut();
177
176
  let handlers = &binding.handlers;
178
177
 
179
- match Self::perform_handler_rewrite(self, handlers, sanitized_html) {
178
+ match Self::perform_handler_rewrite(self, handlers, sanitized_html.unwrap()) {
180
179
  Ok(rewritten_html) => Ok(String::from_utf8(rewritten_html).unwrap()),
181
- Err(err) => Err(magnus::Error::new(
182
- exception::runtime_error(),
183
- format!("{err:?}"),
184
- )),
180
+ Err(err) => Err(err),
185
181
  }
186
182
  }
187
183
 
@@ -212,9 +208,10 @@ impl SelmaRewriter {
212
208
  if el.removed() {
213
209
  return Ok(());
214
210
  }
215
- sanitizer.sanitize_attributes(el);
216
-
217
- Ok(())
211
+ match sanitizer.sanitize_attributes(el) {
212
+ Ok(_) => Ok(()),
213
+ Err(err) => Err(err.to_string().into()),
214
+ }
218
215
  })],
219
216
  // TODO: allow for MemorySettings to be defined
220
217
  ..Settings::default()
@@ -341,7 +338,7 @@ impl SelmaRewriter {
341
338
  let mut stack = closure_element_stack.as_ref().borrow_mut();
342
339
  stack.pop();
343
340
  Ok(())
344
- });
341
+ })?;
345
342
  Ok(())
346
343
  }));
347
344
  });
@@ -375,13 +372,14 @@ impl SelmaRewriter {
375
372
  ) -> Result<(), magnus::Error> {
376
373
  // if `on_end_tag` function is defined, call it
377
374
  if rb_handler.respond_to(Self::SELMA_ON_END_TAG, true).unwrap() {
375
+ // TODO: error here is an "EndTagError"
378
376
  element.on_end_tag(move |end_tag| {
379
377
  let rb_end_tag = SelmaHTMLEndTag::new(end_tag);
380
378
 
381
- rb_handler
382
- .funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,))
383
- .unwrap();
384
- Ok(())
379
+ match rb_handler.funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,)) {
380
+ Ok(_) => Ok(()),
381
+ Err(err) => Err(err.to_string().into()),
382
+ }
385
383
  });
386
384
  }
387
385
 
@@ -390,40 +388,30 @@ impl SelmaRewriter {
390
388
  rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_ELEMENT, (rb_element,));
391
389
  match rb_result {
392
390
  Ok(_) => Ok(()),
393
- Err(err) => Err(magnus::Error::new(
394
- exception::runtime_error(),
395
- format!("{err:?}"),
396
- )),
391
+ Err(err) => Err(err),
397
392
  }
398
393
  }
399
394
 
400
- fn process_text_handlers(rb_handler: Value, text: &mut TextChunk) -> Result<(), magnus::Error> {
401
- // prevents missing `handle_text` function
402
- let content = text.as_str();
395
+ fn process_text_handlers(
396
+ rb_handler: Value,
397
+ text_chunk: &mut TextChunk,
398
+ ) -> Result<(), magnus::Error> {
399
+ // prevents missing `handle_text_chunk` function
400
+ let content = text_chunk.as_str();
403
401
 
404
402
  // seems that sometimes lol-html returns blank text / EOLs?
405
403
  if content.is_empty() {
406
404
  return Ok(());
407
405
  }
408
406
 
409
- let rb_result = rb_handler.funcall::<_, _, String>(Self::SELMA_HANDLE_TEXT, (content,));
410
-
411
- if rb_result.is_err() {
412
- return Err(magnus::Error::new(
413
- exception::type_error(),
414
- format!(
415
- "Expected #{:?} to return a string: {:?}",
416
- Self::SELMA_HANDLE_TEXT,
417
- rb_result.err().unwrap()
418
- ),
419
- ));
407
+ let rb_text_chunk = SelmaHTMLTextChunk::new(text_chunk);
408
+ match rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_TEXT_CHUNK, (rb_text_chunk,)) {
409
+ Ok(_) => Ok(()),
410
+ Err(err) => Err(magnus::Error::new(
411
+ exception::runtime_error(),
412
+ format!("{err:?}"),
413
+ )),
420
414
  }
421
-
422
- let new_content = rb_result.unwrap();
423
- // TODO: can this be an option?
424
- text.replace(&new_content, ContentType::Html);
425
-
426
- Ok(())
427
415
  }
428
416
  }
429
417
 
@@ -1,12 +1,10 @@
1
1
  use std::{borrow::BorrowMut, cell::RefMut, collections::HashMap};
2
2
 
3
- use lol_html::html_content::{Comment, ContentType, Doctype, Element, EndTag};
4
- use magnus::{
5
- class, exception, function, method, scan_args, Error, Module, Object, RArray, RHash, RModule,
6
- Value,
3
+ use lol_html::{
4
+ errors::AttributeNameError,
5
+ html_content::{Comment, ContentType, Doctype, Element, EndTag},
7
6
  };
8
-
9
- use crate::tags::Tag;
7
+ use magnus::{class, function, method, scan_args, Module, Object, RArray, RHash, RModule, Value};
10
8
 
11
9
  #[derive(Clone, Debug)]
12
10
  struct ElementSanitizer {
@@ -18,7 +16,7 @@ struct ElementSanitizer {
18
16
 
19
17
  #[derive(Clone, Debug)]
20
18
  pub struct Sanitizer {
21
- flags: [u8; Tag::TAG_COUNT],
19
+ flags: [u8; crate::tags::Tag::TAG_COUNT],
22
20
  allowed_attrs: Vec<String>,
23
21
  allowed_classes: Vec<String>,
24
22
  element_sanitizers: HashMap<String, ElementSanitizer>,
@@ -39,7 +37,7 @@ impl SelmaSanitizer {
39
37
  const SELMA_SANITIZER_REMOVE_CONTENTS: u8 = (1 << 2);
40
38
  const SELMA_SANITIZER_WRAP_WHITESPACE: u8 = (1 << 3);
41
39
 
42
- pub fn new(arguments: &[Value]) -> Result<Self, Error> {
40
+ pub fn new(arguments: &[Value]) -> Result<Self, magnus::Error> {
43
41
  let args = scan_args::scan_args::<(), (Option<RHash>,), (), (), (), ()>(arguments)?;
44
42
  let (opt_config,): (Option<RHash>,) = args.optional;
45
43
 
@@ -50,7 +48,7 @@ impl SelmaSanitizer {
50
48
  };
51
49
 
52
50
  let mut element_sanitizers = HashMap::new();
53
- Tag::html_tags().iter().for_each(|html_tag| {
51
+ crate::tags::Tag::html_tags().iter().for_each(|html_tag| {
54
52
  let es = ElementSanitizer {
55
53
  allowed_attrs: vec![],
56
54
  allowed_classes: vec![],
@@ -58,11 +56,14 @@ impl SelmaSanitizer {
58
56
 
59
57
  protocol_sanitizers: HashMap::new(),
60
58
  };
61
- element_sanitizers.insert(Tag::element_name_from_enum(html_tag).to_string(), es);
59
+ element_sanitizers.insert(
60
+ crate::tags::Tag::element_name_from_enum(html_tag).to_string(),
61
+ es,
62
+ );
62
63
  });
63
64
 
64
65
  Ok(Self(std::cell::RefCell::new(Sanitizer {
65
- flags: [0; Tag::TAG_COUNT],
66
+ flags: [0; crate::tags::Tag::TAG_COUNT],
66
67
  allowed_attrs: vec![],
67
68
  allowed_classes: vec![],
68
69
  element_sanitizers,
@@ -74,7 +75,7 @@ impl SelmaSanitizer {
74
75
  })))
75
76
  }
76
77
 
77
- fn get_config(&self) -> Result<RHash, Error> {
78
+ fn get_config(&self) -> Result<RHash, magnus::Error> {
78
79
  let binding = self.0.borrow();
79
80
 
80
81
  Ok(binding.config)
@@ -82,7 +83,7 @@ impl SelmaSanitizer {
82
83
 
83
84
  /// Toggle a sanitizer option on or off.
84
85
  fn set_flag(&self, tag_name: String, flag: u8, set: bool) {
85
- let tag = Tag::tag_from_tag_name(tag_name.as_str());
86
+ let tag = crate::tags::Tag::tag_from_tag_name(tag_name.as_str());
86
87
  if set {
87
88
  self.0.borrow_mut().flags[tag.index] |= flag;
88
89
  } else {
@@ -93,13 +94,19 @@ impl SelmaSanitizer {
93
94
  /// Toggles all sanitization options on or off.
94
95
  fn set_all_flags(&self, flag: u8, set: bool) {
95
96
  if set {
96
- Tag::html_tags().iter().enumerate().for_each(|(iter, _)| {
97
- self.0.borrow_mut().flags[iter] |= flag;
98
- });
97
+ crate::tags::Tag::html_tags()
98
+ .iter()
99
+ .enumerate()
100
+ .for_each(|(iter, _)| {
101
+ self.0.borrow_mut().flags[iter] |= flag;
102
+ });
99
103
  } else {
100
- Tag::html_tags().iter().enumerate().for_each(|(iter, _)| {
101
- self.0.borrow_mut().flags[iter] &= flag;
102
- });
104
+ crate::tags::Tag::html_tags()
105
+ .iter()
106
+ .enumerate()
107
+ .for_each(|(iter, _)| {
108
+ self.0.borrow_mut().flags[iter] &= flag;
109
+ });
103
110
  }
104
111
  }
105
112
 
@@ -111,8 +118,8 @@ impl SelmaSanitizer {
111
118
 
112
119
  pub fn escape_tagfilter(&self, e: &mut Element) -> bool {
113
120
  if self.0.borrow().escape_tagfilter {
114
- let tag = Tag::tag_from_element(e);
115
- if Tag::is_tag_escapeworthy(tag) {
121
+ let tag = crate::tags::Tag::tag_from_element(e);
122
+ if crate::tags::Tag::is_tag_escapeworthy(tag) {
116
123
  e.remove();
117
124
  return true;
118
125
  }
@@ -229,9 +236,9 @@ impl SelmaSanitizer {
229
236
  }
230
237
  }
231
238
 
232
- pub fn sanitize_attributes(&self, element: &mut Element) -> Result<(), magnus::Error> {
239
+ pub fn sanitize_attributes(&self, element: &mut Element) -> Result<(), AttributeNameError> {
233
240
  let binding = self.0.borrow_mut();
234
- let tag = Tag::tag_from_element(element);
241
+ let tag = crate::tags::Tag::tag_from_element(element);
235
242
  let element_sanitizer = Self::get_element_sanitizer(&binding, &element.tag_name());
236
243
 
237
244
  // FIXME: This is a hack to get around the fact that we can't borrow
@@ -255,26 +262,30 @@ impl SelmaSanitizer {
255
262
  let x = escapist::unescape_html(trimmed.as_bytes());
256
263
  let unescaped_attr_val = String::from_utf8_lossy(&x).to_string();
257
264
 
258
- if !Self::should_keep_attribute(
265
+ let should_keep_attrubute = match Self::should_keep_attribute(
259
266
  &binding,
260
267
  element,
261
268
  element_sanitizer,
262
269
  attr_name,
263
270
  &unescaped_attr_val,
264
271
  ) {
272
+ Ok(should_keep) => should_keep,
273
+ Err(e) => {
274
+ return Err(e);
275
+ }
276
+ };
277
+
278
+ if !should_keep_attrubute {
265
279
  element.remove_attribute(attr_name);
266
280
  } else {
267
281
  // Prevent the use of `<meta>` elements that set a charset other than UTF-8,
268
282
  // since output is always UTF-8.
269
- if Tag::is_meta(tag) {
283
+ if crate::tags::Tag::is_meta(tag) {
270
284
  if attr_name == "charset" && unescaped_attr_val != "utf-8" {
271
285
  match element.set_attribute(attr_name, "utf-8") {
272
286
  Ok(_) => {}
273
- Err(_) => {
274
- return Err(magnus::Error::new(
275
- exception::runtime_error(),
276
- format!("Unable to change {attr_name:?}"),
277
- ));
287
+ Err(err) => {
288
+ return Err(err);
278
289
  }
279
290
  }
280
291
  }
@@ -282,13 +293,17 @@ impl SelmaSanitizer {
282
293
  let mut buf = String::new();
283
294
  // ...then, escape any special characters, for security
284
295
  if attr_name == "href" {
285
- // FIXME: gross--------------vvvv
286
- escapist::escape_href(&mut buf, unescaped_attr_val.to_string().as_str());
296
+ escapist::escape_href(&mut buf, unescaped_attr_val.as_str());
287
297
  } else {
288
- escapist::escape_html(&mut buf, unescaped_attr_val.to_string().as_str());
298
+ escapist::escape_html(&mut buf, unescaped_attr_val.as_str());
289
299
  };
290
300
 
291
- element.set_attribute(attr_name, &buf);
301
+ match element.set_attribute(attr_name, &buf) {
302
+ Ok(_) => {}
303
+ Err(err) => {
304
+ return Err(err);
305
+ }
306
+ }
292
307
  }
293
308
  }
294
309
  }
@@ -313,7 +328,7 @@ impl SelmaSanitizer {
313
328
  element_sanitizer: &ElementSanitizer,
314
329
  attr_name: &String,
315
330
  attr_val: &String,
316
- ) -> bool {
331
+ ) -> Result<bool, AttributeNameError> {
317
332
  let mut allowed: bool = false;
318
333
  let element_allowed_attrs = element_sanitizer.allowed_attrs.contains(attr_name);
319
334
  let sanitizer_allowed_attrs = binding.allowed_attrs.contains(attr_name);
@@ -327,7 +342,7 @@ impl SelmaSanitizer {
327
342
  }
328
343
 
329
344
  if !allowed {
330
- return false;
345
+ return Ok(false);
331
346
  }
332
347
 
333
348
  let protocol_sanitizer_values = element_sanitizer.protocol_sanitizers.get(attr_name);
@@ -335,32 +350,29 @@ impl SelmaSanitizer {
335
350
  None => {
336
351
  // has a protocol, but no sanitization list
337
352
  if !attr_val.is_empty() && Self::has_protocol(attr_val) {
338
- return false;
353
+ return Ok(false);
339
354
  }
340
355
  }
341
356
  Some(protocol_sanitizer_values) => {
342
357
  if !attr_val.is_empty()
343
358
  && !Self::has_allowed_protocol(protocol_sanitizer_values, attr_val)
344
359
  {
345
- return false;
360
+ return Ok(false);
346
361
  }
347
362
  }
348
363
  }
349
364
 
350
- if attr_name == "class"
351
- && !Self::sanitize_class_attribute(
365
+ if attr_name == "class" {
366
+ return Self::sanitize_class_attribute(
352
367
  binding,
353
368
  element,
354
369
  element_sanitizer,
355
370
  attr_name,
356
371
  attr_val,
357
- )
358
- .unwrap()
359
- {
360
- return false;
372
+ );
361
373
  }
362
374
 
363
- true
375
+ Ok(true)
364
376
  }
365
377
 
366
378
  fn has_protocol(attr_val: &str) -> bool {
@@ -403,7 +415,7 @@ impl SelmaSanitizer {
403
415
  element_sanitizer: &ElementSanitizer,
404
416
  attr_name: &str,
405
417
  attr_val: &str,
406
- ) -> Result<bool, Error> {
418
+ ) -> Result<bool, lol_html::errors::AttributeNameError> {
407
419
  let allowed_global = &binding.allowed_classes;
408
420
 
409
421
  let mut valid_classes: Vec<String> = vec![];
@@ -431,28 +443,25 @@ impl SelmaSanitizer {
431
443
 
432
444
  match element.set_attribute(attr_name, valid_classes.join(" ").as_str()) {
433
445
  Ok(_) => Ok(true),
434
- Err(err) => Err(Error::new(
435
- exception::runtime_error(),
436
- format!("AttributeNameError: {err:?}"),
437
- )),
446
+ Err(err) => Err(err),
438
447
  }
439
448
  }
440
449
 
441
450
  pub fn allow_element(&self, element: &mut Element) -> bool {
442
- let tag = Tag::tag_from_element(element);
451
+ let tag = crate::tags::Tag::tag_from_element(element);
443
452
  let flags: u8 = self.0.borrow().flags[tag.index];
444
453
 
445
454
  (flags & Self::SELMA_SANITIZER_ALLOW) == 0
446
455
  }
447
456
 
448
457
  pub fn try_remove_element(&self, element: &mut Element) -> bool {
449
- let tag = Tag::tag_from_element(element);
458
+ let tag = crate::tags::Tag::tag_from_element(element);
450
459
  let flags: u8 = self.0.borrow().flags[tag.index];
451
460
 
452
461
  let should_remove = !element.removed() && self.allow_element(element);
453
462
 
454
463
  if should_remove {
455
- if Tag::has_text_content(tag) {
464
+ if crate::tags::Tag::has_text_content(tag) {
456
465
  Self::remove_element(
457
466
  element,
458
467
  tag.self_closing,
@@ -465,7 +474,7 @@ impl SelmaSanitizer {
465
474
  Self::check_if_end_tag_needs_removal(element);
466
475
  } else {
467
476
  // anything in <iframe> must be removed, if it's kept
468
- if Tag::is_iframe(tag) {
477
+ if crate::tags::Tag::is_iframe(tag) {
469
478
  if self.0.borrow().flags[tag.index] != 0 {
470
479
  element.set_inner_content(" ", ContentType::Text);
471
480
  } else {
@@ -497,14 +506,14 @@ impl SelmaSanitizer {
497
506
  }
498
507
 
499
508
  pub fn force_remove_element(&self, element: &mut Element) {
500
- let tag = Tag::tag_from_element(element);
509
+ let tag = crate::tags::Tag::tag_from_element(element);
501
510
  let self_closing = tag.self_closing;
502
511
  Self::remove_element(element, self_closing, Self::SELMA_SANITIZER_REMOVE_CONTENTS);
503
512
  Self::check_if_end_tag_needs_removal(element);
504
513
  }
505
514
 
506
515
  fn check_if_end_tag_needs_removal(element: &mut Element) {
507
- if element.removed() && !Tag::tag_from_element(element).self_closing {
516
+ if element.removed() && !crate::tags::Tag::tag_from_element(element).self_closing {
508
517
  element
509
518
  .on_end_tag(move |end| {
510
519
  Self::remove_end_tag(end);
@@ -533,7 +542,7 @@ impl SelmaSanitizer {
533
542
  }
534
543
  }
535
544
 
536
- pub fn init(m_selma: RModule) -> Result<(), Error> {
545
+ pub fn init(m_selma: RModule) -> Result<(), magnus::Error> {
537
546
  let c_sanitizer = m_selma.define_class("Sanitizer", Default::default())?;
538
547
 
539
548
  c_sanitizer.define_singleton_method("new", function!(SelmaSanitizer::new, -1))?;
@@ -192,14 +192,17 @@ impl Tag {
192
192
  /// Is this tag something which needs to be removed?
193
193
  pub fn is_tag_escapeworthy(tag: Tag) -> bool {
194
194
  tag.index == HTMLTag::TITLE as usize
195
- || tag.index == HTMLTag::TEXTAREA as usize
196
- || tag.index == HTMLTag::STYLE as usize
197
- || tag.index == HTMLTag::XMP as usize
198
195
  || tag.index == HTMLTag::IFRAME as usize
196
+ || tag.index == HTMLTag::MATH as usize
199
197
  || tag.index == HTMLTag::NOEMBED as usize
200
198
  || tag.index == HTMLTag::NOFRAMES as usize
201
- || tag.index == HTMLTag::SCRIPT as usize
199
+ || tag.index == HTMLTag::NOSCRIPT as usize
202
200
  || tag.index == HTMLTag::PLAINTEXT as usize
201
+ || tag.index == HTMLTag::SCRIPT as usize
202
+ || tag.index == HTMLTag::STYLE as usize
203
+ || tag.index == HTMLTag::SVG as usize
204
+ || tag.index == HTMLTag::TEXTAREA as usize
205
+ || tag.index == HTMLTag::XMP as usize
203
206
  }
204
207
 
205
208
  pub const ESCAPEWORTHY_TAGS_CSS: &str =
@@ -3,6 +3,10 @@
3
3
  module Selma
4
4
  class Sanitizer
5
5
  module Config
6
+ # although there are many more protocol types, eg., ftp, xmpp, etc.,
7
+ # these are the only ones that are allowed by default
8
+ VALID_PROTOCOLS = ["http", "https", "mailto", :relative]
9
+
6
10
  DEFAULT = freeze_config(
7
11
  # Whether or not to allow HTML comments. Allowing comments is strongly
8
12
  # discouraged, since IE allows script execution within conditional
data/lib/selma/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Selma
4
- VERSION = "0.0.3"
4
+ VERSION = "0.0.4"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: selma
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Garen J. Torikian
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-12-24 00:00:00.000000000 Z
11
+ date: 2022-12-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -82,6 +82,7 @@ files:
82
82
  - ext/selma/src/html.rs
83
83
  - ext/selma/src/html/element.rs
84
84
  - ext/selma/src/html/end_tag.rs
85
+ - ext/selma/src/html/text_chunk.rs
85
86
  - ext/selma/src/lib.rs
86
87
  - ext/selma/src/native_ref_wrap.rs
87
88
  - ext/selma/src/rewriter.rs