selma 0.0.3-arm64-darwin → 0.0.4-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f9939e73d5832b8d53b08f867bf218ab761211a6390f2da824889a7ec9d95516
4
- data.tar.gz: e605aa93a58224fef7dc76c6fe587903a57cc03339f68da3cfc7f801bfa0228c
3
+ metadata.gz: f899592f41387386f5d3710bed4ce837d90ef562f8c4d1be388ffc7d12d611a6
4
+ data.tar.gz: e7b29fc1c6ffa7a8543a041c64b12511e1423e6ed966e4da34429a43923b76a2
5
5
  SHA512:
6
- metadata.gz: 71c3adb9c05a949eaf4079c72db7e5d1a2c8c1a3bbc20190dbc0dc62fc7d1105452c222ae0b4594694008efaa06bdb2ed57ae54927ed93d2b3716b2f742983f2
7
- data.tar.gz: 431c58a0824a3b711724e95809323009061ca750bf9fa9a70999f31e454b194d537edfefdd287aa887416e3c624a1e9e354b28251c6caa0fe2b144e9bb75687d
6
+ metadata.gz: 256f9015d9ccdca3fb3e788c3cbc6d6e7bb174de08b9cfc009b3ee3ec49f33999834d8d83467ffff24e8a6a5a0efa55d70309f6505403cccd7a58cd7591198d4
7
+ data.tar.gz: 64ba49bb6934213db43cc34de2886df48209a3097c9cdc540c0f5cc8a79592befe49e7065fe606c6a163193e7829daf2fbf99cb0181a9cefb721d9bdbbbd65a1
data/README.md CHANGED
@@ -56,6 +56,10 @@ allow_comments: false,
56
56
  # "<!DOCTYPE html>" when sanitizing a document.
57
57
  allow_doctype: false,
58
58
 
59
+ # HTML elements to allow. By default, no elements are allowed (which means
60
+ # that all HTML will be stripped).
61
+ elements: ["a", "b", "img", ],
62
+
59
63
  # HTML attributes to allow in specific elements. The key is the name of the element,
60
64
  # and the value is an array of allowed attributes. By default, no attributes
61
65
  # are allowed.
@@ -64,14 +68,10 @@ attributes: {
64
68
  "img" => ["src"],
65
69
  },
66
70
 
67
- # HTML elements to allow. By default, no elements are allowed (which means
68
- # that all HTML will be stripped).
69
- elements: ["a", "b", "img", ],
70
-
71
71
  # URL handling protocols to allow in specific attributes. By default, no
72
72
  # protocols are allowed. Use :relative in place of a protocol if you want
73
73
  # to allow relative URLs sans protocol.
74
- protocols: {
74
+ protocols: {
75
75
  "a" => { "href" => ["http", "https", "mailto", :relative] },
76
76
  "img" => { "href" => ["http", "https"] },
77
77
  },
@@ -91,7 +91,7 @@ The real power in Selma comes in its use of handlers. A handler is simply an obj
91
91
 
92
92
  - `selector`, a method which MUST return instance of `Selma::Selector` which defines the CSS classes to match
93
93
  - `handle_element`, a method that's call on each matched element
94
- - `handle_text`, a method that's called on each matched text node; this MUST return a string
94
+ - `handle_text_chunk`, a method that's called on each matched text node; this MUST return a string
95
95
 
96
96
  Here's an example which rewrites the `href` attribute on `a` and the `src` attribute on `img` to be `https` rather than `http`.
97
97
 
@@ -118,7 +118,7 @@ rewriter = Selma::Rewriter.new(handlers: [MatchAttribute.new])
118
118
  The `Selma::Selector` object has three possible kwargs:
119
119
 
120
120
  - `match_element`: any element which matches this CSS rule will be passed on to `handle_element`
121
- - `match_text_within`: any element which matches this CSS rule will be passed on to `handle_text`
121
+ - `match_text_within`: any element which matches this CSS rule will be passed on to `handle_text_chunk`
122
122
  - `ignore_text_within`: this is an array of element names whose text contents will be ignored
123
123
 
124
124
  You've seen an example of `match_element`; here's one for `match_text` which changes strings in various elements which are _not_ `pre` or `code`:
@@ -132,7 +132,7 @@ class MatchText
132
132
  SELECTOR
133
133
  end
134
134
 
135
- def handle_text(text)
135
+ def handle_text_chunk(text)
136
136
  string.sub(/@.+/, "<a href=\"www.yetto.app/#{Regexp.last_match}\">")
137
137
  end
138
138
  end
@@ -150,8 +150,9 @@ The `element` argument in `handle_element` has the following methods:
150
150
  - `remove_attribute`: remove an attribute
151
151
  - `attributes`: list all the attributes
152
152
  - `ancestors`: list all the ancestors
153
- - `append(content, content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
154
- - `wrap(start_text, end_text, content_type)`: adds `start_text` before an element and `end_text` after an element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
153
+ - `append(content, as: content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
154
+ - `before(content, as: content_type)`: Inserts `content` before the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
155
+ - `after(content, as: content_type)`: Inserts `content` after the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
155
156
  - `set_inner_content`: replaces inner content of the element with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
156
157
 
157
158
  ## Benchmarks
data/ext/selma/Cargo.toml CHANGED
@@ -5,7 +5,7 @@ edition = "2021"
5
5
 
6
6
  [dependencies]
7
7
  enum-iterator = "1.2"
8
- escapist = "0.0.1"
8
+ escapist = "0.0.2"
9
9
  magnus = { git = "https://github.com/matsadler/magnus", rev = "23160f7229ac74c42da1b5096a65ccbc40962697" }
10
10
  lol_html = "0.3"
11
11
 
@@ -1,8 +1,6 @@
1
- use std::borrow::Cow;
2
-
3
1
  use crate::native_ref_wrap::NativeRefWrap;
4
- use lol_html::html_content::{ContentType, Element};
5
- use magnus::{exception, method, Error, Module, RArray, RClass, RHash, RString, Symbol};
2
+ use lol_html::html_content::Element;
3
+ use magnus::{exception, method, Error, Module, RArray, RClass, RHash, RString, Value};
6
4
 
7
5
  struct HTMLElement {
8
6
  element: NativeRefWrap<Element<'static, 'static>>,
@@ -106,73 +104,67 @@ impl SelmaHTMLElement {
106
104
  Ok(array)
107
105
  }
108
106
 
109
- fn append(&self, text_to_append: String, content_type: Symbol) -> Result<(), Error> {
107
+ fn before(&self, args: &[Value]) -> Result<(), Error> {
110
108
  let mut binding = self.0.borrow_mut();
111
109
  let element = binding.element.get_mut().unwrap();
112
110
 
113
- let text_str = text_to_append.as_str();
114
-
115
- let content_type = Self::find_content_type(content_type);
111
+ let (text_str, content_type) = match crate::scan_text_args(args) {
112
+ Ok((text_str, content_type)) => (text_str, content_type),
113
+ Err(err) => return Err(err),
114
+ };
116
115
 
117
- element.append(text_str, content_type);
116
+ element.before(&text_str, content_type);
118
117
 
119
118
  Ok(())
120
119
  }
121
120
 
122
- fn wrap(
123
- &self,
124
- start_text: String,
125
- end_text: String,
126
- content_type: Symbol,
127
- ) -> Result<(), Error> {
121
+ fn after(&self, args: &[Value]) -> Result<(), Error> {
128
122
  let mut binding = self.0.borrow_mut();
129
123
  let element = binding.element.get_mut().unwrap();
130
124
 
131
- let before_content_type = Self::find_content_type(content_type);
132
- let after_content_type = Self::find_content_type(content_type);
133
- element.before(&start_text, before_content_type);
134
- element.after(&end_text, after_content_type);
125
+ let (text_str, content_type) = match crate::scan_text_args(args) {
126
+ Ok((text_str, content_type)) => (text_str, content_type),
127
+ Err(err) => return Err(err),
128
+ };
129
+
130
+ element.after(&text_str, content_type);
135
131
 
136
132
  Ok(())
137
133
  }
138
134
 
139
- fn set_inner_content(&self, text_to_set: String, content_type: Symbol) -> Result<(), Error> {
135
+ fn append(&self, args: &[Value]) -> Result<(), Error> {
140
136
  let mut binding = self.0.borrow_mut();
141
137
  let element = binding.element.get_mut().unwrap();
142
138
 
143
- let text_str = text_to_set.as_str();
144
-
145
- let content_type = Self::find_content_type(content_type);
139
+ let (text_str, content_type) = match crate::scan_text_args(args) {
140
+ Ok((text_str, content_type)) => (text_str, content_type),
141
+ Err(err) => return Err(err),
142
+ };
146
143
 
147
- element.set_inner_content(text_str, content_type);
144
+ element.append(&text_str, content_type);
148
145
 
149
146
  Ok(())
150
147
  }
151
148
 
152
- fn find_content_type(content_type: Symbol) -> ContentType {
153
- match content_type.name() {
154
- Ok(name) => match name {
155
- Cow::Borrowed("as_text") => ContentType::Text,
156
- Cow::Borrowed("as_html") => ContentType::Html,
157
- _ => Err(Error::new(
158
- exception::runtime_error(),
159
- format!("unknown symbol `{name:?}`"),
160
- ))
161
- .unwrap(),
162
- },
163
- Err(err) => Err(Error::new(
164
- exception::runtime_error(),
165
- format!("Could not unwrap symbol: {err:?}"),
166
- ))
167
- .unwrap(),
168
- }
149
+ fn set_inner_content(&self, args: &[Value]) -> Result<(), Error> {
150
+ let mut binding = self.0.borrow_mut();
151
+ let element = binding.element.get_mut().unwrap();
152
+
153
+ let (inner_content, content_type) = match crate::scan_text_args(args) {
154
+ Ok((inner_content, content_type)) => (inner_content, content_type),
155
+ Err(err) => return Err(err),
156
+ };
157
+
158
+ element.set_inner_content(&inner_content, content_type);
159
+
160
+ Ok(())
169
161
  }
170
162
  }
171
163
 
172
164
  pub fn init(c_html: RClass) -> Result<(), Error> {
173
165
  let c_element = c_html
174
166
  .define_class("Element", Default::default())
175
- .expect("cannot find class Selma::Element");
167
+ .expect("cannot find class Selma::HTML::Element");
176
168
 
177
169
  c_element.define_method("tag_name", method!(SelmaHTMLElement::tag_name, 0))?;
178
170
  c_element.define_method("[]", method!(SelmaHTMLElement::get_attribute, 1))?;
@@ -184,11 +176,12 @@ pub fn init(c_html: RClass) -> Result<(), Error> {
184
176
  c_element.define_method("attributes", method!(SelmaHTMLElement::get_attributes, 0))?;
185
177
  c_element.define_method("ancestors", method!(SelmaHTMLElement::get_ancestors, 0))?;
186
178
 
187
- c_element.define_method("append", method!(SelmaHTMLElement::append, 2))?;
188
- c_element.define_method("wrap", method!(SelmaHTMLElement::wrap, 3))?;
179
+ c_element.define_method("before", method!(SelmaHTMLElement::before, -1))?;
180
+ c_element.define_method("after", method!(SelmaHTMLElement::after, -1))?;
181
+ c_element.define_method("append", method!(SelmaHTMLElement::append, -1))?;
189
182
  c_element.define_method(
190
183
  "set_inner_content",
191
- method!(SelmaHTMLElement::set_inner_content, 2),
184
+ method!(SelmaHTMLElement::set_inner_content, -1),
192
185
  )?;
193
186
 
194
187
  Ok(())
@@ -6,7 +6,7 @@ struct HTMLEndTag {
6
6
  end_tag: NativeRefWrap<EndTag<'static>>,
7
7
  }
8
8
 
9
- #[magnus::wrap(class = "Selma::HTML::Element")]
9
+ #[magnus::wrap(class = "Selma::HTML::EndTag")]
10
10
  pub struct SelmaHTMLEndTag(std::cell::RefCell<HTMLEndTag>);
11
11
 
12
12
  /// SAFETY: This is safe because we only access this data when the GVL is held.
@@ -27,7 +27,7 @@ impl SelmaHTMLEndTag {
27
27
  pub fn init(c_html: RClass) -> Result<(), Error> {
28
28
  let c_end_tag = c_html
29
29
  .define_class("EndTag", Default::default())
30
- .expect("cannot find class Selma::EndTag");
30
+ .expect("cannot find class Selma::HTML::EndTag");
31
31
 
32
32
  c_end_tag.define_method("tag_name", method!(SelmaHTMLEndTag::tag_name, 0))?;
33
33
 
@@ -0,0 +1,83 @@
1
+ use crate::native_ref_wrap::NativeRefWrap;
2
+ use lol_html::html_content::{TextChunk, TextType};
3
+ use magnus::{exception, method, Error, Module, RClass, Symbol, Value};
4
+
5
+ struct HTMLTextChunk {
6
+ text_chunk: NativeRefWrap<TextChunk<'static>>,
7
+ }
8
+
9
+ #[magnus::wrap(class = "Selma::HTML::TextChunk")]
10
+ pub struct SelmaHTMLTextChunk(std::cell::RefCell<HTMLTextChunk>);
11
+
12
+ /// SAFETY: This is safe because we only access this data when the GVL is held.
13
+ unsafe impl Send for SelmaHTMLTextChunk {}
14
+
15
+ impl SelmaHTMLTextChunk {
16
+ pub fn new(text_chunk: &mut TextChunk) -> Self {
17
+ let (ref_wrap, _anchor) = NativeRefWrap::wrap_mut(text_chunk);
18
+
19
+ Self(std::cell::RefCell::new(HTMLTextChunk {
20
+ text_chunk: ref_wrap,
21
+ }))
22
+ }
23
+
24
+ fn to_s(&self) -> Result<String, Error> {
25
+ let binding = self.0.borrow();
26
+
27
+ if let Ok(tc) = binding.text_chunk.get() {
28
+ Ok(tc.as_str().to_string())
29
+ } else {
30
+ Err(Error::new(
31
+ exception::runtime_error(),
32
+ "`to_s` is not available",
33
+ ))
34
+ }
35
+ }
36
+
37
+ fn text_type(&self) -> Result<Symbol, Error> {
38
+ let binding = self.0.borrow();
39
+
40
+ if let Ok(tc) = binding.text_chunk.get() {
41
+ match tc.text_type() {
42
+ TextType::Data => Ok(Symbol::from("data")),
43
+ TextType::PlainText => Ok(Symbol::from("plain_text")),
44
+ TextType::RawText => Ok(Symbol::from("raw_text")),
45
+ TextType::ScriptData => Ok(Symbol::from("script")),
46
+ TextType::RCData => Ok(Symbol::from("rc_data")),
47
+ TextType::CDataSection => Ok(Symbol::from("cdata_section")),
48
+ }
49
+ } else {
50
+ Err(Error::new(
51
+ exception::runtime_error(),
52
+ "`text_type` is not available",
53
+ ))
54
+ }
55
+ }
56
+
57
+ fn replace(&self, args: &[Value]) -> Result<(), Error> {
58
+ let mut binding = self.0.borrow_mut();
59
+ let text_chunk = binding.text_chunk.get_mut().unwrap();
60
+
61
+ let (text_str, content_type) = match crate::scan_text_args(args) {
62
+ Ok((text_str, content_type)) => (text_str, content_type),
63
+ Err(err) => return Err(err),
64
+ };
65
+
66
+ text_chunk.replace(&text_str, content_type);
67
+
68
+ Ok(())
69
+ }
70
+ }
71
+
72
+ pub fn init(c_html: RClass) -> Result<(), Error> {
73
+ let c_text_chunk = c_html
74
+ .define_class("TextChunk", Default::default())
75
+ .expect("cannot find class Selma::HTML::TextChunk");
76
+
77
+ c_text_chunk.define_method("to_s", method!(SelmaHTMLTextChunk::to_s, 0))?;
78
+ c_text_chunk.define_method("content", method!(SelmaHTMLTextChunk::to_s, 0))?;
79
+ c_text_chunk.define_method("text_type", method!(SelmaHTMLTextChunk::text_type, 0))?;
80
+ c_text_chunk.define_method("replace", method!(SelmaHTMLTextChunk::replace, -1))?;
81
+
82
+ Ok(())
83
+ }
@@ -9,9 +9,11 @@ pub fn init(m_selma: RModule) -> Result<(), Error> {
9
9
 
10
10
  element::init(c_html).expect("cannot define Selma::HTML::Element class");
11
11
  end_tag::init(c_html).expect("cannot define Selma::HTML::EndTag class");
12
+ text_chunk::init(c_html).expect("cannot define Selma::HTML::TextChunk class");
12
13
 
13
14
  Ok(())
14
15
  }
15
16
 
16
17
  pub mod element;
17
18
  pub mod end_tag;
19
+ pub mod text_chunk;
data/ext/selma/src/lib.rs CHANGED
@@ -1,6 +1,7 @@
1
1
  extern crate core;
2
2
 
3
- use magnus::{define_module, Error};
3
+ use lol_html::html_content::ContentType;
4
+ use magnus::{define_module, exception, scan_args, Error, Symbol, Value};
4
5
 
5
6
  pub mod html;
6
7
  pub mod native_ref_wrap;
@@ -10,6 +11,32 @@ pub mod selector;
10
11
  pub mod tags;
11
12
  pub mod wrapped_struct;
12
13
 
14
+ #[allow(clippy::let_unit_value)]
15
+ fn scan_text_args(args: &[Value]) -> Result<(String, ContentType), magnus::Error> {
16
+ let args = scan_args::scan_args(args)?;
17
+ let (text,): (String,) = args.required;
18
+ let _: () = args.optional;
19
+ let _: () = args.splat;
20
+ let _: () = args.trailing;
21
+ let _: () = args.block;
22
+
23
+ let kwargs = scan_args::get_kwargs::<_, (Symbol,), (), ()>(args.keywords, &["as"], &[])?;
24
+ let as_sym = kwargs.required.0;
25
+ let as_sym_str = as_sym.name().unwrap();
26
+ let content_type = if as_sym_str == "text" {
27
+ ContentType::Text
28
+ } else if as_sym_str == "html" {
29
+ ContentType::Html
30
+ } else {
31
+ return Err(Error::new(
32
+ exception::runtime_error(),
33
+ format!("unknown symbol `{as_sym_str:?}`"),
34
+ ));
35
+ };
36
+
37
+ Ok((text, content_type))
38
+ }
39
+
13
40
  #[magnus::init]
14
41
  fn init() -> Result<(), Error> {
15
42
  let m_selma = define_module("Selma").expect("cannot define ::Selma module");
@@ -1,6 +1,6 @@
1
1
  use lol_html::{
2
2
  doc_comments, doctype, element,
3
- html_content::{ContentType, Element, EndTag, TextChunk},
3
+ html_content::{Element, EndTag, TextChunk},
4
4
  text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter, Selector, Settings,
5
5
  };
6
6
  use magnus::{exception, function, method, scan_args, Module, Object, RArray, RModule, Value};
@@ -8,7 +8,7 @@ use magnus::{exception, function, method, scan_args, Module, Object, RArray, RMo
8
8
  use std::{borrow::Cow, cell::RefCell, primitive::str, rc::Rc};
9
9
 
10
10
  use crate::{
11
- html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag},
11
+ html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag, text_chunk::SelmaHTMLTextChunk},
12
12
  sanitizer::SelmaSanitizer,
13
13
  selector::SelmaSelector,
14
14
  tags::Tag,
@@ -43,7 +43,7 @@ unsafe impl Send for SelmaRewriter {}
43
43
  impl SelmaRewriter {
44
44
  const SELMA_ON_END_TAG: &str = "on_end_tag";
45
45
  const SELMA_HANDLE_ELEMENT: &str = "handle_element";
46
- const SELMA_HANDLE_TEXT: &str = "handle_text";
46
+ const SELMA_HANDLE_TEXT_CHUNK: &str = "handle_text_chunk";
47
47
 
48
48
  /// @yard
49
49
  /// @def new(sanitizer: Selma::Sanitizer.new(Selma::Sanitizer::Config::DEFAULT), handlers: [])
@@ -145,7 +145,7 @@ impl SelmaRewriter {
145
145
  let _: () = args.trailing;
146
146
  let _: () = args.block;
147
147
 
148
- let kw = scan_args::get_kwargs::<
148
+ let kwargs = scan_args::get_kwargs::<
149
149
  _,
150
150
  (),
151
151
  (
@@ -154,7 +154,7 @@ impl SelmaRewriter {
154
154
  ),
155
155
  (),
156
156
  >(args.keywords, &[], &["sanitizer", "handlers"])?;
157
- let (rb_sanitizer, rb_handlers) = kw.optional;
157
+ let (rb_sanitizer, rb_handlers) = kwargs.optional;
158
158
 
159
159
  Ok((rb_sanitizer, rb_handlers))
160
160
  }
@@ -162,26 +162,22 @@ impl SelmaRewriter {
162
162
  /// Perform HTML rewrite sequence.
163
163
  fn rewrite(&self, html: String) -> Result<String, magnus::Error> {
164
164
  let sanitized_html = match &self.0.borrow().sanitizer {
165
- None => html,
165
+ None => Ok(html),
166
166
  Some(sanitizer) => {
167
- // due to malicious html crafting
168
- // (e.g. <<foo>script>...</script>, or <div <!-- comment -->> as in tests),
169
- // we need to run sanitization several times to truly remove unwanted tags,
170
- // because lol-html happily accepts this garbage (by design?)
171
- let sanitized_html = Self::perform_sanitization(sanitizer, &html).unwrap();
167
+ let sanitized_html = match Self::perform_sanitization(sanitizer, &html) {
168
+ Ok(sanitized_html) => sanitized_html,
169
+ Err(err) => return Err(err),
170
+ };
172
171
 
173
- String::from_utf8(sanitized_html).unwrap()
172
+ String::from_utf8(sanitized_html)
174
173
  }
175
174
  };
176
175
  let binding = self.0.borrow_mut();
177
176
  let handlers = &binding.handlers;
178
177
 
179
- match Self::perform_handler_rewrite(self, handlers, sanitized_html) {
178
+ match Self::perform_handler_rewrite(self, handlers, sanitized_html.unwrap()) {
180
179
  Ok(rewritten_html) => Ok(String::from_utf8(rewritten_html).unwrap()),
181
- Err(err) => Err(magnus::Error::new(
182
- exception::runtime_error(),
183
- format!("{err:?}"),
184
- )),
180
+ Err(err) => Err(err),
185
181
  }
186
182
  }
187
183
 
@@ -212,9 +208,10 @@ impl SelmaRewriter {
212
208
  if el.removed() {
213
209
  return Ok(());
214
210
  }
215
- sanitizer.sanitize_attributes(el);
216
-
217
- Ok(())
211
+ match sanitizer.sanitize_attributes(el) {
212
+ Ok(_) => Ok(()),
213
+ Err(err) => Err(err.to_string().into()),
214
+ }
218
215
  })],
219
216
  // TODO: allow for MemorySettings to be defined
220
217
  ..Settings::default()
@@ -341,7 +338,7 @@ impl SelmaRewriter {
341
338
  let mut stack = closure_element_stack.as_ref().borrow_mut();
342
339
  stack.pop();
343
340
  Ok(())
344
- });
341
+ })?;
345
342
  Ok(())
346
343
  }));
347
344
  });
@@ -375,13 +372,14 @@ impl SelmaRewriter {
375
372
  ) -> Result<(), magnus::Error> {
376
373
  // if `on_end_tag` function is defined, call it
377
374
  if rb_handler.respond_to(Self::SELMA_ON_END_TAG, true).unwrap() {
375
+ // TODO: error here is an "EndTagError"
378
376
  element.on_end_tag(move |end_tag| {
379
377
  let rb_end_tag = SelmaHTMLEndTag::new(end_tag);
380
378
 
381
- rb_handler
382
- .funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,))
383
- .unwrap();
384
- Ok(())
379
+ match rb_handler.funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,)) {
380
+ Ok(_) => Ok(()),
381
+ Err(err) => Err(err.to_string().into()),
382
+ }
385
383
  });
386
384
  }
387
385
 
@@ -390,40 +388,30 @@ impl SelmaRewriter {
390
388
  rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_ELEMENT, (rb_element,));
391
389
  match rb_result {
392
390
  Ok(_) => Ok(()),
393
- Err(err) => Err(magnus::Error::new(
394
- exception::runtime_error(),
395
- format!("{err:?}"),
396
- )),
391
+ Err(err) => Err(err),
397
392
  }
398
393
  }
399
394
 
400
- fn process_text_handlers(rb_handler: Value, text: &mut TextChunk) -> Result<(), magnus::Error> {
401
- // prevents missing `handle_text` function
402
- let content = text.as_str();
395
+ fn process_text_handlers(
396
+ rb_handler: Value,
397
+ text_chunk: &mut TextChunk,
398
+ ) -> Result<(), magnus::Error> {
399
+ // prevents missing `handle_text_chunk` function
400
+ let content = text_chunk.as_str();
403
401
 
404
402
  // seems that sometimes lol-html returns blank text / EOLs?
405
403
  if content.is_empty() {
406
404
  return Ok(());
407
405
  }
408
406
 
409
- let rb_result = rb_handler.funcall::<_, _, String>(Self::SELMA_HANDLE_TEXT, (content,));
410
-
411
- if rb_result.is_err() {
412
- return Err(magnus::Error::new(
413
- exception::type_error(),
414
- format!(
415
- "Expected #{:?} to return a string: {:?}",
416
- Self::SELMA_HANDLE_TEXT,
417
- rb_result.err().unwrap()
418
- ),
419
- ));
407
+ let rb_text_chunk = SelmaHTMLTextChunk::new(text_chunk);
408
+ match rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_TEXT_CHUNK, (rb_text_chunk,)) {
409
+ Ok(_) => Ok(()),
410
+ Err(err) => Err(magnus::Error::new(
411
+ exception::runtime_error(),
412
+ format!("{err:?}"),
413
+ )),
420
414
  }
421
-
422
- let new_content = rb_result.unwrap();
423
- // TODO: can this be an option?
424
- text.replace(&new_content, ContentType::Html);
425
-
426
- Ok(())
427
415
  }
428
416
  }
429
417
 
@@ -1,12 +1,10 @@
1
1
  use std::{borrow::BorrowMut, cell::RefMut, collections::HashMap};
2
2
 
3
- use lol_html::html_content::{Comment, ContentType, Doctype, Element, EndTag};
4
- use magnus::{
5
- class, exception, function, method, scan_args, Error, Module, Object, RArray, RHash, RModule,
6
- Value,
3
+ use lol_html::{
4
+ errors::AttributeNameError,
5
+ html_content::{Comment, ContentType, Doctype, Element, EndTag},
7
6
  };
8
-
9
- use crate::tags::Tag;
7
+ use magnus::{class, function, method, scan_args, Module, Object, RArray, RHash, RModule, Value};
10
8
 
11
9
  #[derive(Clone, Debug)]
12
10
  struct ElementSanitizer {
@@ -18,7 +16,7 @@ struct ElementSanitizer {
18
16
 
19
17
  #[derive(Clone, Debug)]
20
18
  pub struct Sanitizer {
21
- flags: [u8; Tag::TAG_COUNT],
19
+ flags: [u8; crate::tags::Tag::TAG_COUNT],
22
20
  allowed_attrs: Vec<String>,
23
21
  allowed_classes: Vec<String>,
24
22
  element_sanitizers: HashMap<String, ElementSanitizer>,
@@ -39,7 +37,7 @@ impl SelmaSanitizer {
39
37
  const SELMA_SANITIZER_REMOVE_CONTENTS: u8 = (1 << 2);
40
38
  const SELMA_SANITIZER_WRAP_WHITESPACE: u8 = (1 << 3);
41
39
 
42
- pub fn new(arguments: &[Value]) -> Result<Self, Error> {
40
+ pub fn new(arguments: &[Value]) -> Result<Self, magnus::Error> {
43
41
  let args = scan_args::scan_args::<(), (Option<RHash>,), (), (), (), ()>(arguments)?;
44
42
  let (opt_config,): (Option<RHash>,) = args.optional;
45
43
 
@@ -50,7 +48,7 @@ impl SelmaSanitizer {
50
48
  };
51
49
 
52
50
  let mut element_sanitizers = HashMap::new();
53
- Tag::html_tags().iter().for_each(|html_tag| {
51
+ crate::tags::Tag::html_tags().iter().for_each(|html_tag| {
54
52
  let es = ElementSanitizer {
55
53
  allowed_attrs: vec![],
56
54
  allowed_classes: vec![],
@@ -58,11 +56,14 @@ impl SelmaSanitizer {
58
56
 
59
57
  protocol_sanitizers: HashMap::new(),
60
58
  };
61
- element_sanitizers.insert(Tag::element_name_from_enum(html_tag).to_string(), es);
59
+ element_sanitizers.insert(
60
+ crate::tags::Tag::element_name_from_enum(html_tag).to_string(),
61
+ es,
62
+ );
62
63
  });
63
64
 
64
65
  Ok(Self(std::cell::RefCell::new(Sanitizer {
65
- flags: [0; Tag::TAG_COUNT],
66
+ flags: [0; crate::tags::Tag::TAG_COUNT],
66
67
  allowed_attrs: vec![],
67
68
  allowed_classes: vec![],
68
69
  element_sanitizers,
@@ -74,7 +75,7 @@ impl SelmaSanitizer {
74
75
  })))
75
76
  }
76
77
 
77
- fn get_config(&self) -> Result<RHash, Error> {
78
+ fn get_config(&self) -> Result<RHash, magnus::Error> {
78
79
  let binding = self.0.borrow();
79
80
 
80
81
  Ok(binding.config)
@@ -82,7 +83,7 @@ impl SelmaSanitizer {
82
83
 
83
84
  /// Toggle a sanitizer option on or off.
84
85
  fn set_flag(&self, tag_name: String, flag: u8, set: bool) {
85
- let tag = Tag::tag_from_tag_name(tag_name.as_str());
86
+ let tag = crate::tags::Tag::tag_from_tag_name(tag_name.as_str());
86
87
  if set {
87
88
  self.0.borrow_mut().flags[tag.index] |= flag;
88
89
  } else {
@@ -93,13 +94,19 @@ impl SelmaSanitizer {
93
94
  /// Toggles all sanitization options on or off.
94
95
  fn set_all_flags(&self, flag: u8, set: bool) {
95
96
  if set {
96
- Tag::html_tags().iter().enumerate().for_each(|(iter, _)| {
97
- self.0.borrow_mut().flags[iter] |= flag;
98
- });
97
+ crate::tags::Tag::html_tags()
98
+ .iter()
99
+ .enumerate()
100
+ .for_each(|(iter, _)| {
101
+ self.0.borrow_mut().flags[iter] |= flag;
102
+ });
99
103
  } else {
100
- Tag::html_tags().iter().enumerate().for_each(|(iter, _)| {
101
- self.0.borrow_mut().flags[iter] &= flag;
102
- });
104
+ crate::tags::Tag::html_tags()
105
+ .iter()
106
+ .enumerate()
107
+ .for_each(|(iter, _)| {
108
+ self.0.borrow_mut().flags[iter] &= flag;
109
+ });
103
110
  }
104
111
  }
105
112
 
@@ -111,8 +118,8 @@ impl SelmaSanitizer {
111
118
 
112
119
  pub fn escape_tagfilter(&self, e: &mut Element) -> bool {
113
120
  if self.0.borrow().escape_tagfilter {
114
- let tag = Tag::tag_from_element(e);
115
- if Tag::is_tag_escapeworthy(tag) {
121
+ let tag = crate::tags::Tag::tag_from_element(e);
122
+ if crate::tags::Tag::is_tag_escapeworthy(tag) {
116
123
  e.remove();
117
124
  return true;
118
125
  }
@@ -229,9 +236,9 @@ impl SelmaSanitizer {
229
236
  }
230
237
  }
231
238
 
232
- pub fn sanitize_attributes(&self, element: &mut Element) -> Result<(), magnus::Error> {
239
+ pub fn sanitize_attributes(&self, element: &mut Element) -> Result<(), AttributeNameError> {
233
240
  let binding = self.0.borrow_mut();
234
- let tag = Tag::tag_from_element(element);
241
+ let tag = crate::tags::Tag::tag_from_element(element);
235
242
  let element_sanitizer = Self::get_element_sanitizer(&binding, &element.tag_name());
236
243
 
237
244
  // FIXME: This is a hack to get around the fact that we can't borrow
@@ -255,26 +262,30 @@ impl SelmaSanitizer {
255
262
  let x = escapist::unescape_html(trimmed.as_bytes());
256
263
  let unescaped_attr_val = String::from_utf8_lossy(&x).to_string();
257
264
 
258
- if !Self::should_keep_attribute(
265
+ let should_keep_attrubute = match Self::should_keep_attribute(
259
266
  &binding,
260
267
  element,
261
268
  element_sanitizer,
262
269
  attr_name,
263
270
  &unescaped_attr_val,
264
271
  ) {
272
+ Ok(should_keep) => should_keep,
273
+ Err(e) => {
274
+ return Err(e);
275
+ }
276
+ };
277
+
278
+ if !should_keep_attrubute {
265
279
  element.remove_attribute(attr_name);
266
280
  } else {
267
281
  // Prevent the use of `<meta>` elements that set a charset other than UTF-8,
268
282
  // since output is always UTF-8.
269
- if Tag::is_meta(tag) {
283
+ if crate::tags::Tag::is_meta(tag) {
270
284
  if attr_name == "charset" && unescaped_attr_val != "utf-8" {
271
285
  match element.set_attribute(attr_name, "utf-8") {
272
286
  Ok(_) => {}
273
- Err(_) => {
274
- return Err(magnus::Error::new(
275
- exception::runtime_error(),
276
- format!("Unable to change {attr_name:?}"),
277
- ));
287
+ Err(err) => {
288
+ return Err(err);
278
289
  }
279
290
  }
280
291
  }
@@ -282,13 +293,17 @@ impl SelmaSanitizer {
282
293
  let mut buf = String::new();
283
294
  // ...then, escape any special characters, for security
284
295
  if attr_name == "href" {
285
- // FIXME: gross--------------vvvv
286
- escapist::escape_href(&mut buf, unescaped_attr_val.to_string().as_str());
296
+ escapist::escape_href(&mut buf, unescaped_attr_val.as_str());
287
297
  } else {
288
- escapist::escape_html(&mut buf, unescaped_attr_val.to_string().as_str());
298
+ escapist::escape_html(&mut buf, unescaped_attr_val.as_str());
289
299
  };
290
300
 
291
- element.set_attribute(attr_name, &buf);
301
+ match element.set_attribute(attr_name, &buf) {
302
+ Ok(_) => {}
303
+ Err(err) => {
304
+ return Err(err);
305
+ }
306
+ }
292
307
  }
293
308
  }
294
309
  }
@@ -313,7 +328,7 @@ impl SelmaSanitizer {
313
328
  element_sanitizer: &ElementSanitizer,
314
329
  attr_name: &String,
315
330
  attr_val: &String,
316
- ) -> bool {
331
+ ) -> Result<bool, AttributeNameError> {
317
332
  let mut allowed: bool = false;
318
333
  let element_allowed_attrs = element_sanitizer.allowed_attrs.contains(attr_name);
319
334
  let sanitizer_allowed_attrs = binding.allowed_attrs.contains(attr_name);
@@ -327,7 +342,7 @@ impl SelmaSanitizer {
327
342
  }
328
343
 
329
344
  if !allowed {
330
- return false;
345
+ return Ok(false);
331
346
  }
332
347
 
333
348
  let protocol_sanitizer_values = element_sanitizer.protocol_sanitizers.get(attr_name);
@@ -335,32 +350,29 @@ impl SelmaSanitizer {
335
350
  None => {
336
351
  // has a protocol, but no sanitization list
337
352
  if !attr_val.is_empty() && Self::has_protocol(attr_val) {
338
- return false;
353
+ return Ok(false);
339
354
  }
340
355
  }
341
356
  Some(protocol_sanitizer_values) => {
342
357
  if !attr_val.is_empty()
343
358
  && !Self::has_allowed_protocol(protocol_sanitizer_values, attr_val)
344
359
  {
345
- return false;
360
+ return Ok(false);
346
361
  }
347
362
  }
348
363
  }
349
364
 
350
- if attr_name == "class"
351
- && !Self::sanitize_class_attribute(
365
+ if attr_name == "class" {
366
+ return Self::sanitize_class_attribute(
352
367
  binding,
353
368
  element,
354
369
  element_sanitizer,
355
370
  attr_name,
356
371
  attr_val,
357
- )
358
- .unwrap()
359
- {
360
- return false;
372
+ );
361
373
  }
362
374
 
363
- true
375
+ Ok(true)
364
376
  }
365
377
 
366
378
  fn has_protocol(attr_val: &str) -> bool {
@@ -403,7 +415,7 @@ impl SelmaSanitizer {
403
415
  element_sanitizer: &ElementSanitizer,
404
416
  attr_name: &str,
405
417
  attr_val: &str,
406
- ) -> Result<bool, Error> {
418
+ ) -> Result<bool, lol_html::errors::AttributeNameError> {
407
419
  let allowed_global = &binding.allowed_classes;
408
420
 
409
421
  let mut valid_classes: Vec<String> = vec![];
@@ -431,28 +443,25 @@ impl SelmaSanitizer {
431
443
 
432
444
  match element.set_attribute(attr_name, valid_classes.join(" ").as_str()) {
433
445
  Ok(_) => Ok(true),
434
- Err(err) => Err(Error::new(
435
- exception::runtime_error(),
436
- format!("AttributeNameError: {err:?}"),
437
- )),
446
+ Err(err) => Err(err),
438
447
  }
439
448
  }
440
449
 
441
450
  pub fn allow_element(&self, element: &mut Element) -> bool {
442
- let tag = Tag::tag_from_element(element);
451
+ let tag = crate::tags::Tag::tag_from_element(element);
443
452
  let flags: u8 = self.0.borrow().flags[tag.index];
444
453
 
445
454
  (flags & Self::SELMA_SANITIZER_ALLOW) == 0
446
455
  }
447
456
 
448
457
  pub fn try_remove_element(&self, element: &mut Element) -> bool {
449
- let tag = Tag::tag_from_element(element);
458
+ let tag = crate::tags::Tag::tag_from_element(element);
450
459
  let flags: u8 = self.0.borrow().flags[tag.index];
451
460
 
452
461
  let should_remove = !element.removed() && self.allow_element(element);
453
462
 
454
463
  if should_remove {
455
- if Tag::has_text_content(tag) {
464
+ if crate::tags::Tag::has_text_content(tag) {
456
465
  Self::remove_element(
457
466
  element,
458
467
  tag.self_closing,
@@ -465,7 +474,7 @@ impl SelmaSanitizer {
465
474
  Self::check_if_end_tag_needs_removal(element);
466
475
  } else {
467
476
  // anything in <iframe> must be removed, if it's kept
468
- if Tag::is_iframe(tag) {
477
+ if crate::tags::Tag::is_iframe(tag) {
469
478
  if self.0.borrow().flags[tag.index] != 0 {
470
479
  element.set_inner_content(" ", ContentType::Text);
471
480
  } else {
@@ -497,14 +506,14 @@ impl SelmaSanitizer {
497
506
  }
498
507
 
499
508
  pub fn force_remove_element(&self, element: &mut Element) {
500
- let tag = Tag::tag_from_element(element);
509
+ let tag = crate::tags::Tag::tag_from_element(element);
501
510
  let self_closing = tag.self_closing;
502
511
  Self::remove_element(element, self_closing, Self::SELMA_SANITIZER_REMOVE_CONTENTS);
503
512
  Self::check_if_end_tag_needs_removal(element);
504
513
  }
505
514
 
506
515
  fn check_if_end_tag_needs_removal(element: &mut Element) {
507
- if element.removed() && !Tag::tag_from_element(element).self_closing {
516
+ if element.removed() && !crate::tags::Tag::tag_from_element(element).self_closing {
508
517
  element
509
518
  .on_end_tag(move |end| {
510
519
  Self::remove_end_tag(end);
@@ -533,7 +542,7 @@ impl SelmaSanitizer {
533
542
  }
534
543
  }
535
544
 
536
- pub fn init(m_selma: RModule) -> Result<(), Error> {
545
+ pub fn init(m_selma: RModule) -> Result<(), magnus::Error> {
537
546
  let c_sanitizer = m_selma.define_class("Sanitizer", Default::default())?;
538
547
 
539
548
  c_sanitizer.define_singleton_method("new", function!(SelmaSanitizer::new, -1))?;
@@ -192,14 +192,17 @@ impl Tag {
192
192
  /// Is this tag something which needs to be removed?
193
193
  pub fn is_tag_escapeworthy(tag: Tag) -> bool {
194
194
  tag.index == HTMLTag::TITLE as usize
195
- || tag.index == HTMLTag::TEXTAREA as usize
196
- || tag.index == HTMLTag::STYLE as usize
197
- || tag.index == HTMLTag::XMP as usize
198
195
  || tag.index == HTMLTag::IFRAME as usize
196
+ || tag.index == HTMLTag::MATH as usize
199
197
  || tag.index == HTMLTag::NOEMBED as usize
200
198
  || tag.index == HTMLTag::NOFRAMES as usize
201
- || tag.index == HTMLTag::SCRIPT as usize
199
+ || tag.index == HTMLTag::NOSCRIPT as usize
202
200
  || tag.index == HTMLTag::PLAINTEXT as usize
201
+ || tag.index == HTMLTag::SCRIPT as usize
202
+ || tag.index == HTMLTag::STYLE as usize
203
+ || tag.index == HTMLTag::SVG as usize
204
+ || tag.index == HTMLTag::TEXTAREA as usize
205
+ || tag.index == HTMLTag::XMP as usize
203
206
  }
204
207
 
205
208
  pub const ESCAPEWORTHY_TAGS_CSS: &str =
Binary file
@@ -3,6 +3,10 @@
3
3
  module Selma
4
4
  class Sanitizer
5
5
  module Config
6
+ # although there are many more protocol types, eg., ftp, xmpp, etc.,
7
+ # these are the only ones that are allowed by default
8
+ VALID_PROTOCOLS = ["http", "https", "mailto", :relative]
9
+
6
10
  DEFAULT = freeze_config(
7
11
  # Whether or not to allow HTML comments. Allowing comments is strongly
8
12
  # discouraged, since IE allows script execution within conditional
data/lib/selma/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Selma
4
- VERSION = "0.0.3"
4
+ VERSION = "0.0.4"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: selma
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: arm64-darwin
6
6
  authors:
7
7
  - Garen J. Torikian
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-12-24 00:00:00.000000000 Z
11
+ date: 2022-12-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -81,6 +81,7 @@ files:
81
81
  - ext/selma/src/html.rs
82
82
  - ext/selma/src/html/element.rs
83
83
  - ext/selma/src/html/end_tag.rs
84
+ - ext/selma/src/html/text_chunk.rs
84
85
  - ext/selma/src/lib.rs
85
86
  - ext/selma/src/native_ref_wrap.rs
86
87
  - ext/selma/src/rewriter.rs