selma 0.0.3-x86_64-darwin → 0.0.5-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +11 -10
- data/ext/selma/Cargo.toml +1 -1
- data/ext/selma/src/html/element.rs +103 -44
- data/ext/selma/src/html/end_tag.rs +2 -2
- data/ext/selma/src/html/text_chunk.rs +113 -0
- data/ext/selma/src/html.rs +2 -0
- data/ext/selma/src/lib.rs +28 -1
- data/ext/selma/src/rewriter.rs +37 -49
- data/ext/selma/src/sanitizer.rs +102 -83
- data/ext/selma/src/tags.rs +7 -4
- data/lib/selma/3.1/selma.bundle +0 -0
- data/lib/selma/sanitizer/config/default.rb +4 -0
- data/lib/selma/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2505abb2e18c7f001866cfa03b52a27854b264b173f763e068f62d0e359bdd0f
|
4
|
+
data.tar.gz: 53a68d13f36649a832ae93bf77d9ababa3b44e46a9cd264cb065d2e60d1a12c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dc42caa9767caa33f8754d38017f3997f5e820c2788dd8b2192d429a86ad0b41da067266fa064497d61583f8d7cc96b41f9da5869f8eaf102407a226680c025d
|
7
|
+
data.tar.gz: 6656140ed6d6c9d10f88727f247909e3db6fd1b2772c3eae471a2614ef210d7b2d09e4da46b8dc7db6adbd50b7e73a1040e2cebe618bf406e549aa804748b08b
|
data/README.md
CHANGED
@@ -56,6 +56,10 @@ allow_comments: false,
|
|
56
56
|
# "<!DOCTYPE html>" when sanitizing a document.
|
57
57
|
allow_doctype: false,
|
58
58
|
|
59
|
+
# HTML elements to allow. By default, no elements are allowed (which means
|
60
|
+
# that all HTML will be stripped).
|
61
|
+
elements: ["a", "b", "img", ],
|
62
|
+
|
59
63
|
# HTML attributes to allow in specific elements. The key is the name of the element,
|
60
64
|
# and the value is an array of allowed attributes. By default, no attributes
|
61
65
|
# are allowed.
|
@@ -64,14 +68,10 @@ attributes: {
|
|
64
68
|
"img" => ["src"],
|
65
69
|
},
|
66
70
|
|
67
|
-
# HTML elements to allow. By default, no elements are allowed (which means
|
68
|
-
# that all HTML will be stripped).
|
69
|
-
elements: ["a", "b", "img", ],
|
70
|
-
|
71
71
|
# URL handling protocols to allow in specific attributes. By default, no
|
72
72
|
# protocols are allowed. Use :relative in place of a protocol if you want
|
73
73
|
# to allow relative URLs sans protocol.
|
74
|
-
|
74
|
+
protocols: {
|
75
75
|
"a" => { "href" => ["http", "https", "mailto", :relative] },
|
76
76
|
"img" => { "href" => ["http", "https"] },
|
77
77
|
},
|
@@ -91,7 +91,7 @@ The real power in Selma comes in its use of handlers. A handler is simply an obj
|
|
91
91
|
|
92
92
|
- `selector`, a method which MUST return instance of `Selma::Selector` which defines the CSS classes to match
|
93
93
|
- `handle_element`, a method that's call on each matched element
|
94
|
-
- `
|
94
|
+
- `handle_text_chunk`, a method that's called on each matched text node; this MUST return a string
|
95
95
|
|
96
96
|
Here's an example which rewrites the `href` attribute on `a` and the `src` attribute on `img` to be `https` rather than `http`.
|
97
97
|
|
@@ -118,7 +118,7 @@ rewriter = Selma::Rewriter.new(handlers: [MatchAttribute.new])
|
|
118
118
|
The `Selma::Selector` object has three possible kwargs:
|
119
119
|
|
120
120
|
- `match_element`: any element which matches this CSS rule will be passed on to `handle_element`
|
121
|
-
- `match_text_within`: any element which matches this CSS rule will be passed on to `
|
121
|
+
- `match_text_within`: any element which matches this CSS rule will be passed on to `handle_text_chunk`
|
122
122
|
- `ignore_text_within`: this is an array of element names whose text contents will be ignored
|
123
123
|
|
124
124
|
You've seen an example of `match_element`; here's one for `match_text` which changes strings in various elements which are _not_ `pre` or `code`:
|
@@ -132,7 +132,7 @@ class MatchText
|
|
132
132
|
SELECTOR
|
133
133
|
end
|
134
134
|
|
135
|
-
def
|
135
|
+
def handle_text_chunk(text)
|
136
136
|
string.sub(/@.+/, "<a href=\"www.yetto.app/#{Regexp.last_match}\">")
|
137
137
|
end
|
138
138
|
end
|
@@ -150,8 +150,9 @@ The `element` argument in `handle_element` has the following methods:
|
|
150
150
|
- `remove_attribute`: remove an attribute
|
151
151
|
- `attributes`: list all the attributes
|
152
152
|
- `ancestors`: list all the ancestors
|
153
|
-
- `append(content, content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
154
|
-
- `
|
153
|
+
- `append(content, as: content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
154
|
+
- `before(content, as: content_type)`: Inserts `content` before the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
155
|
+
- `after(content, as: content_type)`: Inserts `content` after the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
155
156
|
- `set_inner_content`: replaces inner content of the element with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
156
157
|
|
157
158
|
## Benchmarks
|
data/ext/selma/Cargo.toml
CHANGED
@@ -1,8 +1,6 @@
|
|
1
|
-
use std::borrow::Cow;
|
2
|
-
|
3
1
|
use crate::native_ref_wrap::NativeRefWrap;
|
4
|
-
use lol_html::html_content::
|
5
|
-
use magnus::{exception, method, Error, Module, RArray, RClass, RHash, RString,
|
2
|
+
use lol_html::html_content::Element;
|
3
|
+
use magnus::{exception, method, Error, Module, RArray, RClass, RHash, RString, Value};
|
6
4
|
|
7
5
|
struct HTMLElement {
|
8
6
|
element: NativeRefWrap<Element<'static, 'static>>,
|
@@ -38,6 +36,48 @@ impl SelmaHTMLElement {
|
|
38
36
|
}
|
39
37
|
}
|
40
38
|
|
39
|
+
fn set_tag_name(&self, name: String) -> Result<(), Error> {
|
40
|
+
let mut binding = self.0.borrow_mut();
|
41
|
+
|
42
|
+
if let Ok(element) = binding.element.get_mut() {
|
43
|
+
match element.set_tag_name(&name) {
|
44
|
+
Ok(_) => Ok(()),
|
45
|
+
Err(err) => Err(Error::new(exception::runtime_error(), format!("{err:?}"))),
|
46
|
+
}
|
47
|
+
} else {
|
48
|
+
Err(Error::new(
|
49
|
+
exception::runtime_error(),
|
50
|
+
"`set_tag_name` is not available",
|
51
|
+
))
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
fn is_self_closing(&self) -> Result<bool, Error> {
|
56
|
+
let binding = self.0.borrow();
|
57
|
+
|
58
|
+
if let Ok(e) = binding.element.get() {
|
59
|
+
Ok(e.is_self_closing())
|
60
|
+
} else {
|
61
|
+
Err(Error::new(
|
62
|
+
exception::runtime_error(),
|
63
|
+
"`is_self_closing` is not available",
|
64
|
+
))
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
fn has_attribute(&self, attr: String) -> Result<bool, Error> {
|
69
|
+
let binding = self.0.borrow();
|
70
|
+
|
71
|
+
if let Ok(e) = binding.element.get() {
|
72
|
+
Ok(e.has_attribute(&attr))
|
73
|
+
} else {
|
74
|
+
Err(Error::new(
|
75
|
+
exception::runtime_error(),
|
76
|
+
"`is_self_closing` is not available",
|
77
|
+
))
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
41
81
|
fn get_attribute(&self, attr: String) -> Option<String> {
|
42
82
|
let binding = self.0.borrow();
|
43
83
|
let element = binding.element.get();
|
@@ -106,89 +146,108 @@ impl SelmaHTMLElement {
|
|
106
146
|
Ok(array)
|
107
147
|
}
|
108
148
|
|
109
|
-
fn
|
149
|
+
fn before(&self, args: &[Value]) -> Result<(), Error> {
|
110
150
|
let mut binding = self.0.borrow_mut();
|
111
151
|
let element = binding.element.get_mut().unwrap();
|
112
152
|
|
113
|
-
let text_str =
|
153
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
154
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
155
|
+
Err(err) => return Err(err),
|
156
|
+
};
|
114
157
|
|
115
|
-
|
116
|
-
|
117
|
-
element.append(text_str, content_type);
|
158
|
+
element.before(&text_str, content_type);
|
118
159
|
|
119
160
|
Ok(())
|
120
161
|
}
|
121
162
|
|
122
|
-
fn
|
123
|
-
&self,
|
124
|
-
start_text: String,
|
125
|
-
end_text: String,
|
126
|
-
content_type: Symbol,
|
127
|
-
) -> Result<(), Error> {
|
163
|
+
fn after(&self, args: &[Value]) -> Result<(), Error> {
|
128
164
|
let mut binding = self.0.borrow_mut();
|
129
165
|
let element = binding.element.get_mut().unwrap();
|
130
166
|
|
131
|
-
let
|
132
|
-
|
133
|
-
|
134
|
-
|
167
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
168
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
169
|
+
Err(err) => return Err(err),
|
170
|
+
};
|
171
|
+
|
172
|
+
element.after(&text_str, content_type);
|
135
173
|
|
136
174
|
Ok(())
|
137
175
|
}
|
138
176
|
|
139
|
-
fn
|
177
|
+
fn prepend(&self, args: &[Value]) -> Result<(), Error> {
|
140
178
|
let mut binding = self.0.borrow_mut();
|
141
179
|
let element = binding.element.get_mut().unwrap();
|
142
180
|
|
143
|
-
let text_str =
|
181
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
182
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
183
|
+
Err(err) => return Err(err),
|
184
|
+
};
|
185
|
+
|
186
|
+
element.prepend(&text_str, content_type);
|
144
187
|
|
145
|
-
|
188
|
+
Ok(())
|
189
|
+
}
|
146
190
|
|
147
|
-
|
191
|
+
fn append(&self, args: &[Value]) -> Result<(), Error> {
|
192
|
+
let mut binding = self.0.borrow_mut();
|
193
|
+
let element = binding.element.get_mut().unwrap();
|
194
|
+
|
195
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
196
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
197
|
+
Err(err) => return Err(err),
|
198
|
+
};
|
199
|
+
|
200
|
+
element.append(&text_str, content_type);
|
148
201
|
|
149
202
|
Ok(())
|
150
203
|
}
|
151
204
|
|
152
|
-
fn
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
exception::runtime_error(),
|
165
|
-
format!("Could not unwrap symbol: {err:?}"),
|
166
|
-
))
|
167
|
-
.unwrap(),
|
168
|
-
}
|
205
|
+
fn set_inner_content(&self, args: &[Value]) -> Result<(), Error> {
|
206
|
+
let mut binding = self.0.borrow_mut();
|
207
|
+
let element = binding.element.get_mut().unwrap();
|
208
|
+
|
209
|
+
let (inner_content, content_type) = match crate::scan_text_args(args) {
|
210
|
+
Ok((inner_content, content_type)) => (inner_content, content_type),
|
211
|
+
Err(err) => return Err(err),
|
212
|
+
};
|
213
|
+
|
214
|
+
element.set_inner_content(&inner_content, content_type);
|
215
|
+
|
216
|
+
Ok(())
|
169
217
|
}
|
170
218
|
}
|
171
219
|
|
172
220
|
pub fn init(c_html: RClass) -> Result<(), Error> {
|
173
221
|
let c_element = c_html
|
174
222
|
.define_class("Element", Default::default())
|
175
|
-
.expect("cannot find class Selma::Element");
|
223
|
+
.expect("cannot find class Selma::HTML::Element");
|
176
224
|
|
177
225
|
c_element.define_method("tag_name", method!(SelmaHTMLElement::tag_name, 0))?;
|
226
|
+
c_element.define_method("tag_name=", method!(SelmaHTMLElement::set_tag_name, 1))?;
|
227
|
+
c_element.define_method(
|
228
|
+
"self_closing?",
|
229
|
+
method!(SelmaHTMLElement::is_self_closing, 0),
|
230
|
+
)?;
|
178
231
|
c_element.define_method("[]", method!(SelmaHTMLElement::get_attribute, 1))?;
|
179
232
|
c_element.define_method("[]=", method!(SelmaHTMLElement::set_attribute, 2))?;
|
180
233
|
c_element.define_method(
|
181
234
|
"remove_attribute",
|
182
235
|
method!(SelmaHTMLElement::remove_attribute, 1),
|
183
236
|
)?;
|
237
|
+
c_element.define_method(
|
238
|
+
"has_attribute?",
|
239
|
+
method!(SelmaHTMLElement::has_attribute, 1),
|
240
|
+
)?;
|
184
241
|
c_element.define_method("attributes", method!(SelmaHTMLElement::get_attributes, 0))?;
|
185
242
|
c_element.define_method("ancestors", method!(SelmaHTMLElement::get_ancestors, 0))?;
|
186
243
|
|
187
|
-
c_element.define_method("
|
188
|
-
c_element.define_method("
|
244
|
+
c_element.define_method("before", method!(SelmaHTMLElement::before, -1))?;
|
245
|
+
c_element.define_method("after", method!(SelmaHTMLElement::after, -1))?;
|
246
|
+
c_element.define_method("prepend", method!(SelmaHTMLElement::prepend, -1))?;
|
247
|
+
c_element.define_method("append", method!(SelmaHTMLElement::append, -1))?;
|
189
248
|
c_element.define_method(
|
190
249
|
"set_inner_content",
|
191
|
-
method!(SelmaHTMLElement::set_inner_content,
|
250
|
+
method!(SelmaHTMLElement::set_inner_content, -1),
|
192
251
|
)?;
|
193
252
|
|
194
253
|
Ok(())
|
@@ -6,7 +6,7 @@ struct HTMLEndTag {
|
|
6
6
|
end_tag: NativeRefWrap<EndTag<'static>>,
|
7
7
|
}
|
8
8
|
|
9
|
-
#[magnus::wrap(class = "Selma::HTML::
|
9
|
+
#[magnus::wrap(class = "Selma::HTML::EndTag")]
|
10
10
|
pub struct SelmaHTMLEndTag(std::cell::RefCell<HTMLEndTag>);
|
11
11
|
|
12
12
|
/// SAFETY: This is safe because we only access this data when the GVL is held.
|
@@ -27,7 +27,7 @@ impl SelmaHTMLEndTag {
|
|
27
27
|
pub fn init(c_html: RClass) -> Result<(), Error> {
|
28
28
|
let c_end_tag = c_html
|
29
29
|
.define_class("EndTag", Default::default())
|
30
|
-
.expect("cannot find class Selma::EndTag");
|
30
|
+
.expect("cannot find class Selma::HTML::EndTag");
|
31
31
|
|
32
32
|
c_end_tag.define_method("tag_name", method!(SelmaHTMLEndTag::tag_name, 0))?;
|
33
33
|
|
@@ -0,0 +1,113 @@
|
|
1
|
+
use crate::native_ref_wrap::NativeRefWrap;
|
2
|
+
use lol_html::html_content::{TextChunk, TextType};
|
3
|
+
use magnus::{exception, method, Error, Module, RClass, Symbol, Value};
|
4
|
+
|
5
|
+
struct HTMLTextChunk {
|
6
|
+
text_chunk: NativeRefWrap<TextChunk<'static>>,
|
7
|
+
}
|
8
|
+
|
9
|
+
#[magnus::wrap(class = "Selma::HTML::TextChunk")]
|
10
|
+
pub struct SelmaHTMLTextChunk(std::cell::RefCell<HTMLTextChunk>);
|
11
|
+
|
12
|
+
/// SAFETY: This is safe because we only access this data when the GVL is held.
|
13
|
+
unsafe impl Send for SelmaHTMLTextChunk {}
|
14
|
+
|
15
|
+
impl SelmaHTMLTextChunk {
|
16
|
+
pub fn new(text_chunk: &mut TextChunk) -> Self {
|
17
|
+
let (ref_wrap, _anchor) = NativeRefWrap::wrap_mut(text_chunk);
|
18
|
+
|
19
|
+
Self(std::cell::RefCell::new(HTMLTextChunk {
|
20
|
+
text_chunk: ref_wrap,
|
21
|
+
}))
|
22
|
+
}
|
23
|
+
|
24
|
+
fn to_s(&self) -> Result<String, Error> {
|
25
|
+
let binding = self.0.borrow();
|
26
|
+
|
27
|
+
if let Ok(tc) = binding.text_chunk.get() {
|
28
|
+
Ok(tc.as_str().to_string())
|
29
|
+
} else {
|
30
|
+
Err(Error::new(
|
31
|
+
exception::runtime_error(),
|
32
|
+
"`to_s` is not available",
|
33
|
+
))
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
fn text_type(&self) -> Result<Symbol, Error> {
|
38
|
+
let binding = self.0.borrow();
|
39
|
+
|
40
|
+
if let Ok(tc) = binding.text_chunk.get() {
|
41
|
+
match tc.text_type() {
|
42
|
+
TextType::Data => Ok(Symbol::from("data")),
|
43
|
+
TextType::PlainText => Ok(Symbol::from("plain_text")),
|
44
|
+
TextType::RawText => Ok(Symbol::from("raw_text")),
|
45
|
+
TextType::ScriptData => Ok(Symbol::from("script")),
|
46
|
+
TextType::RCData => Ok(Symbol::from("rc_data")),
|
47
|
+
TextType::CDataSection => Ok(Symbol::from("cdata_section")),
|
48
|
+
}
|
49
|
+
} else {
|
50
|
+
Err(Error::new(
|
51
|
+
exception::runtime_error(),
|
52
|
+
"`text_type` is not available",
|
53
|
+
))
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
fn before(&self, args: &[Value]) -> Result<(), Error> {
|
58
|
+
let mut binding = self.0.borrow_mut();
|
59
|
+
let text_chunk = binding.text_chunk.get_mut().unwrap();
|
60
|
+
|
61
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
62
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
63
|
+
Err(err) => return Err(err),
|
64
|
+
};
|
65
|
+
|
66
|
+
text_chunk.before(&text_str, content_type);
|
67
|
+
|
68
|
+
Ok(())
|
69
|
+
}
|
70
|
+
|
71
|
+
fn after(&self, args: &[Value]) -> Result<(), Error> {
|
72
|
+
let mut binding = self.0.borrow_mut();
|
73
|
+
let text_chunk = binding.text_chunk.get_mut().unwrap();
|
74
|
+
|
75
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
76
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
77
|
+
Err(err) => return Err(err),
|
78
|
+
};
|
79
|
+
|
80
|
+
text_chunk.after(&text_str, content_type);
|
81
|
+
|
82
|
+
Ok(())
|
83
|
+
}
|
84
|
+
|
85
|
+
fn replace(&self, args: &[Value]) -> Result<(), Error> {
|
86
|
+
let mut binding = self.0.borrow_mut();
|
87
|
+
let text_chunk = binding.text_chunk.get_mut().unwrap();
|
88
|
+
|
89
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
90
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
91
|
+
Err(err) => return Err(err),
|
92
|
+
};
|
93
|
+
|
94
|
+
text_chunk.replace(&text_str, content_type);
|
95
|
+
|
96
|
+
Ok(())
|
97
|
+
}
|
98
|
+
}
|
99
|
+
|
100
|
+
pub fn init(c_html: RClass) -> Result<(), Error> {
|
101
|
+
let c_text_chunk = c_html
|
102
|
+
.define_class("TextChunk", Default::default())
|
103
|
+
.expect("cannot find class Selma::HTML::TextChunk");
|
104
|
+
|
105
|
+
c_text_chunk.define_method("to_s", method!(SelmaHTMLTextChunk::to_s, 0))?;
|
106
|
+
c_text_chunk.define_method("content", method!(SelmaHTMLTextChunk::to_s, 0))?;
|
107
|
+
c_text_chunk.define_method("text_type", method!(SelmaHTMLTextChunk::text_type, 0))?;
|
108
|
+
c_text_chunk.define_method("before", method!(SelmaHTMLTextChunk::before, -1))?;
|
109
|
+
c_text_chunk.define_method("after", method!(SelmaHTMLTextChunk::after, -1))?;
|
110
|
+
c_text_chunk.define_method("replace", method!(SelmaHTMLTextChunk::replace, -1))?;
|
111
|
+
|
112
|
+
Ok(())
|
113
|
+
}
|
data/ext/selma/src/html.rs
CHANGED
@@ -9,9 +9,11 @@ pub fn init(m_selma: RModule) -> Result<(), Error> {
|
|
9
9
|
|
10
10
|
element::init(c_html).expect("cannot define Selma::HTML::Element class");
|
11
11
|
end_tag::init(c_html).expect("cannot define Selma::HTML::EndTag class");
|
12
|
+
text_chunk::init(c_html).expect("cannot define Selma::HTML::TextChunk class");
|
12
13
|
|
13
14
|
Ok(())
|
14
15
|
}
|
15
16
|
|
16
17
|
pub mod element;
|
17
18
|
pub mod end_tag;
|
19
|
+
pub mod text_chunk;
|
data/ext/selma/src/lib.rs
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
extern crate core;
|
2
2
|
|
3
|
-
use
|
3
|
+
use lol_html::html_content::ContentType;
|
4
|
+
use magnus::{define_module, exception, scan_args, Error, Symbol, Value};
|
4
5
|
|
5
6
|
pub mod html;
|
6
7
|
pub mod native_ref_wrap;
|
@@ -10,6 +11,32 @@ pub mod selector;
|
|
10
11
|
pub mod tags;
|
11
12
|
pub mod wrapped_struct;
|
12
13
|
|
14
|
+
#[allow(clippy::let_unit_value)]
|
15
|
+
fn scan_text_args(args: &[Value]) -> Result<(String, ContentType), magnus::Error> {
|
16
|
+
let args = scan_args::scan_args(args)?;
|
17
|
+
let (text,): (String,) = args.required;
|
18
|
+
let _: () = args.optional;
|
19
|
+
let _: () = args.splat;
|
20
|
+
let _: () = args.trailing;
|
21
|
+
let _: () = args.block;
|
22
|
+
|
23
|
+
let kwargs = scan_args::get_kwargs::<_, (Symbol,), (), ()>(args.keywords, &["as"], &[])?;
|
24
|
+
let as_sym = kwargs.required.0;
|
25
|
+
let as_sym_str = as_sym.name().unwrap();
|
26
|
+
let content_type = if as_sym_str == "text" {
|
27
|
+
ContentType::Text
|
28
|
+
} else if as_sym_str == "html" {
|
29
|
+
ContentType::Html
|
30
|
+
} else {
|
31
|
+
return Err(Error::new(
|
32
|
+
exception::runtime_error(),
|
33
|
+
format!("unknown symbol `{as_sym_str:?}`"),
|
34
|
+
));
|
35
|
+
};
|
36
|
+
|
37
|
+
Ok((text, content_type))
|
38
|
+
}
|
39
|
+
|
13
40
|
#[magnus::init]
|
14
41
|
fn init() -> Result<(), Error> {
|
15
42
|
let m_selma = define_module("Selma").expect("cannot define ::Selma module");
|
data/ext/selma/src/rewriter.rs
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
use lol_html::{
|
2
2
|
doc_comments, doctype, element,
|
3
|
-
html_content::{
|
3
|
+
html_content::{Element, EndTag, TextChunk},
|
4
4
|
text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter, Selector, Settings,
|
5
5
|
};
|
6
6
|
use magnus::{exception, function, method, scan_args, Module, Object, RArray, RModule, Value};
|
@@ -8,7 +8,7 @@ use magnus::{exception, function, method, scan_args, Module, Object, RArray, RMo
|
|
8
8
|
use std::{borrow::Cow, cell::RefCell, primitive::str, rc::Rc};
|
9
9
|
|
10
10
|
use crate::{
|
11
|
-
html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag},
|
11
|
+
html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag, text_chunk::SelmaHTMLTextChunk},
|
12
12
|
sanitizer::SelmaSanitizer,
|
13
13
|
selector::SelmaSelector,
|
14
14
|
tags::Tag,
|
@@ -43,7 +43,7 @@ unsafe impl Send for SelmaRewriter {}
|
|
43
43
|
impl SelmaRewriter {
|
44
44
|
const SELMA_ON_END_TAG: &str = "on_end_tag";
|
45
45
|
const SELMA_HANDLE_ELEMENT: &str = "handle_element";
|
46
|
-
const
|
46
|
+
const SELMA_HANDLE_TEXT_CHUNK: &str = "handle_text_chunk";
|
47
47
|
|
48
48
|
/// @yard
|
49
49
|
/// @def new(sanitizer: Selma::Sanitizer.new(Selma::Sanitizer::Config::DEFAULT), handlers: [])
|
@@ -145,7 +145,7 @@ impl SelmaRewriter {
|
|
145
145
|
let _: () = args.trailing;
|
146
146
|
let _: () = args.block;
|
147
147
|
|
148
|
-
let
|
148
|
+
let kwargs = scan_args::get_kwargs::<
|
149
149
|
_,
|
150
150
|
(),
|
151
151
|
(
|
@@ -154,7 +154,7 @@ impl SelmaRewriter {
|
|
154
154
|
),
|
155
155
|
(),
|
156
156
|
>(args.keywords, &[], &["sanitizer", "handlers"])?;
|
157
|
-
let (rb_sanitizer, rb_handlers) =
|
157
|
+
let (rb_sanitizer, rb_handlers) = kwargs.optional;
|
158
158
|
|
159
159
|
Ok((rb_sanitizer, rb_handlers))
|
160
160
|
}
|
@@ -162,26 +162,22 @@ impl SelmaRewriter {
|
|
162
162
|
/// Perform HTML rewrite sequence.
|
163
163
|
fn rewrite(&self, html: String) -> Result<String, magnus::Error> {
|
164
164
|
let sanitized_html = match &self.0.borrow().sanitizer {
|
165
|
-
None => html,
|
165
|
+
None => Ok(html),
|
166
166
|
Some(sanitizer) => {
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
let sanitized_html = Self::perform_sanitization(sanitizer, &html).unwrap();
|
167
|
+
let sanitized_html = match Self::perform_sanitization(sanitizer, &html) {
|
168
|
+
Ok(sanitized_html) => sanitized_html,
|
169
|
+
Err(err) => return Err(err),
|
170
|
+
};
|
172
171
|
|
173
|
-
String::from_utf8(sanitized_html)
|
172
|
+
String::from_utf8(sanitized_html)
|
174
173
|
}
|
175
174
|
};
|
176
175
|
let binding = self.0.borrow_mut();
|
177
176
|
let handlers = &binding.handlers;
|
178
177
|
|
179
|
-
match Self::perform_handler_rewrite(self, handlers, sanitized_html) {
|
178
|
+
match Self::perform_handler_rewrite(self, handlers, sanitized_html.unwrap()) {
|
180
179
|
Ok(rewritten_html) => Ok(String::from_utf8(rewritten_html).unwrap()),
|
181
|
-
Err(err) => Err(
|
182
|
-
exception::runtime_error(),
|
183
|
-
format!("{err:?}"),
|
184
|
-
)),
|
180
|
+
Err(err) => Err(err),
|
185
181
|
}
|
186
182
|
}
|
187
183
|
|
@@ -212,9 +208,10 @@ impl SelmaRewriter {
|
|
212
208
|
if el.removed() {
|
213
209
|
return Ok(());
|
214
210
|
}
|
215
|
-
sanitizer.sanitize_attributes(el)
|
216
|
-
|
217
|
-
|
211
|
+
match sanitizer.sanitize_attributes(el) {
|
212
|
+
Ok(_) => Ok(()),
|
213
|
+
Err(err) => Err(err.to_string().into()),
|
214
|
+
}
|
218
215
|
})],
|
219
216
|
// TODO: allow for MemorySettings to be defined
|
220
217
|
..Settings::default()
|
@@ -341,7 +338,7 @@ impl SelmaRewriter {
|
|
341
338
|
let mut stack = closure_element_stack.as_ref().borrow_mut();
|
342
339
|
stack.pop();
|
343
340
|
Ok(())
|
344
|
-
})
|
341
|
+
})?;
|
345
342
|
Ok(())
|
346
343
|
}));
|
347
344
|
});
|
@@ -375,13 +372,14 @@ impl SelmaRewriter {
|
|
375
372
|
) -> Result<(), magnus::Error> {
|
376
373
|
// if `on_end_tag` function is defined, call it
|
377
374
|
if rb_handler.respond_to(Self::SELMA_ON_END_TAG, true).unwrap() {
|
375
|
+
// TODO: error here is an "EndTagError"
|
378
376
|
element.on_end_tag(move |end_tag| {
|
379
377
|
let rb_end_tag = SelmaHTMLEndTag::new(end_tag);
|
380
378
|
|
381
|
-
rb_handler
|
382
|
-
|
383
|
-
.
|
384
|
-
|
379
|
+
match rb_handler.funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,)) {
|
380
|
+
Ok(_) => Ok(()),
|
381
|
+
Err(err) => Err(err.to_string().into()),
|
382
|
+
}
|
385
383
|
});
|
386
384
|
}
|
387
385
|
|
@@ -390,40 +388,30 @@ impl SelmaRewriter {
|
|
390
388
|
rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_ELEMENT, (rb_element,));
|
391
389
|
match rb_result {
|
392
390
|
Ok(_) => Ok(()),
|
393
|
-
Err(err) => Err(
|
394
|
-
exception::runtime_error(),
|
395
|
-
format!("{err:?}"),
|
396
|
-
)),
|
391
|
+
Err(err) => Err(err),
|
397
392
|
}
|
398
393
|
}
|
399
394
|
|
400
|
-
fn process_text_handlers(
|
401
|
-
|
402
|
-
|
395
|
+
fn process_text_handlers(
|
396
|
+
rb_handler: Value,
|
397
|
+
text_chunk: &mut TextChunk,
|
398
|
+
) -> Result<(), magnus::Error> {
|
399
|
+
// prevents missing `handle_text_chunk` function
|
400
|
+
let content = text_chunk.as_str();
|
403
401
|
|
404
402
|
// seems that sometimes lol-html returns blank text / EOLs?
|
405
403
|
if content.is_empty() {
|
406
404
|
return Ok(());
|
407
405
|
}
|
408
406
|
|
409
|
-
let
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
exception::
|
414
|
-
format!(
|
415
|
-
|
416
|
-
Self::SELMA_HANDLE_TEXT,
|
417
|
-
rb_result.err().unwrap()
|
418
|
-
),
|
419
|
-
));
|
407
|
+
let rb_text_chunk = SelmaHTMLTextChunk::new(text_chunk);
|
408
|
+
match rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_TEXT_CHUNK, (rb_text_chunk,)) {
|
409
|
+
Ok(_) => Ok(()),
|
410
|
+
Err(err) => Err(magnus::Error::new(
|
411
|
+
exception::runtime_error(),
|
412
|
+
format!("{err:?}"),
|
413
|
+
)),
|
420
414
|
}
|
421
|
-
|
422
|
-
let new_content = rb_result.unwrap();
|
423
|
-
// TODO: can this be an option?
|
424
|
-
text.replace(&new_content, ContentType::Html);
|
425
|
-
|
426
|
-
Ok(())
|
427
415
|
}
|
428
416
|
}
|
429
417
|
|
data/ext/selma/src/sanitizer.rs
CHANGED
@@ -1,12 +1,10 @@
|
|
1
|
-
use std::{borrow::BorrowMut,
|
1
|
+
use std::{borrow::BorrowMut, collections::HashMap};
|
2
2
|
|
3
|
-
use lol_html::
|
4
|
-
|
5
|
-
|
6
|
-
Value,
|
3
|
+
use lol_html::{
|
4
|
+
errors::AttributeNameError,
|
5
|
+
html_content::{Comment, ContentType, Doctype, Element, EndTag},
|
7
6
|
};
|
8
|
-
|
9
|
-
use crate::tags::Tag;
|
7
|
+
use magnus::{class, function, method, scan_args, Module, Object, RArray, RHash, RModule, Value};
|
10
8
|
|
11
9
|
#[derive(Clone, Debug)]
|
12
10
|
struct ElementSanitizer {
|
@@ -16,9 +14,21 @@ struct ElementSanitizer {
|
|
16
14
|
protocol_sanitizers: HashMap<String, Vec<String>>,
|
17
15
|
}
|
18
16
|
|
17
|
+
impl Default for ElementSanitizer {
|
18
|
+
fn default() -> Self {
|
19
|
+
ElementSanitizer {
|
20
|
+
allowed_attrs: vec![],
|
21
|
+
allowed_classes: vec![],
|
22
|
+
required_attrs: vec![],
|
23
|
+
|
24
|
+
protocol_sanitizers: HashMap::new(),
|
25
|
+
}
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
19
29
|
#[derive(Clone, Debug)]
|
20
30
|
pub struct Sanitizer {
|
21
|
-
flags: [u8; Tag::TAG_COUNT],
|
31
|
+
flags: [u8; crate::tags::Tag::TAG_COUNT],
|
22
32
|
allowed_attrs: Vec<String>,
|
23
33
|
allowed_classes: Vec<String>,
|
24
34
|
element_sanitizers: HashMap<String, ElementSanitizer>,
|
@@ -39,7 +49,7 @@ impl SelmaSanitizer {
|
|
39
49
|
const SELMA_SANITIZER_REMOVE_CONTENTS: u8 = (1 << 2);
|
40
50
|
const SELMA_SANITIZER_WRAP_WHITESPACE: u8 = (1 << 3);
|
41
51
|
|
42
|
-
pub fn new(arguments: &[Value]) -> Result<Self, Error> {
|
52
|
+
pub fn new(arguments: &[Value]) -> Result<Self, magnus::Error> {
|
43
53
|
let args = scan_args::scan_args::<(), (Option<RHash>,), (), (), (), ()>(arguments)?;
|
44
54
|
let (opt_config,): (Option<RHash>,) = args.optional;
|
45
55
|
|
@@ -50,19 +60,16 @@ impl SelmaSanitizer {
|
|
50
60
|
};
|
51
61
|
|
52
62
|
let mut element_sanitizers = HashMap::new();
|
53
|
-
Tag::html_tags().iter().for_each(|html_tag| {
|
54
|
-
let es = ElementSanitizer
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
protocol_sanitizers: HashMap::new(),
|
60
|
-
};
|
61
|
-
element_sanitizers.insert(Tag::element_name_from_enum(html_tag).to_string(), es);
|
63
|
+
crate::tags::Tag::html_tags().iter().for_each(|html_tag| {
|
64
|
+
let es = ElementSanitizer::default();
|
65
|
+
element_sanitizers.insert(
|
66
|
+
crate::tags::Tag::element_name_from_enum(html_tag).to_string(),
|
67
|
+
es,
|
68
|
+
);
|
62
69
|
});
|
63
70
|
|
64
71
|
Ok(Self(std::cell::RefCell::new(Sanitizer {
|
65
|
-
flags: [0; Tag::TAG_COUNT],
|
72
|
+
flags: [0; crate::tags::Tag::TAG_COUNT],
|
66
73
|
allowed_attrs: vec![],
|
67
74
|
allowed_classes: vec![],
|
68
75
|
element_sanitizers,
|
@@ -74,7 +81,7 @@ impl SelmaSanitizer {
|
|
74
81
|
})))
|
75
82
|
}
|
76
83
|
|
77
|
-
fn get_config(&self) -> Result<RHash, Error> {
|
84
|
+
fn get_config(&self) -> Result<RHash, magnus::Error> {
|
78
85
|
let binding = self.0.borrow();
|
79
86
|
|
80
87
|
Ok(binding.config)
|
@@ -82,7 +89,7 @@ impl SelmaSanitizer {
|
|
82
89
|
|
83
90
|
/// Toggle a sanitizer option on or off.
|
84
91
|
fn set_flag(&self, tag_name: String, flag: u8, set: bool) {
|
85
|
-
let tag = Tag::tag_from_tag_name(tag_name.as_str());
|
92
|
+
let tag = crate::tags::Tag::tag_from_tag_name(tag_name.as_str());
|
86
93
|
if set {
|
87
94
|
self.0.borrow_mut().flags[tag.index] |= flag;
|
88
95
|
} else {
|
@@ -93,13 +100,19 @@ impl SelmaSanitizer {
|
|
93
100
|
/// Toggles all sanitization options on or off.
|
94
101
|
fn set_all_flags(&self, flag: u8, set: bool) {
|
95
102
|
if set {
|
96
|
-
Tag::html_tags()
|
97
|
-
|
98
|
-
|
103
|
+
crate::tags::Tag::html_tags()
|
104
|
+
.iter()
|
105
|
+
.enumerate()
|
106
|
+
.for_each(|(iter, _)| {
|
107
|
+
self.0.borrow_mut().flags[iter] |= flag;
|
108
|
+
});
|
99
109
|
} else {
|
100
|
-
Tag::html_tags()
|
101
|
-
|
102
|
-
|
110
|
+
crate::tags::Tag::html_tags()
|
111
|
+
.iter()
|
112
|
+
.enumerate()
|
113
|
+
.for_each(|(iter, _)| {
|
114
|
+
self.0.borrow_mut().flags[iter] &= flag;
|
115
|
+
});
|
103
116
|
}
|
104
117
|
}
|
105
118
|
|
@@ -111,8 +124,8 @@ impl SelmaSanitizer {
|
|
111
124
|
|
112
125
|
pub fn escape_tagfilter(&self, e: &mut Element) -> bool {
|
113
126
|
if self.0.borrow().escape_tagfilter {
|
114
|
-
let tag = Tag::tag_from_element(e);
|
115
|
-
if Tag::is_tag_escapeworthy(tag) {
|
127
|
+
let tag = crate::tags::Tag::tag_from_element(e);
|
128
|
+
if crate::tags::Tag::is_tag_escapeworthy(tag) {
|
116
129
|
e.remove();
|
117
130
|
return true;
|
118
131
|
}
|
@@ -162,7 +175,8 @@ impl SelmaSanitizer {
|
|
162
175
|
let allowed_attrs = &mut binding.allowed_attrs;
|
163
176
|
Self::set_allowed(allowed_attrs, &attr_name, allow);
|
164
177
|
} else {
|
165
|
-
let
|
178
|
+
let element_sanitizers = &mut binding.element_sanitizers;
|
179
|
+
let element_sanitizer = Self::get_element_sanitizer(element_sanitizers, &element_name);
|
166
180
|
|
167
181
|
element_sanitizer.allowed_attrs.push(attr_name);
|
168
182
|
}
|
@@ -176,7 +190,8 @@ impl SelmaSanitizer {
|
|
176
190
|
let allowed_classes = &mut binding.allowed_classes;
|
177
191
|
Self::set_allowed(allowed_classes, &class_name, allow);
|
178
192
|
} else {
|
179
|
-
let
|
193
|
+
let element_sanitizers = &mut binding.element_sanitizers;
|
194
|
+
let element_sanitizer = Self::get_element_sanitizer(element_sanitizers, &element_name);
|
180
195
|
|
181
196
|
let allowed_classes = element_sanitizer.allowed_classes.borrow_mut();
|
182
197
|
Self::set_allowed(allowed_classes, &class_name, allow)
|
@@ -187,9 +202,10 @@ impl SelmaSanitizer {
|
|
187
202
|
fn set_allowed_protocols(&self, element_name: String, attr_name: String, allow_list: RArray) {
|
188
203
|
let mut binding = self.0.borrow_mut();
|
189
204
|
|
190
|
-
let
|
205
|
+
let element_sanitizers = &mut binding.element_sanitizers;
|
206
|
+
let element_sanitizer = Self::get_element_sanitizer(element_sanitizers, &element_name);
|
191
207
|
|
192
|
-
let protocol_sanitizers = element_sanitizer.protocol_sanitizers.borrow_mut();
|
208
|
+
let protocol_sanitizers = &mut element_sanitizer.protocol_sanitizers.borrow_mut();
|
193
209
|
|
194
210
|
for opt_allowed_protocol in allow_list.each() {
|
195
211
|
let allowed_protocol = opt_allowed_protocol.unwrap();
|
@@ -229,10 +245,16 @@ impl SelmaSanitizer {
|
|
229
245
|
}
|
230
246
|
}
|
231
247
|
|
232
|
-
pub fn sanitize_attributes(&self, element: &mut Element) -> Result<(),
|
233
|
-
let
|
234
|
-
let
|
235
|
-
let element_sanitizer =
|
248
|
+
pub fn sanitize_attributes(&self, element: &mut Element) -> Result<(), AttributeNameError> {
|
249
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
250
|
+
let tag_name = &element.tag_name();
|
251
|
+
let element_sanitizer = {
|
252
|
+
let mut binding = self.0.borrow_mut();
|
253
|
+
let element_sanitizers = &mut binding.element_sanitizers;
|
254
|
+
Self::get_element_sanitizer(element_sanitizers, tag_name).clone()
|
255
|
+
};
|
256
|
+
|
257
|
+
let binding = self.0.borrow();
|
236
258
|
|
237
259
|
// FIXME: This is a hack to get around the fact that we can't borrow
|
238
260
|
let attribute_map: HashMap<String, String> = element
|
@@ -255,26 +277,30 @@ impl SelmaSanitizer {
|
|
255
277
|
let x = escapist::unescape_html(trimmed.as_bytes());
|
256
278
|
let unescaped_attr_val = String::from_utf8_lossy(&x).to_string();
|
257
279
|
|
258
|
-
|
280
|
+
let should_keep_attrubute = match Self::should_keep_attribute(
|
259
281
|
&binding,
|
260
282
|
element,
|
261
|
-
element_sanitizer,
|
283
|
+
&element_sanitizer,
|
262
284
|
attr_name,
|
263
285
|
&unescaped_attr_val,
|
264
286
|
) {
|
287
|
+
Ok(should_keep) => should_keep,
|
288
|
+
Err(e) => {
|
289
|
+
return Err(e);
|
290
|
+
}
|
291
|
+
};
|
292
|
+
|
293
|
+
if !should_keep_attrubute {
|
265
294
|
element.remove_attribute(attr_name);
|
266
295
|
} else {
|
267
296
|
// Prevent the use of `<meta>` elements that set a charset other than UTF-8,
|
268
297
|
// since output is always UTF-8.
|
269
|
-
if Tag::is_meta(tag) {
|
298
|
+
if crate::tags::Tag::is_meta(tag) {
|
270
299
|
if attr_name == "charset" && unescaped_attr_val != "utf-8" {
|
271
300
|
match element.set_attribute(attr_name, "utf-8") {
|
272
301
|
Ok(_) => {}
|
273
|
-
Err(
|
274
|
-
return Err(
|
275
|
-
exception::runtime_error(),
|
276
|
-
format!("Unable to change {attr_name:?}"),
|
277
|
-
));
|
302
|
+
Err(err) => {
|
303
|
+
return Err(err);
|
278
304
|
}
|
279
305
|
}
|
280
306
|
}
|
@@ -282,13 +308,17 @@ impl SelmaSanitizer {
|
|
282
308
|
let mut buf = String::new();
|
283
309
|
// ...then, escape any special characters, for security
|
284
310
|
if attr_name == "href" {
|
285
|
-
|
286
|
-
escapist::escape_href(&mut buf, unescaped_attr_val.to_string().as_str());
|
311
|
+
escapist::escape_href(&mut buf, unescaped_attr_val.as_str());
|
287
312
|
} else {
|
288
|
-
escapist::escape_html(&mut buf, unescaped_attr_val.
|
313
|
+
escapist::escape_html(&mut buf, unescaped_attr_val.as_str());
|
289
314
|
};
|
290
315
|
|
291
|
-
element.set_attribute(attr_name, &buf)
|
316
|
+
match element.set_attribute(attr_name, &buf) {
|
317
|
+
Ok(_) => {}
|
318
|
+
Err(err) => {
|
319
|
+
return Err(err);
|
320
|
+
}
|
321
|
+
}
|
292
322
|
}
|
293
323
|
}
|
294
324
|
}
|
@@ -308,12 +338,12 @@ impl SelmaSanitizer {
|
|
308
338
|
}
|
309
339
|
|
310
340
|
fn should_keep_attribute(
|
311
|
-
binding: &
|
341
|
+
binding: &Sanitizer,
|
312
342
|
element: &mut Element,
|
313
343
|
element_sanitizer: &ElementSanitizer,
|
314
344
|
attr_name: &String,
|
315
345
|
attr_val: &String,
|
316
|
-
) -> bool {
|
346
|
+
) -> Result<bool, AttributeNameError> {
|
317
347
|
let mut allowed: bool = false;
|
318
348
|
let element_allowed_attrs = element_sanitizer.allowed_attrs.contains(attr_name);
|
319
349
|
let sanitizer_allowed_attrs = binding.allowed_attrs.contains(attr_name);
|
@@ -327,7 +357,7 @@ impl SelmaSanitizer {
|
|
327
357
|
}
|
328
358
|
|
329
359
|
if !allowed {
|
330
|
-
return false;
|
360
|
+
return Ok(false);
|
331
361
|
}
|
332
362
|
|
333
363
|
let protocol_sanitizer_values = element_sanitizer.protocol_sanitizers.get(attr_name);
|
@@ -335,32 +365,29 @@ impl SelmaSanitizer {
|
|
335
365
|
None => {
|
336
366
|
// has a protocol, but no sanitization list
|
337
367
|
if !attr_val.is_empty() && Self::has_protocol(attr_val) {
|
338
|
-
return false;
|
368
|
+
return Ok(false);
|
339
369
|
}
|
340
370
|
}
|
341
371
|
Some(protocol_sanitizer_values) => {
|
342
372
|
if !attr_val.is_empty()
|
343
373
|
&& !Self::has_allowed_protocol(protocol_sanitizer_values, attr_val)
|
344
374
|
{
|
345
|
-
return false;
|
375
|
+
return Ok(false);
|
346
376
|
}
|
347
377
|
}
|
348
378
|
}
|
349
379
|
|
350
|
-
if attr_name == "class"
|
351
|
-
|
380
|
+
if attr_name == "class" {
|
381
|
+
return Self::sanitize_class_attribute(
|
352
382
|
binding,
|
353
383
|
element,
|
354
384
|
element_sanitizer,
|
355
385
|
attr_name,
|
356
386
|
attr_val,
|
357
|
-
)
|
358
|
-
.unwrap()
|
359
|
-
{
|
360
|
-
return false;
|
387
|
+
);
|
361
388
|
}
|
362
389
|
|
363
|
-
true
|
390
|
+
Ok(true)
|
364
391
|
}
|
365
392
|
|
366
393
|
fn has_protocol(attr_val: &str) -> bool {
|
@@ -398,12 +425,12 @@ impl SelmaSanitizer {
|
|
398
425
|
}
|
399
426
|
|
400
427
|
fn sanitize_class_attribute(
|
401
|
-
binding: &
|
428
|
+
binding: &Sanitizer,
|
402
429
|
element: &mut Element,
|
403
430
|
element_sanitizer: &ElementSanitizer,
|
404
431
|
attr_name: &str,
|
405
432
|
attr_val: &str,
|
406
|
-
) -> Result<bool,
|
433
|
+
) -> Result<bool, lol_html::errors::AttributeNameError> {
|
407
434
|
let allowed_global = &binding.allowed_classes;
|
408
435
|
|
409
436
|
let mut valid_classes: Vec<String> = vec![];
|
@@ -431,28 +458,25 @@ impl SelmaSanitizer {
|
|
431
458
|
|
432
459
|
match element.set_attribute(attr_name, valid_classes.join(" ").as_str()) {
|
433
460
|
Ok(_) => Ok(true),
|
434
|
-
Err(err) => Err(
|
435
|
-
exception::runtime_error(),
|
436
|
-
format!("AttributeNameError: {err:?}"),
|
437
|
-
)),
|
461
|
+
Err(err) => Err(err),
|
438
462
|
}
|
439
463
|
}
|
440
464
|
|
441
465
|
pub fn allow_element(&self, element: &mut Element) -> bool {
|
442
|
-
let tag = Tag::tag_from_element(element);
|
466
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
443
467
|
let flags: u8 = self.0.borrow().flags[tag.index];
|
444
468
|
|
445
469
|
(flags & Self::SELMA_SANITIZER_ALLOW) == 0
|
446
470
|
}
|
447
471
|
|
448
472
|
pub fn try_remove_element(&self, element: &mut Element) -> bool {
|
449
|
-
let tag = Tag::tag_from_element(element);
|
473
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
450
474
|
let flags: u8 = self.0.borrow().flags[tag.index];
|
451
475
|
|
452
476
|
let should_remove = !element.removed() && self.allow_element(element);
|
453
477
|
|
454
478
|
if should_remove {
|
455
|
-
if Tag::has_text_content(tag) {
|
479
|
+
if crate::tags::Tag::has_text_content(tag) {
|
456
480
|
Self::remove_element(
|
457
481
|
element,
|
458
482
|
tag.self_closing,
|
@@ -465,7 +489,7 @@ impl SelmaSanitizer {
|
|
465
489
|
Self::check_if_end_tag_needs_removal(element);
|
466
490
|
} else {
|
467
491
|
// anything in <iframe> must be removed, if it's kept
|
468
|
-
if Tag::is_iframe(tag) {
|
492
|
+
if crate::tags::Tag::is_iframe(tag) {
|
469
493
|
if self.0.borrow().flags[tag.index] != 0 {
|
470
494
|
element.set_inner_content(" ", ContentType::Text);
|
471
495
|
} else {
|
@@ -497,14 +521,14 @@ impl SelmaSanitizer {
|
|
497
521
|
}
|
498
522
|
|
499
523
|
pub fn force_remove_element(&self, element: &mut Element) {
|
500
|
-
let tag = Tag::tag_from_element(element);
|
524
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
501
525
|
let self_closing = tag.self_closing;
|
502
526
|
Self::remove_element(element, self_closing, Self::SELMA_SANITIZER_REMOVE_CONTENTS);
|
503
527
|
Self::check_if_end_tag_needs_removal(element);
|
504
528
|
}
|
505
529
|
|
506
530
|
fn check_if_end_tag_needs_removal(element: &mut Element) {
|
507
|
-
if element.removed() && !Tag::tag_from_element(element).self_closing {
|
531
|
+
if element.removed() && !crate::tags::Tag::tag_from_element(element).self_closing {
|
508
532
|
element
|
509
533
|
.on_end_tag(move |end| {
|
510
534
|
Self::remove_end_tag(end);
|
@@ -519,21 +543,16 @@ impl SelmaSanitizer {
|
|
519
543
|
}
|
520
544
|
|
521
545
|
fn get_element_sanitizer<'a>(
|
522
|
-
|
523
|
-
element_name: &str,
|
524
|
-
) -> &'a ElementSanitizer {
|
525
|
-
binding.element_sanitizers.get(element_name).unwrap()
|
526
|
-
}
|
527
|
-
|
528
|
-
fn get_mut_element_sanitizer<'a>(
|
529
|
-
binding: &'a mut Sanitizer,
|
546
|
+
element_sanitizers: &'a mut HashMap<String, ElementSanitizer>,
|
530
547
|
element_name: &str,
|
531
548
|
) -> &'a mut ElementSanitizer {
|
532
|
-
|
549
|
+
element_sanitizers
|
550
|
+
.entry(element_name.to_string())
|
551
|
+
.or_insert_with(ElementSanitizer::default)
|
533
552
|
}
|
534
553
|
}
|
535
554
|
|
536
|
-
pub fn init(m_selma: RModule) -> Result<(), Error> {
|
555
|
+
pub fn init(m_selma: RModule) -> Result<(), magnus::Error> {
|
537
556
|
let c_sanitizer = m_selma.define_class("Sanitizer", Default::default())?;
|
538
557
|
|
539
558
|
c_sanitizer.define_singleton_method("new", function!(SelmaSanitizer::new, -1))?;
|
data/ext/selma/src/tags.rs
CHANGED
@@ -192,14 +192,17 @@ impl Tag {
|
|
192
192
|
/// Is this tag something which needs to be removed?
|
193
193
|
pub fn is_tag_escapeworthy(tag: Tag) -> bool {
|
194
194
|
tag.index == HTMLTag::TITLE as usize
|
195
|
-
|| tag.index == HTMLTag::TEXTAREA as usize
|
196
|
-
|| tag.index == HTMLTag::STYLE as usize
|
197
|
-
|| tag.index == HTMLTag::XMP as usize
|
198
195
|
|| tag.index == HTMLTag::IFRAME as usize
|
196
|
+
|| tag.index == HTMLTag::MATH as usize
|
199
197
|
|| tag.index == HTMLTag::NOEMBED as usize
|
200
198
|
|| tag.index == HTMLTag::NOFRAMES as usize
|
201
|
-
|| tag.index == HTMLTag::
|
199
|
+
|| tag.index == HTMLTag::NOSCRIPT as usize
|
202
200
|
|| tag.index == HTMLTag::PLAINTEXT as usize
|
201
|
+
|| tag.index == HTMLTag::SCRIPT as usize
|
202
|
+
|| tag.index == HTMLTag::STYLE as usize
|
203
|
+
|| tag.index == HTMLTag::SVG as usize
|
204
|
+
|| tag.index == HTMLTag::TEXTAREA as usize
|
205
|
+
|| tag.index == HTMLTag::XMP as usize
|
203
206
|
}
|
204
207
|
|
205
208
|
pub const ESCAPEWORTHY_TAGS_CSS: &str =
|
data/lib/selma/3.1/selma.bundle
CHANGED
Binary file
|
@@ -3,6 +3,10 @@
|
|
3
3
|
module Selma
|
4
4
|
class Sanitizer
|
5
5
|
module Config
|
6
|
+
# although there are many more protocol types, eg., ftp, xmpp, etc.,
|
7
|
+
# these are the only ones that are allowed by default
|
8
|
+
VALID_PROTOCOLS = ["http", "https", "mailto", :relative]
|
9
|
+
|
6
10
|
DEFAULT = freeze_config(
|
7
11
|
# Whether or not to allow HTML comments. Allowing comments is strongly
|
8
12
|
# discouraged, since IE allows script execution within conditional
|
data/lib/selma/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: selma
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: x86_64-darwin
|
6
6
|
authors:
|
7
7
|
- Garen J. Torikian
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-12-
|
11
|
+
date: 2022-12-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -81,6 +81,7 @@ files:
|
|
81
81
|
- ext/selma/src/html.rs
|
82
82
|
- ext/selma/src/html/element.rs
|
83
83
|
- ext/selma/src/html/end_tag.rs
|
84
|
+
- ext/selma/src/html/text_chunk.rs
|
84
85
|
- ext/selma/src/lib.rs
|
85
86
|
- ext/selma/src/native_ref_wrap.rs
|
86
87
|
- ext/selma/src/rewriter.rs
|