selma 0.0.3-x64-mingw-ucrt → 0.0.5-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -10
- data/ext/selma/Cargo.toml +1 -1
- data/ext/selma/src/html/element.rs +103 -44
- data/ext/selma/src/html/end_tag.rs +2 -2
- data/ext/selma/src/html/text_chunk.rs +113 -0
- data/ext/selma/src/html.rs +2 -0
- data/ext/selma/src/lib.rs +28 -1
- data/ext/selma/src/rewriter.rs +37 -49
- data/ext/selma/src/sanitizer.rs +102 -83
- data/ext/selma/src/tags.rs +7 -4
- data/lib/selma/3.1/selma.so +0 -0
- data/lib/selma/sanitizer/config/default.rb +4 -0
- data/lib/selma/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 71d2ed6bdc4d48a698b04caaa61d15bbb167d31e9fda1c0d48970cf2dbbd188d
|
4
|
+
data.tar.gz: 0c8a572b488c418aee28fe204053e8f969a9b43b20eb59291b219e14d5a29ad4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45d724459cbe8b4491a8cf01185edb0443a7890344a95aced77d058d5631c082c26e6ea3f0b4bcdc6e27f5466e6f66f247d3ef3d9b02fa5a76d36fa723d8f3de
|
7
|
+
data.tar.gz: 12e315d7d96c534b7cd318c3e67fcb566ee1c721f0a58836079366d540f6dfd16aa80b20abd0b9f0c03047c04811881dd46a4f33fc48407ab3b687e6b44ebdc0
|
data/README.md
CHANGED
@@ -56,6 +56,10 @@ allow_comments: false,
|
|
56
56
|
# "<!DOCTYPE html>" when sanitizing a document.
|
57
57
|
allow_doctype: false,
|
58
58
|
|
59
|
+
# HTML elements to allow. By default, no elements are allowed (which means
|
60
|
+
# that all HTML will be stripped).
|
61
|
+
elements: ["a", "b", "img", ],
|
62
|
+
|
59
63
|
# HTML attributes to allow in specific elements. The key is the name of the element,
|
60
64
|
# and the value is an array of allowed attributes. By default, no attributes
|
61
65
|
# are allowed.
|
@@ -64,14 +68,10 @@ attributes: {
|
|
64
68
|
"img" => ["src"],
|
65
69
|
},
|
66
70
|
|
67
|
-
# HTML elements to allow. By default, no elements are allowed (which means
|
68
|
-
# that all HTML will be stripped).
|
69
|
-
elements: ["a", "b", "img", ],
|
70
|
-
|
71
71
|
# URL handling protocols to allow in specific attributes. By default, no
|
72
72
|
# protocols are allowed. Use :relative in place of a protocol if you want
|
73
73
|
# to allow relative URLs sans protocol.
|
74
|
-
|
74
|
+
protocols: {
|
75
75
|
"a" => { "href" => ["http", "https", "mailto", :relative] },
|
76
76
|
"img" => { "href" => ["http", "https"] },
|
77
77
|
},
|
@@ -91,7 +91,7 @@ The real power in Selma comes in its use of handlers. A handler is simply an obj
|
|
91
91
|
|
92
92
|
- `selector`, a method which MUST return instance of `Selma::Selector` which defines the CSS classes to match
|
93
93
|
- `handle_element`, a method that's call on each matched element
|
94
|
-
- `
|
94
|
+
- `handle_text_chunk`, a method that's called on each matched text node; this MUST return a string
|
95
95
|
|
96
96
|
Here's an example which rewrites the `href` attribute on `a` and the `src` attribute on `img` to be `https` rather than `http`.
|
97
97
|
|
@@ -118,7 +118,7 @@ rewriter = Selma::Rewriter.new(handlers: [MatchAttribute.new])
|
|
118
118
|
The `Selma::Selector` object has three possible kwargs:
|
119
119
|
|
120
120
|
- `match_element`: any element which matches this CSS rule will be passed on to `handle_element`
|
121
|
-
- `match_text_within`: any element which matches this CSS rule will be passed on to `
|
121
|
+
- `match_text_within`: any element which matches this CSS rule will be passed on to `handle_text_chunk`
|
122
122
|
- `ignore_text_within`: this is an array of element names whose text contents will be ignored
|
123
123
|
|
124
124
|
You've seen an example of `match_element`; here's one for `match_text` which changes strings in various elements which are _not_ `pre` or `code`:
|
@@ -132,7 +132,7 @@ class MatchText
|
|
132
132
|
SELECTOR
|
133
133
|
end
|
134
134
|
|
135
|
-
def
|
135
|
+
def handle_text_chunk(text)
|
136
136
|
string.sub(/@.+/, "<a href=\"www.yetto.app/#{Regexp.last_match}\">")
|
137
137
|
end
|
138
138
|
end
|
@@ -150,8 +150,9 @@ The `element` argument in `handle_element` has the following methods:
|
|
150
150
|
- `remove_attribute`: remove an attribute
|
151
151
|
- `attributes`: list all the attributes
|
152
152
|
- `ancestors`: list all the ancestors
|
153
|
-
- `append(content, content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
154
|
-
- `
|
153
|
+
- `append(content, as: content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
154
|
+
- `before(content, as: content_type)`: Inserts `content` before the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
155
|
+
- `after(content, as: content_type)`: Inserts `content` after the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
155
156
|
- `set_inner_content`: replaces inner content of the element with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
156
157
|
|
157
158
|
## Benchmarks
|
data/ext/selma/Cargo.toml
CHANGED
@@ -1,8 +1,6 @@
|
|
1
|
-
use std::borrow::Cow;
|
2
|
-
|
3
1
|
use crate::native_ref_wrap::NativeRefWrap;
|
4
|
-
use lol_html::html_content::
|
5
|
-
use magnus::{exception, method, Error, Module, RArray, RClass, RHash, RString,
|
2
|
+
use lol_html::html_content::Element;
|
3
|
+
use magnus::{exception, method, Error, Module, RArray, RClass, RHash, RString, Value};
|
6
4
|
|
7
5
|
struct HTMLElement {
|
8
6
|
element: NativeRefWrap<Element<'static, 'static>>,
|
@@ -38,6 +36,48 @@ impl SelmaHTMLElement {
|
|
38
36
|
}
|
39
37
|
}
|
40
38
|
|
39
|
+
fn set_tag_name(&self, name: String) -> Result<(), Error> {
|
40
|
+
let mut binding = self.0.borrow_mut();
|
41
|
+
|
42
|
+
if let Ok(element) = binding.element.get_mut() {
|
43
|
+
match element.set_tag_name(&name) {
|
44
|
+
Ok(_) => Ok(()),
|
45
|
+
Err(err) => Err(Error::new(exception::runtime_error(), format!("{err:?}"))),
|
46
|
+
}
|
47
|
+
} else {
|
48
|
+
Err(Error::new(
|
49
|
+
exception::runtime_error(),
|
50
|
+
"`set_tag_name` is not available",
|
51
|
+
))
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
fn is_self_closing(&self) -> Result<bool, Error> {
|
56
|
+
let binding = self.0.borrow();
|
57
|
+
|
58
|
+
if let Ok(e) = binding.element.get() {
|
59
|
+
Ok(e.is_self_closing())
|
60
|
+
} else {
|
61
|
+
Err(Error::new(
|
62
|
+
exception::runtime_error(),
|
63
|
+
"`is_self_closing` is not available",
|
64
|
+
))
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
fn has_attribute(&self, attr: String) -> Result<bool, Error> {
|
69
|
+
let binding = self.0.borrow();
|
70
|
+
|
71
|
+
if let Ok(e) = binding.element.get() {
|
72
|
+
Ok(e.has_attribute(&attr))
|
73
|
+
} else {
|
74
|
+
Err(Error::new(
|
75
|
+
exception::runtime_error(),
|
76
|
+
"`is_self_closing` is not available",
|
77
|
+
))
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
41
81
|
fn get_attribute(&self, attr: String) -> Option<String> {
|
42
82
|
let binding = self.0.borrow();
|
43
83
|
let element = binding.element.get();
|
@@ -106,89 +146,108 @@ impl SelmaHTMLElement {
|
|
106
146
|
Ok(array)
|
107
147
|
}
|
108
148
|
|
109
|
-
fn
|
149
|
+
fn before(&self, args: &[Value]) -> Result<(), Error> {
|
110
150
|
let mut binding = self.0.borrow_mut();
|
111
151
|
let element = binding.element.get_mut().unwrap();
|
112
152
|
|
113
|
-
let text_str =
|
153
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
154
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
155
|
+
Err(err) => return Err(err),
|
156
|
+
};
|
114
157
|
|
115
|
-
|
116
|
-
|
117
|
-
element.append(text_str, content_type);
|
158
|
+
element.before(&text_str, content_type);
|
118
159
|
|
119
160
|
Ok(())
|
120
161
|
}
|
121
162
|
|
122
|
-
fn
|
123
|
-
&self,
|
124
|
-
start_text: String,
|
125
|
-
end_text: String,
|
126
|
-
content_type: Symbol,
|
127
|
-
) -> Result<(), Error> {
|
163
|
+
fn after(&self, args: &[Value]) -> Result<(), Error> {
|
128
164
|
let mut binding = self.0.borrow_mut();
|
129
165
|
let element = binding.element.get_mut().unwrap();
|
130
166
|
|
131
|
-
let
|
132
|
-
|
133
|
-
|
134
|
-
|
167
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
168
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
169
|
+
Err(err) => return Err(err),
|
170
|
+
};
|
171
|
+
|
172
|
+
element.after(&text_str, content_type);
|
135
173
|
|
136
174
|
Ok(())
|
137
175
|
}
|
138
176
|
|
139
|
-
fn
|
177
|
+
fn prepend(&self, args: &[Value]) -> Result<(), Error> {
|
140
178
|
let mut binding = self.0.borrow_mut();
|
141
179
|
let element = binding.element.get_mut().unwrap();
|
142
180
|
|
143
|
-
let text_str =
|
181
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
182
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
183
|
+
Err(err) => return Err(err),
|
184
|
+
};
|
185
|
+
|
186
|
+
element.prepend(&text_str, content_type);
|
144
187
|
|
145
|
-
|
188
|
+
Ok(())
|
189
|
+
}
|
146
190
|
|
147
|
-
|
191
|
+
fn append(&self, args: &[Value]) -> Result<(), Error> {
|
192
|
+
let mut binding = self.0.borrow_mut();
|
193
|
+
let element = binding.element.get_mut().unwrap();
|
194
|
+
|
195
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
196
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
197
|
+
Err(err) => return Err(err),
|
198
|
+
};
|
199
|
+
|
200
|
+
element.append(&text_str, content_type);
|
148
201
|
|
149
202
|
Ok(())
|
150
203
|
}
|
151
204
|
|
152
|
-
fn
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
exception::runtime_error(),
|
165
|
-
format!("Could not unwrap symbol: {err:?}"),
|
166
|
-
))
|
167
|
-
.unwrap(),
|
168
|
-
}
|
205
|
+
fn set_inner_content(&self, args: &[Value]) -> Result<(), Error> {
|
206
|
+
let mut binding = self.0.borrow_mut();
|
207
|
+
let element = binding.element.get_mut().unwrap();
|
208
|
+
|
209
|
+
let (inner_content, content_type) = match crate::scan_text_args(args) {
|
210
|
+
Ok((inner_content, content_type)) => (inner_content, content_type),
|
211
|
+
Err(err) => return Err(err),
|
212
|
+
};
|
213
|
+
|
214
|
+
element.set_inner_content(&inner_content, content_type);
|
215
|
+
|
216
|
+
Ok(())
|
169
217
|
}
|
170
218
|
}
|
171
219
|
|
172
220
|
pub fn init(c_html: RClass) -> Result<(), Error> {
|
173
221
|
let c_element = c_html
|
174
222
|
.define_class("Element", Default::default())
|
175
|
-
.expect("cannot find class Selma::Element");
|
223
|
+
.expect("cannot find class Selma::HTML::Element");
|
176
224
|
|
177
225
|
c_element.define_method("tag_name", method!(SelmaHTMLElement::tag_name, 0))?;
|
226
|
+
c_element.define_method("tag_name=", method!(SelmaHTMLElement::set_tag_name, 1))?;
|
227
|
+
c_element.define_method(
|
228
|
+
"self_closing?",
|
229
|
+
method!(SelmaHTMLElement::is_self_closing, 0),
|
230
|
+
)?;
|
178
231
|
c_element.define_method("[]", method!(SelmaHTMLElement::get_attribute, 1))?;
|
179
232
|
c_element.define_method("[]=", method!(SelmaHTMLElement::set_attribute, 2))?;
|
180
233
|
c_element.define_method(
|
181
234
|
"remove_attribute",
|
182
235
|
method!(SelmaHTMLElement::remove_attribute, 1),
|
183
236
|
)?;
|
237
|
+
c_element.define_method(
|
238
|
+
"has_attribute?",
|
239
|
+
method!(SelmaHTMLElement::has_attribute, 1),
|
240
|
+
)?;
|
184
241
|
c_element.define_method("attributes", method!(SelmaHTMLElement::get_attributes, 0))?;
|
185
242
|
c_element.define_method("ancestors", method!(SelmaHTMLElement::get_ancestors, 0))?;
|
186
243
|
|
187
|
-
c_element.define_method("
|
188
|
-
c_element.define_method("
|
244
|
+
c_element.define_method("before", method!(SelmaHTMLElement::before, -1))?;
|
245
|
+
c_element.define_method("after", method!(SelmaHTMLElement::after, -1))?;
|
246
|
+
c_element.define_method("prepend", method!(SelmaHTMLElement::prepend, -1))?;
|
247
|
+
c_element.define_method("append", method!(SelmaHTMLElement::append, -1))?;
|
189
248
|
c_element.define_method(
|
190
249
|
"set_inner_content",
|
191
|
-
method!(SelmaHTMLElement::set_inner_content,
|
250
|
+
method!(SelmaHTMLElement::set_inner_content, -1),
|
192
251
|
)?;
|
193
252
|
|
194
253
|
Ok(())
|
@@ -6,7 +6,7 @@ struct HTMLEndTag {
|
|
6
6
|
end_tag: NativeRefWrap<EndTag<'static>>,
|
7
7
|
}
|
8
8
|
|
9
|
-
#[magnus::wrap(class = "Selma::HTML::
|
9
|
+
#[magnus::wrap(class = "Selma::HTML::EndTag")]
|
10
10
|
pub struct SelmaHTMLEndTag(std::cell::RefCell<HTMLEndTag>);
|
11
11
|
|
12
12
|
/// SAFETY: This is safe because we only access this data when the GVL is held.
|
@@ -27,7 +27,7 @@ impl SelmaHTMLEndTag {
|
|
27
27
|
pub fn init(c_html: RClass) -> Result<(), Error> {
|
28
28
|
let c_end_tag = c_html
|
29
29
|
.define_class("EndTag", Default::default())
|
30
|
-
.expect("cannot find class Selma::EndTag");
|
30
|
+
.expect("cannot find class Selma::HTML::EndTag");
|
31
31
|
|
32
32
|
c_end_tag.define_method("tag_name", method!(SelmaHTMLEndTag::tag_name, 0))?;
|
33
33
|
|
@@ -0,0 +1,113 @@
|
|
1
|
+
use crate::native_ref_wrap::NativeRefWrap;
|
2
|
+
use lol_html::html_content::{TextChunk, TextType};
|
3
|
+
use magnus::{exception, method, Error, Module, RClass, Symbol, Value};
|
4
|
+
|
5
|
+
struct HTMLTextChunk {
|
6
|
+
text_chunk: NativeRefWrap<TextChunk<'static>>,
|
7
|
+
}
|
8
|
+
|
9
|
+
#[magnus::wrap(class = "Selma::HTML::TextChunk")]
|
10
|
+
pub struct SelmaHTMLTextChunk(std::cell::RefCell<HTMLTextChunk>);
|
11
|
+
|
12
|
+
/// SAFETY: This is safe because we only access this data when the GVL is held.
|
13
|
+
unsafe impl Send for SelmaHTMLTextChunk {}
|
14
|
+
|
15
|
+
impl SelmaHTMLTextChunk {
|
16
|
+
pub fn new(text_chunk: &mut TextChunk) -> Self {
|
17
|
+
let (ref_wrap, _anchor) = NativeRefWrap::wrap_mut(text_chunk);
|
18
|
+
|
19
|
+
Self(std::cell::RefCell::new(HTMLTextChunk {
|
20
|
+
text_chunk: ref_wrap,
|
21
|
+
}))
|
22
|
+
}
|
23
|
+
|
24
|
+
fn to_s(&self) -> Result<String, Error> {
|
25
|
+
let binding = self.0.borrow();
|
26
|
+
|
27
|
+
if let Ok(tc) = binding.text_chunk.get() {
|
28
|
+
Ok(tc.as_str().to_string())
|
29
|
+
} else {
|
30
|
+
Err(Error::new(
|
31
|
+
exception::runtime_error(),
|
32
|
+
"`to_s` is not available",
|
33
|
+
))
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
fn text_type(&self) -> Result<Symbol, Error> {
|
38
|
+
let binding = self.0.borrow();
|
39
|
+
|
40
|
+
if let Ok(tc) = binding.text_chunk.get() {
|
41
|
+
match tc.text_type() {
|
42
|
+
TextType::Data => Ok(Symbol::from("data")),
|
43
|
+
TextType::PlainText => Ok(Symbol::from("plain_text")),
|
44
|
+
TextType::RawText => Ok(Symbol::from("raw_text")),
|
45
|
+
TextType::ScriptData => Ok(Symbol::from("script")),
|
46
|
+
TextType::RCData => Ok(Symbol::from("rc_data")),
|
47
|
+
TextType::CDataSection => Ok(Symbol::from("cdata_section")),
|
48
|
+
}
|
49
|
+
} else {
|
50
|
+
Err(Error::new(
|
51
|
+
exception::runtime_error(),
|
52
|
+
"`text_type` is not available",
|
53
|
+
))
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
fn before(&self, args: &[Value]) -> Result<(), Error> {
|
58
|
+
let mut binding = self.0.borrow_mut();
|
59
|
+
let text_chunk = binding.text_chunk.get_mut().unwrap();
|
60
|
+
|
61
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
62
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
63
|
+
Err(err) => return Err(err),
|
64
|
+
};
|
65
|
+
|
66
|
+
text_chunk.before(&text_str, content_type);
|
67
|
+
|
68
|
+
Ok(())
|
69
|
+
}
|
70
|
+
|
71
|
+
fn after(&self, args: &[Value]) -> Result<(), Error> {
|
72
|
+
let mut binding = self.0.borrow_mut();
|
73
|
+
let text_chunk = binding.text_chunk.get_mut().unwrap();
|
74
|
+
|
75
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
76
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
77
|
+
Err(err) => return Err(err),
|
78
|
+
};
|
79
|
+
|
80
|
+
text_chunk.after(&text_str, content_type);
|
81
|
+
|
82
|
+
Ok(())
|
83
|
+
}
|
84
|
+
|
85
|
+
fn replace(&self, args: &[Value]) -> Result<(), Error> {
|
86
|
+
let mut binding = self.0.borrow_mut();
|
87
|
+
let text_chunk = binding.text_chunk.get_mut().unwrap();
|
88
|
+
|
89
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
90
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
91
|
+
Err(err) => return Err(err),
|
92
|
+
};
|
93
|
+
|
94
|
+
text_chunk.replace(&text_str, content_type);
|
95
|
+
|
96
|
+
Ok(())
|
97
|
+
}
|
98
|
+
}
|
99
|
+
|
100
|
+
pub fn init(c_html: RClass) -> Result<(), Error> {
|
101
|
+
let c_text_chunk = c_html
|
102
|
+
.define_class("TextChunk", Default::default())
|
103
|
+
.expect("cannot find class Selma::HTML::TextChunk");
|
104
|
+
|
105
|
+
c_text_chunk.define_method("to_s", method!(SelmaHTMLTextChunk::to_s, 0))?;
|
106
|
+
c_text_chunk.define_method("content", method!(SelmaHTMLTextChunk::to_s, 0))?;
|
107
|
+
c_text_chunk.define_method("text_type", method!(SelmaHTMLTextChunk::text_type, 0))?;
|
108
|
+
c_text_chunk.define_method("before", method!(SelmaHTMLTextChunk::before, -1))?;
|
109
|
+
c_text_chunk.define_method("after", method!(SelmaHTMLTextChunk::after, -1))?;
|
110
|
+
c_text_chunk.define_method("replace", method!(SelmaHTMLTextChunk::replace, -1))?;
|
111
|
+
|
112
|
+
Ok(())
|
113
|
+
}
|
data/ext/selma/src/html.rs
CHANGED
@@ -9,9 +9,11 @@ pub fn init(m_selma: RModule) -> Result<(), Error> {
|
|
9
9
|
|
10
10
|
element::init(c_html).expect("cannot define Selma::HTML::Element class");
|
11
11
|
end_tag::init(c_html).expect("cannot define Selma::HTML::EndTag class");
|
12
|
+
text_chunk::init(c_html).expect("cannot define Selma::HTML::TextChunk class");
|
12
13
|
|
13
14
|
Ok(())
|
14
15
|
}
|
15
16
|
|
16
17
|
pub mod element;
|
17
18
|
pub mod end_tag;
|
19
|
+
pub mod text_chunk;
|
data/ext/selma/src/lib.rs
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
extern crate core;
|
2
2
|
|
3
|
-
use
|
3
|
+
use lol_html::html_content::ContentType;
|
4
|
+
use magnus::{define_module, exception, scan_args, Error, Symbol, Value};
|
4
5
|
|
5
6
|
pub mod html;
|
6
7
|
pub mod native_ref_wrap;
|
@@ -10,6 +11,32 @@ pub mod selector;
|
|
10
11
|
pub mod tags;
|
11
12
|
pub mod wrapped_struct;
|
12
13
|
|
14
|
+
#[allow(clippy::let_unit_value)]
|
15
|
+
fn scan_text_args(args: &[Value]) -> Result<(String, ContentType), magnus::Error> {
|
16
|
+
let args = scan_args::scan_args(args)?;
|
17
|
+
let (text,): (String,) = args.required;
|
18
|
+
let _: () = args.optional;
|
19
|
+
let _: () = args.splat;
|
20
|
+
let _: () = args.trailing;
|
21
|
+
let _: () = args.block;
|
22
|
+
|
23
|
+
let kwargs = scan_args::get_kwargs::<_, (Symbol,), (), ()>(args.keywords, &["as"], &[])?;
|
24
|
+
let as_sym = kwargs.required.0;
|
25
|
+
let as_sym_str = as_sym.name().unwrap();
|
26
|
+
let content_type = if as_sym_str == "text" {
|
27
|
+
ContentType::Text
|
28
|
+
} else if as_sym_str == "html" {
|
29
|
+
ContentType::Html
|
30
|
+
} else {
|
31
|
+
return Err(Error::new(
|
32
|
+
exception::runtime_error(),
|
33
|
+
format!("unknown symbol `{as_sym_str:?}`"),
|
34
|
+
));
|
35
|
+
};
|
36
|
+
|
37
|
+
Ok((text, content_type))
|
38
|
+
}
|
39
|
+
|
13
40
|
#[magnus::init]
|
14
41
|
fn init() -> Result<(), Error> {
|
15
42
|
let m_selma = define_module("Selma").expect("cannot define ::Selma module");
|
data/ext/selma/src/rewriter.rs
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
use lol_html::{
|
2
2
|
doc_comments, doctype, element,
|
3
|
-
html_content::{
|
3
|
+
html_content::{Element, EndTag, TextChunk},
|
4
4
|
text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter, Selector, Settings,
|
5
5
|
};
|
6
6
|
use magnus::{exception, function, method, scan_args, Module, Object, RArray, RModule, Value};
|
@@ -8,7 +8,7 @@ use magnus::{exception, function, method, scan_args, Module, Object, RArray, RMo
|
|
8
8
|
use std::{borrow::Cow, cell::RefCell, primitive::str, rc::Rc};
|
9
9
|
|
10
10
|
use crate::{
|
11
|
-
html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag},
|
11
|
+
html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag, text_chunk::SelmaHTMLTextChunk},
|
12
12
|
sanitizer::SelmaSanitizer,
|
13
13
|
selector::SelmaSelector,
|
14
14
|
tags::Tag,
|
@@ -43,7 +43,7 @@ unsafe impl Send for SelmaRewriter {}
|
|
43
43
|
impl SelmaRewriter {
|
44
44
|
const SELMA_ON_END_TAG: &str = "on_end_tag";
|
45
45
|
const SELMA_HANDLE_ELEMENT: &str = "handle_element";
|
46
|
-
const
|
46
|
+
const SELMA_HANDLE_TEXT_CHUNK: &str = "handle_text_chunk";
|
47
47
|
|
48
48
|
/// @yard
|
49
49
|
/// @def new(sanitizer: Selma::Sanitizer.new(Selma::Sanitizer::Config::DEFAULT), handlers: [])
|
@@ -145,7 +145,7 @@ impl SelmaRewriter {
|
|
145
145
|
let _: () = args.trailing;
|
146
146
|
let _: () = args.block;
|
147
147
|
|
148
|
-
let
|
148
|
+
let kwargs = scan_args::get_kwargs::<
|
149
149
|
_,
|
150
150
|
(),
|
151
151
|
(
|
@@ -154,7 +154,7 @@ impl SelmaRewriter {
|
|
154
154
|
),
|
155
155
|
(),
|
156
156
|
>(args.keywords, &[], &["sanitizer", "handlers"])?;
|
157
|
-
let (rb_sanitizer, rb_handlers) =
|
157
|
+
let (rb_sanitizer, rb_handlers) = kwargs.optional;
|
158
158
|
|
159
159
|
Ok((rb_sanitizer, rb_handlers))
|
160
160
|
}
|
@@ -162,26 +162,22 @@ impl SelmaRewriter {
|
|
162
162
|
/// Perform HTML rewrite sequence.
|
163
163
|
fn rewrite(&self, html: String) -> Result<String, magnus::Error> {
|
164
164
|
let sanitized_html = match &self.0.borrow().sanitizer {
|
165
|
-
None => html,
|
165
|
+
None => Ok(html),
|
166
166
|
Some(sanitizer) => {
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
let sanitized_html = Self::perform_sanitization(sanitizer, &html).unwrap();
|
167
|
+
let sanitized_html = match Self::perform_sanitization(sanitizer, &html) {
|
168
|
+
Ok(sanitized_html) => sanitized_html,
|
169
|
+
Err(err) => return Err(err),
|
170
|
+
};
|
172
171
|
|
173
|
-
String::from_utf8(sanitized_html)
|
172
|
+
String::from_utf8(sanitized_html)
|
174
173
|
}
|
175
174
|
};
|
176
175
|
let binding = self.0.borrow_mut();
|
177
176
|
let handlers = &binding.handlers;
|
178
177
|
|
179
|
-
match Self::perform_handler_rewrite(self, handlers, sanitized_html) {
|
178
|
+
match Self::perform_handler_rewrite(self, handlers, sanitized_html.unwrap()) {
|
180
179
|
Ok(rewritten_html) => Ok(String::from_utf8(rewritten_html).unwrap()),
|
181
|
-
Err(err) => Err(
|
182
|
-
exception::runtime_error(),
|
183
|
-
format!("{err:?}"),
|
184
|
-
)),
|
180
|
+
Err(err) => Err(err),
|
185
181
|
}
|
186
182
|
}
|
187
183
|
|
@@ -212,9 +208,10 @@ impl SelmaRewriter {
|
|
212
208
|
if el.removed() {
|
213
209
|
return Ok(());
|
214
210
|
}
|
215
|
-
sanitizer.sanitize_attributes(el)
|
216
|
-
|
217
|
-
|
211
|
+
match sanitizer.sanitize_attributes(el) {
|
212
|
+
Ok(_) => Ok(()),
|
213
|
+
Err(err) => Err(err.to_string().into()),
|
214
|
+
}
|
218
215
|
})],
|
219
216
|
// TODO: allow for MemorySettings to be defined
|
220
217
|
..Settings::default()
|
@@ -341,7 +338,7 @@ impl SelmaRewriter {
|
|
341
338
|
let mut stack = closure_element_stack.as_ref().borrow_mut();
|
342
339
|
stack.pop();
|
343
340
|
Ok(())
|
344
|
-
})
|
341
|
+
})?;
|
345
342
|
Ok(())
|
346
343
|
}));
|
347
344
|
});
|
@@ -375,13 +372,14 @@ impl SelmaRewriter {
|
|
375
372
|
) -> Result<(), magnus::Error> {
|
376
373
|
// if `on_end_tag` function is defined, call it
|
377
374
|
if rb_handler.respond_to(Self::SELMA_ON_END_TAG, true).unwrap() {
|
375
|
+
// TODO: error here is an "EndTagError"
|
378
376
|
element.on_end_tag(move |end_tag| {
|
379
377
|
let rb_end_tag = SelmaHTMLEndTag::new(end_tag);
|
380
378
|
|
381
|
-
rb_handler
|
382
|
-
|
383
|
-
.
|
384
|
-
|
379
|
+
match rb_handler.funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,)) {
|
380
|
+
Ok(_) => Ok(()),
|
381
|
+
Err(err) => Err(err.to_string().into()),
|
382
|
+
}
|
385
383
|
});
|
386
384
|
}
|
387
385
|
|
@@ -390,40 +388,30 @@ impl SelmaRewriter {
|
|
390
388
|
rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_ELEMENT, (rb_element,));
|
391
389
|
match rb_result {
|
392
390
|
Ok(_) => Ok(()),
|
393
|
-
Err(err) => Err(
|
394
|
-
exception::runtime_error(),
|
395
|
-
format!("{err:?}"),
|
396
|
-
)),
|
391
|
+
Err(err) => Err(err),
|
397
392
|
}
|
398
393
|
}
|
399
394
|
|
400
|
-
fn process_text_handlers(
|
401
|
-
|
402
|
-
|
395
|
+
fn process_text_handlers(
|
396
|
+
rb_handler: Value,
|
397
|
+
text_chunk: &mut TextChunk,
|
398
|
+
) -> Result<(), magnus::Error> {
|
399
|
+
// prevents missing `handle_text_chunk` function
|
400
|
+
let content = text_chunk.as_str();
|
403
401
|
|
404
402
|
// seems that sometimes lol-html returns blank text / EOLs?
|
405
403
|
if content.is_empty() {
|
406
404
|
return Ok(());
|
407
405
|
}
|
408
406
|
|
409
|
-
let
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
exception::
|
414
|
-
format!(
|
415
|
-
|
416
|
-
Self::SELMA_HANDLE_TEXT,
|
417
|
-
rb_result.err().unwrap()
|
418
|
-
),
|
419
|
-
));
|
407
|
+
let rb_text_chunk = SelmaHTMLTextChunk::new(text_chunk);
|
408
|
+
match rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_TEXT_CHUNK, (rb_text_chunk,)) {
|
409
|
+
Ok(_) => Ok(()),
|
410
|
+
Err(err) => Err(magnus::Error::new(
|
411
|
+
exception::runtime_error(),
|
412
|
+
format!("{err:?}"),
|
413
|
+
)),
|
420
414
|
}
|
421
|
-
|
422
|
-
let new_content = rb_result.unwrap();
|
423
|
-
// TODO: can this be an option?
|
424
|
-
text.replace(&new_content, ContentType::Html);
|
425
|
-
|
426
|
-
Ok(())
|
427
415
|
}
|
428
416
|
}
|
429
417
|
|
data/ext/selma/src/sanitizer.rs
CHANGED
@@ -1,12 +1,10 @@
|
|
1
|
-
use std::{borrow::BorrowMut,
|
1
|
+
use std::{borrow::BorrowMut, collections::HashMap};
|
2
2
|
|
3
|
-
use lol_html::
|
4
|
-
|
5
|
-
|
6
|
-
Value,
|
3
|
+
use lol_html::{
|
4
|
+
errors::AttributeNameError,
|
5
|
+
html_content::{Comment, ContentType, Doctype, Element, EndTag},
|
7
6
|
};
|
8
|
-
|
9
|
-
use crate::tags::Tag;
|
7
|
+
use magnus::{class, function, method, scan_args, Module, Object, RArray, RHash, RModule, Value};
|
10
8
|
|
11
9
|
#[derive(Clone, Debug)]
|
12
10
|
struct ElementSanitizer {
|
@@ -16,9 +14,21 @@ struct ElementSanitizer {
|
|
16
14
|
protocol_sanitizers: HashMap<String, Vec<String>>,
|
17
15
|
}
|
18
16
|
|
17
|
+
impl Default for ElementSanitizer {
|
18
|
+
fn default() -> Self {
|
19
|
+
ElementSanitizer {
|
20
|
+
allowed_attrs: vec![],
|
21
|
+
allowed_classes: vec![],
|
22
|
+
required_attrs: vec![],
|
23
|
+
|
24
|
+
protocol_sanitizers: HashMap::new(),
|
25
|
+
}
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
19
29
|
#[derive(Clone, Debug)]
|
20
30
|
pub struct Sanitizer {
|
21
|
-
flags: [u8; Tag::TAG_COUNT],
|
31
|
+
flags: [u8; crate::tags::Tag::TAG_COUNT],
|
22
32
|
allowed_attrs: Vec<String>,
|
23
33
|
allowed_classes: Vec<String>,
|
24
34
|
element_sanitizers: HashMap<String, ElementSanitizer>,
|
@@ -39,7 +49,7 @@ impl SelmaSanitizer {
|
|
39
49
|
const SELMA_SANITIZER_REMOVE_CONTENTS: u8 = (1 << 2);
|
40
50
|
const SELMA_SANITIZER_WRAP_WHITESPACE: u8 = (1 << 3);
|
41
51
|
|
42
|
-
pub fn new(arguments: &[Value]) -> Result<Self, Error> {
|
52
|
+
pub fn new(arguments: &[Value]) -> Result<Self, magnus::Error> {
|
43
53
|
let args = scan_args::scan_args::<(), (Option<RHash>,), (), (), (), ()>(arguments)?;
|
44
54
|
let (opt_config,): (Option<RHash>,) = args.optional;
|
45
55
|
|
@@ -50,19 +60,16 @@ impl SelmaSanitizer {
|
|
50
60
|
};
|
51
61
|
|
52
62
|
let mut element_sanitizers = HashMap::new();
|
53
|
-
Tag::html_tags().iter().for_each(|html_tag| {
|
54
|
-
let es = ElementSanitizer
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
protocol_sanitizers: HashMap::new(),
|
60
|
-
};
|
61
|
-
element_sanitizers.insert(Tag::element_name_from_enum(html_tag).to_string(), es);
|
63
|
+
crate::tags::Tag::html_tags().iter().for_each(|html_tag| {
|
64
|
+
let es = ElementSanitizer::default();
|
65
|
+
element_sanitizers.insert(
|
66
|
+
crate::tags::Tag::element_name_from_enum(html_tag).to_string(),
|
67
|
+
es,
|
68
|
+
);
|
62
69
|
});
|
63
70
|
|
64
71
|
Ok(Self(std::cell::RefCell::new(Sanitizer {
|
65
|
-
flags: [0; Tag::TAG_COUNT],
|
72
|
+
flags: [0; crate::tags::Tag::TAG_COUNT],
|
66
73
|
allowed_attrs: vec![],
|
67
74
|
allowed_classes: vec![],
|
68
75
|
element_sanitizers,
|
@@ -74,7 +81,7 @@ impl SelmaSanitizer {
|
|
74
81
|
})))
|
75
82
|
}
|
76
83
|
|
77
|
-
fn get_config(&self) -> Result<RHash, Error> {
|
84
|
+
fn get_config(&self) -> Result<RHash, magnus::Error> {
|
78
85
|
let binding = self.0.borrow();
|
79
86
|
|
80
87
|
Ok(binding.config)
|
@@ -82,7 +89,7 @@ impl SelmaSanitizer {
|
|
82
89
|
|
83
90
|
/// Toggle a sanitizer option on or off.
|
84
91
|
fn set_flag(&self, tag_name: String, flag: u8, set: bool) {
|
85
|
-
let tag = Tag::tag_from_tag_name(tag_name.as_str());
|
92
|
+
let tag = crate::tags::Tag::tag_from_tag_name(tag_name.as_str());
|
86
93
|
if set {
|
87
94
|
self.0.borrow_mut().flags[tag.index] |= flag;
|
88
95
|
} else {
|
@@ -93,13 +100,19 @@ impl SelmaSanitizer {
|
|
93
100
|
/// Toggles all sanitization options on or off.
|
94
101
|
fn set_all_flags(&self, flag: u8, set: bool) {
|
95
102
|
if set {
|
96
|
-
Tag::html_tags()
|
97
|
-
|
98
|
-
|
103
|
+
crate::tags::Tag::html_tags()
|
104
|
+
.iter()
|
105
|
+
.enumerate()
|
106
|
+
.for_each(|(iter, _)| {
|
107
|
+
self.0.borrow_mut().flags[iter] |= flag;
|
108
|
+
});
|
99
109
|
} else {
|
100
|
-
Tag::html_tags()
|
101
|
-
|
102
|
-
|
110
|
+
crate::tags::Tag::html_tags()
|
111
|
+
.iter()
|
112
|
+
.enumerate()
|
113
|
+
.for_each(|(iter, _)| {
|
114
|
+
self.0.borrow_mut().flags[iter] &= flag;
|
115
|
+
});
|
103
116
|
}
|
104
117
|
}
|
105
118
|
|
@@ -111,8 +124,8 @@ impl SelmaSanitizer {
|
|
111
124
|
|
112
125
|
pub fn escape_tagfilter(&self, e: &mut Element) -> bool {
|
113
126
|
if self.0.borrow().escape_tagfilter {
|
114
|
-
let tag = Tag::tag_from_element(e);
|
115
|
-
if Tag::is_tag_escapeworthy(tag) {
|
127
|
+
let tag = crate::tags::Tag::tag_from_element(e);
|
128
|
+
if crate::tags::Tag::is_tag_escapeworthy(tag) {
|
116
129
|
e.remove();
|
117
130
|
return true;
|
118
131
|
}
|
@@ -162,7 +175,8 @@ impl SelmaSanitizer {
|
|
162
175
|
let allowed_attrs = &mut binding.allowed_attrs;
|
163
176
|
Self::set_allowed(allowed_attrs, &attr_name, allow);
|
164
177
|
} else {
|
165
|
-
let
|
178
|
+
let element_sanitizers = &mut binding.element_sanitizers;
|
179
|
+
let element_sanitizer = Self::get_element_sanitizer(element_sanitizers, &element_name);
|
166
180
|
|
167
181
|
element_sanitizer.allowed_attrs.push(attr_name);
|
168
182
|
}
|
@@ -176,7 +190,8 @@ impl SelmaSanitizer {
|
|
176
190
|
let allowed_classes = &mut binding.allowed_classes;
|
177
191
|
Self::set_allowed(allowed_classes, &class_name, allow);
|
178
192
|
} else {
|
179
|
-
let
|
193
|
+
let element_sanitizers = &mut binding.element_sanitizers;
|
194
|
+
let element_sanitizer = Self::get_element_sanitizer(element_sanitizers, &element_name);
|
180
195
|
|
181
196
|
let allowed_classes = element_sanitizer.allowed_classes.borrow_mut();
|
182
197
|
Self::set_allowed(allowed_classes, &class_name, allow)
|
@@ -187,9 +202,10 @@ impl SelmaSanitizer {
|
|
187
202
|
fn set_allowed_protocols(&self, element_name: String, attr_name: String, allow_list: RArray) {
|
188
203
|
let mut binding = self.0.borrow_mut();
|
189
204
|
|
190
|
-
let
|
205
|
+
let element_sanitizers = &mut binding.element_sanitizers;
|
206
|
+
let element_sanitizer = Self::get_element_sanitizer(element_sanitizers, &element_name);
|
191
207
|
|
192
|
-
let protocol_sanitizers = element_sanitizer.protocol_sanitizers.borrow_mut();
|
208
|
+
let protocol_sanitizers = &mut element_sanitizer.protocol_sanitizers.borrow_mut();
|
193
209
|
|
194
210
|
for opt_allowed_protocol in allow_list.each() {
|
195
211
|
let allowed_protocol = opt_allowed_protocol.unwrap();
|
@@ -229,10 +245,16 @@ impl SelmaSanitizer {
|
|
229
245
|
}
|
230
246
|
}
|
231
247
|
|
232
|
-
pub fn sanitize_attributes(&self, element: &mut Element) -> Result<(),
|
233
|
-
let
|
234
|
-
let
|
235
|
-
let element_sanitizer =
|
248
|
+
pub fn sanitize_attributes(&self, element: &mut Element) -> Result<(), AttributeNameError> {
|
249
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
250
|
+
let tag_name = &element.tag_name();
|
251
|
+
let element_sanitizer = {
|
252
|
+
let mut binding = self.0.borrow_mut();
|
253
|
+
let element_sanitizers = &mut binding.element_sanitizers;
|
254
|
+
Self::get_element_sanitizer(element_sanitizers, tag_name).clone()
|
255
|
+
};
|
256
|
+
|
257
|
+
let binding = self.0.borrow();
|
236
258
|
|
237
259
|
// FIXME: This is a hack to get around the fact that we can't borrow
|
238
260
|
let attribute_map: HashMap<String, String> = element
|
@@ -255,26 +277,30 @@ impl SelmaSanitizer {
|
|
255
277
|
let x = escapist::unescape_html(trimmed.as_bytes());
|
256
278
|
let unescaped_attr_val = String::from_utf8_lossy(&x).to_string();
|
257
279
|
|
258
|
-
|
280
|
+
let should_keep_attrubute = match Self::should_keep_attribute(
|
259
281
|
&binding,
|
260
282
|
element,
|
261
|
-
element_sanitizer,
|
283
|
+
&element_sanitizer,
|
262
284
|
attr_name,
|
263
285
|
&unescaped_attr_val,
|
264
286
|
) {
|
287
|
+
Ok(should_keep) => should_keep,
|
288
|
+
Err(e) => {
|
289
|
+
return Err(e);
|
290
|
+
}
|
291
|
+
};
|
292
|
+
|
293
|
+
if !should_keep_attrubute {
|
265
294
|
element.remove_attribute(attr_name);
|
266
295
|
} else {
|
267
296
|
// Prevent the use of `<meta>` elements that set a charset other than UTF-8,
|
268
297
|
// since output is always UTF-8.
|
269
|
-
if Tag::is_meta(tag) {
|
298
|
+
if crate::tags::Tag::is_meta(tag) {
|
270
299
|
if attr_name == "charset" && unescaped_attr_val != "utf-8" {
|
271
300
|
match element.set_attribute(attr_name, "utf-8") {
|
272
301
|
Ok(_) => {}
|
273
|
-
Err(
|
274
|
-
return Err(
|
275
|
-
exception::runtime_error(),
|
276
|
-
format!("Unable to change {attr_name:?}"),
|
277
|
-
));
|
302
|
+
Err(err) => {
|
303
|
+
return Err(err);
|
278
304
|
}
|
279
305
|
}
|
280
306
|
}
|
@@ -282,13 +308,17 @@ impl SelmaSanitizer {
|
|
282
308
|
let mut buf = String::new();
|
283
309
|
// ...then, escape any special characters, for security
|
284
310
|
if attr_name == "href" {
|
285
|
-
|
286
|
-
escapist::escape_href(&mut buf, unescaped_attr_val.to_string().as_str());
|
311
|
+
escapist::escape_href(&mut buf, unescaped_attr_val.as_str());
|
287
312
|
} else {
|
288
|
-
escapist::escape_html(&mut buf, unescaped_attr_val.
|
313
|
+
escapist::escape_html(&mut buf, unescaped_attr_val.as_str());
|
289
314
|
};
|
290
315
|
|
291
|
-
element.set_attribute(attr_name, &buf)
|
316
|
+
match element.set_attribute(attr_name, &buf) {
|
317
|
+
Ok(_) => {}
|
318
|
+
Err(err) => {
|
319
|
+
return Err(err);
|
320
|
+
}
|
321
|
+
}
|
292
322
|
}
|
293
323
|
}
|
294
324
|
}
|
@@ -308,12 +338,12 @@ impl SelmaSanitizer {
|
|
308
338
|
}
|
309
339
|
|
310
340
|
fn should_keep_attribute(
|
311
|
-
binding: &
|
341
|
+
binding: &Sanitizer,
|
312
342
|
element: &mut Element,
|
313
343
|
element_sanitizer: &ElementSanitizer,
|
314
344
|
attr_name: &String,
|
315
345
|
attr_val: &String,
|
316
|
-
) -> bool {
|
346
|
+
) -> Result<bool, AttributeNameError> {
|
317
347
|
let mut allowed: bool = false;
|
318
348
|
let element_allowed_attrs = element_sanitizer.allowed_attrs.contains(attr_name);
|
319
349
|
let sanitizer_allowed_attrs = binding.allowed_attrs.contains(attr_name);
|
@@ -327,7 +357,7 @@ impl SelmaSanitizer {
|
|
327
357
|
}
|
328
358
|
|
329
359
|
if !allowed {
|
330
|
-
return false;
|
360
|
+
return Ok(false);
|
331
361
|
}
|
332
362
|
|
333
363
|
let protocol_sanitizer_values = element_sanitizer.protocol_sanitizers.get(attr_name);
|
@@ -335,32 +365,29 @@ impl SelmaSanitizer {
|
|
335
365
|
None => {
|
336
366
|
// has a protocol, but no sanitization list
|
337
367
|
if !attr_val.is_empty() && Self::has_protocol(attr_val) {
|
338
|
-
return false;
|
368
|
+
return Ok(false);
|
339
369
|
}
|
340
370
|
}
|
341
371
|
Some(protocol_sanitizer_values) => {
|
342
372
|
if !attr_val.is_empty()
|
343
373
|
&& !Self::has_allowed_protocol(protocol_sanitizer_values, attr_val)
|
344
374
|
{
|
345
|
-
return false;
|
375
|
+
return Ok(false);
|
346
376
|
}
|
347
377
|
}
|
348
378
|
}
|
349
379
|
|
350
|
-
if attr_name == "class"
|
351
|
-
|
380
|
+
if attr_name == "class" {
|
381
|
+
return Self::sanitize_class_attribute(
|
352
382
|
binding,
|
353
383
|
element,
|
354
384
|
element_sanitizer,
|
355
385
|
attr_name,
|
356
386
|
attr_val,
|
357
|
-
)
|
358
|
-
.unwrap()
|
359
|
-
{
|
360
|
-
return false;
|
387
|
+
);
|
361
388
|
}
|
362
389
|
|
363
|
-
true
|
390
|
+
Ok(true)
|
364
391
|
}
|
365
392
|
|
366
393
|
fn has_protocol(attr_val: &str) -> bool {
|
@@ -398,12 +425,12 @@ impl SelmaSanitizer {
|
|
398
425
|
}
|
399
426
|
|
400
427
|
fn sanitize_class_attribute(
|
401
|
-
binding: &
|
428
|
+
binding: &Sanitizer,
|
402
429
|
element: &mut Element,
|
403
430
|
element_sanitizer: &ElementSanitizer,
|
404
431
|
attr_name: &str,
|
405
432
|
attr_val: &str,
|
406
|
-
) -> Result<bool,
|
433
|
+
) -> Result<bool, lol_html::errors::AttributeNameError> {
|
407
434
|
let allowed_global = &binding.allowed_classes;
|
408
435
|
|
409
436
|
let mut valid_classes: Vec<String> = vec![];
|
@@ -431,28 +458,25 @@ impl SelmaSanitizer {
|
|
431
458
|
|
432
459
|
match element.set_attribute(attr_name, valid_classes.join(" ").as_str()) {
|
433
460
|
Ok(_) => Ok(true),
|
434
|
-
Err(err) => Err(
|
435
|
-
exception::runtime_error(),
|
436
|
-
format!("AttributeNameError: {err:?}"),
|
437
|
-
)),
|
461
|
+
Err(err) => Err(err),
|
438
462
|
}
|
439
463
|
}
|
440
464
|
|
441
465
|
pub fn allow_element(&self, element: &mut Element) -> bool {
|
442
|
-
let tag = Tag::tag_from_element(element);
|
466
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
443
467
|
let flags: u8 = self.0.borrow().flags[tag.index];
|
444
468
|
|
445
469
|
(flags & Self::SELMA_SANITIZER_ALLOW) == 0
|
446
470
|
}
|
447
471
|
|
448
472
|
pub fn try_remove_element(&self, element: &mut Element) -> bool {
|
449
|
-
let tag = Tag::tag_from_element(element);
|
473
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
450
474
|
let flags: u8 = self.0.borrow().flags[tag.index];
|
451
475
|
|
452
476
|
let should_remove = !element.removed() && self.allow_element(element);
|
453
477
|
|
454
478
|
if should_remove {
|
455
|
-
if Tag::has_text_content(tag) {
|
479
|
+
if crate::tags::Tag::has_text_content(tag) {
|
456
480
|
Self::remove_element(
|
457
481
|
element,
|
458
482
|
tag.self_closing,
|
@@ -465,7 +489,7 @@ impl SelmaSanitizer {
|
|
465
489
|
Self::check_if_end_tag_needs_removal(element);
|
466
490
|
} else {
|
467
491
|
// anything in <iframe> must be removed, if it's kept
|
468
|
-
if Tag::is_iframe(tag) {
|
492
|
+
if crate::tags::Tag::is_iframe(tag) {
|
469
493
|
if self.0.borrow().flags[tag.index] != 0 {
|
470
494
|
element.set_inner_content(" ", ContentType::Text);
|
471
495
|
} else {
|
@@ -497,14 +521,14 @@ impl SelmaSanitizer {
|
|
497
521
|
}
|
498
522
|
|
499
523
|
pub fn force_remove_element(&self, element: &mut Element) {
|
500
|
-
let tag = Tag::tag_from_element(element);
|
524
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
501
525
|
let self_closing = tag.self_closing;
|
502
526
|
Self::remove_element(element, self_closing, Self::SELMA_SANITIZER_REMOVE_CONTENTS);
|
503
527
|
Self::check_if_end_tag_needs_removal(element);
|
504
528
|
}
|
505
529
|
|
506
530
|
fn check_if_end_tag_needs_removal(element: &mut Element) {
|
507
|
-
if element.removed() && !Tag::tag_from_element(element).self_closing {
|
531
|
+
if element.removed() && !crate::tags::Tag::tag_from_element(element).self_closing {
|
508
532
|
element
|
509
533
|
.on_end_tag(move |end| {
|
510
534
|
Self::remove_end_tag(end);
|
@@ -519,21 +543,16 @@ impl SelmaSanitizer {
|
|
519
543
|
}
|
520
544
|
|
521
545
|
fn get_element_sanitizer<'a>(
|
522
|
-
|
523
|
-
element_name: &str,
|
524
|
-
) -> &'a ElementSanitizer {
|
525
|
-
binding.element_sanitizers.get(element_name).unwrap()
|
526
|
-
}
|
527
|
-
|
528
|
-
fn get_mut_element_sanitizer<'a>(
|
529
|
-
binding: &'a mut Sanitizer,
|
546
|
+
element_sanitizers: &'a mut HashMap<String, ElementSanitizer>,
|
530
547
|
element_name: &str,
|
531
548
|
) -> &'a mut ElementSanitizer {
|
532
|
-
|
549
|
+
element_sanitizers
|
550
|
+
.entry(element_name.to_string())
|
551
|
+
.or_insert_with(ElementSanitizer::default)
|
533
552
|
}
|
534
553
|
}
|
535
554
|
|
536
|
-
pub fn init(m_selma: RModule) -> Result<(), Error> {
|
555
|
+
pub fn init(m_selma: RModule) -> Result<(), magnus::Error> {
|
537
556
|
let c_sanitizer = m_selma.define_class("Sanitizer", Default::default())?;
|
538
557
|
|
539
558
|
c_sanitizer.define_singleton_method("new", function!(SelmaSanitizer::new, -1))?;
|
data/ext/selma/src/tags.rs
CHANGED
@@ -192,14 +192,17 @@ impl Tag {
|
|
192
192
|
/// Is this tag something which needs to be removed?
|
193
193
|
pub fn is_tag_escapeworthy(tag: Tag) -> bool {
|
194
194
|
tag.index == HTMLTag::TITLE as usize
|
195
|
-
|| tag.index == HTMLTag::TEXTAREA as usize
|
196
|
-
|| tag.index == HTMLTag::STYLE as usize
|
197
|
-
|| tag.index == HTMLTag::XMP as usize
|
198
195
|
|| tag.index == HTMLTag::IFRAME as usize
|
196
|
+
|| tag.index == HTMLTag::MATH as usize
|
199
197
|
|| tag.index == HTMLTag::NOEMBED as usize
|
200
198
|
|| tag.index == HTMLTag::NOFRAMES as usize
|
201
|
-
|| tag.index == HTMLTag::
|
199
|
+
|| tag.index == HTMLTag::NOSCRIPT as usize
|
202
200
|
|| tag.index == HTMLTag::PLAINTEXT as usize
|
201
|
+
|| tag.index == HTMLTag::SCRIPT as usize
|
202
|
+
|| tag.index == HTMLTag::STYLE as usize
|
203
|
+
|| tag.index == HTMLTag::SVG as usize
|
204
|
+
|| tag.index == HTMLTag::TEXTAREA as usize
|
205
|
+
|| tag.index == HTMLTag::XMP as usize
|
203
206
|
}
|
204
207
|
|
205
208
|
pub const ESCAPEWORTHY_TAGS_CSS: &str =
|
data/lib/selma/3.1/selma.so
CHANGED
Binary file
|
@@ -3,6 +3,10 @@
|
|
3
3
|
module Selma
|
4
4
|
class Sanitizer
|
5
5
|
module Config
|
6
|
+
# although there are many more protocol types, eg., ftp, xmpp, etc.,
|
7
|
+
# these are the only ones that are allowed by default
|
8
|
+
VALID_PROTOCOLS = ["http", "https", "mailto", :relative]
|
9
|
+
|
6
10
|
DEFAULT = freeze_config(
|
7
11
|
# Whether or not to allow HTML comments. Allowing comments is strongly
|
8
12
|
# discouraged, since IE allows script execution within conditional
|
data/lib/selma/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: selma
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: x64-mingw-ucrt
|
6
6
|
authors:
|
7
7
|
- Garen J. Torikian
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-12-
|
11
|
+
date: 2022-12-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -81,6 +81,7 @@ files:
|
|
81
81
|
- ext/selma/src/html.rs
|
82
82
|
- ext/selma/src/html/element.rs
|
83
83
|
- ext/selma/src/html/end_tag.rs
|
84
|
+
- ext/selma/src/html/text_chunk.rs
|
84
85
|
- ext/selma/src/lib.rs
|
85
86
|
- ext/selma/src/native_ref_wrap.rs
|
86
87
|
- ext/selma/src/rewriter.rs
|