selma 0.0.2-x86_64-darwin → 0.0.4-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +11 -10
- data/ext/selma/Cargo.toml +3 -3
- data/ext/selma/src/html/element.rs +41 -48
- data/ext/selma/src/html/end_tag.rs +2 -2
- data/ext/selma/src/html/text_chunk.rs +83 -0
- data/ext/selma/src/html.rs +2 -0
- data/ext/selma/src/lib.rs +28 -1
- data/ext/selma/src/native_ref_wrap.rs +3 -3
- data/ext/selma/src/rewriter.rs +47 -59
- data/ext/selma/src/sanitizer.rs +76 -57
- data/ext/selma/src/selector.rs +2 -5
- data/ext/selma/src/tags.rs +7 -4
- data/lib/selma/3.1/selma.bundle +0 -0
- data/lib/selma/sanitizer/config/default.rb +4 -0
- data/lib/selma/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5355712312797e3aabbf2c958807f12c397250836d7c4575848bf57852b9cecb
|
4
|
+
data.tar.gz: 5c8bf43375c4785fa152d6ea467efc3f7ef34f7fa0801f5473cc2b7f62296cf6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 29137ce7a58700f8b56414a11054ab295643d56c1fe079921fe69bf9f80c3879e22c3afd60e25961315e0a37b4dbbf4b1a278951404166ffb3571c55b0e75bef
|
7
|
+
data.tar.gz: 4dee67b422acddcd03f4168ae52e27ee9c403cab9233886d861a121ca26a0ee01968d0134b112409a7650141990aa5530053188880d644739b43181357ddfc15
|
data/README.md
CHANGED
@@ -56,6 +56,10 @@ allow_comments: false,
|
|
56
56
|
# "<!DOCTYPE html>" when sanitizing a document.
|
57
57
|
allow_doctype: false,
|
58
58
|
|
59
|
+
# HTML elements to allow. By default, no elements are allowed (which means
|
60
|
+
# that all HTML will be stripped).
|
61
|
+
elements: ["a", "b", "img", ],
|
62
|
+
|
59
63
|
# HTML attributes to allow in specific elements. The key is the name of the element,
|
60
64
|
# and the value is an array of allowed attributes. By default, no attributes
|
61
65
|
# are allowed.
|
@@ -64,14 +68,10 @@ attributes: {
|
|
64
68
|
"img" => ["src"],
|
65
69
|
},
|
66
70
|
|
67
|
-
# HTML elements to allow. By default, no elements are allowed (which means
|
68
|
-
# that all HTML will be stripped).
|
69
|
-
elements: ["a", "b", "img", ],
|
70
|
-
|
71
71
|
# URL handling protocols to allow in specific attributes. By default, no
|
72
72
|
# protocols are allowed. Use :relative in place of a protocol if you want
|
73
73
|
# to allow relative URLs sans protocol.
|
74
|
-
|
74
|
+
protocols: {
|
75
75
|
"a" => { "href" => ["http", "https", "mailto", :relative] },
|
76
76
|
"img" => { "href" => ["http", "https"] },
|
77
77
|
},
|
@@ -91,7 +91,7 @@ The real power in Selma comes in its use of handlers. A handler is simply an obj
|
|
91
91
|
|
92
92
|
- `selector`, a method which MUST return instance of `Selma::Selector` which defines the CSS classes to match
|
93
93
|
- `handle_element`, a method that's call on each matched element
|
94
|
-
- `
|
94
|
+
- `handle_text_chunk`, a method that's called on each matched text node; this MUST return a string
|
95
95
|
|
96
96
|
Here's an example which rewrites the `href` attribute on `a` and the `src` attribute on `img` to be `https` rather than `http`.
|
97
97
|
|
@@ -118,7 +118,7 @@ rewriter = Selma::Rewriter.new(handlers: [MatchAttribute.new])
|
|
118
118
|
The `Selma::Selector` object has three possible kwargs:
|
119
119
|
|
120
120
|
- `match_element`: any element which matches this CSS rule will be passed on to `handle_element`
|
121
|
-
- `match_text_within`: any element which matches this CSS rule will be passed on to `
|
121
|
+
- `match_text_within`: any element which matches this CSS rule will be passed on to `handle_text_chunk`
|
122
122
|
- `ignore_text_within`: this is an array of element names whose text contents will be ignored
|
123
123
|
|
124
124
|
You've seen an example of `match_element`; here's one for `match_text` which changes strings in various elements which are _not_ `pre` or `code`:
|
@@ -132,7 +132,7 @@ class MatchText
|
|
132
132
|
SELECTOR
|
133
133
|
end
|
134
134
|
|
135
|
-
def
|
135
|
+
def handle_text_chunk(text)
|
136
136
|
string.sub(/@.+/, "<a href=\"www.yetto.app/#{Regexp.last_match}\">")
|
137
137
|
end
|
138
138
|
end
|
@@ -150,8 +150,9 @@ The `element` argument in `handle_element` has the following methods:
|
|
150
150
|
- `remove_attribute`: remove an attribute
|
151
151
|
- `attributes`: list all the attributes
|
152
152
|
- `ancestors`: list all the ancestors
|
153
|
-
- `append(content, content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
154
|
-
- `
|
153
|
+
- `append(content, as: content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
154
|
+
- `before(content, as: content_type)`: Inserts `content` before the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
155
|
+
- `after(content, as: content_type)`: Inserts `content` after the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
155
156
|
- `set_inner_content`: replaces inner content of the element with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
156
157
|
|
157
158
|
## Benchmarks
|
data/ext/selma/Cargo.toml
CHANGED
@@ -5,9 +5,9 @@ edition = "2021"
|
|
5
5
|
|
6
6
|
[dependencies]
|
7
7
|
enum-iterator = "1.2"
|
8
|
-
escapist = "0.0.
|
9
|
-
magnus = "
|
10
|
-
lol_html =
|
8
|
+
escapist = "0.0.2"
|
9
|
+
magnus = { git = "https://github.com/matsadler/magnus", rev = "23160f7229ac74c42da1b5096a65ccbc40962697" }
|
10
|
+
lol_html = "0.3"
|
11
11
|
|
12
12
|
[lib]
|
13
13
|
name = "selma"
|
@@ -1,8 +1,6 @@
|
|
1
|
-
use std::borrow::Cow;
|
2
|
-
|
3
1
|
use crate::native_ref_wrap::NativeRefWrap;
|
4
|
-
use lol_html::html_content::
|
5
|
-
use magnus::{exception, method, Error, Module, RArray, RClass, RHash, RString,
|
2
|
+
use lol_html::html_content::Element;
|
3
|
+
use magnus::{exception, method, Error, Module, RArray, RClass, RHash, RString, Value};
|
6
4
|
|
7
5
|
struct HTMLElement {
|
8
6
|
element: NativeRefWrap<Element<'static, 'static>>,
|
@@ -51,7 +49,7 @@ impl SelmaHTMLElement {
|
|
51
49
|
Ok(_) => Ok(value),
|
52
50
|
Err(err) => Err(Error::new(
|
53
51
|
exception::runtime_error(),
|
54
|
-
format!("AttributeNameError: {}"
|
52
|
+
format!("AttributeNameError: {err:?}"),
|
55
53
|
)),
|
56
54
|
}
|
57
55
|
} else {
|
@@ -81,7 +79,7 @@ impl SelmaHTMLElement {
|
|
81
79
|
Ok(_) => {}
|
82
80
|
Err(err) => Err(Error::new(
|
83
81
|
exception::runtime_error(),
|
84
|
-
format!("AttributeNameError: {}"
|
82
|
+
format!("AttributeNameError: {err:?}"),
|
85
83
|
))
|
86
84
|
.unwrap(),
|
87
85
|
});
|
@@ -99,80 +97,74 @@ impl SelmaHTMLElement {
|
|
99
97
|
.for_each(|ancestor| match array.push(RString::new(ancestor)) {
|
100
98
|
Ok(_) => {}
|
101
99
|
Err(err) => {
|
102
|
-
Err(Error::new(exception::runtime_error(), format!("{}"
|
100
|
+
Err(Error::new(exception::runtime_error(), format!("{err:?}"))).unwrap()
|
103
101
|
}
|
104
102
|
});
|
105
103
|
|
106
104
|
Ok(array)
|
107
105
|
}
|
108
106
|
|
109
|
-
fn
|
107
|
+
fn before(&self, args: &[Value]) -> Result<(), Error> {
|
110
108
|
let mut binding = self.0.borrow_mut();
|
111
109
|
let element = binding.element.get_mut().unwrap();
|
112
110
|
|
113
|
-
let text_str =
|
114
|
-
|
115
|
-
|
111
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
112
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
113
|
+
Err(err) => return Err(err),
|
114
|
+
};
|
116
115
|
|
117
|
-
element.
|
116
|
+
element.before(&text_str, content_type);
|
118
117
|
|
119
118
|
Ok(())
|
120
119
|
}
|
121
120
|
|
122
|
-
fn
|
123
|
-
&self,
|
124
|
-
start_text: String,
|
125
|
-
end_text: String,
|
126
|
-
content_type: Symbol,
|
127
|
-
) -> Result<(), Error> {
|
121
|
+
fn after(&self, args: &[Value]) -> Result<(), Error> {
|
128
122
|
let mut binding = self.0.borrow_mut();
|
129
123
|
let element = binding.element.get_mut().unwrap();
|
130
124
|
|
131
|
-
let
|
132
|
-
|
133
|
-
|
134
|
-
|
125
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
126
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
127
|
+
Err(err) => return Err(err),
|
128
|
+
};
|
129
|
+
|
130
|
+
element.after(&text_str, content_type);
|
135
131
|
|
136
132
|
Ok(())
|
137
133
|
}
|
138
134
|
|
139
|
-
fn
|
135
|
+
fn append(&self, args: &[Value]) -> Result<(), Error> {
|
140
136
|
let mut binding = self.0.borrow_mut();
|
141
137
|
let element = binding.element.get_mut().unwrap();
|
142
138
|
|
143
|
-
let text_str =
|
144
|
-
|
145
|
-
|
139
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
140
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
141
|
+
Err(err) => return Err(err),
|
142
|
+
};
|
146
143
|
|
147
|
-
element.
|
144
|
+
element.append(&text_str, content_type);
|
148
145
|
|
149
146
|
Ok(())
|
150
147
|
}
|
151
148
|
|
152
|
-
fn
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
exception::runtime_error(),
|
165
|
-
format!("Could not unwrap symbol"),
|
166
|
-
))
|
167
|
-
.unwrap(),
|
168
|
-
}
|
149
|
+
fn set_inner_content(&self, args: &[Value]) -> Result<(), Error> {
|
150
|
+
let mut binding = self.0.borrow_mut();
|
151
|
+
let element = binding.element.get_mut().unwrap();
|
152
|
+
|
153
|
+
let (inner_content, content_type) = match crate::scan_text_args(args) {
|
154
|
+
Ok((inner_content, content_type)) => (inner_content, content_type),
|
155
|
+
Err(err) => return Err(err),
|
156
|
+
};
|
157
|
+
|
158
|
+
element.set_inner_content(&inner_content, content_type);
|
159
|
+
|
160
|
+
Ok(())
|
169
161
|
}
|
170
162
|
}
|
171
163
|
|
172
164
|
pub fn init(c_html: RClass) -> Result<(), Error> {
|
173
165
|
let c_element = c_html
|
174
166
|
.define_class("Element", Default::default())
|
175
|
-
.expect("cannot find class Selma::Element");
|
167
|
+
.expect("cannot find class Selma::HTML::Element");
|
176
168
|
|
177
169
|
c_element.define_method("tag_name", method!(SelmaHTMLElement::tag_name, 0))?;
|
178
170
|
c_element.define_method("[]", method!(SelmaHTMLElement::get_attribute, 1))?;
|
@@ -184,11 +176,12 @@ pub fn init(c_html: RClass) -> Result<(), Error> {
|
|
184
176
|
c_element.define_method("attributes", method!(SelmaHTMLElement::get_attributes, 0))?;
|
185
177
|
c_element.define_method("ancestors", method!(SelmaHTMLElement::get_ancestors, 0))?;
|
186
178
|
|
187
|
-
c_element.define_method("
|
188
|
-
c_element.define_method("
|
179
|
+
c_element.define_method("before", method!(SelmaHTMLElement::before, -1))?;
|
180
|
+
c_element.define_method("after", method!(SelmaHTMLElement::after, -1))?;
|
181
|
+
c_element.define_method("append", method!(SelmaHTMLElement::append, -1))?;
|
189
182
|
c_element.define_method(
|
190
183
|
"set_inner_content",
|
191
|
-
method!(SelmaHTMLElement::set_inner_content,
|
184
|
+
method!(SelmaHTMLElement::set_inner_content, -1),
|
192
185
|
)?;
|
193
186
|
|
194
187
|
Ok(())
|
@@ -6,7 +6,7 @@ struct HTMLEndTag {
|
|
6
6
|
end_tag: NativeRefWrap<EndTag<'static>>,
|
7
7
|
}
|
8
8
|
|
9
|
-
#[magnus::wrap(class = "Selma::HTML::
|
9
|
+
#[magnus::wrap(class = "Selma::HTML::EndTag")]
|
10
10
|
pub struct SelmaHTMLEndTag(std::cell::RefCell<HTMLEndTag>);
|
11
11
|
|
12
12
|
/// SAFETY: This is safe because we only access this data when the GVL is held.
|
@@ -27,7 +27,7 @@ impl SelmaHTMLEndTag {
|
|
27
27
|
pub fn init(c_html: RClass) -> Result<(), Error> {
|
28
28
|
let c_end_tag = c_html
|
29
29
|
.define_class("EndTag", Default::default())
|
30
|
-
.expect("cannot find class Selma::EndTag");
|
30
|
+
.expect("cannot find class Selma::HTML::EndTag");
|
31
31
|
|
32
32
|
c_end_tag.define_method("tag_name", method!(SelmaHTMLEndTag::tag_name, 0))?;
|
33
33
|
|
@@ -0,0 +1,83 @@
|
|
1
|
+
use crate::native_ref_wrap::NativeRefWrap;
|
2
|
+
use lol_html::html_content::{TextChunk, TextType};
|
3
|
+
use magnus::{exception, method, Error, Module, RClass, Symbol, Value};
|
4
|
+
|
5
|
+
struct HTMLTextChunk {
|
6
|
+
text_chunk: NativeRefWrap<TextChunk<'static>>,
|
7
|
+
}
|
8
|
+
|
9
|
+
#[magnus::wrap(class = "Selma::HTML::TextChunk")]
|
10
|
+
pub struct SelmaHTMLTextChunk(std::cell::RefCell<HTMLTextChunk>);
|
11
|
+
|
12
|
+
/// SAFETY: This is safe because we only access this data when the GVL is held.
|
13
|
+
unsafe impl Send for SelmaHTMLTextChunk {}
|
14
|
+
|
15
|
+
impl SelmaHTMLTextChunk {
|
16
|
+
pub fn new(text_chunk: &mut TextChunk) -> Self {
|
17
|
+
let (ref_wrap, _anchor) = NativeRefWrap::wrap_mut(text_chunk);
|
18
|
+
|
19
|
+
Self(std::cell::RefCell::new(HTMLTextChunk {
|
20
|
+
text_chunk: ref_wrap,
|
21
|
+
}))
|
22
|
+
}
|
23
|
+
|
24
|
+
fn to_s(&self) -> Result<String, Error> {
|
25
|
+
let binding = self.0.borrow();
|
26
|
+
|
27
|
+
if let Ok(tc) = binding.text_chunk.get() {
|
28
|
+
Ok(tc.as_str().to_string())
|
29
|
+
} else {
|
30
|
+
Err(Error::new(
|
31
|
+
exception::runtime_error(),
|
32
|
+
"`to_s` is not available",
|
33
|
+
))
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
fn text_type(&self) -> Result<Symbol, Error> {
|
38
|
+
let binding = self.0.borrow();
|
39
|
+
|
40
|
+
if let Ok(tc) = binding.text_chunk.get() {
|
41
|
+
match tc.text_type() {
|
42
|
+
TextType::Data => Ok(Symbol::from("data")),
|
43
|
+
TextType::PlainText => Ok(Symbol::from("plain_text")),
|
44
|
+
TextType::RawText => Ok(Symbol::from("raw_text")),
|
45
|
+
TextType::ScriptData => Ok(Symbol::from("script")),
|
46
|
+
TextType::RCData => Ok(Symbol::from("rc_data")),
|
47
|
+
TextType::CDataSection => Ok(Symbol::from("cdata_section")),
|
48
|
+
}
|
49
|
+
} else {
|
50
|
+
Err(Error::new(
|
51
|
+
exception::runtime_error(),
|
52
|
+
"`text_type` is not available",
|
53
|
+
))
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
fn replace(&self, args: &[Value]) -> Result<(), Error> {
|
58
|
+
let mut binding = self.0.borrow_mut();
|
59
|
+
let text_chunk = binding.text_chunk.get_mut().unwrap();
|
60
|
+
|
61
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
62
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
63
|
+
Err(err) => return Err(err),
|
64
|
+
};
|
65
|
+
|
66
|
+
text_chunk.replace(&text_str, content_type);
|
67
|
+
|
68
|
+
Ok(())
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
pub fn init(c_html: RClass) -> Result<(), Error> {
|
73
|
+
let c_text_chunk = c_html
|
74
|
+
.define_class("TextChunk", Default::default())
|
75
|
+
.expect("cannot find class Selma::HTML::TextChunk");
|
76
|
+
|
77
|
+
c_text_chunk.define_method("to_s", method!(SelmaHTMLTextChunk::to_s, 0))?;
|
78
|
+
c_text_chunk.define_method("content", method!(SelmaHTMLTextChunk::to_s, 0))?;
|
79
|
+
c_text_chunk.define_method("text_type", method!(SelmaHTMLTextChunk::text_type, 0))?;
|
80
|
+
c_text_chunk.define_method("replace", method!(SelmaHTMLTextChunk::replace, -1))?;
|
81
|
+
|
82
|
+
Ok(())
|
83
|
+
}
|
data/ext/selma/src/html.rs
CHANGED
@@ -9,9 +9,11 @@ pub fn init(m_selma: RModule) -> Result<(), Error> {
|
|
9
9
|
|
10
10
|
element::init(c_html).expect("cannot define Selma::HTML::Element class");
|
11
11
|
end_tag::init(c_html).expect("cannot define Selma::HTML::EndTag class");
|
12
|
+
text_chunk::init(c_html).expect("cannot define Selma::HTML::TextChunk class");
|
12
13
|
|
13
14
|
Ok(())
|
14
15
|
}
|
15
16
|
|
16
17
|
pub mod element;
|
17
18
|
pub mod end_tag;
|
19
|
+
pub mod text_chunk;
|
data/ext/selma/src/lib.rs
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
extern crate core;
|
2
2
|
|
3
|
-
use
|
3
|
+
use lol_html::html_content::ContentType;
|
4
|
+
use magnus::{define_module, exception, scan_args, Error, Symbol, Value};
|
4
5
|
|
5
6
|
pub mod html;
|
6
7
|
pub mod native_ref_wrap;
|
@@ -10,6 +11,32 @@ pub mod selector;
|
|
10
11
|
pub mod tags;
|
11
12
|
pub mod wrapped_struct;
|
12
13
|
|
14
|
+
#[allow(clippy::let_unit_value)]
|
15
|
+
fn scan_text_args(args: &[Value]) -> Result<(String, ContentType), magnus::Error> {
|
16
|
+
let args = scan_args::scan_args(args)?;
|
17
|
+
let (text,): (String,) = args.required;
|
18
|
+
let _: () = args.optional;
|
19
|
+
let _: () = args.splat;
|
20
|
+
let _: () = args.trailing;
|
21
|
+
let _: () = args.block;
|
22
|
+
|
23
|
+
let kwargs = scan_args::get_kwargs::<_, (Symbol,), (), ()>(args.keywords, &["as"], &[])?;
|
24
|
+
let as_sym = kwargs.required.0;
|
25
|
+
let as_sym_str = as_sym.name().unwrap();
|
26
|
+
let content_type = if as_sym_str == "text" {
|
27
|
+
ContentType::Text
|
28
|
+
} else if as_sym_str == "html" {
|
29
|
+
ContentType::Html
|
30
|
+
} else {
|
31
|
+
return Err(Error::new(
|
32
|
+
exception::runtime_error(),
|
33
|
+
format!("unknown symbol `{as_sym_str:?}`"),
|
34
|
+
));
|
35
|
+
};
|
36
|
+
|
37
|
+
Ok((text, content_type))
|
38
|
+
}
|
39
|
+
|
13
40
|
#[magnus::init]
|
14
41
|
fn init() -> Result<(), Error> {
|
15
42
|
let m_selma = define_module("Selma").expect("cannot define ::Selma module");
|
@@ -1,4 +1,4 @@
|
|
1
|
-
use std::{cell::Cell, marker::PhantomData,
|
1
|
+
use std::{cell::Cell, marker::PhantomData, rc::Rc};
|
2
2
|
|
3
3
|
// NOTE: My Rust isn't good enough to know what any of this does,
|
4
4
|
// but it was taken from https://github.com/cloudflare/lol-html/blob/1a1ab2e2bf896f815fe8888ed78ccdf46d7c6b85/js-api/src/lib.rs#LL38
|
@@ -37,7 +37,7 @@ pub struct NativeRefWrap<R> {
|
|
37
37
|
impl<R> NativeRefWrap<R> {
|
38
38
|
pub fn wrap<I>(inner: &I) -> (Self, Anchor) {
|
39
39
|
let wrap = NativeRefWrap {
|
40
|
-
inner_ptr:
|
40
|
+
inner_ptr: inner as *const I as *mut R,
|
41
41
|
poisoned: Rc::new(Cell::new(false)),
|
42
42
|
};
|
43
43
|
|
@@ -48,7 +48,7 @@ impl<R> NativeRefWrap<R> {
|
|
48
48
|
|
49
49
|
pub fn wrap_mut<I>(inner: &mut I) -> (Self, Anchor) {
|
50
50
|
let wrap = NativeRefWrap {
|
51
|
-
inner_ptr:
|
51
|
+
inner_ptr: inner as *mut I as *mut R,
|
52
52
|
poisoned: Rc::new(Cell::new(false)),
|
53
53
|
};
|
54
54
|
|
data/ext/selma/src/rewriter.rs
CHANGED
@@ -1,14 +1,14 @@
|
|
1
|
-
use std::{borrow::Cow, cell::RefCell, rc::Rc};
|
2
|
-
|
3
1
|
use lol_html::{
|
4
2
|
doc_comments, doctype, element,
|
5
|
-
html_content::{
|
3
|
+
html_content::{Element, EndTag, TextChunk},
|
6
4
|
text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter, Selector, Settings,
|
7
5
|
};
|
8
6
|
use magnus::{exception, function, method, scan_args, Module, Object, RArray, RModule, Value};
|
9
7
|
|
8
|
+
use std::{borrow::Cow, cell::RefCell, primitive::str, rc::Rc};
|
9
|
+
|
10
10
|
use crate::{
|
11
|
-
html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag},
|
11
|
+
html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag, text_chunk::SelmaHTMLTextChunk},
|
12
12
|
sanitizer::SelmaSanitizer,
|
13
13
|
selector::SelmaSelector,
|
14
14
|
tags::Tag,
|
@@ -43,7 +43,7 @@ unsafe impl Send for SelmaRewriter {}
|
|
43
43
|
impl SelmaRewriter {
|
44
44
|
const SELMA_ON_END_TAG: &str = "on_end_tag";
|
45
45
|
const SELMA_HANDLE_ELEMENT: &str = "handle_element";
|
46
|
-
const
|
46
|
+
const SELMA_HANDLE_TEXT_CHUNK: &str = "handle_text_chunk";
|
47
47
|
|
48
48
|
/// @yard
|
49
49
|
/// @def new(sanitizer: Selma::Sanitizer.new(Selma::Sanitizer::Config::DEFAULT), handlers: [])
|
@@ -83,18 +83,18 @@ impl SelmaRewriter {
|
|
83
83
|
return Err(magnus::Error::new(
|
84
84
|
exception::no_method_error(),
|
85
85
|
format!(
|
86
|
-
"Could not call #selector on {:?}; is this an object that defines it?",
|
87
|
-
|
86
|
+
"Could not call #selector on {classname:?}; is this an object that defines it?",
|
87
|
+
|
88
88
|
),
|
89
89
|
));
|
90
90
|
}
|
91
91
|
|
92
92
|
let rb_selector: WrappedStruct<SelmaSelector> =
|
93
93
|
match rb_handler.funcall("selector", ()) {
|
94
|
-
Err(
|
94
|
+
Err(err) => {
|
95
95
|
return Err(magnus::Error::new(
|
96
96
|
exception::type_error(),
|
97
|
-
format!("Error instantiating selector: {}"
|
97
|
+
format!("Error instantiating selector: {err:?}"),
|
98
98
|
));
|
99
99
|
}
|
100
100
|
Ok(rb_selector) => rb_selector,
|
@@ -145,7 +145,7 @@ impl SelmaRewriter {
|
|
145
145
|
let _: () = args.trailing;
|
146
146
|
let _: () = args.block;
|
147
147
|
|
148
|
-
let
|
148
|
+
let kwargs = scan_args::get_kwargs::<
|
149
149
|
_,
|
150
150
|
(),
|
151
151
|
(
|
@@ -154,7 +154,7 @@ impl SelmaRewriter {
|
|
154
154
|
),
|
155
155
|
(),
|
156
156
|
>(args.keywords, &[], &["sanitizer", "handlers"])?;
|
157
|
-
let (rb_sanitizer, rb_handlers) =
|
157
|
+
let (rb_sanitizer, rb_handlers) = kwargs.optional;
|
158
158
|
|
159
159
|
Ok((rb_sanitizer, rb_handlers))
|
160
160
|
}
|
@@ -162,28 +162,22 @@ impl SelmaRewriter {
|
|
162
162
|
/// Perform HTML rewrite sequence.
|
163
163
|
fn rewrite(&self, html: String) -> Result<String, magnus::Error> {
|
164
164
|
let sanitized_html = match &self.0.borrow().sanitizer {
|
165
|
-
None => html,
|
165
|
+
None => Ok(html),
|
166
166
|
Some(sanitizer) => {
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
// we need to run sanitization several times to truly remove unwanted tags,
|
172
|
-
// because lol-html happily accepts this garbage (by design?)
|
173
|
-
let sanitized_html = Self::perform_sanitization(sanitizer, &html).unwrap();
|
167
|
+
let sanitized_html = match Self::perform_sanitization(sanitizer, &html) {
|
168
|
+
Ok(sanitized_html) => sanitized_html,
|
169
|
+
Err(err) => return Err(err),
|
170
|
+
};
|
174
171
|
|
175
|
-
String::from_utf8(sanitized_html)
|
172
|
+
String::from_utf8(sanitized_html)
|
176
173
|
}
|
177
174
|
};
|
178
175
|
let binding = self.0.borrow_mut();
|
179
176
|
let handlers = &binding.handlers;
|
180
177
|
|
181
|
-
match Self::perform_handler_rewrite(self, handlers, sanitized_html) {
|
178
|
+
match Self::perform_handler_rewrite(self, handlers, sanitized_html.unwrap()) {
|
182
179
|
Ok(rewritten_html) => Ok(String::from_utf8(rewritten_html).unwrap()),
|
183
|
-
Err(err) => Err(
|
184
|
-
exception::runtime_error(),
|
185
|
-
format!("{}", err),
|
186
|
-
)),
|
180
|
+
Err(err) => Err(err),
|
187
181
|
}
|
188
182
|
}
|
189
183
|
|
@@ -214,10 +208,12 @@ impl SelmaRewriter {
|
|
214
208
|
if el.removed() {
|
215
209
|
return Ok(());
|
216
210
|
}
|
217
|
-
sanitizer.sanitize_attributes(el)
|
218
|
-
|
219
|
-
|
211
|
+
match sanitizer.sanitize_attributes(el) {
|
212
|
+
Ok(_) => Ok(()),
|
213
|
+
Err(err) => Err(err.to_string().into()),
|
214
|
+
}
|
220
215
|
})],
|
216
|
+
// TODO: allow for MemorySettings to be defined
|
221
217
|
..Settings::default()
|
222
218
|
},
|
223
219
|
|c: &[u8]| first_pass_html.extend_from_slice(c),
|
@@ -342,7 +338,7 @@ impl SelmaRewriter {
|
|
342
338
|
let mut stack = closure_element_stack.as_ref().borrow_mut();
|
343
339
|
stack.pop();
|
344
340
|
Ok(())
|
345
|
-
})
|
341
|
+
})?;
|
346
342
|
Ok(())
|
347
343
|
}));
|
348
344
|
});
|
@@ -361,7 +357,7 @@ impl SelmaRewriter {
|
|
361
357
|
Err(err) => {
|
362
358
|
return Err(magnus::Error::new(
|
363
359
|
exception::runtime_error(),
|
364
|
-
format!("{}"
|
360
|
+
format!("{err:?}"),
|
365
361
|
));
|
366
362
|
}
|
367
363
|
}
|
@@ -372,17 +368,18 @@ impl SelmaRewriter {
|
|
372
368
|
fn process_element_handlers(
|
373
369
|
rb_handler: Value,
|
374
370
|
element: &mut Element,
|
375
|
-
ancestors: &
|
371
|
+
ancestors: &[String],
|
376
372
|
) -> Result<(), magnus::Error> {
|
377
373
|
// if `on_end_tag` function is defined, call it
|
378
374
|
if rb_handler.respond_to(Self::SELMA_ON_END_TAG, true).unwrap() {
|
375
|
+
// TODO: error here is an "EndTagError"
|
379
376
|
element.on_end_tag(move |end_tag| {
|
380
377
|
let rb_end_tag = SelmaHTMLEndTag::new(end_tag);
|
381
378
|
|
382
|
-
rb_handler
|
383
|
-
|
384
|
-
.
|
385
|
-
|
379
|
+
match rb_handler.funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,)) {
|
380
|
+
Ok(_) => Ok(()),
|
381
|
+
Err(err) => Err(err.to_string().into()),
|
382
|
+
}
|
386
383
|
});
|
387
384
|
}
|
388
385
|
|
@@ -391,39 +388,30 @@ impl SelmaRewriter {
|
|
391
388
|
rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_ELEMENT, (rb_element,));
|
392
389
|
match rb_result {
|
393
390
|
Ok(_) => Ok(()),
|
394
|
-
Err(err) => Err(
|
395
|
-
exception::runtime_error(),
|
396
|
-
format!("{}", err),
|
397
|
-
)),
|
391
|
+
Err(err) => Err(err),
|
398
392
|
}
|
399
393
|
}
|
400
394
|
|
401
|
-
fn process_text_handlers(
|
402
|
-
|
403
|
-
|
395
|
+
fn process_text_handlers(
|
396
|
+
rb_handler: Value,
|
397
|
+
text_chunk: &mut TextChunk,
|
398
|
+
) -> Result<(), magnus::Error> {
|
399
|
+
// prevents missing `handle_text_chunk` function
|
400
|
+
let content = text_chunk.as_str();
|
404
401
|
|
405
|
-
//
|
402
|
+
// seems that sometimes lol-html returns blank text / EOLs?
|
406
403
|
if content.is_empty() {
|
407
404
|
return Ok(());
|
408
405
|
}
|
409
|
-
let rb_result = rb_handler.funcall(Self::SELMA_HANDLE_TEXT, (content,));
|
410
406
|
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
),
|
419
|
-
));
|
407
|
+
let rb_text_chunk = SelmaHTMLTextChunk::new(text_chunk);
|
408
|
+
match rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_TEXT_CHUNK, (rb_text_chunk,)) {
|
409
|
+
Ok(_) => Ok(()),
|
410
|
+
Err(err) => Err(magnus::Error::new(
|
411
|
+
exception::runtime_error(),
|
412
|
+
format!("{err:?}"),
|
413
|
+
)),
|
420
414
|
}
|
421
|
-
|
422
|
-
let new_content: String = rb_result.unwrap();
|
423
|
-
// TODO: can this be an option?
|
424
|
-
text.replace(&new_content, ContentType::Html);
|
425
|
-
|
426
|
-
Ok(())
|
427
415
|
}
|
428
416
|
}
|
429
417
|
|
data/ext/selma/src/sanitizer.rs
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
use std::{borrow::BorrowMut, cell::RefMut, collections::HashMap};
|
2
2
|
|
3
|
-
use lol_html::
|
4
|
-
|
5
|
-
|
6
|
-
Value,
|
3
|
+
use lol_html::{
|
4
|
+
errors::AttributeNameError,
|
5
|
+
html_content::{Comment, ContentType, Doctype, Element, EndTag},
|
7
6
|
};
|
8
|
-
|
9
|
-
use crate::tags::Tag;
|
7
|
+
use magnus::{class, function, method, scan_args, Module, Object, RArray, RHash, RModule, Value};
|
10
8
|
|
11
9
|
#[derive(Clone, Debug)]
|
12
10
|
struct ElementSanitizer {
|
@@ -18,7 +16,7 @@ struct ElementSanitizer {
|
|
18
16
|
|
19
17
|
#[derive(Clone, Debug)]
|
20
18
|
pub struct Sanitizer {
|
21
|
-
flags: [u8; Tag::TAG_COUNT],
|
19
|
+
flags: [u8; crate::tags::Tag::TAG_COUNT],
|
22
20
|
allowed_attrs: Vec<String>,
|
23
21
|
allowed_classes: Vec<String>,
|
24
22
|
element_sanitizers: HashMap<String, ElementSanitizer>,
|
@@ -35,11 +33,11 @@ pub struct SelmaSanitizer(std::cell::RefCell<Sanitizer>);
|
|
35
33
|
|
36
34
|
impl SelmaSanitizer {
|
37
35
|
const SELMA_SANITIZER_ALLOW: u8 = (1 << 0);
|
38
|
-
const SELMA_SANITIZER_ESCAPE_TAGFILTER: u8 = (1 << 1);
|
36
|
+
// const SELMA_SANITIZER_ESCAPE_TAGFILTER: u8 = (1 << 1);
|
39
37
|
const SELMA_SANITIZER_REMOVE_CONTENTS: u8 = (1 << 2);
|
40
38
|
const SELMA_SANITIZER_WRAP_WHITESPACE: u8 = (1 << 3);
|
41
39
|
|
42
|
-
pub fn new(arguments: &[Value]) -> Result<Self, Error> {
|
40
|
+
pub fn new(arguments: &[Value]) -> Result<Self, magnus::Error> {
|
43
41
|
let args = scan_args::scan_args::<(), (Option<RHash>,), (), (), (), ()>(arguments)?;
|
44
42
|
let (opt_config,): (Option<RHash>,) = args.optional;
|
45
43
|
|
@@ -50,7 +48,7 @@ impl SelmaSanitizer {
|
|
50
48
|
};
|
51
49
|
|
52
50
|
let mut element_sanitizers = HashMap::new();
|
53
|
-
Tag::html_tags().iter().for_each(|html_tag| {
|
51
|
+
crate::tags::Tag::html_tags().iter().for_each(|html_tag| {
|
54
52
|
let es = ElementSanitizer {
|
55
53
|
allowed_attrs: vec![],
|
56
54
|
allowed_classes: vec![],
|
@@ -58,11 +56,14 @@ impl SelmaSanitizer {
|
|
58
56
|
|
59
57
|
protocol_sanitizers: HashMap::new(),
|
60
58
|
};
|
61
|
-
element_sanitizers.insert(
|
59
|
+
element_sanitizers.insert(
|
60
|
+
crate::tags::Tag::element_name_from_enum(html_tag).to_string(),
|
61
|
+
es,
|
62
|
+
);
|
62
63
|
});
|
63
64
|
|
64
65
|
Ok(Self(std::cell::RefCell::new(Sanitizer {
|
65
|
-
flags: [0; Tag::TAG_COUNT],
|
66
|
+
flags: [0; crate::tags::Tag::TAG_COUNT],
|
66
67
|
allowed_attrs: vec![],
|
67
68
|
allowed_classes: vec![],
|
68
69
|
element_sanitizers,
|
@@ -74,7 +75,7 @@ impl SelmaSanitizer {
|
|
74
75
|
})))
|
75
76
|
}
|
76
77
|
|
77
|
-
fn get_config(&self) -> Result<RHash, Error> {
|
78
|
+
fn get_config(&self) -> Result<RHash, magnus::Error> {
|
78
79
|
let binding = self.0.borrow();
|
79
80
|
|
80
81
|
Ok(binding.config)
|
@@ -82,7 +83,7 @@ impl SelmaSanitizer {
|
|
82
83
|
|
83
84
|
/// Toggle a sanitizer option on or off.
|
84
85
|
fn set_flag(&self, tag_name: String, flag: u8, set: bool) {
|
85
|
-
let tag = Tag::tag_from_tag_name(tag_name.as_str());
|
86
|
+
let tag = crate::tags::Tag::tag_from_tag_name(tag_name.as_str());
|
86
87
|
if set {
|
87
88
|
self.0.borrow_mut().flags[tag.index] |= flag;
|
88
89
|
} else {
|
@@ -93,13 +94,19 @@ impl SelmaSanitizer {
|
|
93
94
|
/// Toggles all sanitization options on or off.
|
94
95
|
fn set_all_flags(&self, flag: u8, set: bool) {
|
95
96
|
if set {
|
96
|
-
Tag::html_tags()
|
97
|
-
|
98
|
-
|
97
|
+
crate::tags::Tag::html_tags()
|
98
|
+
.iter()
|
99
|
+
.enumerate()
|
100
|
+
.for_each(|(iter, _)| {
|
101
|
+
self.0.borrow_mut().flags[iter] |= flag;
|
102
|
+
});
|
99
103
|
} else {
|
100
|
-
Tag::html_tags()
|
101
|
-
|
102
|
-
|
104
|
+
crate::tags::Tag::html_tags()
|
105
|
+
.iter()
|
106
|
+
.enumerate()
|
107
|
+
.for_each(|(iter, _)| {
|
108
|
+
self.0.borrow_mut().flags[iter] &= flag;
|
109
|
+
});
|
103
110
|
}
|
104
111
|
}
|
105
112
|
|
@@ -111,8 +118,8 @@ impl SelmaSanitizer {
|
|
111
118
|
|
112
119
|
pub fn escape_tagfilter(&self, e: &mut Element) -> bool {
|
113
120
|
if self.0.borrow().escape_tagfilter {
|
114
|
-
let tag = Tag::tag_from_element(e);
|
115
|
-
if Tag::is_tag_escapeworthy(tag) {
|
121
|
+
let tag = crate::tags::Tag::tag_from_element(e);
|
122
|
+
if crate::tags::Tag::is_tag_escapeworthy(tag) {
|
116
123
|
e.remove();
|
117
124
|
return true;
|
118
125
|
}
|
@@ -229,9 +236,9 @@ impl SelmaSanitizer {
|
|
229
236
|
}
|
230
237
|
}
|
231
238
|
|
232
|
-
pub fn sanitize_attributes(&self, element: &mut Element) {
|
239
|
+
pub fn sanitize_attributes(&self, element: &mut Element) -> Result<(), AttributeNameError> {
|
233
240
|
let binding = self.0.borrow_mut();
|
234
|
-
let tag = Tag::tag_from_element(element);
|
241
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
235
242
|
let element_sanitizer = Self::get_element_sanitizer(&binding, &element.tag_name());
|
236
243
|
|
237
244
|
// FIXME: This is a hack to get around the fact that we can't borrow
|
@@ -247,7 +254,7 @@ impl SelmaSanitizer {
|
|
247
254
|
// encountered, remove the entire element to be safe.
|
248
255
|
if attr_name.starts_with("<!--") {
|
249
256
|
Self::force_remove_element(self, element);
|
250
|
-
return;
|
257
|
+
return Ok(());
|
251
258
|
}
|
252
259
|
|
253
260
|
// first, trim leading spaces and unescape any encodings
|
@@ -255,46 +262,64 @@ impl SelmaSanitizer {
|
|
255
262
|
let x = escapist::unescape_html(trimmed.as_bytes());
|
256
263
|
let unescaped_attr_val = String::from_utf8_lossy(&x).to_string();
|
257
264
|
|
258
|
-
|
265
|
+
let should_keep_attrubute = match Self::should_keep_attribute(
|
259
266
|
&binding,
|
260
267
|
element,
|
261
268
|
element_sanitizer,
|
262
269
|
attr_name,
|
263
270
|
&unescaped_attr_val,
|
264
271
|
) {
|
272
|
+
Ok(should_keep) => should_keep,
|
273
|
+
Err(e) => {
|
274
|
+
return Err(e);
|
275
|
+
}
|
276
|
+
};
|
277
|
+
|
278
|
+
if !should_keep_attrubute {
|
265
279
|
element.remove_attribute(attr_name);
|
266
280
|
} else {
|
267
281
|
// Prevent the use of `<meta>` elements that set a charset other than UTF-8,
|
268
282
|
// since output is always UTF-8.
|
269
|
-
if Tag::is_meta(tag) {
|
283
|
+
if crate::tags::Tag::is_meta(tag) {
|
270
284
|
if attr_name == "charset" && unescaped_attr_val != "utf-8" {
|
271
|
-
element.set_attribute(attr_name, "utf-8")
|
285
|
+
match element.set_attribute(attr_name, "utf-8") {
|
286
|
+
Ok(_) => {}
|
287
|
+
Err(err) => {
|
288
|
+
return Err(err);
|
289
|
+
}
|
290
|
+
}
|
272
291
|
}
|
273
292
|
} else if !unescaped_attr_val.is_empty() {
|
274
293
|
let mut buf = String::new();
|
275
294
|
// ...then, escape any special characters, for security
|
276
295
|
if attr_name == "href" {
|
277
|
-
|
278
|
-
escapist::escape_href(&mut buf, unescaped_attr_val.to_string().as_str());
|
296
|
+
escapist::escape_href(&mut buf, unescaped_attr_val.as_str());
|
279
297
|
} else {
|
280
|
-
escapist::escape_html(&mut buf, unescaped_attr_val.
|
298
|
+
escapist::escape_html(&mut buf, unescaped_attr_val.as_str());
|
281
299
|
};
|
282
300
|
|
283
|
-
element.set_attribute(attr_name, &buf)
|
301
|
+
match element.set_attribute(attr_name, &buf) {
|
302
|
+
Ok(_) => {}
|
303
|
+
Err(err) => {
|
304
|
+
return Err(err);
|
305
|
+
}
|
306
|
+
}
|
284
307
|
}
|
285
308
|
}
|
286
309
|
}
|
287
310
|
|
288
311
|
let required = &element_sanitizer.required_attrs;
|
289
312
|
if required.contains(&"*".to_string()) {
|
290
|
-
return;
|
313
|
+
return Ok(());
|
291
314
|
}
|
292
315
|
for attr in element.attributes().iter() {
|
293
316
|
let attr_name = &attr.name();
|
294
317
|
if required.contains(attr_name) {
|
295
|
-
return;
|
318
|
+
return Ok(());
|
296
319
|
}
|
297
320
|
}
|
321
|
+
|
322
|
+
Ok(())
|
298
323
|
}
|
299
324
|
|
300
325
|
fn should_keep_attribute(
|
@@ -303,7 +328,7 @@ impl SelmaSanitizer {
|
|
303
328
|
element_sanitizer: &ElementSanitizer,
|
304
329
|
attr_name: &String,
|
305
330
|
attr_val: &String,
|
306
|
-
) -> bool {
|
331
|
+
) -> Result<bool, AttributeNameError> {
|
307
332
|
let mut allowed: bool = false;
|
308
333
|
let element_allowed_attrs = element_sanitizer.allowed_attrs.contains(attr_name);
|
309
334
|
let sanitizer_allowed_attrs = binding.allowed_attrs.contains(attr_name);
|
@@ -317,7 +342,7 @@ impl SelmaSanitizer {
|
|
317
342
|
}
|
318
343
|
|
319
344
|
if !allowed {
|
320
|
-
return false;
|
345
|
+
return Ok(false);
|
321
346
|
}
|
322
347
|
|
323
348
|
let protocol_sanitizer_values = element_sanitizer.protocol_sanitizers.get(attr_name);
|
@@ -325,32 +350,29 @@ impl SelmaSanitizer {
|
|
325
350
|
None => {
|
326
351
|
// has a protocol, but no sanitization list
|
327
352
|
if !attr_val.is_empty() && Self::has_protocol(attr_val) {
|
328
|
-
return false;
|
353
|
+
return Ok(false);
|
329
354
|
}
|
330
355
|
}
|
331
356
|
Some(protocol_sanitizer_values) => {
|
332
357
|
if !attr_val.is_empty()
|
333
358
|
&& !Self::has_allowed_protocol(protocol_sanitizer_values, attr_val)
|
334
359
|
{
|
335
|
-
return false;
|
360
|
+
return Ok(false);
|
336
361
|
}
|
337
362
|
}
|
338
363
|
}
|
339
364
|
|
340
|
-
if attr_name == "class"
|
341
|
-
|
365
|
+
if attr_name == "class" {
|
366
|
+
return Self::sanitize_class_attribute(
|
342
367
|
binding,
|
343
368
|
element,
|
344
369
|
element_sanitizer,
|
345
370
|
attr_name,
|
346
371
|
attr_val,
|
347
|
-
)
|
348
|
-
.unwrap()
|
349
|
-
{
|
350
|
-
return false;
|
372
|
+
);
|
351
373
|
}
|
352
374
|
|
353
|
-
true
|
375
|
+
Ok(true)
|
354
376
|
}
|
355
377
|
|
356
378
|
fn has_protocol(attr_val: &str) -> bool {
|
@@ -393,7 +415,7 @@ impl SelmaSanitizer {
|
|
393
415
|
element_sanitizer: &ElementSanitizer,
|
394
416
|
attr_name: &str,
|
395
417
|
attr_val: &str,
|
396
|
-
) -> Result<bool,
|
418
|
+
) -> Result<bool, lol_html::errors::AttributeNameError> {
|
397
419
|
let allowed_global = &binding.allowed_classes;
|
398
420
|
|
399
421
|
let mut valid_classes: Vec<String> = vec![];
|
@@ -421,28 +443,25 @@ impl SelmaSanitizer {
|
|
421
443
|
|
422
444
|
match element.set_attribute(attr_name, valid_classes.join(" ").as_str()) {
|
423
445
|
Ok(_) => Ok(true),
|
424
|
-
Err(err) => Err(
|
425
|
-
exception::runtime_error(),
|
426
|
-
format!("AttributeNameError: {}", err),
|
427
|
-
)),
|
446
|
+
Err(err) => Err(err),
|
428
447
|
}
|
429
448
|
}
|
430
449
|
|
431
450
|
pub fn allow_element(&self, element: &mut Element) -> bool {
|
432
|
-
let tag = Tag::tag_from_element(element);
|
451
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
433
452
|
let flags: u8 = self.0.borrow().flags[tag.index];
|
434
453
|
|
435
454
|
(flags & Self::SELMA_SANITIZER_ALLOW) == 0
|
436
455
|
}
|
437
456
|
|
438
457
|
pub fn try_remove_element(&self, element: &mut Element) -> bool {
|
439
|
-
let tag = Tag::tag_from_element(element);
|
458
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
440
459
|
let flags: u8 = self.0.borrow().flags[tag.index];
|
441
460
|
|
442
461
|
let should_remove = !element.removed() && self.allow_element(element);
|
443
462
|
|
444
463
|
if should_remove {
|
445
|
-
if Tag::has_text_content(tag) {
|
464
|
+
if crate::tags::Tag::has_text_content(tag) {
|
446
465
|
Self::remove_element(
|
447
466
|
element,
|
448
467
|
tag.self_closing,
|
@@ -455,7 +474,7 @@ impl SelmaSanitizer {
|
|
455
474
|
Self::check_if_end_tag_needs_removal(element);
|
456
475
|
} else {
|
457
476
|
// anything in <iframe> must be removed, if it's kept
|
458
|
-
if Tag::is_iframe(tag) {
|
477
|
+
if crate::tags::Tag::is_iframe(tag) {
|
459
478
|
if self.0.borrow().flags[tag.index] != 0 {
|
460
479
|
element.set_inner_content(" ", ContentType::Text);
|
461
480
|
} else {
|
@@ -487,14 +506,14 @@ impl SelmaSanitizer {
|
|
487
506
|
}
|
488
507
|
|
489
508
|
pub fn force_remove_element(&self, element: &mut Element) {
|
490
|
-
let tag = Tag::tag_from_element(element);
|
509
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
491
510
|
let self_closing = tag.self_closing;
|
492
511
|
Self::remove_element(element, self_closing, Self::SELMA_SANITIZER_REMOVE_CONTENTS);
|
493
512
|
Self::check_if_end_tag_needs_removal(element);
|
494
513
|
}
|
495
514
|
|
496
515
|
fn check_if_end_tag_needs_removal(element: &mut Element) {
|
497
|
-
if element.removed() && !Tag::tag_from_element(element).self_closing {
|
516
|
+
if element.removed() && !crate::tags::Tag::tag_from_element(element).self_closing {
|
498
517
|
element
|
499
518
|
.on_end_tag(move |end| {
|
500
519
|
Self::remove_end_tag(end);
|
@@ -523,7 +542,7 @@ impl SelmaSanitizer {
|
|
523
542
|
}
|
524
543
|
}
|
525
544
|
|
526
|
-
pub fn init(m_selma: RModule) -> Result<(), Error> {
|
545
|
+
pub fn init(m_selma: RModule) -> Result<(), magnus::Error> {
|
527
546
|
let c_sanitizer = m_selma.define_class("Sanitizer", Default::default())?;
|
528
547
|
|
529
548
|
c_sanitizer.define_singleton_method("new", function!(SelmaSanitizer::new, -1))?;
|
data/ext/selma/src/selector.rs
CHANGED
@@ -27,7 +27,7 @@ impl SelmaSelector {
|
|
27
27
|
if css.parse::<lol_html::Selector>().is_err() {
|
28
28
|
return Err(Error::new(
|
29
29
|
exception::arg_error(),
|
30
|
-
format!("Could not parse `match_element` (`{}`) as valid CSS"
|
30
|
+
format!("Could not parse `match_element` (`{css:?}`) as valid CSS"),
|
31
31
|
));
|
32
32
|
}
|
33
33
|
}
|
@@ -37,10 +37,7 @@ impl SelmaSelector {
|
|
37
37
|
if css.parse::<lol_html::Selector>().is_err() {
|
38
38
|
return Err(Error::new(
|
39
39
|
exception::arg_error(),
|
40
|
-
format!(
|
41
|
-
"Could not parse `match_text_within` (`{}`) as valid CSS",
|
42
|
-
css
|
43
|
-
),
|
40
|
+
format!("Could not parse `match_text_within` (`{css:?}`) as valid CSS",),
|
44
41
|
));
|
45
42
|
}
|
46
43
|
}
|
data/ext/selma/src/tags.rs
CHANGED
@@ -192,14 +192,17 @@ impl Tag {
|
|
192
192
|
/// Is this tag something which needs to be removed?
|
193
193
|
pub fn is_tag_escapeworthy(tag: Tag) -> bool {
|
194
194
|
tag.index == HTMLTag::TITLE as usize
|
195
|
-
|| tag.index == HTMLTag::TEXTAREA as usize
|
196
|
-
|| tag.index == HTMLTag::STYLE as usize
|
197
|
-
|| tag.index == HTMLTag::XMP as usize
|
198
195
|
|| tag.index == HTMLTag::IFRAME as usize
|
196
|
+
|| tag.index == HTMLTag::MATH as usize
|
199
197
|
|| tag.index == HTMLTag::NOEMBED as usize
|
200
198
|
|| tag.index == HTMLTag::NOFRAMES as usize
|
201
|
-
|| tag.index == HTMLTag::
|
199
|
+
|| tag.index == HTMLTag::NOSCRIPT as usize
|
202
200
|
|| tag.index == HTMLTag::PLAINTEXT as usize
|
201
|
+
|| tag.index == HTMLTag::SCRIPT as usize
|
202
|
+
|| tag.index == HTMLTag::STYLE as usize
|
203
|
+
|| tag.index == HTMLTag::SVG as usize
|
204
|
+
|| tag.index == HTMLTag::TEXTAREA as usize
|
205
|
+
|| tag.index == HTMLTag::XMP as usize
|
203
206
|
}
|
204
207
|
|
205
208
|
pub const ESCAPEWORTHY_TAGS_CSS: &str =
|
data/lib/selma/3.1/selma.bundle
CHANGED
Binary file
|
@@ -3,6 +3,10 @@
|
|
3
3
|
module Selma
|
4
4
|
class Sanitizer
|
5
5
|
module Config
|
6
|
+
# although there are many more protocol types, eg., ftp, xmpp, etc.,
|
7
|
+
# these are the only ones that are allowed by default
|
8
|
+
VALID_PROTOCOLS = ["http", "https", "mailto", :relative]
|
9
|
+
|
6
10
|
DEFAULT = freeze_config(
|
7
11
|
# Whether or not to allow HTML comments. Allowing comments is strongly
|
8
12
|
# discouraged, since IE allows script execution within conditional
|
data/lib/selma/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: selma
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: x86_64-darwin
|
6
6
|
authors:
|
7
7
|
- Garen J. Torikian
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-12-
|
11
|
+
date: 2022-12-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -81,6 +81,7 @@ files:
|
|
81
81
|
- ext/selma/src/html.rs
|
82
82
|
- ext/selma/src/html/element.rs
|
83
83
|
- ext/selma/src/html/end_tag.rs
|
84
|
+
- ext/selma/src/html/text_chunk.rs
|
84
85
|
- ext/selma/src/lib.rs
|
85
86
|
- ext/selma/src/native_ref_wrap.rs
|
86
87
|
- ext/selma/src/rewriter.rs
|