selma 0.0.2-x86_64-linux → 0.0.4-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -10
- data/ext/selma/Cargo.toml +3 -3
- data/ext/selma/src/html/element.rs +41 -48
- data/ext/selma/src/html/end_tag.rs +2 -2
- data/ext/selma/src/html/text_chunk.rs +83 -0
- data/ext/selma/src/html.rs +2 -0
- data/ext/selma/src/lib.rs +28 -1
- data/ext/selma/src/native_ref_wrap.rs +3 -3
- data/ext/selma/src/rewriter.rs +47 -59
- data/ext/selma/src/sanitizer.rs +76 -57
- data/ext/selma/src/selector.rs +2 -5
- data/ext/selma/src/tags.rs +7 -4
- data/lib/selma/3.1/selma.so +0 -0
- data/lib/selma/sanitizer/config/default.rb +4 -0
- data/lib/selma/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e3b9b090cd0f742bd24d8bd1d065b45d0b142b690141cce378258c8bd231501d
|
4
|
+
data.tar.gz: '08eaf21b2ab2161bb5daadf451175cb3e37a4c4ab9212c294450a73291f06460'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8be8197fb86053ae0aa6e957a2694812be266a5d3903d15c59ede3768811b11639002d92f21217428fb89e023022544cfcd930191dec20b3d7d12934d6337a0f
|
7
|
+
data.tar.gz: 9e059e1e25bc52ad38ea64dd0d8b9654caaf4ce67aaa33bd6e1d1187ac534d92dc4c9631bff7457d76f7d685100d93b69e18e335dc4f6b69d8dcc053c36e0179
|
data/README.md
CHANGED
@@ -56,6 +56,10 @@ allow_comments: false,
|
|
56
56
|
# "<!DOCTYPE html>" when sanitizing a document.
|
57
57
|
allow_doctype: false,
|
58
58
|
|
59
|
+
# HTML elements to allow. By default, no elements are allowed (which means
|
60
|
+
# that all HTML will be stripped).
|
61
|
+
elements: ["a", "b", "img", ],
|
62
|
+
|
59
63
|
# HTML attributes to allow in specific elements. The key is the name of the element,
|
60
64
|
# and the value is an array of allowed attributes. By default, no attributes
|
61
65
|
# are allowed.
|
@@ -64,14 +68,10 @@ attributes: {
|
|
64
68
|
"img" => ["src"],
|
65
69
|
},
|
66
70
|
|
67
|
-
# HTML elements to allow. By default, no elements are allowed (which means
|
68
|
-
# that all HTML will be stripped).
|
69
|
-
elements: ["a", "b", "img", ],
|
70
|
-
|
71
71
|
# URL handling protocols to allow in specific attributes. By default, no
|
72
72
|
# protocols are allowed. Use :relative in place of a protocol if you want
|
73
73
|
# to allow relative URLs sans protocol.
|
74
|
-
|
74
|
+
protocols: {
|
75
75
|
"a" => { "href" => ["http", "https", "mailto", :relative] },
|
76
76
|
"img" => { "href" => ["http", "https"] },
|
77
77
|
},
|
@@ -91,7 +91,7 @@ The real power in Selma comes in its use of handlers. A handler is simply an obj
|
|
91
91
|
|
92
92
|
- `selector`, a method which MUST return instance of `Selma::Selector` which defines the CSS classes to match
|
93
93
|
- `handle_element`, a method that's call on each matched element
|
94
|
-
- `
|
94
|
+
- `handle_text_chunk`, a method that's called on each matched text node; this MUST return a string
|
95
95
|
|
96
96
|
Here's an example which rewrites the `href` attribute on `a` and the `src` attribute on `img` to be `https` rather than `http`.
|
97
97
|
|
@@ -118,7 +118,7 @@ rewriter = Selma::Rewriter.new(handlers: [MatchAttribute.new])
|
|
118
118
|
The `Selma::Selector` object has three possible kwargs:
|
119
119
|
|
120
120
|
- `match_element`: any element which matches this CSS rule will be passed on to `handle_element`
|
121
|
-
- `match_text_within`: any element which matches this CSS rule will be passed on to `
|
121
|
+
- `match_text_within`: any element which matches this CSS rule will be passed on to `handle_text_chunk`
|
122
122
|
- `ignore_text_within`: this is an array of element names whose text contents will be ignored
|
123
123
|
|
124
124
|
You've seen an example of `match_element`; here's one for `match_text` which changes strings in various elements which are _not_ `pre` or `code`:
|
@@ -132,7 +132,7 @@ class MatchText
|
|
132
132
|
SELECTOR
|
133
133
|
end
|
134
134
|
|
135
|
-
def
|
135
|
+
def handle_text_chunk(text)
|
136
136
|
string.sub(/@.+/, "<a href=\"www.yetto.app/#{Regexp.last_match}\">")
|
137
137
|
end
|
138
138
|
end
|
@@ -150,8 +150,9 @@ The `element` argument in `handle_element` has the following methods:
|
|
150
150
|
- `remove_attribute`: remove an attribute
|
151
151
|
- `attributes`: list all the attributes
|
152
152
|
- `ancestors`: list all the ancestors
|
153
|
-
- `append(content, content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
154
|
-
- `
|
153
|
+
- `append(content, as: content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
154
|
+
- `before(content, as: content_type)`: Inserts `content` before the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
155
|
+
- `after(content, as: content_type)`: Inserts `content` after the element. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
155
156
|
- `set_inner_content`: replaces inner content of the element with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied.
|
156
157
|
|
157
158
|
## Benchmarks
|
data/ext/selma/Cargo.toml
CHANGED
@@ -5,9 +5,9 @@ edition = "2021"
|
|
5
5
|
|
6
6
|
[dependencies]
|
7
7
|
enum-iterator = "1.2"
|
8
|
-
escapist = "0.0.
|
9
|
-
magnus = "
|
10
|
-
lol_html =
|
8
|
+
escapist = "0.0.2"
|
9
|
+
magnus = { git = "https://github.com/matsadler/magnus", rev = "23160f7229ac74c42da1b5096a65ccbc40962697" }
|
10
|
+
lol_html = "0.3"
|
11
11
|
|
12
12
|
[lib]
|
13
13
|
name = "selma"
|
@@ -1,8 +1,6 @@
|
|
1
|
-
use std::borrow::Cow;
|
2
|
-
|
3
1
|
use crate::native_ref_wrap::NativeRefWrap;
|
4
|
-
use lol_html::html_content::
|
5
|
-
use magnus::{exception, method, Error, Module, RArray, RClass, RHash, RString,
|
2
|
+
use lol_html::html_content::Element;
|
3
|
+
use magnus::{exception, method, Error, Module, RArray, RClass, RHash, RString, Value};
|
6
4
|
|
7
5
|
struct HTMLElement {
|
8
6
|
element: NativeRefWrap<Element<'static, 'static>>,
|
@@ -51,7 +49,7 @@ impl SelmaHTMLElement {
|
|
51
49
|
Ok(_) => Ok(value),
|
52
50
|
Err(err) => Err(Error::new(
|
53
51
|
exception::runtime_error(),
|
54
|
-
format!("AttributeNameError: {}"
|
52
|
+
format!("AttributeNameError: {err:?}"),
|
55
53
|
)),
|
56
54
|
}
|
57
55
|
} else {
|
@@ -81,7 +79,7 @@ impl SelmaHTMLElement {
|
|
81
79
|
Ok(_) => {}
|
82
80
|
Err(err) => Err(Error::new(
|
83
81
|
exception::runtime_error(),
|
84
|
-
format!("AttributeNameError: {}"
|
82
|
+
format!("AttributeNameError: {err:?}"),
|
85
83
|
))
|
86
84
|
.unwrap(),
|
87
85
|
});
|
@@ -99,80 +97,74 @@ impl SelmaHTMLElement {
|
|
99
97
|
.for_each(|ancestor| match array.push(RString::new(ancestor)) {
|
100
98
|
Ok(_) => {}
|
101
99
|
Err(err) => {
|
102
|
-
Err(Error::new(exception::runtime_error(), format!("{}"
|
100
|
+
Err(Error::new(exception::runtime_error(), format!("{err:?}"))).unwrap()
|
103
101
|
}
|
104
102
|
});
|
105
103
|
|
106
104
|
Ok(array)
|
107
105
|
}
|
108
106
|
|
109
|
-
fn
|
107
|
+
fn before(&self, args: &[Value]) -> Result<(), Error> {
|
110
108
|
let mut binding = self.0.borrow_mut();
|
111
109
|
let element = binding.element.get_mut().unwrap();
|
112
110
|
|
113
|
-
let text_str =
|
114
|
-
|
115
|
-
|
111
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
112
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
113
|
+
Err(err) => return Err(err),
|
114
|
+
};
|
116
115
|
|
117
|
-
element.
|
116
|
+
element.before(&text_str, content_type);
|
118
117
|
|
119
118
|
Ok(())
|
120
119
|
}
|
121
120
|
|
122
|
-
fn
|
123
|
-
&self,
|
124
|
-
start_text: String,
|
125
|
-
end_text: String,
|
126
|
-
content_type: Symbol,
|
127
|
-
) -> Result<(), Error> {
|
121
|
+
fn after(&self, args: &[Value]) -> Result<(), Error> {
|
128
122
|
let mut binding = self.0.borrow_mut();
|
129
123
|
let element = binding.element.get_mut().unwrap();
|
130
124
|
|
131
|
-
let
|
132
|
-
|
133
|
-
|
134
|
-
|
125
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
126
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
127
|
+
Err(err) => return Err(err),
|
128
|
+
};
|
129
|
+
|
130
|
+
element.after(&text_str, content_type);
|
135
131
|
|
136
132
|
Ok(())
|
137
133
|
}
|
138
134
|
|
139
|
-
fn
|
135
|
+
fn append(&self, args: &[Value]) -> Result<(), Error> {
|
140
136
|
let mut binding = self.0.borrow_mut();
|
141
137
|
let element = binding.element.get_mut().unwrap();
|
142
138
|
|
143
|
-
let text_str =
|
144
|
-
|
145
|
-
|
139
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
140
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
141
|
+
Err(err) => return Err(err),
|
142
|
+
};
|
146
143
|
|
147
|
-
element.
|
144
|
+
element.append(&text_str, content_type);
|
148
145
|
|
149
146
|
Ok(())
|
150
147
|
}
|
151
148
|
|
152
|
-
fn
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
exception::runtime_error(),
|
165
|
-
format!("Could not unwrap symbol"),
|
166
|
-
))
|
167
|
-
.unwrap(),
|
168
|
-
}
|
149
|
+
fn set_inner_content(&self, args: &[Value]) -> Result<(), Error> {
|
150
|
+
let mut binding = self.0.borrow_mut();
|
151
|
+
let element = binding.element.get_mut().unwrap();
|
152
|
+
|
153
|
+
let (inner_content, content_type) = match crate::scan_text_args(args) {
|
154
|
+
Ok((inner_content, content_type)) => (inner_content, content_type),
|
155
|
+
Err(err) => return Err(err),
|
156
|
+
};
|
157
|
+
|
158
|
+
element.set_inner_content(&inner_content, content_type);
|
159
|
+
|
160
|
+
Ok(())
|
169
161
|
}
|
170
162
|
}
|
171
163
|
|
172
164
|
pub fn init(c_html: RClass) -> Result<(), Error> {
|
173
165
|
let c_element = c_html
|
174
166
|
.define_class("Element", Default::default())
|
175
|
-
.expect("cannot find class Selma::Element");
|
167
|
+
.expect("cannot find class Selma::HTML::Element");
|
176
168
|
|
177
169
|
c_element.define_method("tag_name", method!(SelmaHTMLElement::tag_name, 0))?;
|
178
170
|
c_element.define_method("[]", method!(SelmaHTMLElement::get_attribute, 1))?;
|
@@ -184,11 +176,12 @@ pub fn init(c_html: RClass) -> Result<(), Error> {
|
|
184
176
|
c_element.define_method("attributes", method!(SelmaHTMLElement::get_attributes, 0))?;
|
185
177
|
c_element.define_method("ancestors", method!(SelmaHTMLElement::get_ancestors, 0))?;
|
186
178
|
|
187
|
-
c_element.define_method("
|
188
|
-
c_element.define_method("
|
179
|
+
c_element.define_method("before", method!(SelmaHTMLElement::before, -1))?;
|
180
|
+
c_element.define_method("after", method!(SelmaHTMLElement::after, -1))?;
|
181
|
+
c_element.define_method("append", method!(SelmaHTMLElement::append, -1))?;
|
189
182
|
c_element.define_method(
|
190
183
|
"set_inner_content",
|
191
|
-
method!(SelmaHTMLElement::set_inner_content,
|
184
|
+
method!(SelmaHTMLElement::set_inner_content, -1),
|
192
185
|
)?;
|
193
186
|
|
194
187
|
Ok(())
|
@@ -6,7 +6,7 @@ struct HTMLEndTag {
|
|
6
6
|
end_tag: NativeRefWrap<EndTag<'static>>,
|
7
7
|
}
|
8
8
|
|
9
|
-
#[magnus::wrap(class = "Selma::HTML::
|
9
|
+
#[magnus::wrap(class = "Selma::HTML::EndTag")]
|
10
10
|
pub struct SelmaHTMLEndTag(std::cell::RefCell<HTMLEndTag>);
|
11
11
|
|
12
12
|
/// SAFETY: This is safe because we only access this data when the GVL is held.
|
@@ -27,7 +27,7 @@ impl SelmaHTMLEndTag {
|
|
27
27
|
pub fn init(c_html: RClass) -> Result<(), Error> {
|
28
28
|
let c_end_tag = c_html
|
29
29
|
.define_class("EndTag", Default::default())
|
30
|
-
.expect("cannot find class Selma::EndTag");
|
30
|
+
.expect("cannot find class Selma::HTML::EndTag");
|
31
31
|
|
32
32
|
c_end_tag.define_method("tag_name", method!(SelmaHTMLEndTag::tag_name, 0))?;
|
33
33
|
|
@@ -0,0 +1,83 @@
|
|
1
|
+
use crate::native_ref_wrap::NativeRefWrap;
|
2
|
+
use lol_html::html_content::{TextChunk, TextType};
|
3
|
+
use magnus::{exception, method, Error, Module, RClass, Symbol, Value};
|
4
|
+
|
5
|
+
struct HTMLTextChunk {
|
6
|
+
text_chunk: NativeRefWrap<TextChunk<'static>>,
|
7
|
+
}
|
8
|
+
|
9
|
+
#[magnus::wrap(class = "Selma::HTML::TextChunk")]
|
10
|
+
pub struct SelmaHTMLTextChunk(std::cell::RefCell<HTMLTextChunk>);
|
11
|
+
|
12
|
+
/// SAFETY: This is safe because we only access this data when the GVL is held.
|
13
|
+
unsafe impl Send for SelmaHTMLTextChunk {}
|
14
|
+
|
15
|
+
impl SelmaHTMLTextChunk {
|
16
|
+
pub fn new(text_chunk: &mut TextChunk) -> Self {
|
17
|
+
let (ref_wrap, _anchor) = NativeRefWrap::wrap_mut(text_chunk);
|
18
|
+
|
19
|
+
Self(std::cell::RefCell::new(HTMLTextChunk {
|
20
|
+
text_chunk: ref_wrap,
|
21
|
+
}))
|
22
|
+
}
|
23
|
+
|
24
|
+
fn to_s(&self) -> Result<String, Error> {
|
25
|
+
let binding = self.0.borrow();
|
26
|
+
|
27
|
+
if let Ok(tc) = binding.text_chunk.get() {
|
28
|
+
Ok(tc.as_str().to_string())
|
29
|
+
} else {
|
30
|
+
Err(Error::new(
|
31
|
+
exception::runtime_error(),
|
32
|
+
"`to_s` is not available",
|
33
|
+
))
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
fn text_type(&self) -> Result<Symbol, Error> {
|
38
|
+
let binding = self.0.borrow();
|
39
|
+
|
40
|
+
if let Ok(tc) = binding.text_chunk.get() {
|
41
|
+
match tc.text_type() {
|
42
|
+
TextType::Data => Ok(Symbol::from("data")),
|
43
|
+
TextType::PlainText => Ok(Symbol::from("plain_text")),
|
44
|
+
TextType::RawText => Ok(Symbol::from("raw_text")),
|
45
|
+
TextType::ScriptData => Ok(Symbol::from("script")),
|
46
|
+
TextType::RCData => Ok(Symbol::from("rc_data")),
|
47
|
+
TextType::CDataSection => Ok(Symbol::from("cdata_section")),
|
48
|
+
}
|
49
|
+
} else {
|
50
|
+
Err(Error::new(
|
51
|
+
exception::runtime_error(),
|
52
|
+
"`text_type` is not available",
|
53
|
+
))
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
fn replace(&self, args: &[Value]) -> Result<(), Error> {
|
58
|
+
let mut binding = self.0.borrow_mut();
|
59
|
+
let text_chunk = binding.text_chunk.get_mut().unwrap();
|
60
|
+
|
61
|
+
let (text_str, content_type) = match crate::scan_text_args(args) {
|
62
|
+
Ok((text_str, content_type)) => (text_str, content_type),
|
63
|
+
Err(err) => return Err(err),
|
64
|
+
};
|
65
|
+
|
66
|
+
text_chunk.replace(&text_str, content_type);
|
67
|
+
|
68
|
+
Ok(())
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
pub fn init(c_html: RClass) -> Result<(), Error> {
|
73
|
+
let c_text_chunk = c_html
|
74
|
+
.define_class("TextChunk", Default::default())
|
75
|
+
.expect("cannot find class Selma::HTML::TextChunk");
|
76
|
+
|
77
|
+
c_text_chunk.define_method("to_s", method!(SelmaHTMLTextChunk::to_s, 0))?;
|
78
|
+
c_text_chunk.define_method("content", method!(SelmaHTMLTextChunk::to_s, 0))?;
|
79
|
+
c_text_chunk.define_method("text_type", method!(SelmaHTMLTextChunk::text_type, 0))?;
|
80
|
+
c_text_chunk.define_method("replace", method!(SelmaHTMLTextChunk::replace, -1))?;
|
81
|
+
|
82
|
+
Ok(())
|
83
|
+
}
|
data/ext/selma/src/html.rs
CHANGED
@@ -9,9 +9,11 @@ pub fn init(m_selma: RModule) -> Result<(), Error> {
|
|
9
9
|
|
10
10
|
element::init(c_html).expect("cannot define Selma::HTML::Element class");
|
11
11
|
end_tag::init(c_html).expect("cannot define Selma::HTML::EndTag class");
|
12
|
+
text_chunk::init(c_html).expect("cannot define Selma::HTML::TextChunk class");
|
12
13
|
|
13
14
|
Ok(())
|
14
15
|
}
|
15
16
|
|
16
17
|
pub mod element;
|
17
18
|
pub mod end_tag;
|
19
|
+
pub mod text_chunk;
|
data/ext/selma/src/lib.rs
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
extern crate core;
|
2
2
|
|
3
|
-
use
|
3
|
+
use lol_html::html_content::ContentType;
|
4
|
+
use magnus::{define_module, exception, scan_args, Error, Symbol, Value};
|
4
5
|
|
5
6
|
pub mod html;
|
6
7
|
pub mod native_ref_wrap;
|
@@ -10,6 +11,32 @@ pub mod selector;
|
|
10
11
|
pub mod tags;
|
11
12
|
pub mod wrapped_struct;
|
12
13
|
|
14
|
+
#[allow(clippy::let_unit_value)]
|
15
|
+
fn scan_text_args(args: &[Value]) -> Result<(String, ContentType), magnus::Error> {
|
16
|
+
let args = scan_args::scan_args(args)?;
|
17
|
+
let (text,): (String,) = args.required;
|
18
|
+
let _: () = args.optional;
|
19
|
+
let _: () = args.splat;
|
20
|
+
let _: () = args.trailing;
|
21
|
+
let _: () = args.block;
|
22
|
+
|
23
|
+
let kwargs = scan_args::get_kwargs::<_, (Symbol,), (), ()>(args.keywords, &["as"], &[])?;
|
24
|
+
let as_sym = kwargs.required.0;
|
25
|
+
let as_sym_str = as_sym.name().unwrap();
|
26
|
+
let content_type = if as_sym_str == "text" {
|
27
|
+
ContentType::Text
|
28
|
+
} else if as_sym_str == "html" {
|
29
|
+
ContentType::Html
|
30
|
+
} else {
|
31
|
+
return Err(Error::new(
|
32
|
+
exception::runtime_error(),
|
33
|
+
format!("unknown symbol `{as_sym_str:?}`"),
|
34
|
+
));
|
35
|
+
};
|
36
|
+
|
37
|
+
Ok((text, content_type))
|
38
|
+
}
|
39
|
+
|
13
40
|
#[magnus::init]
|
14
41
|
fn init() -> Result<(), Error> {
|
15
42
|
let m_selma = define_module("Selma").expect("cannot define ::Selma module");
|
@@ -1,4 +1,4 @@
|
|
1
|
-
use std::{cell::Cell, marker::PhantomData,
|
1
|
+
use std::{cell::Cell, marker::PhantomData, rc::Rc};
|
2
2
|
|
3
3
|
// NOTE: My Rust isn't good enough to know what any of this does,
|
4
4
|
// but it was taken from https://github.com/cloudflare/lol-html/blob/1a1ab2e2bf896f815fe8888ed78ccdf46d7c6b85/js-api/src/lib.rs#LL38
|
@@ -37,7 +37,7 @@ pub struct NativeRefWrap<R> {
|
|
37
37
|
impl<R> NativeRefWrap<R> {
|
38
38
|
pub fn wrap<I>(inner: &I) -> (Self, Anchor) {
|
39
39
|
let wrap = NativeRefWrap {
|
40
|
-
inner_ptr:
|
40
|
+
inner_ptr: inner as *const I as *mut R,
|
41
41
|
poisoned: Rc::new(Cell::new(false)),
|
42
42
|
};
|
43
43
|
|
@@ -48,7 +48,7 @@ impl<R> NativeRefWrap<R> {
|
|
48
48
|
|
49
49
|
pub fn wrap_mut<I>(inner: &mut I) -> (Self, Anchor) {
|
50
50
|
let wrap = NativeRefWrap {
|
51
|
-
inner_ptr:
|
51
|
+
inner_ptr: inner as *mut I as *mut R,
|
52
52
|
poisoned: Rc::new(Cell::new(false)),
|
53
53
|
};
|
54
54
|
|
data/ext/selma/src/rewriter.rs
CHANGED
@@ -1,14 +1,14 @@
|
|
1
|
-
use std::{borrow::Cow, cell::RefCell, rc::Rc};
|
2
|
-
|
3
1
|
use lol_html::{
|
4
2
|
doc_comments, doctype, element,
|
5
|
-
html_content::{
|
3
|
+
html_content::{Element, EndTag, TextChunk},
|
6
4
|
text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter, Selector, Settings,
|
7
5
|
};
|
8
6
|
use magnus::{exception, function, method, scan_args, Module, Object, RArray, RModule, Value};
|
9
7
|
|
8
|
+
use std::{borrow::Cow, cell::RefCell, primitive::str, rc::Rc};
|
9
|
+
|
10
10
|
use crate::{
|
11
|
-
html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag},
|
11
|
+
html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag, text_chunk::SelmaHTMLTextChunk},
|
12
12
|
sanitizer::SelmaSanitizer,
|
13
13
|
selector::SelmaSelector,
|
14
14
|
tags::Tag,
|
@@ -43,7 +43,7 @@ unsafe impl Send for SelmaRewriter {}
|
|
43
43
|
impl SelmaRewriter {
|
44
44
|
const SELMA_ON_END_TAG: &str = "on_end_tag";
|
45
45
|
const SELMA_HANDLE_ELEMENT: &str = "handle_element";
|
46
|
-
const
|
46
|
+
const SELMA_HANDLE_TEXT_CHUNK: &str = "handle_text_chunk";
|
47
47
|
|
48
48
|
/// @yard
|
49
49
|
/// @def new(sanitizer: Selma::Sanitizer.new(Selma::Sanitizer::Config::DEFAULT), handlers: [])
|
@@ -83,18 +83,18 @@ impl SelmaRewriter {
|
|
83
83
|
return Err(magnus::Error::new(
|
84
84
|
exception::no_method_error(),
|
85
85
|
format!(
|
86
|
-
"Could not call #selector on {:?}; is this an object that defines it?",
|
87
|
-
|
86
|
+
"Could not call #selector on {classname:?}; is this an object that defines it?",
|
87
|
+
|
88
88
|
),
|
89
89
|
));
|
90
90
|
}
|
91
91
|
|
92
92
|
let rb_selector: WrappedStruct<SelmaSelector> =
|
93
93
|
match rb_handler.funcall("selector", ()) {
|
94
|
-
Err(
|
94
|
+
Err(err) => {
|
95
95
|
return Err(magnus::Error::new(
|
96
96
|
exception::type_error(),
|
97
|
-
format!("Error instantiating selector: {}"
|
97
|
+
format!("Error instantiating selector: {err:?}"),
|
98
98
|
));
|
99
99
|
}
|
100
100
|
Ok(rb_selector) => rb_selector,
|
@@ -145,7 +145,7 @@ impl SelmaRewriter {
|
|
145
145
|
let _: () = args.trailing;
|
146
146
|
let _: () = args.block;
|
147
147
|
|
148
|
-
let
|
148
|
+
let kwargs = scan_args::get_kwargs::<
|
149
149
|
_,
|
150
150
|
(),
|
151
151
|
(
|
@@ -154,7 +154,7 @@ impl SelmaRewriter {
|
|
154
154
|
),
|
155
155
|
(),
|
156
156
|
>(args.keywords, &[], &["sanitizer", "handlers"])?;
|
157
|
-
let (rb_sanitizer, rb_handlers) =
|
157
|
+
let (rb_sanitizer, rb_handlers) = kwargs.optional;
|
158
158
|
|
159
159
|
Ok((rb_sanitizer, rb_handlers))
|
160
160
|
}
|
@@ -162,28 +162,22 @@ impl SelmaRewriter {
|
|
162
162
|
/// Perform HTML rewrite sequence.
|
163
163
|
fn rewrite(&self, html: String) -> Result<String, magnus::Error> {
|
164
164
|
let sanitized_html = match &self.0.borrow().sanitizer {
|
165
|
-
None => html,
|
165
|
+
None => Ok(html),
|
166
166
|
Some(sanitizer) => {
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
// we need to run sanitization several times to truly remove unwanted tags,
|
172
|
-
// because lol-html happily accepts this garbage (by design?)
|
173
|
-
let sanitized_html = Self::perform_sanitization(sanitizer, &html).unwrap();
|
167
|
+
let sanitized_html = match Self::perform_sanitization(sanitizer, &html) {
|
168
|
+
Ok(sanitized_html) => sanitized_html,
|
169
|
+
Err(err) => return Err(err),
|
170
|
+
};
|
174
171
|
|
175
|
-
String::from_utf8(sanitized_html)
|
172
|
+
String::from_utf8(sanitized_html)
|
176
173
|
}
|
177
174
|
};
|
178
175
|
let binding = self.0.borrow_mut();
|
179
176
|
let handlers = &binding.handlers;
|
180
177
|
|
181
|
-
match Self::perform_handler_rewrite(self, handlers, sanitized_html) {
|
178
|
+
match Self::perform_handler_rewrite(self, handlers, sanitized_html.unwrap()) {
|
182
179
|
Ok(rewritten_html) => Ok(String::from_utf8(rewritten_html).unwrap()),
|
183
|
-
Err(err) => Err(
|
184
|
-
exception::runtime_error(),
|
185
|
-
format!("{}", err),
|
186
|
-
)),
|
180
|
+
Err(err) => Err(err),
|
187
181
|
}
|
188
182
|
}
|
189
183
|
|
@@ -214,10 +208,12 @@ impl SelmaRewriter {
|
|
214
208
|
if el.removed() {
|
215
209
|
return Ok(());
|
216
210
|
}
|
217
|
-
sanitizer.sanitize_attributes(el)
|
218
|
-
|
219
|
-
|
211
|
+
match sanitizer.sanitize_attributes(el) {
|
212
|
+
Ok(_) => Ok(()),
|
213
|
+
Err(err) => Err(err.to_string().into()),
|
214
|
+
}
|
220
215
|
})],
|
216
|
+
// TODO: allow for MemorySettings to be defined
|
221
217
|
..Settings::default()
|
222
218
|
},
|
223
219
|
|c: &[u8]| first_pass_html.extend_from_slice(c),
|
@@ -342,7 +338,7 @@ impl SelmaRewriter {
|
|
342
338
|
let mut stack = closure_element_stack.as_ref().borrow_mut();
|
343
339
|
stack.pop();
|
344
340
|
Ok(())
|
345
|
-
})
|
341
|
+
})?;
|
346
342
|
Ok(())
|
347
343
|
}));
|
348
344
|
});
|
@@ -361,7 +357,7 @@ impl SelmaRewriter {
|
|
361
357
|
Err(err) => {
|
362
358
|
return Err(magnus::Error::new(
|
363
359
|
exception::runtime_error(),
|
364
|
-
format!("{}"
|
360
|
+
format!("{err:?}"),
|
365
361
|
));
|
366
362
|
}
|
367
363
|
}
|
@@ -372,17 +368,18 @@ impl SelmaRewriter {
|
|
372
368
|
fn process_element_handlers(
|
373
369
|
rb_handler: Value,
|
374
370
|
element: &mut Element,
|
375
|
-
ancestors: &
|
371
|
+
ancestors: &[String],
|
376
372
|
) -> Result<(), magnus::Error> {
|
377
373
|
// if `on_end_tag` function is defined, call it
|
378
374
|
if rb_handler.respond_to(Self::SELMA_ON_END_TAG, true).unwrap() {
|
375
|
+
// TODO: error here is an "EndTagError"
|
379
376
|
element.on_end_tag(move |end_tag| {
|
380
377
|
let rb_end_tag = SelmaHTMLEndTag::new(end_tag);
|
381
378
|
|
382
|
-
rb_handler
|
383
|
-
|
384
|
-
.
|
385
|
-
|
379
|
+
match rb_handler.funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,)) {
|
380
|
+
Ok(_) => Ok(()),
|
381
|
+
Err(err) => Err(err.to_string().into()),
|
382
|
+
}
|
386
383
|
});
|
387
384
|
}
|
388
385
|
|
@@ -391,39 +388,30 @@ impl SelmaRewriter {
|
|
391
388
|
rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_ELEMENT, (rb_element,));
|
392
389
|
match rb_result {
|
393
390
|
Ok(_) => Ok(()),
|
394
|
-
Err(err) => Err(
|
395
|
-
exception::runtime_error(),
|
396
|
-
format!("{}", err),
|
397
|
-
)),
|
391
|
+
Err(err) => Err(err),
|
398
392
|
}
|
399
393
|
}
|
400
394
|
|
401
|
-
fn process_text_handlers(
|
402
|
-
|
403
|
-
|
395
|
+
fn process_text_handlers(
|
396
|
+
rb_handler: Value,
|
397
|
+
text_chunk: &mut TextChunk,
|
398
|
+
) -> Result<(), magnus::Error> {
|
399
|
+
// prevents missing `handle_text_chunk` function
|
400
|
+
let content = text_chunk.as_str();
|
404
401
|
|
405
|
-
//
|
402
|
+
// seems that sometimes lol-html returns blank text / EOLs?
|
406
403
|
if content.is_empty() {
|
407
404
|
return Ok(());
|
408
405
|
}
|
409
|
-
let rb_result = rb_handler.funcall(Self::SELMA_HANDLE_TEXT, (content,));
|
410
406
|
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
),
|
419
|
-
));
|
407
|
+
let rb_text_chunk = SelmaHTMLTextChunk::new(text_chunk);
|
408
|
+
match rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_TEXT_CHUNK, (rb_text_chunk,)) {
|
409
|
+
Ok(_) => Ok(()),
|
410
|
+
Err(err) => Err(magnus::Error::new(
|
411
|
+
exception::runtime_error(),
|
412
|
+
format!("{err:?}"),
|
413
|
+
)),
|
420
414
|
}
|
421
|
-
|
422
|
-
let new_content: String = rb_result.unwrap();
|
423
|
-
// TODO: can this be an option?
|
424
|
-
text.replace(&new_content, ContentType::Html);
|
425
|
-
|
426
|
-
Ok(())
|
427
415
|
}
|
428
416
|
}
|
429
417
|
|
data/ext/selma/src/sanitizer.rs
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
use std::{borrow::BorrowMut, cell::RefMut, collections::HashMap};
|
2
2
|
|
3
|
-
use lol_html::
|
4
|
-
|
5
|
-
|
6
|
-
Value,
|
3
|
+
use lol_html::{
|
4
|
+
errors::AttributeNameError,
|
5
|
+
html_content::{Comment, ContentType, Doctype, Element, EndTag},
|
7
6
|
};
|
8
|
-
|
9
|
-
use crate::tags::Tag;
|
7
|
+
use magnus::{class, function, method, scan_args, Module, Object, RArray, RHash, RModule, Value};
|
10
8
|
|
11
9
|
#[derive(Clone, Debug)]
|
12
10
|
struct ElementSanitizer {
|
@@ -18,7 +16,7 @@ struct ElementSanitizer {
|
|
18
16
|
|
19
17
|
#[derive(Clone, Debug)]
|
20
18
|
pub struct Sanitizer {
|
21
|
-
flags: [u8; Tag::TAG_COUNT],
|
19
|
+
flags: [u8; crate::tags::Tag::TAG_COUNT],
|
22
20
|
allowed_attrs: Vec<String>,
|
23
21
|
allowed_classes: Vec<String>,
|
24
22
|
element_sanitizers: HashMap<String, ElementSanitizer>,
|
@@ -35,11 +33,11 @@ pub struct SelmaSanitizer(std::cell::RefCell<Sanitizer>);
|
|
35
33
|
|
36
34
|
impl SelmaSanitizer {
|
37
35
|
const SELMA_SANITIZER_ALLOW: u8 = (1 << 0);
|
38
|
-
const SELMA_SANITIZER_ESCAPE_TAGFILTER: u8 = (1 << 1);
|
36
|
+
// const SELMA_SANITIZER_ESCAPE_TAGFILTER: u8 = (1 << 1);
|
39
37
|
const SELMA_SANITIZER_REMOVE_CONTENTS: u8 = (1 << 2);
|
40
38
|
const SELMA_SANITIZER_WRAP_WHITESPACE: u8 = (1 << 3);
|
41
39
|
|
42
|
-
pub fn new(arguments: &[Value]) -> Result<Self, Error> {
|
40
|
+
pub fn new(arguments: &[Value]) -> Result<Self, magnus::Error> {
|
43
41
|
let args = scan_args::scan_args::<(), (Option<RHash>,), (), (), (), ()>(arguments)?;
|
44
42
|
let (opt_config,): (Option<RHash>,) = args.optional;
|
45
43
|
|
@@ -50,7 +48,7 @@ impl SelmaSanitizer {
|
|
50
48
|
};
|
51
49
|
|
52
50
|
let mut element_sanitizers = HashMap::new();
|
53
|
-
Tag::html_tags().iter().for_each(|html_tag| {
|
51
|
+
crate::tags::Tag::html_tags().iter().for_each(|html_tag| {
|
54
52
|
let es = ElementSanitizer {
|
55
53
|
allowed_attrs: vec![],
|
56
54
|
allowed_classes: vec![],
|
@@ -58,11 +56,14 @@ impl SelmaSanitizer {
|
|
58
56
|
|
59
57
|
protocol_sanitizers: HashMap::new(),
|
60
58
|
};
|
61
|
-
element_sanitizers.insert(
|
59
|
+
element_sanitizers.insert(
|
60
|
+
crate::tags::Tag::element_name_from_enum(html_tag).to_string(),
|
61
|
+
es,
|
62
|
+
);
|
62
63
|
});
|
63
64
|
|
64
65
|
Ok(Self(std::cell::RefCell::new(Sanitizer {
|
65
|
-
flags: [0; Tag::TAG_COUNT],
|
66
|
+
flags: [0; crate::tags::Tag::TAG_COUNT],
|
66
67
|
allowed_attrs: vec![],
|
67
68
|
allowed_classes: vec![],
|
68
69
|
element_sanitizers,
|
@@ -74,7 +75,7 @@ impl SelmaSanitizer {
|
|
74
75
|
})))
|
75
76
|
}
|
76
77
|
|
77
|
-
fn get_config(&self) -> Result<RHash, Error> {
|
78
|
+
fn get_config(&self) -> Result<RHash, magnus::Error> {
|
78
79
|
let binding = self.0.borrow();
|
79
80
|
|
80
81
|
Ok(binding.config)
|
@@ -82,7 +83,7 @@ impl SelmaSanitizer {
|
|
82
83
|
|
83
84
|
/// Toggle a sanitizer option on or off.
|
84
85
|
fn set_flag(&self, tag_name: String, flag: u8, set: bool) {
|
85
|
-
let tag = Tag::tag_from_tag_name(tag_name.as_str());
|
86
|
+
let tag = crate::tags::Tag::tag_from_tag_name(tag_name.as_str());
|
86
87
|
if set {
|
87
88
|
self.0.borrow_mut().flags[tag.index] |= flag;
|
88
89
|
} else {
|
@@ -93,13 +94,19 @@ impl SelmaSanitizer {
|
|
93
94
|
/// Toggles all sanitization options on or off.
|
94
95
|
fn set_all_flags(&self, flag: u8, set: bool) {
|
95
96
|
if set {
|
96
|
-
Tag::html_tags()
|
97
|
-
|
98
|
-
|
97
|
+
crate::tags::Tag::html_tags()
|
98
|
+
.iter()
|
99
|
+
.enumerate()
|
100
|
+
.for_each(|(iter, _)| {
|
101
|
+
self.0.borrow_mut().flags[iter] |= flag;
|
102
|
+
});
|
99
103
|
} else {
|
100
|
-
Tag::html_tags()
|
101
|
-
|
102
|
-
|
104
|
+
crate::tags::Tag::html_tags()
|
105
|
+
.iter()
|
106
|
+
.enumerate()
|
107
|
+
.for_each(|(iter, _)| {
|
108
|
+
self.0.borrow_mut().flags[iter] &= flag;
|
109
|
+
});
|
103
110
|
}
|
104
111
|
}
|
105
112
|
|
@@ -111,8 +118,8 @@ impl SelmaSanitizer {
|
|
111
118
|
|
112
119
|
pub fn escape_tagfilter(&self, e: &mut Element) -> bool {
|
113
120
|
if self.0.borrow().escape_tagfilter {
|
114
|
-
let tag = Tag::tag_from_element(e);
|
115
|
-
if Tag::is_tag_escapeworthy(tag) {
|
121
|
+
let tag = crate::tags::Tag::tag_from_element(e);
|
122
|
+
if crate::tags::Tag::is_tag_escapeworthy(tag) {
|
116
123
|
e.remove();
|
117
124
|
return true;
|
118
125
|
}
|
@@ -229,9 +236,9 @@ impl SelmaSanitizer {
|
|
229
236
|
}
|
230
237
|
}
|
231
238
|
|
232
|
-
pub fn sanitize_attributes(&self, element: &mut Element) {
|
239
|
+
pub fn sanitize_attributes(&self, element: &mut Element) -> Result<(), AttributeNameError> {
|
233
240
|
let binding = self.0.borrow_mut();
|
234
|
-
let tag = Tag::tag_from_element(element);
|
241
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
235
242
|
let element_sanitizer = Self::get_element_sanitizer(&binding, &element.tag_name());
|
236
243
|
|
237
244
|
// FIXME: This is a hack to get around the fact that we can't borrow
|
@@ -247,7 +254,7 @@ impl SelmaSanitizer {
|
|
247
254
|
// encountered, remove the entire element to be safe.
|
248
255
|
if attr_name.starts_with("<!--") {
|
249
256
|
Self::force_remove_element(self, element);
|
250
|
-
return;
|
257
|
+
return Ok(());
|
251
258
|
}
|
252
259
|
|
253
260
|
// first, trim leading spaces and unescape any encodings
|
@@ -255,46 +262,64 @@ impl SelmaSanitizer {
|
|
255
262
|
let x = escapist::unescape_html(trimmed.as_bytes());
|
256
263
|
let unescaped_attr_val = String::from_utf8_lossy(&x).to_string();
|
257
264
|
|
258
|
-
|
265
|
+
let should_keep_attrubute = match Self::should_keep_attribute(
|
259
266
|
&binding,
|
260
267
|
element,
|
261
268
|
element_sanitizer,
|
262
269
|
attr_name,
|
263
270
|
&unescaped_attr_val,
|
264
271
|
) {
|
272
|
+
Ok(should_keep) => should_keep,
|
273
|
+
Err(e) => {
|
274
|
+
return Err(e);
|
275
|
+
}
|
276
|
+
};
|
277
|
+
|
278
|
+
if !should_keep_attrubute {
|
265
279
|
element.remove_attribute(attr_name);
|
266
280
|
} else {
|
267
281
|
// Prevent the use of `<meta>` elements that set a charset other than UTF-8,
|
268
282
|
// since output is always UTF-8.
|
269
|
-
if Tag::is_meta(tag) {
|
283
|
+
if crate::tags::Tag::is_meta(tag) {
|
270
284
|
if attr_name == "charset" && unescaped_attr_val != "utf-8" {
|
271
|
-
element.set_attribute(attr_name, "utf-8")
|
285
|
+
match element.set_attribute(attr_name, "utf-8") {
|
286
|
+
Ok(_) => {}
|
287
|
+
Err(err) => {
|
288
|
+
return Err(err);
|
289
|
+
}
|
290
|
+
}
|
272
291
|
}
|
273
292
|
} else if !unescaped_attr_val.is_empty() {
|
274
293
|
let mut buf = String::new();
|
275
294
|
// ...then, escape any special characters, for security
|
276
295
|
if attr_name == "href" {
|
277
|
-
|
278
|
-
escapist::escape_href(&mut buf, unescaped_attr_val.to_string().as_str());
|
296
|
+
escapist::escape_href(&mut buf, unescaped_attr_val.as_str());
|
279
297
|
} else {
|
280
|
-
escapist::escape_html(&mut buf, unescaped_attr_val.
|
298
|
+
escapist::escape_html(&mut buf, unescaped_attr_val.as_str());
|
281
299
|
};
|
282
300
|
|
283
|
-
element.set_attribute(attr_name, &buf)
|
301
|
+
match element.set_attribute(attr_name, &buf) {
|
302
|
+
Ok(_) => {}
|
303
|
+
Err(err) => {
|
304
|
+
return Err(err);
|
305
|
+
}
|
306
|
+
}
|
284
307
|
}
|
285
308
|
}
|
286
309
|
}
|
287
310
|
|
288
311
|
let required = &element_sanitizer.required_attrs;
|
289
312
|
if required.contains(&"*".to_string()) {
|
290
|
-
return;
|
313
|
+
return Ok(());
|
291
314
|
}
|
292
315
|
for attr in element.attributes().iter() {
|
293
316
|
let attr_name = &attr.name();
|
294
317
|
if required.contains(attr_name) {
|
295
|
-
return;
|
318
|
+
return Ok(());
|
296
319
|
}
|
297
320
|
}
|
321
|
+
|
322
|
+
Ok(())
|
298
323
|
}
|
299
324
|
|
300
325
|
fn should_keep_attribute(
|
@@ -303,7 +328,7 @@ impl SelmaSanitizer {
|
|
303
328
|
element_sanitizer: &ElementSanitizer,
|
304
329
|
attr_name: &String,
|
305
330
|
attr_val: &String,
|
306
|
-
) -> bool {
|
331
|
+
) -> Result<bool, AttributeNameError> {
|
307
332
|
let mut allowed: bool = false;
|
308
333
|
let element_allowed_attrs = element_sanitizer.allowed_attrs.contains(attr_name);
|
309
334
|
let sanitizer_allowed_attrs = binding.allowed_attrs.contains(attr_name);
|
@@ -317,7 +342,7 @@ impl SelmaSanitizer {
|
|
317
342
|
}
|
318
343
|
|
319
344
|
if !allowed {
|
320
|
-
return false;
|
345
|
+
return Ok(false);
|
321
346
|
}
|
322
347
|
|
323
348
|
let protocol_sanitizer_values = element_sanitizer.protocol_sanitizers.get(attr_name);
|
@@ -325,32 +350,29 @@ impl SelmaSanitizer {
|
|
325
350
|
None => {
|
326
351
|
// has a protocol, but no sanitization list
|
327
352
|
if !attr_val.is_empty() && Self::has_protocol(attr_val) {
|
328
|
-
return false;
|
353
|
+
return Ok(false);
|
329
354
|
}
|
330
355
|
}
|
331
356
|
Some(protocol_sanitizer_values) => {
|
332
357
|
if !attr_val.is_empty()
|
333
358
|
&& !Self::has_allowed_protocol(protocol_sanitizer_values, attr_val)
|
334
359
|
{
|
335
|
-
return false;
|
360
|
+
return Ok(false);
|
336
361
|
}
|
337
362
|
}
|
338
363
|
}
|
339
364
|
|
340
|
-
if attr_name == "class"
|
341
|
-
|
365
|
+
if attr_name == "class" {
|
366
|
+
return Self::sanitize_class_attribute(
|
342
367
|
binding,
|
343
368
|
element,
|
344
369
|
element_sanitizer,
|
345
370
|
attr_name,
|
346
371
|
attr_val,
|
347
|
-
)
|
348
|
-
.unwrap()
|
349
|
-
{
|
350
|
-
return false;
|
372
|
+
);
|
351
373
|
}
|
352
374
|
|
353
|
-
true
|
375
|
+
Ok(true)
|
354
376
|
}
|
355
377
|
|
356
378
|
fn has_protocol(attr_val: &str) -> bool {
|
@@ -393,7 +415,7 @@ impl SelmaSanitizer {
|
|
393
415
|
element_sanitizer: &ElementSanitizer,
|
394
416
|
attr_name: &str,
|
395
417
|
attr_val: &str,
|
396
|
-
) -> Result<bool,
|
418
|
+
) -> Result<bool, lol_html::errors::AttributeNameError> {
|
397
419
|
let allowed_global = &binding.allowed_classes;
|
398
420
|
|
399
421
|
let mut valid_classes: Vec<String> = vec![];
|
@@ -421,28 +443,25 @@ impl SelmaSanitizer {
|
|
421
443
|
|
422
444
|
match element.set_attribute(attr_name, valid_classes.join(" ").as_str()) {
|
423
445
|
Ok(_) => Ok(true),
|
424
|
-
Err(err) => Err(
|
425
|
-
exception::runtime_error(),
|
426
|
-
format!("AttributeNameError: {}", err),
|
427
|
-
)),
|
446
|
+
Err(err) => Err(err),
|
428
447
|
}
|
429
448
|
}
|
430
449
|
|
431
450
|
pub fn allow_element(&self, element: &mut Element) -> bool {
|
432
|
-
let tag = Tag::tag_from_element(element);
|
451
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
433
452
|
let flags: u8 = self.0.borrow().flags[tag.index];
|
434
453
|
|
435
454
|
(flags & Self::SELMA_SANITIZER_ALLOW) == 0
|
436
455
|
}
|
437
456
|
|
438
457
|
pub fn try_remove_element(&self, element: &mut Element) -> bool {
|
439
|
-
let tag = Tag::tag_from_element(element);
|
458
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
440
459
|
let flags: u8 = self.0.borrow().flags[tag.index];
|
441
460
|
|
442
461
|
let should_remove = !element.removed() && self.allow_element(element);
|
443
462
|
|
444
463
|
if should_remove {
|
445
|
-
if Tag::has_text_content(tag) {
|
464
|
+
if crate::tags::Tag::has_text_content(tag) {
|
446
465
|
Self::remove_element(
|
447
466
|
element,
|
448
467
|
tag.self_closing,
|
@@ -455,7 +474,7 @@ impl SelmaSanitizer {
|
|
455
474
|
Self::check_if_end_tag_needs_removal(element);
|
456
475
|
} else {
|
457
476
|
// anything in <iframe> must be removed, if it's kept
|
458
|
-
if Tag::is_iframe(tag) {
|
477
|
+
if crate::tags::Tag::is_iframe(tag) {
|
459
478
|
if self.0.borrow().flags[tag.index] != 0 {
|
460
479
|
element.set_inner_content(" ", ContentType::Text);
|
461
480
|
} else {
|
@@ -487,14 +506,14 @@ impl SelmaSanitizer {
|
|
487
506
|
}
|
488
507
|
|
489
508
|
pub fn force_remove_element(&self, element: &mut Element) {
|
490
|
-
let tag = Tag::tag_from_element(element);
|
509
|
+
let tag = crate::tags::Tag::tag_from_element(element);
|
491
510
|
let self_closing = tag.self_closing;
|
492
511
|
Self::remove_element(element, self_closing, Self::SELMA_SANITIZER_REMOVE_CONTENTS);
|
493
512
|
Self::check_if_end_tag_needs_removal(element);
|
494
513
|
}
|
495
514
|
|
496
515
|
fn check_if_end_tag_needs_removal(element: &mut Element) {
|
497
|
-
if element.removed() && !Tag::tag_from_element(element).self_closing {
|
516
|
+
if element.removed() && !crate::tags::Tag::tag_from_element(element).self_closing {
|
498
517
|
element
|
499
518
|
.on_end_tag(move |end| {
|
500
519
|
Self::remove_end_tag(end);
|
@@ -523,7 +542,7 @@ impl SelmaSanitizer {
|
|
523
542
|
}
|
524
543
|
}
|
525
544
|
|
526
|
-
pub fn init(m_selma: RModule) -> Result<(), Error> {
|
545
|
+
pub fn init(m_selma: RModule) -> Result<(), magnus::Error> {
|
527
546
|
let c_sanitizer = m_selma.define_class("Sanitizer", Default::default())?;
|
528
547
|
|
529
548
|
c_sanitizer.define_singleton_method("new", function!(SelmaSanitizer::new, -1))?;
|
data/ext/selma/src/selector.rs
CHANGED
@@ -27,7 +27,7 @@ impl SelmaSelector {
|
|
27
27
|
if css.parse::<lol_html::Selector>().is_err() {
|
28
28
|
return Err(Error::new(
|
29
29
|
exception::arg_error(),
|
30
|
-
format!("Could not parse `match_element` (`{}`) as valid CSS"
|
30
|
+
format!("Could not parse `match_element` (`{css:?}`) as valid CSS"),
|
31
31
|
));
|
32
32
|
}
|
33
33
|
}
|
@@ -37,10 +37,7 @@ impl SelmaSelector {
|
|
37
37
|
if css.parse::<lol_html::Selector>().is_err() {
|
38
38
|
return Err(Error::new(
|
39
39
|
exception::arg_error(),
|
40
|
-
format!(
|
41
|
-
"Could not parse `match_text_within` (`{}`) as valid CSS",
|
42
|
-
css
|
43
|
-
),
|
40
|
+
format!("Could not parse `match_text_within` (`{css:?}`) as valid CSS",),
|
44
41
|
));
|
45
42
|
}
|
46
43
|
}
|
data/ext/selma/src/tags.rs
CHANGED
@@ -192,14 +192,17 @@ impl Tag {
|
|
192
192
|
/// Is this tag something which needs to be removed?
|
193
193
|
pub fn is_tag_escapeworthy(tag: Tag) -> bool {
|
194
194
|
tag.index == HTMLTag::TITLE as usize
|
195
|
-
|| tag.index == HTMLTag::TEXTAREA as usize
|
196
|
-
|| tag.index == HTMLTag::STYLE as usize
|
197
|
-
|| tag.index == HTMLTag::XMP as usize
|
198
195
|
|| tag.index == HTMLTag::IFRAME as usize
|
196
|
+
|| tag.index == HTMLTag::MATH as usize
|
199
197
|
|| tag.index == HTMLTag::NOEMBED as usize
|
200
198
|
|| tag.index == HTMLTag::NOFRAMES as usize
|
201
|
-
|| tag.index == HTMLTag::
|
199
|
+
|| tag.index == HTMLTag::NOSCRIPT as usize
|
202
200
|
|| tag.index == HTMLTag::PLAINTEXT as usize
|
201
|
+
|| tag.index == HTMLTag::SCRIPT as usize
|
202
|
+
|| tag.index == HTMLTag::STYLE as usize
|
203
|
+
|| tag.index == HTMLTag::SVG as usize
|
204
|
+
|| tag.index == HTMLTag::TEXTAREA as usize
|
205
|
+
|| tag.index == HTMLTag::XMP as usize
|
203
206
|
}
|
204
207
|
|
205
208
|
pub const ESCAPEWORTHY_TAGS_CSS: &str =
|
data/lib/selma/3.1/selma.so
CHANGED
Binary file
|
@@ -3,6 +3,10 @@
|
|
3
3
|
module Selma
|
4
4
|
class Sanitizer
|
5
5
|
module Config
|
6
|
+
# although there are many more protocol types, eg., ftp, xmpp, etc.,
|
7
|
+
# these are the only ones that are allowed by default
|
8
|
+
VALID_PROTOCOLS = ["http", "https", "mailto", :relative]
|
9
|
+
|
6
10
|
DEFAULT = freeze_config(
|
7
11
|
# Whether or not to allow HTML comments. Allowing comments is strongly
|
8
12
|
# discouraged, since IE allows script execution within conditional
|
data/lib/selma/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: selma
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: x86_64-linux
|
6
6
|
authors:
|
7
7
|
- Garen J. Torikian
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-12-
|
11
|
+
date: 2022-12-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -81,6 +81,7 @@ files:
|
|
81
81
|
- ext/selma/src/html.rs
|
82
82
|
- ext/selma/src/html/element.rs
|
83
83
|
- ext/selma/src/html/end_tag.rs
|
84
|
+
- ext/selma/src/html/text_chunk.rs
|
84
85
|
- ext/selma/src/lib.rs
|
85
86
|
- ext/selma/src/native_ref_wrap.rs
|
86
87
|
- ext/selma/src/rewriter.rs
|