selma 0.0.2-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +173 -0
- data/ext/selma/Cargo.toml +14 -0
- data/ext/selma/_util.rb +102 -0
- data/ext/selma/extconf.rb +6 -0
- data/ext/selma/src/html/element.rs +195 -0
- data/ext/selma/src/html/end_tag.rs +35 -0
- data/ext/selma/src/html.rs +17 -0
- data/ext/selma/src/lib.rs +23 -0
- data/ext/selma/src/native_ref_wrap.rs +79 -0
- data/ext/selma/src/rewriter.rs +441 -0
- data/ext/selma/src/sanitizer.rs +578 -0
- data/ext/selma/src/selector.rs +115 -0
- data/ext/selma/src/tags.rs +1133 -0
- data/ext/selma/src/wrapped_struct.rs +92 -0
- data/lib/selma/3.1/selma.bundle +0 -0
- data/lib/selma/extension.rb +14 -0
- data/lib/selma/html.rb +6 -0
- data/lib/selma/rewriter.rb +6 -0
- data/lib/selma/sanitizer/config/basic.rb +27 -0
- data/lib/selma/sanitizer/config/default.rb +42 -0
- data/lib/selma/sanitizer/config/relaxed.rb +37 -0
- data/lib/selma/sanitizer/config/restricted.rb +13 -0
- data/lib/selma/sanitizer/config.rb +67 -0
- data/lib/selma/sanitizer.rb +85 -0
- data/lib/selma/selector.rb +6 -0
- data/lib/selma/version.rb +5 -0
- data/lib/selma.rb +13 -0
- data/selma.gemspec +41 -0
- metadata +136 -0
@@ -0,0 +1,441 @@
|
|
1
|
+
use std::{borrow::Cow, cell::RefCell, rc::Rc};
|
2
|
+
|
3
|
+
use lol_html::{
|
4
|
+
doc_comments, doctype, element,
|
5
|
+
html_content::{ContentType, Element, EndTag, TextChunk},
|
6
|
+
text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter, Selector, Settings,
|
7
|
+
};
|
8
|
+
use magnus::{exception, function, method, scan_args, Module, Object, RArray, RModule, Value};
|
9
|
+
|
10
|
+
use crate::{
|
11
|
+
html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag},
|
12
|
+
sanitizer::SelmaSanitizer,
|
13
|
+
selector::SelmaSelector,
|
14
|
+
tags::Tag,
|
15
|
+
wrapped_struct::WrappedStruct,
|
16
|
+
};
|
17
|
+
|
18
|
+
#[derive(Clone, Debug)]
|
19
|
+
pub struct Handler {
|
20
|
+
rb_handler: Value,
|
21
|
+
rb_selector: WrappedStruct<SelmaSelector>,
|
22
|
+
|
23
|
+
total_element_handler_calls: usize,
|
24
|
+
total_elapsed_element_handlers: f64,
|
25
|
+
|
26
|
+
total_text_handler_calls: usize,
|
27
|
+
total_elapsed_text_handlers: f64,
|
28
|
+
}
|
29
|
+
|
30
|
+
pub struct Rewriter {
|
31
|
+
sanitizer: Option<SelmaSanitizer>,
|
32
|
+
handlers: Vec<Handler>,
|
33
|
+
|
34
|
+
total_elapsed: f64,
|
35
|
+
}
|
36
|
+
|
37
|
+
#[magnus::wrap(class = "Selma::Rewriter")]
|
38
|
+
pub struct SelmaRewriter(std::cell::RefCell<Rewriter>);
|
39
|
+
|
40
|
+
/// SAFETY: This is safe because we only access this data when the GVL is held.
|
41
|
+
unsafe impl Send for SelmaRewriter {}
|
42
|
+
|
43
|
+
impl SelmaRewriter {
|
44
|
+
const SELMA_ON_END_TAG: &str = "on_end_tag";
|
45
|
+
const SELMA_HANDLE_ELEMENT: &str = "handle_element";
|
46
|
+
const SELMA_HANDLE_TEXT: &str = "handle_text";
|
47
|
+
|
48
|
+
/// @yard
|
49
|
+
/// @def new(sanitizer: Selma::Sanitizer.new(Selma::Sanitizer::Config::DEFAULT), handlers: [])
|
50
|
+
/// @param sanitizer [Selma::Sanitizer] The sanitizer which performs the initial cleanup
|
51
|
+
/// @param handlers [Array<Selma::Selector>] The handlers to use to perform HTML rewriting
|
52
|
+
/// @return [Selma::Rewriter]
|
53
|
+
fn new(args: &[Value]) -> Result<Self, magnus::Error> {
|
54
|
+
let (rb_sanitizer, rb_handlers) = Self::scan_parse_args(args)?;
|
55
|
+
|
56
|
+
let sanitizer = match rb_sanitizer {
|
57
|
+
None => {
|
58
|
+
let default_sanitizer = SelmaSanitizer::new(&[])?;
|
59
|
+
let wrapped_sanitizer = WrappedStruct::from(default_sanitizer);
|
60
|
+
wrapped_sanitizer.funcall::<&str, (), Value>("setup", ())?;
|
61
|
+
Some(wrapped_sanitizer.get().unwrap().to_owned())
|
62
|
+
}
|
63
|
+
Some(sanitizer_value) => match sanitizer_value {
|
64
|
+
None => None,
|
65
|
+
Some(sanitizer) => {
|
66
|
+
sanitizer.funcall::<&str, (), Value>("setup", ())?;
|
67
|
+
Some(sanitizer.get().unwrap().to_owned())
|
68
|
+
}
|
69
|
+
},
|
70
|
+
};
|
71
|
+
|
72
|
+
let handlers = match rb_handlers {
|
73
|
+
None => vec![],
|
74
|
+
Some(rb_handlers) => {
|
75
|
+
let mut handlers: Vec<Handler> = vec![];
|
76
|
+
|
77
|
+
for h in rb_handlers.each() {
|
78
|
+
let rb_handler = h.unwrap();
|
79
|
+
|
80
|
+
// prevents missing #selector from ruining things
|
81
|
+
if !rb_handler.respond_to("selector", true).unwrap() {
|
82
|
+
let classname = unsafe { rb_handler.classname() };
|
83
|
+
return Err(magnus::Error::new(
|
84
|
+
exception::no_method_error(),
|
85
|
+
format!(
|
86
|
+
"Could not call #selector on {:?}; is this an object that defines it?",
|
87
|
+
classname
|
88
|
+
),
|
89
|
+
));
|
90
|
+
}
|
91
|
+
|
92
|
+
let rb_selector: WrappedStruct<SelmaSelector> =
|
93
|
+
match rb_handler.funcall("selector", ()) {
|
94
|
+
Err(e) => {
|
95
|
+
return Err(magnus::Error::new(
|
96
|
+
exception::type_error(),
|
97
|
+
format!("Error instantiating selector: {}", e),
|
98
|
+
));
|
99
|
+
}
|
100
|
+
Ok(rb_selector) => rb_selector,
|
101
|
+
};
|
102
|
+
let handler = Handler {
|
103
|
+
rb_handler,
|
104
|
+
rb_selector,
|
105
|
+
total_element_handler_calls: 0,
|
106
|
+
total_elapsed_element_handlers: 0.0,
|
107
|
+
|
108
|
+
total_text_handler_calls: 0,
|
109
|
+
total_elapsed_text_handlers: 0.0,
|
110
|
+
};
|
111
|
+
handlers.push(handler);
|
112
|
+
}
|
113
|
+
handlers
|
114
|
+
}
|
115
|
+
};
|
116
|
+
|
117
|
+
if sanitizer.is_none() && handlers.is_empty() {
|
118
|
+
return Err(magnus::Error::new(
|
119
|
+
exception::arg_error(),
|
120
|
+
"Must provide a sanitizer or a handler",
|
121
|
+
));
|
122
|
+
}
|
123
|
+
|
124
|
+
Ok(Self(std::cell::RefCell::new(Rewriter {
|
125
|
+
sanitizer,
|
126
|
+
handlers,
|
127
|
+
total_elapsed: 0.0,
|
128
|
+
})))
|
129
|
+
}
|
130
|
+
|
131
|
+
#[allow(clippy::let_unit_value)]
|
132
|
+
fn scan_parse_args(
|
133
|
+
args: &[Value],
|
134
|
+
) -> Result<
|
135
|
+
(
|
136
|
+
Option<Option<WrappedStruct<SelmaSanitizer>>>,
|
137
|
+
Option<RArray>,
|
138
|
+
),
|
139
|
+
magnus::Error,
|
140
|
+
> {
|
141
|
+
let args = scan_args::scan_args(args)?;
|
142
|
+
let _: () = args.required;
|
143
|
+
let _: () = args.optional;
|
144
|
+
let _: () = args.splat;
|
145
|
+
let _: () = args.trailing;
|
146
|
+
let _: () = args.block;
|
147
|
+
|
148
|
+
let kw = scan_args::get_kwargs::<
|
149
|
+
_,
|
150
|
+
(),
|
151
|
+
(
|
152
|
+
Option<Option<WrappedStruct<SelmaSanitizer>>>,
|
153
|
+
Option<RArray>,
|
154
|
+
),
|
155
|
+
(),
|
156
|
+
>(args.keywords, &[], &["sanitizer", "handlers"])?;
|
157
|
+
let (rb_sanitizer, rb_handlers) = kw.optional;
|
158
|
+
|
159
|
+
Ok((rb_sanitizer, rb_handlers))
|
160
|
+
}
|
161
|
+
|
162
|
+
/// Perform HTML rewrite sequence.
|
163
|
+
fn rewrite(&self, html: String) -> Result<String, magnus::Error> {
|
164
|
+
let sanitized_html = match &self.0.borrow().sanitizer {
|
165
|
+
None => html,
|
166
|
+
Some(sanitizer) => {
|
167
|
+
// let first_pass_html = Self::perform_initial_sanitization(sanitizer, &html).unwrap();
|
168
|
+
|
169
|
+
// due to malicious html crafting
|
170
|
+
// (e.g. <<foo>script>...</script>, or <div <!-- comment -->> as in tests),
|
171
|
+
// we need to run sanitization several times to truly remove unwanted tags,
|
172
|
+
// because lol-html happily accepts this garbage (by design?)
|
173
|
+
let sanitized_html = Self::perform_sanitization(sanitizer, &html).unwrap();
|
174
|
+
|
175
|
+
String::from_utf8(sanitized_html).unwrap()
|
176
|
+
}
|
177
|
+
};
|
178
|
+
let binding = self.0.borrow_mut();
|
179
|
+
let handlers = &binding.handlers;
|
180
|
+
|
181
|
+
match Self::perform_handler_rewrite(self, handlers, sanitized_html) {
|
182
|
+
Ok(rewritten_html) => Ok(String::from_utf8(rewritten_html).unwrap()),
|
183
|
+
Err(err) => Err(magnus::Error::new(
|
184
|
+
exception::runtime_error(),
|
185
|
+
format!("{}", err),
|
186
|
+
)),
|
187
|
+
}
|
188
|
+
}
|
189
|
+
|
190
|
+
fn perform_sanitization(
|
191
|
+
sanitizer: &SelmaSanitizer,
|
192
|
+
html: &String,
|
193
|
+
) -> Result<Vec<u8>, magnus::Error> {
|
194
|
+
let mut first_pass_html = vec![];
|
195
|
+
{
|
196
|
+
let mut document_content_handlers: Vec<DocumentContentHandlers> = vec![];
|
197
|
+
if !sanitizer.get_allow_doctype() {
|
198
|
+
document_content_handlers.push(doctype!(|d| {
|
199
|
+
sanitizer.remove_doctype(d);
|
200
|
+
Ok(())
|
201
|
+
}));
|
202
|
+
}
|
203
|
+
if !sanitizer.get_allow_comments() {
|
204
|
+
document_content_handlers.push(doc_comments!(|c| {
|
205
|
+
sanitizer.remove_comment(c);
|
206
|
+
Ok(())
|
207
|
+
}));
|
208
|
+
}
|
209
|
+
let mut rewriter = HtmlRewriter::new(
|
210
|
+
Settings {
|
211
|
+
document_content_handlers,
|
212
|
+
element_content_handlers: vec![element!("*", |el| {
|
213
|
+
sanitizer.try_remove_element(el);
|
214
|
+
if el.removed() {
|
215
|
+
return Ok(());
|
216
|
+
}
|
217
|
+
sanitizer.sanitize_attributes(el);
|
218
|
+
|
219
|
+
Ok(())
|
220
|
+
})],
|
221
|
+
..Settings::default()
|
222
|
+
},
|
223
|
+
|c: &[u8]| first_pass_html.extend_from_slice(c),
|
224
|
+
);
|
225
|
+
|
226
|
+
let result = rewriter.write(html.as_bytes());
|
227
|
+
if result.is_err() {
|
228
|
+
return Err(magnus::Error::new(
|
229
|
+
exception::runtime_error(),
|
230
|
+
format!("Failed to sanitize HTML: {}", result.unwrap_err()),
|
231
|
+
));
|
232
|
+
}
|
233
|
+
}
|
234
|
+
|
235
|
+
let mut output = vec![];
|
236
|
+
{
|
237
|
+
let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
|
238
|
+
if sanitizer.get_escape_tagfilter() {
|
239
|
+
element_content_handlers.push(element!(Tag::ESCAPEWORTHY_TAGS_CSS, |el| {
|
240
|
+
let should_remove = sanitizer.allow_element(el);
|
241
|
+
if should_remove {
|
242
|
+
sanitizer.force_remove_element(el);
|
243
|
+
}
|
244
|
+
|
245
|
+
Ok(())
|
246
|
+
}));
|
247
|
+
}
|
248
|
+
|
249
|
+
let mut rewriter = HtmlRewriter::new(
|
250
|
+
Settings {
|
251
|
+
element_content_handlers,
|
252
|
+
..Settings::default()
|
253
|
+
},
|
254
|
+
|c: &[u8]| output.extend_from_slice(c),
|
255
|
+
);
|
256
|
+
|
257
|
+
let result = rewriter.write(first_pass_html.as_slice());
|
258
|
+
if result.is_err() {
|
259
|
+
return Err(magnus::Error::new(
|
260
|
+
exception::runtime_error(),
|
261
|
+
format!("Failed to sanitize HTML: {}", result.unwrap_err()),
|
262
|
+
));
|
263
|
+
}
|
264
|
+
}
|
265
|
+
|
266
|
+
Ok(output)
|
267
|
+
}
|
268
|
+
|
269
|
+
pub fn perform_handler_rewrite(
|
270
|
+
&self,
|
271
|
+
handlers: &[Handler],
|
272
|
+
html: String,
|
273
|
+
) -> Result<Vec<u8>, magnus::Error> {
|
274
|
+
// TODO: this should ideally be done ahead of time, not on every `#rewrite` call
|
275
|
+
let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
|
276
|
+
|
277
|
+
handlers.iter().for_each(|handler| {
|
278
|
+
let element_stack: Rc<RefCell<Vec<String>>> = Rc::new(RefCell::new(vec![]));
|
279
|
+
|
280
|
+
let selector = handler.rb_selector.get_static().unwrap();
|
281
|
+
|
282
|
+
// TODO: test final raise by simulating errors
|
283
|
+
if selector.match_element().is_some() {
|
284
|
+
let closure_element_stack = element_stack.clone();
|
285
|
+
|
286
|
+
element_content_handlers.push(element!(
|
287
|
+
selector.match_element().unwrap(),
|
288
|
+
move |el| {
|
289
|
+
match Self::process_element_handlers(
|
290
|
+
handler.rb_handler,
|
291
|
+
el,
|
292
|
+
&closure_element_stack.borrow(),
|
293
|
+
) {
|
294
|
+
Ok(_) => Ok(()),
|
295
|
+
Err(err) => Err(err.to_string().into()),
|
296
|
+
}
|
297
|
+
}
|
298
|
+
));
|
299
|
+
}
|
300
|
+
|
301
|
+
if selector.match_text_within().is_some() {
|
302
|
+
let closure_element_stack = element_stack.clone();
|
303
|
+
|
304
|
+
element_content_handlers.push(text!(
|
305
|
+
selector.match_text_within().unwrap(),
|
306
|
+
move |text| {
|
307
|
+
let element_stack = closure_element_stack.as_ref().borrow();
|
308
|
+
if selector.ignore_text_within().is_some() {
|
309
|
+
// check if current tag is a tag we should be ignoring text within
|
310
|
+
let head_tag_name = element_stack.last().unwrap().to_string();
|
311
|
+
if selector
|
312
|
+
.ignore_text_within()
|
313
|
+
.unwrap()
|
314
|
+
.iter()
|
315
|
+
.any(|f| f == &head_tag_name)
|
316
|
+
{
|
317
|
+
return Ok(());
|
318
|
+
}
|
319
|
+
}
|
320
|
+
|
321
|
+
match Self::process_text_handlers(handler.rb_handler, text) {
|
322
|
+
Ok(_) => Ok(()),
|
323
|
+
Err(err) => Err(err.to_string().into()),
|
324
|
+
}
|
325
|
+
}
|
326
|
+
));
|
327
|
+
}
|
328
|
+
|
329
|
+
// we need to check *every* element we iterate over, to create a stack of elements
|
330
|
+
element_content_handlers.push(element!("*", move |el| {
|
331
|
+
let tag_name = el.tag_name().to_lowercase();
|
332
|
+
|
333
|
+
// no need to track self-closing tags
|
334
|
+
if Tag::tag_from_tag_name(&tag_name).self_closing {
|
335
|
+
return Ok(());
|
336
|
+
};
|
337
|
+
|
338
|
+
element_stack.as_ref().borrow_mut().push(tag_name);
|
339
|
+
|
340
|
+
let closure_element_stack = element_stack.clone();
|
341
|
+
el.on_end_tag(move |_end_tag: &mut EndTag| {
|
342
|
+
let mut stack = closure_element_stack.as_ref().borrow_mut();
|
343
|
+
stack.pop();
|
344
|
+
Ok(())
|
345
|
+
});
|
346
|
+
Ok(())
|
347
|
+
}));
|
348
|
+
});
|
349
|
+
|
350
|
+
let mut output = vec![];
|
351
|
+
{
|
352
|
+
let mut rewriter = HtmlRewriter::new(
|
353
|
+
Settings {
|
354
|
+
element_content_handlers,
|
355
|
+
..Settings::default()
|
356
|
+
},
|
357
|
+
|c: &[u8]| output.extend_from_slice(c),
|
358
|
+
);
|
359
|
+
match rewriter.write(html.as_bytes()) {
|
360
|
+
Ok(_) => {}
|
361
|
+
Err(err) => {
|
362
|
+
return Err(magnus::Error::new(
|
363
|
+
exception::runtime_error(),
|
364
|
+
format!("{}", err),
|
365
|
+
));
|
366
|
+
}
|
367
|
+
}
|
368
|
+
}
|
369
|
+
Ok(output)
|
370
|
+
}
|
371
|
+
|
372
|
+
fn process_element_handlers(
|
373
|
+
rb_handler: Value,
|
374
|
+
element: &mut Element,
|
375
|
+
ancestors: &Vec<String>,
|
376
|
+
) -> Result<(), magnus::Error> {
|
377
|
+
// if `on_end_tag` function is defined, call it
|
378
|
+
if rb_handler.respond_to(Self::SELMA_ON_END_TAG, true).unwrap() {
|
379
|
+
element.on_end_tag(move |end_tag| {
|
380
|
+
let rb_end_tag = SelmaHTMLEndTag::new(end_tag);
|
381
|
+
|
382
|
+
rb_handler
|
383
|
+
.funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,))
|
384
|
+
.unwrap();
|
385
|
+
Ok(())
|
386
|
+
});
|
387
|
+
}
|
388
|
+
|
389
|
+
let rb_element = SelmaHTMLElement::new(element, ancestors);
|
390
|
+
let rb_result =
|
391
|
+
rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_ELEMENT, (rb_element,));
|
392
|
+
match rb_result {
|
393
|
+
Ok(_) => Ok(()),
|
394
|
+
Err(err) => Err(magnus::Error::new(
|
395
|
+
exception::runtime_error(),
|
396
|
+
format!("{}", err),
|
397
|
+
)),
|
398
|
+
}
|
399
|
+
}
|
400
|
+
|
401
|
+
fn process_text_handlers(rb_handler: Value, text: &mut TextChunk) -> Result<(), magnus::Error> {
|
402
|
+
// prevents missing `handle_text` function
|
403
|
+
let content = text.as_str();
|
404
|
+
|
405
|
+
// FIXME: why does this happen?
|
406
|
+
if content.is_empty() {
|
407
|
+
return Ok(());
|
408
|
+
}
|
409
|
+
let rb_result = rb_handler.funcall(Self::SELMA_HANDLE_TEXT, (content,));
|
410
|
+
|
411
|
+
if rb_result.is_err() {
|
412
|
+
return Err(magnus::Error::new(
|
413
|
+
exception::type_error(),
|
414
|
+
format!(
|
415
|
+
"Expected #{:?} to return a string: {:?}",
|
416
|
+
Self::SELMA_HANDLE_TEXT,
|
417
|
+
rb_result.err().unwrap()
|
418
|
+
),
|
419
|
+
));
|
420
|
+
}
|
421
|
+
|
422
|
+
let new_content: String = rb_result.unwrap();
|
423
|
+
// TODO: can this be an option?
|
424
|
+
text.replace(&new_content, ContentType::Html);
|
425
|
+
|
426
|
+
Ok(())
|
427
|
+
}
|
428
|
+
}
|
429
|
+
|
430
|
+
pub fn init(m_selma: RModule) -> Result<(), magnus::Error> {
|
431
|
+
let c_rewriter = m_selma
|
432
|
+
.define_class("Rewriter", Default::default())
|
433
|
+
.expect("cannot find class Selma::Rewriter");
|
434
|
+
|
435
|
+
c_rewriter.define_singleton_method("new", function!(SelmaRewriter::new, -1))?;
|
436
|
+
c_rewriter
|
437
|
+
.define_method("rewrite", method!(SelmaRewriter::rewrite, 1))
|
438
|
+
.expect("cannot define method `rewrite`");
|
439
|
+
|
440
|
+
Ok(())
|
441
|
+
}
|