selma 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,441 @@
1
+ use std::{borrow::Cow, cell::RefCell, rc::Rc};
2
+
3
+ use lol_html::{
4
+ doc_comments, doctype, element,
5
+ html_content::{ContentType, Element, EndTag, TextChunk},
6
+ text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter, Selector, Settings,
7
+ };
8
+ use magnus::{exception, function, method, scan_args, Module, Object, RArray, RModule, Value};
9
+
10
+ use crate::{
11
+ html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag},
12
+ sanitizer::SelmaSanitizer,
13
+ selector::SelmaSelector,
14
+ tags::Tag,
15
+ wrapped_struct::WrappedStruct,
16
+ };
17
+
18
+ #[derive(Clone, Debug)]
19
+ pub struct Handler {
20
+ rb_handler: Value,
21
+ rb_selector: WrappedStruct<SelmaSelector>,
22
+
23
+ total_element_handler_calls: usize,
24
+ total_elapsed_element_handlers: f64,
25
+
26
+ total_text_handler_calls: usize,
27
+ total_elapsed_text_handlers: f64,
28
+ }
29
+
30
+ pub struct Rewriter {
31
+ sanitizer: Option<SelmaSanitizer>,
32
+ handlers: Vec<Handler>,
33
+
34
+ total_elapsed: f64,
35
+ }
36
+
37
+ #[magnus::wrap(class = "Selma::Rewriter")]
38
+ pub struct SelmaRewriter(std::cell::RefCell<Rewriter>);
39
+
40
+ /// SAFETY: This is safe because we only access this data when the GVL is held.
41
+ unsafe impl Send for SelmaRewriter {}
42
+
43
+ impl SelmaRewriter {
44
+ const SELMA_ON_END_TAG: &str = "on_end_tag";
45
+ const SELMA_HANDLE_ELEMENT: &str = "handle_element";
46
+ const SELMA_HANDLE_TEXT: &str = "handle_text";
47
+
48
+ /// @yard
49
+ /// @def new(sanitizer: Selma::Sanitizer.new(Selma::Sanitizer::Config::DEFAULT), handlers: [])
50
+ /// @param sanitizer [Selma::Sanitizer] The sanitizer which performs the initial cleanup
51
+ /// @param handlers [Array<Selma::Selector>] The handlers to use to perform HTML rewriting
52
+ /// @return [Selma::Rewriter]
53
+ fn new(args: &[Value]) -> Result<Self, magnus::Error> {
54
+ let (rb_sanitizer, rb_handlers) = Self::scan_parse_args(args)?;
55
+
56
+ let sanitizer = match rb_sanitizer {
57
+ None => {
58
+ let default_sanitizer = SelmaSanitizer::new(&[])?;
59
+ let wrapped_sanitizer = WrappedStruct::from(default_sanitizer);
60
+ wrapped_sanitizer.funcall::<&str, (), Value>("setup", ())?;
61
+ Some(wrapped_sanitizer.get().unwrap().to_owned())
62
+ }
63
+ Some(sanitizer_value) => match sanitizer_value {
64
+ None => None,
65
+ Some(sanitizer) => {
66
+ sanitizer.funcall::<&str, (), Value>("setup", ())?;
67
+ Some(sanitizer.get().unwrap().to_owned())
68
+ }
69
+ },
70
+ };
71
+
72
+ let handlers = match rb_handlers {
73
+ None => vec![],
74
+ Some(rb_handlers) => {
75
+ let mut handlers: Vec<Handler> = vec![];
76
+
77
+ for h in rb_handlers.each() {
78
+ let rb_handler = h.unwrap();
79
+
80
+ // prevents missing #selector from ruining things
81
+ if !rb_handler.respond_to("selector", true).unwrap() {
82
+ let classname = unsafe { rb_handler.classname() };
83
+ return Err(magnus::Error::new(
84
+ exception::no_method_error(),
85
+ format!(
86
+ "Could not call #selector on {:?}; is this an object that defines it?",
87
+ classname
88
+ ),
89
+ ));
90
+ }
91
+
92
+ let rb_selector: WrappedStruct<SelmaSelector> =
93
+ match rb_handler.funcall("selector", ()) {
94
+ Err(e) => {
95
+ return Err(magnus::Error::new(
96
+ exception::type_error(),
97
+ format!("Error instantiating selector: {}", e),
98
+ ));
99
+ }
100
+ Ok(rb_selector) => rb_selector,
101
+ };
102
+ let handler = Handler {
103
+ rb_handler,
104
+ rb_selector,
105
+ total_element_handler_calls: 0,
106
+ total_elapsed_element_handlers: 0.0,
107
+
108
+ total_text_handler_calls: 0,
109
+ total_elapsed_text_handlers: 0.0,
110
+ };
111
+ handlers.push(handler);
112
+ }
113
+ handlers
114
+ }
115
+ };
116
+
117
+ if sanitizer.is_none() && handlers.is_empty() {
118
+ return Err(magnus::Error::new(
119
+ exception::arg_error(),
120
+ "Must provide a sanitizer or a handler",
121
+ ));
122
+ }
123
+
124
+ Ok(Self(std::cell::RefCell::new(Rewriter {
125
+ sanitizer,
126
+ handlers,
127
+ total_elapsed: 0.0,
128
+ })))
129
+ }
130
+
131
+ #[allow(clippy::let_unit_value)]
132
+ fn scan_parse_args(
133
+ args: &[Value],
134
+ ) -> Result<
135
+ (
136
+ Option<Option<WrappedStruct<SelmaSanitizer>>>,
137
+ Option<RArray>,
138
+ ),
139
+ magnus::Error,
140
+ > {
141
+ let args = scan_args::scan_args(args)?;
142
+ let _: () = args.required;
143
+ let _: () = args.optional;
144
+ let _: () = args.splat;
145
+ let _: () = args.trailing;
146
+ let _: () = args.block;
147
+
148
+ let kw = scan_args::get_kwargs::<
149
+ _,
150
+ (),
151
+ (
152
+ Option<Option<WrappedStruct<SelmaSanitizer>>>,
153
+ Option<RArray>,
154
+ ),
155
+ (),
156
+ >(args.keywords, &[], &["sanitizer", "handlers"])?;
157
+ let (rb_sanitizer, rb_handlers) = kw.optional;
158
+
159
+ Ok((rb_sanitizer, rb_handlers))
160
+ }
161
+
162
+ /// Perform HTML rewrite sequence.
163
+ fn rewrite(&self, html: String) -> Result<String, magnus::Error> {
164
+ let sanitized_html = match &self.0.borrow().sanitizer {
165
+ None => html,
166
+ Some(sanitizer) => {
167
+ // let first_pass_html = Self::perform_initial_sanitization(sanitizer, &html).unwrap();
168
+
169
+ // due to malicious html crafting
170
+ // (e.g. <<foo>script>...</script>, or <div <!-- comment -->> as in tests),
171
+ // we need to run sanitization several times to truly remove unwanted tags,
172
+ // because lol-html happily accepts this garbage (by design?)
173
+ let sanitized_html = Self::perform_sanitization(sanitizer, &html).unwrap();
174
+
175
+ String::from_utf8(sanitized_html).unwrap()
176
+ }
177
+ };
178
+ let binding = self.0.borrow_mut();
179
+ let handlers = &binding.handlers;
180
+
181
+ match Self::perform_handler_rewrite(self, handlers, sanitized_html) {
182
+ Ok(rewritten_html) => Ok(String::from_utf8(rewritten_html).unwrap()),
183
+ Err(err) => Err(magnus::Error::new(
184
+ exception::runtime_error(),
185
+ format!("{}", err),
186
+ )),
187
+ }
188
+ }
189
+
190
+ fn perform_sanitization(
191
+ sanitizer: &SelmaSanitizer,
192
+ html: &String,
193
+ ) -> Result<Vec<u8>, magnus::Error> {
194
+ let mut first_pass_html = vec![];
195
+ {
196
+ let mut document_content_handlers: Vec<DocumentContentHandlers> = vec![];
197
+ if !sanitizer.get_allow_doctype() {
198
+ document_content_handlers.push(doctype!(|d| {
199
+ sanitizer.remove_doctype(d);
200
+ Ok(())
201
+ }));
202
+ }
203
+ if !sanitizer.get_allow_comments() {
204
+ document_content_handlers.push(doc_comments!(|c| {
205
+ sanitizer.remove_comment(c);
206
+ Ok(())
207
+ }));
208
+ }
209
+ let mut rewriter = HtmlRewriter::new(
210
+ Settings {
211
+ document_content_handlers,
212
+ element_content_handlers: vec![element!("*", |el| {
213
+ sanitizer.try_remove_element(el);
214
+ if el.removed() {
215
+ return Ok(());
216
+ }
217
+ sanitizer.sanitize_attributes(el);
218
+
219
+ Ok(())
220
+ })],
221
+ ..Settings::default()
222
+ },
223
+ |c: &[u8]| first_pass_html.extend_from_slice(c),
224
+ );
225
+
226
+ let result = rewriter.write(html.as_bytes());
227
+ if result.is_err() {
228
+ return Err(magnus::Error::new(
229
+ exception::runtime_error(),
230
+ format!("Failed to sanitize HTML: {}", result.unwrap_err()),
231
+ ));
232
+ }
233
+ }
234
+
235
+ let mut output = vec![];
236
+ {
237
+ let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
238
+ if sanitizer.get_escape_tagfilter() {
239
+ element_content_handlers.push(element!(Tag::ESCAPEWORTHY_TAGS_CSS, |el| {
240
+ let should_remove = sanitizer.allow_element(el);
241
+ if should_remove {
242
+ sanitizer.force_remove_element(el);
243
+ }
244
+
245
+ Ok(())
246
+ }));
247
+ }
248
+
249
+ let mut rewriter = HtmlRewriter::new(
250
+ Settings {
251
+ element_content_handlers,
252
+ ..Settings::default()
253
+ },
254
+ |c: &[u8]| output.extend_from_slice(c),
255
+ );
256
+
257
+ let result = rewriter.write(first_pass_html.as_slice());
258
+ if result.is_err() {
259
+ return Err(magnus::Error::new(
260
+ exception::runtime_error(),
261
+ format!("Failed to sanitize HTML: {}", result.unwrap_err()),
262
+ ));
263
+ }
264
+ }
265
+
266
+ Ok(output)
267
+ }
268
+
269
+ pub fn perform_handler_rewrite(
270
+ &self,
271
+ handlers: &[Handler],
272
+ html: String,
273
+ ) -> Result<Vec<u8>, magnus::Error> {
274
+ // TODO: this should ideally be done ahead of time, not on every `#rewrite` call
275
+ let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
276
+
277
+ handlers.iter().for_each(|handler| {
278
+ let element_stack: Rc<RefCell<Vec<String>>> = Rc::new(RefCell::new(vec![]));
279
+
280
+ let selector = handler.rb_selector.get_static().unwrap();
281
+
282
+ // TODO: test final raise by simulating errors
283
+ if selector.match_element().is_some() {
284
+ let closure_element_stack = element_stack.clone();
285
+
286
+ element_content_handlers.push(element!(
287
+ selector.match_element().unwrap(),
288
+ move |el| {
289
+ match Self::process_element_handlers(
290
+ handler.rb_handler,
291
+ el,
292
+ &closure_element_stack.borrow(),
293
+ ) {
294
+ Ok(_) => Ok(()),
295
+ Err(err) => Err(err.to_string().into()),
296
+ }
297
+ }
298
+ ));
299
+ }
300
+
301
+ if selector.match_text_within().is_some() {
302
+ let closure_element_stack = element_stack.clone();
303
+
304
+ element_content_handlers.push(text!(
305
+ selector.match_text_within().unwrap(),
306
+ move |text| {
307
+ let element_stack = closure_element_stack.as_ref().borrow();
308
+ if selector.ignore_text_within().is_some() {
309
+ // check if current tag is a tag we should be ignoring text within
310
+ let head_tag_name = element_stack.last().unwrap().to_string();
311
+ if selector
312
+ .ignore_text_within()
313
+ .unwrap()
314
+ .iter()
315
+ .any(|f| f == &head_tag_name)
316
+ {
317
+ return Ok(());
318
+ }
319
+ }
320
+
321
+ match Self::process_text_handlers(handler.rb_handler, text) {
322
+ Ok(_) => Ok(()),
323
+ Err(err) => Err(err.to_string().into()),
324
+ }
325
+ }
326
+ ));
327
+ }
328
+
329
+ // we need to check *every* element we iterate over, to create a stack of elements
330
+ element_content_handlers.push(element!("*", move |el| {
331
+ let tag_name = el.tag_name().to_lowercase();
332
+
333
+ // no need to track self-closing tags
334
+ if Tag::tag_from_tag_name(&tag_name).self_closing {
335
+ return Ok(());
336
+ };
337
+
338
+ element_stack.as_ref().borrow_mut().push(tag_name);
339
+
340
+ let closure_element_stack = element_stack.clone();
341
+ el.on_end_tag(move |_end_tag: &mut EndTag| {
342
+ let mut stack = closure_element_stack.as_ref().borrow_mut();
343
+ stack.pop();
344
+ Ok(())
345
+ });
346
+ Ok(())
347
+ }));
348
+ });
349
+
350
+ let mut output = vec![];
351
+ {
352
+ let mut rewriter = HtmlRewriter::new(
353
+ Settings {
354
+ element_content_handlers,
355
+ ..Settings::default()
356
+ },
357
+ |c: &[u8]| output.extend_from_slice(c),
358
+ );
359
+ match rewriter.write(html.as_bytes()) {
360
+ Ok(_) => {}
361
+ Err(err) => {
362
+ return Err(magnus::Error::new(
363
+ exception::runtime_error(),
364
+ format!("{}", err),
365
+ ));
366
+ }
367
+ }
368
+ }
369
+ Ok(output)
370
+ }
371
+
372
+ fn process_element_handlers(
373
+ rb_handler: Value,
374
+ element: &mut Element,
375
+ ancestors: &Vec<String>,
376
+ ) -> Result<(), magnus::Error> {
377
+ // if `on_end_tag` function is defined, call it
378
+ if rb_handler.respond_to(Self::SELMA_ON_END_TAG, true).unwrap() {
379
+ element.on_end_tag(move |end_tag| {
380
+ let rb_end_tag = SelmaHTMLEndTag::new(end_tag);
381
+
382
+ rb_handler
383
+ .funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,))
384
+ .unwrap();
385
+ Ok(())
386
+ });
387
+ }
388
+
389
+ let rb_element = SelmaHTMLElement::new(element, ancestors);
390
+ let rb_result =
391
+ rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_ELEMENT, (rb_element,));
392
+ match rb_result {
393
+ Ok(_) => Ok(()),
394
+ Err(err) => Err(magnus::Error::new(
395
+ exception::runtime_error(),
396
+ format!("{}", err),
397
+ )),
398
+ }
399
+ }
400
+
401
+ fn process_text_handlers(rb_handler: Value, text: &mut TextChunk) -> Result<(), magnus::Error> {
402
+ // prevents missing `handle_text` function
403
+ let content = text.as_str();
404
+
405
+ // FIXME: why does this happen?
406
+ if content.is_empty() {
407
+ return Ok(());
408
+ }
409
+ let rb_result = rb_handler.funcall(Self::SELMA_HANDLE_TEXT, (content,));
410
+
411
+ if rb_result.is_err() {
412
+ return Err(magnus::Error::new(
413
+ exception::type_error(),
414
+ format!(
415
+ "Expected #{:?} to return a string: {:?}",
416
+ Self::SELMA_HANDLE_TEXT,
417
+ rb_result.err().unwrap()
418
+ ),
419
+ ));
420
+ }
421
+
422
+ let new_content: String = rb_result.unwrap();
423
+ // TODO: can this be an option?
424
+ text.replace(&new_content, ContentType::Html);
425
+
426
+ Ok(())
427
+ }
428
+ }
429
+
430
+ pub fn init(m_selma: RModule) -> Result<(), magnus::Error> {
431
+ let c_rewriter = m_selma
432
+ .define_class("Rewriter", Default::default())
433
+ .expect("cannot find class Selma::Rewriter");
434
+
435
+ c_rewriter.define_singleton_method("new", function!(SelmaRewriter::new, -1))?;
436
+ c_rewriter
437
+ .define_method("rewrite", method!(SelmaRewriter::rewrite, 1))
438
+ .expect("cannot define method `rewrite`");
439
+
440
+ Ok(())
441
+ }