selma 0.0.2-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,441 @@
1
+ use std::{borrow::Cow, cell::RefCell, rc::Rc};
2
+
3
+ use lol_html::{
4
+ doc_comments, doctype, element,
5
+ html_content::{ContentType, Element, EndTag, TextChunk},
6
+ text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter, Selector, Settings,
7
+ };
8
+ use magnus::{exception, function, method, scan_args, Module, Object, RArray, RModule, Value};
9
+
10
+ use crate::{
11
+ html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag},
12
+ sanitizer::SelmaSanitizer,
13
+ selector::SelmaSelector,
14
+ tags::Tag,
15
+ wrapped_struct::WrappedStruct,
16
+ };
17
+
18
+ #[derive(Clone, Debug)]
19
+ pub struct Handler {
20
+ rb_handler: Value,
21
+ rb_selector: WrappedStruct<SelmaSelector>,
22
+
23
+ total_element_handler_calls: usize,
24
+ total_elapsed_element_handlers: f64,
25
+
26
+ total_text_handler_calls: usize,
27
+ total_elapsed_text_handlers: f64,
28
+ }
29
+
30
+ pub struct Rewriter {
31
+ sanitizer: Option<SelmaSanitizer>,
32
+ handlers: Vec<Handler>,
33
+
34
+ total_elapsed: f64,
35
+ }
36
+
37
+ #[magnus::wrap(class = "Selma::Rewriter")]
38
+ pub struct SelmaRewriter(std::cell::RefCell<Rewriter>);
39
+
40
+ /// SAFETY: This is safe because we only access this data when the GVL is held.
41
+ unsafe impl Send for SelmaRewriter {}
42
+
43
+ impl SelmaRewriter {
44
+ const SELMA_ON_END_TAG: &str = "on_end_tag";
45
+ const SELMA_HANDLE_ELEMENT: &str = "handle_element";
46
+ const SELMA_HANDLE_TEXT: &str = "handle_text";
47
+
48
+ /// @yard
49
+ /// @def new(sanitizer: Selma::Sanitizer.new(Selma::Sanitizer::Config::DEFAULT), handlers: [])
50
+ /// @param sanitizer [Selma::Sanitizer] The sanitizer which performs the initial cleanup
51
+ /// @param handlers [Array<Selma::Selector>] The handlers to use to perform HTML rewriting
52
+ /// @return [Selma::Rewriter]
53
+ fn new(args: &[Value]) -> Result<Self, magnus::Error> {
54
+ let (rb_sanitizer, rb_handlers) = Self::scan_parse_args(args)?;
55
+
56
+ let sanitizer = match rb_sanitizer {
57
+ None => {
58
+ let default_sanitizer = SelmaSanitizer::new(&[])?;
59
+ let wrapped_sanitizer = WrappedStruct::from(default_sanitizer);
60
+ wrapped_sanitizer.funcall::<&str, (), Value>("setup", ())?;
61
+ Some(wrapped_sanitizer.get().unwrap().to_owned())
62
+ }
63
+ Some(sanitizer_value) => match sanitizer_value {
64
+ None => None,
65
+ Some(sanitizer) => {
66
+ sanitizer.funcall::<&str, (), Value>("setup", ())?;
67
+ Some(sanitizer.get().unwrap().to_owned())
68
+ }
69
+ },
70
+ };
71
+
72
+ let handlers = match rb_handlers {
73
+ None => vec![],
74
+ Some(rb_handlers) => {
75
+ let mut handlers: Vec<Handler> = vec![];
76
+
77
+ for h in rb_handlers.each() {
78
+ let rb_handler = h.unwrap();
79
+
80
+ // prevents missing #selector from ruining things
81
+ if !rb_handler.respond_to("selector", true).unwrap() {
82
+ let classname = unsafe { rb_handler.classname() };
83
+ return Err(magnus::Error::new(
84
+ exception::no_method_error(),
85
+ format!(
86
+ "Could not call #selector on {:?}; is this an object that defines it?",
87
+ classname
88
+ ),
89
+ ));
90
+ }
91
+
92
+ let rb_selector: WrappedStruct<SelmaSelector> =
93
+ match rb_handler.funcall("selector", ()) {
94
+ Err(e) => {
95
+ return Err(magnus::Error::new(
96
+ exception::type_error(),
97
+ format!("Error instantiating selector: {}", e),
98
+ ));
99
+ }
100
+ Ok(rb_selector) => rb_selector,
101
+ };
102
+ let handler = Handler {
103
+ rb_handler,
104
+ rb_selector,
105
+ total_element_handler_calls: 0,
106
+ total_elapsed_element_handlers: 0.0,
107
+
108
+ total_text_handler_calls: 0,
109
+ total_elapsed_text_handlers: 0.0,
110
+ };
111
+ handlers.push(handler);
112
+ }
113
+ handlers
114
+ }
115
+ };
116
+
117
+ if sanitizer.is_none() && handlers.is_empty() {
118
+ return Err(magnus::Error::new(
119
+ exception::arg_error(),
120
+ "Must provide a sanitizer or a handler",
121
+ ));
122
+ }
123
+
124
+ Ok(Self(std::cell::RefCell::new(Rewriter {
125
+ sanitizer,
126
+ handlers,
127
+ total_elapsed: 0.0,
128
+ })))
129
+ }
130
+
131
+ #[allow(clippy::let_unit_value)]
132
+ fn scan_parse_args(
133
+ args: &[Value],
134
+ ) -> Result<
135
+ (
136
+ Option<Option<WrappedStruct<SelmaSanitizer>>>,
137
+ Option<RArray>,
138
+ ),
139
+ magnus::Error,
140
+ > {
141
+ let args = scan_args::scan_args(args)?;
142
+ let _: () = args.required;
143
+ let _: () = args.optional;
144
+ let _: () = args.splat;
145
+ let _: () = args.trailing;
146
+ let _: () = args.block;
147
+
148
+ let kw = scan_args::get_kwargs::<
149
+ _,
150
+ (),
151
+ (
152
+ Option<Option<WrappedStruct<SelmaSanitizer>>>,
153
+ Option<RArray>,
154
+ ),
155
+ (),
156
+ >(args.keywords, &[], &["sanitizer", "handlers"])?;
157
+ let (rb_sanitizer, rb_handlers) = kw.optional;
158
+
159
+ Ok((rb_sanitizer, rb_handlers))
160
+ }
161
+
162
+ /// Perform HTML rewrite sequence.
163
+ fn rewrite(&self, html: String) -> Result<String, magnus::Error> {
164
+ let sanitized_html = match &self.0.borrow().sanitizer {
165
+ None => html,
166
+ Some(sanitizer) => {
167
+ // let first_pass_html = Self::perform_initial_sanitization(sanitizer, &html).unwrap();
168
+
169
+ // due to malicious html crafting
170
+ // (e.g. <<foo>script>...</script>, or <div <!-- comment -->> as in tests),
171
+ // we need to run sanitization several times to truly remove unwanted tags,
172
+ // because lol-html happily accepts this garbage (by design?)
173
+ let sanitized_html = Self::perform_sanitization(sanitizer, &html).unwrap();
174
+
175
+ String::from_utf8(sanitized_html).unwrap()
176
+ }
177
+ };
178
+ let binding = self.0.borrow_mut();
179
+ let handlers = &binding.handlers;
180
+
181
+ match Self::perform_handler_rewrite(self, handlers, sanitized_html) {
182
+ Ok(rewritten_html) => Ok(String::from_utf8(rewritten_html).unwrap()),
183
+ Err(err) => Err(magnus::Error::new(
184
+ exception::runtime_error(),
185
+ format!("{}", err),
186
+ )),
187
+ }
188
+ }
189
+
190
+ fn perform_sanitization(
191
+ sanitizer: &SelmaSanitizer,
192
+ html: &String,
193
+ ) -> Result<Vec<u8>, magnus::Error> {
194
+ let mut first_pass_html = vec![];
195
+ {
196
+ let mut document_content_handlers: Vec<DocumentContentHandlers> = vec![];
197
+ if !sanitizer.get_allow_doctype() {
198
+ document_content_handlers.push(doctype!(|d| {
199
+ sanitizer.remove_doctype(d);
200
+ Ok(())
201
+ }));
202
+ }
203
+ if !sanitizer.get_allow_comments() {
204
+ document_content_handlers.push(doc_comments!(|c| {
205
+ sanitizer.remove_comment(c);
206
+ Ok(())
207
+ }));
208
+ }
209
+ let mut rewriter = HtmlRewriter::new(
210
+ Settings {
211
+ document_content_handlers,
212
+ element_content_handlers: vec![element!("*", |el| {
213
+ sanitizer.try_remove_element(el);
214
+ if el.removed() {
215
+ return Ok(());
216
+ }
217
+ sanitizer.sanitize_attributes(el);
218
+
219
+ Ok(())
220
+ })],
221
+ ..Settings::default()
222
+ },
223
+ |c: &[u8]| first_pass_html.extend_from_slice(c),
224
+ );
225
+
226
+ let result = rewriter.write(html.as_bytes());
227
+ if result.is_err() {
228
+ return Err(magnus::Error::new(
229
+ exception::runtime_error(),
230
+ format!("Failed to sanitize HTML: {}", result.unwrap_err()),
231
+ ));
232
+ }
233
+ }
234
+
235
+ let mut output = vec![];
236
+ {
237
+ let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
238
+ if sanitizer.get_escape_tagfilter() {
239
+ element_content_handlers.push(element!(Tag::ESCAPEWORTHY_TAGS_CSS, |el| {
240
+ let should_remove = sanitizer.allow_element(el);
241
+ if should_remove {
242
+ sanitizer.force_remove_element(el);
243
+ }
244
+
245
+ Ok(())
246
+ }));
247
+ }
248
+
249
+ let mut rewriter = HtmlRewriter::new(
250
+ Settings {
251
+ element_content_handlers,
252
+ ..Settings::default()
253
+ },
254
+ |c: &[u8]| output.extend_from_slice(c),
255
+ );
256
+
257
+ let result = rewriter.write(first_pass_html.as_slice());
258
+ if result.is_err() {
259
+ return Err(magnus::Error::new(
260
+ exception::runtime_error(),
261
+ format!("Failed to sanitize HTML: {}", result.unwrap_err()),
262
+ ));
263
+ }
264
+ }
265
+
266
+ Ok(output)
267
+ }
268
+
269
+ pub fn perform_handler_rewrite(
270
+ &self,
271
+ handlers: &[Handler],
272
+ html: String,
273
+ ) -> Result<Vec<u8>, magnus::Error> {
274
+ // TODO: this should ideally be done ahead of time, not on every `#rewrite` call
275
+ let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
276
+
277
+ handlers.iter().for_each(|handler| {
278
+ let element_stack: Rc<RefCell<Vec<String>>> = Rc::new(RefCell::new(vec![]));
279
+
280
+ let selector = handler.rb_selector.get_static().unwrap();
281
+
282
+ // TODO: test final raise by simulating errors
283
+ if selector.match_element().is_some() {
284
+ let closure_element_stack = element_stack.clone();
285
+
286
+ element_content_handlers.push(element!(
287
+ selector.match_element().unwrap(),
288
+ move |el| {
289
+ match Self::process_element_handlers(
290
+ handler.rb_handler,
291
+ el,
292
+ &closure_element_stack.borrow(),
293
+ ) {
294
+ Ok(_) => Ok(()),
295
+ Err(err) => Err(err.to_string().into()),
296
+ }
297
+ }
298
+ ));
299
+ }
300
+
301
+ if selector.match_text_within().is_some() {
302
+ let closure_element_stack = element_stack.clone();
303
+
304
+ element_content_handlers.push(text!(
305
+ selector.match_text_within().unwrap(),
306
+ move |text| {
307
+ let element_stack = closure_element_stack.as_ref().borrow();
308
+ if selector.ignore_text_within().is_some() {
309
+ // check if current tag is a tag we should be ignoring text within
310
+ let head_tag_name = element_stack.last().unwrap().to_string();
311
+ if selector
312
+ .ignore_text_within()
313
+ .unwrap()
314
+ .iter()
315
+ .any(|f| f == &head_tag_name)
316
+ {
317
+ return Ok(());
318
+ }
319
+ }
320
+
321
+ match Self::process_text_handlers(handler.rb_handler, text) {
322
+ Ok(_) => Ok(()),
323
+ Err(err) => Err(err.to_string().into()),
324
+ }
325
+ }
326
+ ));
327
+ }
328
+
329
+ // we need to check *every* element we iterate over, to create a stack of elements
330
+ element_content_handlers.push(element!("*", move |el| {
331
+ let tag_name = el.tag_name().to_lowercase();
332
+
333
+ // no need to track self-closing tags
334
+ if Tag::tag_from_tag_name(&tag_name).self_closing {
335
+ return Ok(());
336
+ };
337
+
338
+ element_stack.as_ref().borrow_mut().push(tag_name);
339
+
340
+ let closure_element_stack = element_stack.clone();
341
+ el.on_end_tag(move |_end_tag: &mut EndTag| {
342
+ let mut stack = closure_element_stack.as_ref().borrow_mut();
343
+ stack.pop();
344
+ Ok(())
345
+ });
346
+ Ok(())
347
+ }));
348
+ });
349
+
350
+ let mut output = vec![];
351
+ {
352
+ let mut rewriter = HtmlRewriter::new(
353
+ Settings {
354
+ element_content_handlers,
355
+ ..Settings::default()
356
+ },
357
+ |c: &[u8]| output.extend_from_slice(c),
358
+ );
359
+ match rewriter.write(html.as_bytes()) {
360
+ Ok(_) => {}
361
+ Err(err) => {
362
+ return Err(magnus::Error::new(
363
+ exception::runtime_error(),
364
+ format!("{}", err),
365
+ ));
366
+ }
367
+ }
368
+ }
369
+ Ok(output)
370
+ }
371
+
372
+ fn process_element_handlers(
373
+ rb_handler: Value,
374
+ element: &mut Element,
375
+ ancestors: &Vec<String>,
376
+ ) -> Result<(), magnus::Error> {
377
+ // if `on_end_tag` function is defined, call it
378
+ if rb_handler.respond_to(Self::SELMA_ON_END_TAG, true).unwrap() {
379
+ element.on_end_tag(move |end_tag| {
380
+ let rb_end_tag = SelmaHTMLEndTag::new(end_tag);
381
+
382
+ rb_handler
383
+ .funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,))
384
+ .unwrap();
385
+ Ok(())
386
+ });
387
+ }
388
+
389
+ let rb_element = SelmaHTMLElement::new(element, ancestors);
390
+ let rb_result =
391
+ rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_ELEMENT, (rb_element,));
392
+ match rb_result {
393
+ Ok(_) => Ok(()),
394
+ Err(err) => Err(magnus::Error::new(
395
+ exception::runtime_error(),
396
+ format!("{}", err),
397
+ )),
398
+ }
399
+ }
400
+
401
+ fn process_text_handlers(rb_handler: Value, text: &mut TextChunk) -> Result<(), magnus::Error> {
402
+ // prevents missing `handle_text` function
403
+ let content = text.as_str();
404
+
405
+ // FIXME: why does this happen?
406
+ if content.is_empty() {
407
+ return Ok(());
408
+ }
409
+ let rb_result = rb_handler.funcall(Self::SELMA_HANDLE_TEXT, (content,));
410
+
411
+ if rb_result.is_err() {
412
+ return Err(magnus::Error::new(
413
+ exception::type_error(),
414
+ format!(
415
+ "Expected #{:?} to return a string: {:?}",
416
+ Self::SELMA_HANDLE_TEXT,
417
+ rb_result.err().unwrap()
418
+ ),
419
+ ));
420
+ }
421
+
422
+ let new_content: String = rb_result.unwrap();
423
+ // TODO: can this be an option?
424
+ text.replace(&new_content, ContentType::Html);
425
+
426
+ Ok(())
427
+ }
428
+ }
429
+
430
+ pub fn init(m_selma: RModule) -> Result<(), magnus::Error> {
431
+ let c_rewriter = m_selma
432
+ .define_class("Rewriter", Default::default())
433
+ .expect("cannot find class Selma::Rewriter");
434
+
435
+ c_rewriter.define_singleton_method("new", function!(SelmaRewriter::new, -1))?;
436
+ c_rewriter
437
+ .define_method("rewrite", method!(SelmaRewriter::rewrite, 1))
438
+ .expect("cannot define method `rewrite`");
439
+
440
+ Ok(())
441
+ }