selma 0.0.7-x86_64-linux → 0.1.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
data/ext/selma/src/lib.rs DELETED
@@ -1,50 +0,0 @@
1
- extern crate core;
2
-
3
- use lol_html::html_content::ContentType;
4
- use magnus::{define_module, exception, scan_args, Error, Symbol, Value};
5
-
6
- pub mod html;
7
- pub mod native_ref_wrap;
8
- pub mod rewriter;
9
- pub mod sanitizer;
10
- pub mod selector;
11
- pub mod tags;
12
- pub mod wrapped_struct;
13
-
14
- #[allow(clippy::let_unit_value)]
15
- fn scan_text_args(args: &[Value]) -> Result<(String, ContentType), magnus::Error> {
16
- let args = scan_args::scan_args(args)?;
17
- let (text,): (String,) = args.required;
18
- let _: () = args.optional;
19
- let _: () = args.splat;
20
- let _: () = args.trailing;
21
- let _: () = args.block;
22
-
23
- let kwargs = scan_args::get_kwargs::<_, (Symbol,), (), ()>(args.keywords, &["as"], &[])?;
24
- let as_sym = kwargs.required.0;
25
- let as_sym_str = as_sym.name().unwrap();
26
- let content_type = if as_sym_str == "text" {
27
- ContentType::Text
28
- } else if as_sym_str == "html" {
29
- ContentType::Html
30
- } else {
31
- return Err(Error::new(
32
- exception::runtime_error(),
33
- format!("unknown symbol `{as_sym_str:?}`"),
34
- ));
35
- };
36
-
37
- Ok((text, content_type))
38
- }
39
-
40
- #[magnus::init]
41
- fn init() -> Result<(), Error> {
42
- let m_selma = define_module("Selma").expect("cannot define ::Selma module");
43
-
44
- sanitizer::init(m_selma).expect("cannot define Selma::Sanitizer class");
45
- rewriter::init(m_selma).expect("cannot define Selma::Rewriter class");
46
- html::init(m_selma).expect("cannot define Selma::HTML class");
47
- selector::init(m_selma).expect("cannot define Selma::Selector class");
48
-
49
- Ok(())
50
- }
@@ -1,79 +0,0 @@
1
- use std::{cell::Cell, marker::PhantomData, rc::Rc};
2
-
3
- // NOTE: My Rust isn't good enough to know what any of this does,
4
- // but it was taken from https://github.com/cloudflare/lol-html/blob/1a1ab2e2bf896f815fe8888ed78ccdf46d7c6b85/js-api/src/lib.rs#LL38
5
-
6
- pub struct Anchor<'r> {
7
- poisoned: Rc<Cell<bool>>,
8
- lifetime: PhantomData<&'r mut ()>,
9
- }
10
-
11
- impl<'r> Anchor<'r> {
12
- pub fn new(poisoned: Rc<Cell<bool>>) -> Self {
13
- Anchor {
14
- poisoned,
15
- lifetime: PhantomData,
16
- }
17
- }
18
- }
19
-
20
- // impl Drop for Anchor<'_> {
21
- // fn drop(&mut self) {
22
- // self.poisoned.replace(true);
23
- // }
24
- // }
25
-
26
- // NOTE: wasm_bindgen doesn't allow structures with lifetimes. To workaround that
27
- // we create a wrapper that erases all the lifetime information from the inner reference
28
- // and provides an anchor object that keeps track of the lifetime in the runtime.
29
- //
30
- // When anchor goes out of scope, wrapper becomes poisoned and any attempt to get inner
31
- // object results in exception.
32
- pub struct NativeRefWrap<R> {
33
- inner_ptr: *mut R,
34
- poisoned: Rc<Cell<bool>>,
35
- }
36
-
37
- impl<R> NativeRefWrap<R> {
38
- pub fn wrap<I>(inner: &I) -> (Self, Anchor) {
39
- let wrap = NativeRefWrap {
40
- inner_ptr: inner as *const I as *mut R,
41
- poisoned: Rc::new(Cell::new(false)),
42
- };
43
-
44
- let anchor = Anchor::new(Rc::clone(&wrap.poisoned));
45
-
46
- (wrap, anchor)
47
- }
48
-
49
- pub fn wrap_mut<I>(inner: &mut I) -> (Self, Anchor) {
50
- let wrap = NativeRefWrap {
51
- inner_ptr: inner as *mut I as *mut R,
52
- poisoned: Rc::new(Cell::new(false)),
53
- };
54
-
55
- let anchor = Anchor::new(Rc::clone(&wrap.poisoned));
56
-
57
- (wrap, anchor)
58
- }
59
-
60
- pub fn get(&self) -> Result<&R, &'static str> {
61
- self.assert_not_poisoned()?;
62
-
63
- Ok(unsafe { self.inner_ptr.as_ref() }.unwrap())
64
- }
65
-
66
- pub fn get_mut(&mut self) -> Result<&mut R, &'static str> {
67
- self.assert_not_poisoned()?;
68
-
69
- Ok(unsafe { self.inner_ptr.as_mut() }.unwrap())
70
- }
71
-
72
- fn assert_not_poisoned(&self) -> Result<(), &'static str> {
73
- if self.poisoned.get() {
74
- Err("The object has been freed and can't be used anymore.")
75
- } else {
76
- Ok(())
77
- }
78
- }
79
- }
@@ -1,429 +0,0 @@
1
- use lol_html::{
2
- doc_comments, doctype, element,
3
- html_content::{Element, EndTag, TextChunk},
4
- text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter, Selector, Settings,
5
- };
6
- use magnus::{exception, function, method, scan_args, Module, Object, RArray, RModule, Value};
7
-
8
- use std::{borrow::Cow, cell::RefCell, primitive::str, rc::Rc};
9
-
10
- use crate::{
11
- html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag, text_chunk::SelmaHTMLTextChunk},
12
- sanitizer::SelmaSanitizer,
13
- selector::SelmaSelector,
14
- tags::Tag,
15
- wrapped_struct::WrappedStruct,
16
- };
17
-
18
- #[derive(Clone, Debug)]
19
- pub struct Handler {
20
- rb_handler: Value,
21
- rb_selector: WrappedStruct<SelmaSelector>,
22
-
23
- total_element_handler_calls: usize,
24
- total_elapsed_element_handlers: f64,
25
-
26
- total_text_handler_calls: usize,
27
- total_elapsed_text_handlers: f64,
28
- }
29
-
30
- pub struct Rewriter {
31
- sanitizer: Option<SelmaSanitizer>,
32
- handlers: Vec<Handler>,
33
-
34
- total_elapsed: f64,
35
- }
36
-
37
- #[magnus::wrap(class = "Selma::Rewriter")]
38
- pub struct SelmaRewriter(std::cell::RefCell<Rewriter>);
39
-
40
- /// SAFETY: This is safe because we only access this data when the GVL is held.
41
- unsafe impl Send for SelmaRewriter {}
42
-
43
- impl SelmaRewriter {
44
- const SELMA_ON_END_TAG: &str = "on_end_tag";
45
- const SELMA_HANDLE_ELEMENT: &str = "handle_element";
46
- const SELMA_HANDLE_TEXT_CHUNK: &str = "handle_text_chunk";
47
-
48
- /// @yard
49
- /// @def new(sanitizer: Selma::Sanitizer.new(Selma::Sanitizer::Config::DEFAULT), handlers: [])
50
- /// @param sanitizer [Selma::Sanitizer] The sanitizer which performs the initial cleanup
51
- /// @param handlers [Array<Selma::Selector>] The handlers to use to perform HTML rewriting
52
- /// @return [Selma::Rewriter]
53
- fn new(args: &[Value]) -> Result<Self, magnus::Error> {
54
- let (rb_sanitizer, rb_handlers) = Self::scan_parse_args(args)?;
55
-
56
- let sanitizer = match rb_sanitizer {
57
- None => {
58
- let default_sanitizer = SelmaSanitizer::new(&[])?;
59
- let wrapped_sanitizer = WrappedStruct::from(default_sanitizer);
60
- wrapped_sanitizer.funcall::<&str, (), Value>("setup", ())?;
61
- Some(wrapped_sanitizer.get().unwrap().to_owned())
62
- }
63
- Some(sanitizer_value) => match sanitizer_value {
64
- None => None,
65
- Some(sanitizer) => {
66
- sanitizer.funcall::<&str, (), Value>("setup", ())?;
67
- Some(sanitizer.get().unwrap().to_owned())
68
- }
69
- },
70
- };
71
-
72
- let handlers = match rb_handlers {
73
- None => vec![],
74
- Some(rb_handlers) => {
75
- let mut handlers: Vec<Handler> = vec![];
76
-
77
- for h in rb_handlers.each() {
78
- let rb_handler = h.unwrap();
79
-
80
- // prevents missing #selector from ruining things
81
- if !rb_handler.respond_to("selector", true).unwrap() {
82
- let classname = unsafe { rb_handler.classname() };
83
- return Err(magnus::Error::new(
84
- exception::no_method_error(),
85
- format!(
86
- "Could not call #selector on {classname:?}; is this an object that defines it?",
87
-
88
- ),
89
- ));
90
- }
91
-
92
- let rb_selector: WrappedStruct<SelmaSelector> =
93
- match rb_handler.funcall("selector", ()) {
94
- Err(err) => {
95
- return Err(magnus::Error::new(
96
- exception::type_error(),
97
- format!("Error instantiating selector: {err:?}"),
98
- ));
99
- }
100
- Ok(rb_selector) => rb_selector,
101
- };
102
- let handler = Handler {
103
- rb_handler,
104
- rb_selector,
105
- total_element_handler_calls: 0,
106
- total_elapsed_element_handlers: 0.0,
107
-
108
- total_text_handler_calls: 0,
109
- total_elapsed_text_handlers: 0.0,
110
- };
111
- handlers.push(handler);
112
- }
113
- handlers
114
- }
115
- };
116
-
117
- if sanitizer.is_none() && handlers.is_empty() {
118
- return Err(magnus::Error::new(
119
- exception::arg_error(),
120
- "Must provide a sanitizer or a handler",
121
- ));
122
- }
123
-
124
- Ok(Self(std::cell::RefCell::new(Rewriter {
125
- sanitizer,
126
- handlers,
127
- total_elapsed: 0.0,
128
- })))
129
- }
130
-
131
- #[allow(clippy::let_unit_value)]
132
- fn scan_parse_args(
133
- args: &[Value],
134
- ) -> Result<
135
- (
136
- Option<Option<WrappedStruct<SelmaSanitizer>>>,
137
- Option<RArray>,
138
- ),
139
- magnus::Error,
140
- > {
141
- let args = scan_args::scan_args(args)?;
142
- let _: () = args.required;
143
- let _: () = args.optional;
144
- let _: () = args.splat;
145
- let _: () = args.trailing;
146
- let _: () = args.block;
147
-
148
- let kwargs = scan_args::get_kwargs::<
149
- _,
150
- (),
151
- (
152
- Option<Option<WrappedStruct<SelmaSanitizer>>>,
153
- Option<RArray>,
154
- ),
155
- (),
156
- >(args.keywords, &[], &["sanitizer", "handlers"])?;
157
- let (rb_sanitizer, rb_handlers) = kwargs.optional;
158
-
159
- Ok((rb_sanitizer, rb_handlers))
160
- }
161
-
162
- /// Perform HTML rewrite sequence.
163
- fn rewrite(&self, html: String) -> Result<String, magnus::Error> {
164
- let sanitized_html = match &self.0.borrow().sanitizer {
165
- None => Ok(html),
166
- Some(sanitizer) => {
167
- let sanitized_html = match Self::perform_sanitization(sanitizer, &html) {
168
- Ok(sanitized_html) => sanitized_html,
169
- Err(err) => return Err(err),
170
- };
171
-
172
- String::from_utf8(sanitized_html)
173
- }
174
- };
175
- let binding = self.0.borrow_mut();
176
- let handlers = &binding.handlers;
177
-
178
- match Self::perform_handler_rewrite(self, handlers, sanitized_html.unwrap()) {
179
- Ok(rewritten_html) => Ok(String::from_utf8(rewritten_html).unwrap()),
180
- Err(err) => Err(err),
181
- }
182
- }
183
-
184
- fn perform_sanitization(
185
- sanitizer: &SelmaSanitizer,
186
- html: &String,
187
- ) -> Result<Vec<u8>, magnus::Error> {
188
- let mut first_pass_html = vec![];
189
- {
190
- let mut document_content_handlers: Vec<DocumentContentHandlers> = vec![];
191
- if !sanitizer.get_allow_doctype() {
192
- document_content_handlers.push(doctype!(|d| {
193
- sanitizer.remove_doctype(d);
194
- Ok(())
195
- }));
196
- }
197
- if !sanitizer.get_allow_comments() {
198
- document_content_handlers.push(doc_comments!(|c| {
199
- sanitizer.remove_comment(c);
200
- Ok(())
201
- }));
202
- }
203
- let mut rewriter = HtmlRewriter::new(
204
- Settings {
205
- document_content_handlers,
206
- element_content_handlers: vec![element!("*", |el| {
207
- sanitizer.try_remove_element(el);
208
- if el.removed() {
209
- return Ok(());
210
- }
211
- match sanitizer.sanitize_attributes(el) {
212
- Ok(_) => Ok(()),
213
- Err(err) => Err(err.to_string().into()),
214
- }
215
- })],
216
- // TODO: allow for MemorySettings to be defined
217
- ..Settings::default()
218
- },
219
- |c: &[u8]| first_pass_html.extend_from_slice(c),
220
- );
221
-
222
- let result = rewriter.write(html.as_bytes());
223
- if result.is_err() {
224
- return Err(magnus::Error::new(
225
- exception::runtime_error(),
226
- format!("Failed to sanitize HTML: {}", result.unwrap_err()),
227
- ));
228
- }
229
- }
230
-
231
- let mut output = vec![];
232
- {
233
- let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
234
- if sanitizer.get_escape_tagfilter() {
235
- element_content_handlers.push(element!(Tag::ESCAPEWORTHY_TAGS_CSS, |el| {
236
- let should_remove = sanitizer.allow_element(el);
237
- if should_remove {
238
- sanitizer.force_remove_element(el);
239
- }
240
-
241
- Ok(())
242
- }));
243
- }
244
-
245
- let mut rewriter = HtmlRewriter::new(
246
- Settings {
247
- element_content_handlers,
248
- ..Settings::default()
249
- },
250
- |c: &[u8]| output.extend_from_slice(c),
251
- );
252
-
253
- let result = rewriter.write(first_pass_html.as_slice());
254
- if result.is_err() {
255
- return Err(magnus::Error::new(
256
- exception::runtime_error(),
257
- format!("Failed to sanitize HTML: {}", result.unwrap_err()),
258
- ));
259
- }
260
- }
261
-
262
- Ok(output)
263
- }
264
-
265
- pub fn perform_handler_rewrite(
266
- &self,
267
- handlers: &[Handler],
268
- html: String,
269
- ) -> Result<Vec<u8>, magnus::Error> {
270
- // TODO: this should ideally be done ahead of time, not on every `#rewrite` call
271
- let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
272
-
273
- handlers.iter().for_each(|handler| {
274
- let element_stack: Rc<RefCell<Vec<String>>> = Rc::new(RefCell::new(vec![]));
275
-
276
- let selector = handler.rb_selector.get_static().unwrap();
277
-
278
- // TODO: test final raise by simulating errors
279
- if selector.match_element().is_some() {
280
- let closure_element_stack = element_stack.clone();
281
-
282
- element_content_handlers.push(element!(
283
- selector.match_element().unwrap(),
284
- move |el| {
285
- match Self::process_element_handlers(
286
- handler.rb_handler,
287
- el,
288
- &closure_element_stack.borrow(),
289
- ) {
290
- Ok(_) => Ok(()),
291
- Err(err) => Err(err.to_string().into()),
292
- }
293
- }
294
- ));
295
- }
296
-
297
- if selector.match_text_within().is_some() {
298
- let closure_element_stack = element_stack.clone();
299
-
300
- element_content_handlers.push(text!(
301
- selector.match_text_within().unwrap(),
302
- move |text| {
303
- let element_stack = closure_element_stack.as_ref().borrow();
304
- if selector.ignore_text_within().is_some() {
305
- // check if current tag is a tag we should be ignoring text within
306
- let head_tag_name = element_stack.last().unwrap().to_string();
307
- if selector
308
- .ignore_text_within()
309
- .unwrap()
310
- .iter()
311
- .any(|f| f == &head_tag_name)
312
- {
313
- return Ok(());
314
- }
315
- }
316
-
317
- match Self::process_text_handlers(handler.rb_handler, text) {
318
- Ok(_) => Ok(()),
319
- Err(err) => Err(err.to_string().into()),
320
- }
321
- }
322
- ));
323
- }
324
-
325
- // we need to check *every* element we iterate over, to create a stack of elements
326
- element_content_handlers.push(element!("*", move |el| {
327
- let tag_name = el.tag_name().to_lowercase();
328
-
329
- // no need to track self-closing tags
330
- if Tag::tag_from_tag_name(&tag_name).self_closing {
331
- return Ok(());
332
- };
333
-
334
- element_stack.as_ref().borrow_mut().push(tag_name);
335
-
336
- let closure_element_stack = element_stack.clone();
337
- el.on_end_tag(move |_end_tag: &mut EndTag| {
338
- let mut stack = closure_element_stack.as_ref().borrow_mut();
339
- stack.pop();
340
- Ok(())
341
- })?;
342
- Ok(())
343
- }));
344
- });
345
-
346
- let mut output = vec![];
347
- {
348
- let mut rewriter = HtmlRewriter::new(
349
- Settings {
350
- element_content_handlers,
351
- ..Settings::default()
352
- },
353
- |c: &[u8]| output.extend_from_slice(c),
354
- );
355
- match rewriter.write(html.as_bytes()) {
356
- Ok(_) => {}
357
- Err(err) => {
358
- return Err(magnus::Error::new(
359
- exception::runtime_error(),
360
- format!("{err:?}"),
361
- ));
362
- }
363
- }
364
- }
365
- Ok(output)
366
- }
367
-
368
- fn process_element_handlers(
369
- rb_handler: Value,
370
- element: &mut Element,
371
- ancestors: &[String],
372
- ) -> Result<(), magnus::Error> {
373
- // if `on_end_tag` function is defined, call it
374
- if rb_handler.respond_to(Self::SELMA_ON_END_TAG, true).unwrap() {
375
- // TODO: error here is an "EndTagError"
376
- element.on_end_tag(move |end_tag| {
377
- let rb_end_tag = SelmaHTMLEndTag::new(end_tag);
378
-
379
- match rb_handler.funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,)) {
380
- Ok(_) => Ok(()),
381
- Err(err) => Err(err.to_string().into()),
382
- }
383
- });
384
- }
385
-
386
- let rb_element = SelmaHTMLElement::new(element, ancestors);
387
- let rb_result =
388
- rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_ELEMENT, (rb_element,));
389
- match rb_result {
390
- Ok(_) => Ok(()),
391
- Err(err) => Err(err),
392
- }
393
- }
394
-
395
- fn process_text_handlers(
396
- rb_handler: Value,
397
- text_chunk: &mut TextChunk,
398
- ) -> Result<(), magnus::Error> {
399
- // prevents missing `handle_text_chunk` function
400
- let content = text_chunk.as_str();
401
-
402
- // seems that sometimes lol-html returns blank text / EOLs?
403
- if content.is_empty() {
404
- return Ok(());
405
- }
406
-
407
- let rb_text_chunk = SelmaHTMLTextChunk::new(text_chunk);
408
- match rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_TEXT_CHUNK, (rb_text_chunk,)) {
409
- Ok(_) => Ok(()),
410
- Err(err) => Err(magnus::Error::new(
411
- exception::runtime_error(),
412
- format!("{err:?}"),
413
- )),
414
- }
415
- }
416
- }
417
-
418
- pub fn init(m_selma: RModule) -> Result<(), magnus::Error> {
419
- let c_rewriter = m_selma
420
- .define_class("Rewriter", Default::default())
421
- .expect("cannot find class Selma::Rewriter");
422
-
423
- c_rewriter.define_singleton_method("new", function!(SelmaRewriter::new, -1))?;
424
- c_rewriter
425
- .define_method("rewrite", method!(SelmaRewriter::rewrite, 1))
426
- .expect("cannot define method `rewrite`");
427
-
428
- Ok(())
429
- }