selma 0.0.6-arm64-darwin → 0.1.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,429 +0,0 @@
1
- use lol_html::{
2
- doc_comments, doctype, element,
3
- html_content::{Element, EndTag, TextChunk},
4
- text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter, Selector, Settings,
5
- };
6
- use magnus::{exception, function, method, scan_args, Module, Object, RArray, RModule, Value};
7
-
8
- use std::{borrow::Cow, cell::RefCell, primitive::str, rc::Rc};
9
-
10
- use crate::{
11
- html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag, text_chunk::SelmaHTMLTextChunk},
12
- sanitizer::SelmaSanitizer,
13
- selector::SelmaSelector,
14
- tags::Tag,
15
- wrapped_struct::WrappedStruct,
16
- };
17
-
18
- #[derive(Clone, Debug)]
19
- pub struct Handler {
20
- rb_handler: Value,
21
- rb_selector: WrappedStruct<SelmaSelector>,
22
-
23
- total_element_handler_calls: usize,
24
- total_elapsed_element_handlers: f64,
25
-
26
- total_text_handler_calls: usize,
27
- total_elapsed_text_handlers: f64,
28
- }
29
-
30
- pub struct Rewriter {
31
- sanitizer: Option<SelmaSanitizer>,
32
- handlers: Vec<Handler>,
33
-
34
- total_elapsed: f64,
35
- }
36
-
37
- #[magnus::wrap(class = "Selma::Rewriter")]
38
- pub struct SelmaRewriter(std::cell::RefCell<Rewriter>);
39
-
40
- /// SAFETY: This is safe because we only access this data when the GVL is held.
41
- unsafe impl Send for SelmaRewriter {}
42
-
43
- impl SelmaRewriter {
44
- const SELMA_ON_END_TAG: &str = "on_end_tag";
45
- const SELMA_HANDLE_ELEMENT: &str = "handle_element";
46
- const SELMA_HANDLE_TEXT_CHUNK: &str = "handle_text_chunk";
47
-
48
- /// @yard
49
- /// @def new(sanitizer: Selma::Sanitizer.new(Selma::Sanitizer::Config::DEFAULT), handlers: [])
50
- /// @param sanitizer [Selma::Sanitizer] The sanitizer which performs the initial cleanup
51
- /// @param handlers [Array<Selma::Selector>] The handlers to use to perform HTML rewriting
52
- /// @return [Selma::Rewriter]
53
- fn new(args: &[Value]) -> Result<Self, magnus::Error> {
54
- let (rb_sanitizer, rb_handlers) = Self::scan_parse_args(args)?;
55
-
56
- let sanitizer = match rb_sanitizer {
57
- None => {
58
- let default_sanitizer = SelmaSanitizer::new(&[])?;
59
- let wrapped_sanitizer = WrappedStruct::from(default_sanitizer);
60
- wrapped_sanitizer.funcall::<&str, (), Value>("setup", ())?;
61
- Some(wrapped_sanitizer.get().unwrap().to_owned())
62
- }
63
- Some(sanitizer_value) => match sanitizer_value {
64
- None => None,
65
- Some(sanitizer) => {
66
- sanitizer.funcall::<&str, (), Value>("setup", ())?;
67
- Some(sanitizer.get().unwrap().to_owned())
68
- }
69
- },
70
- };
71
-
72
- let handlers = match rb_handlers {
73
- None => vec![],
74
- Some(rb_handlers) => {
75
- let mut handlers: Vec<Handler> = vec![];
76
-
77
- for h in rb_handlers.each() {
78
- let rb_handler = h.unwrap();
79
-
80
- // prevents missing #selector from ruining things
81
- if !rb_handler.respond_to("selector", true).unwrap() {
82
- let classname = unsafe { rb_handler.classname() };
83
- return Err(magnus::Error::new(
84
- exception::no_method_error(),
85
- format!(
86
- "Could not call #selector on {classname:?}; is this an object that defines it?",
87
-
88
- ),
89
- ));
90
- }
91
-
92
- let rb_selector: WrappedStruct<SelmaSelector> =
93
- match rb_handler.funcall("selector", ()) {
94
- Err(err) => {
95
- return Err(magnus::Error::new(
96
- exception::type_error(),
97
- format!("Error instantiating selector: {err:?}"),
98
- ));
99
- }
100
- Ok(rb_selector) => rb_selector,
101
- };
102
- let handler = Handler {
103
- rb_handler,
104
- rb_selector,
105
- total_element_handler_calls: 0,
106
- total_elapsed_element_handlers: 0.0,
107
-
108
- total_text_handler_calls: 0,
109
- total_elapsed_text_handlers: 0.0,
110
- };
111
- handlers.push(handler);
112
- }
113
- handlers
114
- }
115
- };
116
-
117
- if sanitizer.is_none() && handlers.is_empty() {
118
- return Err(magnus::Error::new(
119
- exception::arg_error(),
120
- "Must provide a sanitizer or a handler",
121
- ));
122
- }
123
-
124
- Ok(Self(std::cell::RefCell::new(Rewriter {
125
- sanitizer,
126
- handlers,
127
- total_elapsed: 0.0,
128
- })))
129
- }
130
-
131
- #[allow(clippy::let_unit_value)]
132
- fn scan_parse_args(
133
- args: &[Value],
134
- ) -> Result<
135
- (
136
- Option<Option<WrappedStruct<SelmaSanitizer>>>,
137
- Option<RArray>,
138
- ),
139
- magnus::Error,
140
- > {
141
- let args = scan_args::scan_args(args)?;
142
- let _: () = args.required;
143
- let _: () = args.optional;
144
- let _: () = args.splat;
145
- let _: () = args.trailing;
146
- let _: () = args.block;
147
-
148
- let kwargs = scan_args::get_kwargs::<
149
- _,
150
- (),
151
- (
152
- Option<Option<WrappedStruct<SelmaSanitizer>>>,
153
- Option<RArray>,
154
- ),
155
- (),
156
- >(args.keywords, &[], &["sanitizer", "handlers"])?;
157
- let (rb_sanitizer, rb_handlers) = kwargs.optional;
158
-
159
- Ok((rb_sanitizer, rb_handlers))
160
- }
161
-
162
- /// Perform HTML rewrite sequence.
163
- fn rewrite(&self, html: String) -> Result<String, magnus::Error> {
164
- let sanitized_html = match &self.0.borrow().sanitizer {
165
- None => Ok(html),
166
- Some(sanitizer) => {
167
- let sanitized_html = match Self::perform_sanitization(sanitizer, &html) {
168
- Ok(sanitized_html) => sanitized_html,
169
- Err(err) => return Err(err),
170
- };
171
-
172
- String::from_utf8(sanitized_html)
173
- }
174
- };
175
- let binding = self.0.borrow_mut();
176
- let handlers = &binding.handlers;
177
-
178
- match Self::perform_handler_rewrite(self, handlers, sanitized_html.unwrap()) {
179
- Ok(rewritten_html) => Ok(String::from_utf8(rewritten_html).unwrap()),
180
- Err(err) => Err(err),
181
- }
182
- }
183
-
184
- fn perform_sanitization(
185
- sanitizer: &SelmaSanitizer,
186
- html: &String,
187
- ) -> Result<Vec<u8>, magnus::Error> {
188
- let mut first_pass_html = vec![];
189
- {
190
- let mut document_content_handlers: Vec<DocumentContentHandlers> = vec![];
191
- if !sanitizer.get_allow_doctype() {
192
- document_content_handlers.push(doctype!(|d| {
193
- sanitizer.remove_doctype(d);
194
- Ok(())
195
- }));
196
- }
197
- if !sanitizer.get_allow_comments() {
198
- document_content_handlers.push(doc_comments!(|c| {
199
- sanitizer.remove_comment(c);
200
- Ok(())
201
- }));
202
- }
203
- let mut rewriter = HtmlRewriter::new(
204
- Settings {
205
- document_content_handlers,
206
- element_content_handlers: vec![element!("*", |el| {
207
- sanitizer.try_remove_element(el);
208
- if el.removed() {
209
- return Ok(());
210
- }
211
- match sanitizer.sanitize_attributes(el) {
212
- Ok(_) => Ok(()),
213
- Err(err) => Err(err.to_string().into()),
214
- }
215
- })],
216
- // TODO: allow for MemorySettings to be defined
217
- ..Settings::default()
218
- },
219
- |c: &[u8]| first_pass_html.extend_from_slice(c),
220
- );
221
-
222
- let result = rewriter.write(html.as_bytes());
223
- if result.is_err() {
224
- return Err(magnus::Error::new(
225
- exception::runtime_error(),
226
- format!("Failed to sanitize HTML: {}", result.unwrap_err()),
227
- ));
228
- }
229
- }
230
-
231
- let mut output = vec![];
232
- {
233
- let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
234
- if sanitizer.get_escape_tagfilter() {
235
- element_content_handlers.push(element!(Tag::ESCAPEWORTHY_TAGS_CSS, |el| {
236
- let should_remove = sanitizer.allow_element(el);
237
- if should_remove {
238
- sanitizer.force_remove_element(el);
239
- }
240
-
241
- Ok(())
242
- }));
243
- }
244
-
245
- let mut rewriter = HtmlRewriter::new(
246
- Settings {
247
- element_content_handlers,
248
- ..Settings::default()
249
- },
250
- |c: &[u8]| output.extend_from_slice(c),
251
- );
252
-
253
- let result = rewriter.write(first_pass_html.as_slice());
254
- if result.is_err() {
255
- return Err(magnus::Error::new(
256
- exception::runtime_error(),
257
- format!("Failed to sanitize HTML: {}", result.unwrap_err()),
258
- ));
259
- }
260
- }
261
-
262
- Ok(output)
263
- }
264
-
265
- pub fn perform_handler_rewrite(
266
- &self,
267
- handlers: &[Handler],
268
- html: String,
269
- ) -> Result<Vec<u8>, magnus::Error> {
270
- // TODO: this should ideally be done ahead of time, not on every `#rewrite` call
271
- let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
272
-
273
- handlers.iter().for_each(|handler| {
274
- let element_stack: Rc<RefCell<Vec<String>>> = Rc::new(RefCell::new(vec![]));
275
-
276
- let selector = handler.rb_selector.get_static().unwrap();
277
-
278
- // TODO: test final raise by simulating errors
279
- if selector.match_element().is_some() {
280
- let closure_element_stack = element_stack.clone();
281
-
282
- element_content_handlers.push(element!(
283
- selector.match_element().unwrap(),
284
- move |el| {
285
- match Self::process_element_handlers(
286
- handler.rb_handler,
287
- el,
288
- &closure_element_stack.borrow(),
289
- ) {
290
- Ok(_) => Ok(()),
291
- Err(err) => Err(err.to_string().into()),
292
- }
293
- }
294
- ));
295
- }
296
-
297
- if selector.match_text_within().is_some() {
298
- let closure_element_stack = element_stack.clone();
299
-
300
- element_content_handlers.push(text!(
301
- selector.match_text_within().unwrap(),
302
- move |text| {
303
- let element_stack = closure_element_stack.as_ref().borrow();
304
- if selector.ignore_text_within().is_some() {
305
- // check if current tag is a tag we should be ignoring text within
306
- let head_tag_name = element_stack.last().unwrap().to_string();
307
- if selector
308
- .ignore_text_within()
309
- .unwrap()
310
- .iter()
311
- .any(|f| f == &head_tag_name)
312
- {
313
- return Ok(());
314
- }
315
- }
316
-
317
- match Self::process_text_handlers(handler.rb_handler, text) {
318
- Ok(_) => Ok(()),
319
- Err(err) => Err(err.to_string().into()),
320
- }
321
- }
322
- ));
323
- }
324
-
325
- // we need to check *every* element we iterate over, to create a stack of elements
326
- element_content_handlers.push(element!("*", move |el| {
327
- let tag_name = el.tag_name().to_lowercase();
328
-
329
- // no need to track self-closing tags
330
- if Tag::tag_from_tag_name(&tag_name).self_closing {
331
- return Ok(());
332
- };
333
-
334
- element_stack.as_ref().borrow_mut().push(tag_name);
335
-
336
- let closure_element_stack = element_stack.clone();
337
- el.on_end_tag(move |_end_tag: &mut EndTag| {
338
- let mut stack = closure_element_stack.as_ref().borrow_mut();
339
- stack.pop();
340
- Ok(())
341
- })?;
342
- Ok(())
343
- }));
344
- });
345
-
346
- let mut output = vec![];
347
- {
348
- let mut rewriter = HtmlRewriter::new(
349
- Settings {
350
- element_content_handlers,
351
- ..Settings::default()
352
- },
353
- |c: &[u8]| output.extend_from_slice(c),
354
- );
355
- match rewriter.write(html.as_bytes()) {
356
- Ok(_) => {}
357
- Err(err) => {
358
- return Err(magnus::Error::new(
359
- exception::runtime_error(),
360
- format!("{err:?}"),
361
- ));
362
- }
363
- }
364
- }
365
- Ok(output)
366
- }
367
-
368
- fn process_element_handlers(
369
- rb_handler: Value,
370
- element: &mut Element,
371
- ancestors: &[String],
372
- ) -> Result<(), magnus::Error> {
373
- // if `on_end_tag` function is defined, call it
374
- if rb_handler.respond_to(Self::SELMA_ON_END_TAG, true).unwrap() {
375
- // TODO: error here is an "EndTagError"
376
- element.on_end_tag(move |end_tag| {
377
- let rb_end_tag = SelmaHTMLEndTag::new(end_tag);
378
-
379
- match rb_handler.funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,)) {
380
- Ok(_) => Ok(()),
381
- Err(err) => Err(err.to_string().into()),
382
- }
383
- });
384
- }
385
-
386
- let rb_element = SelmaHTMLElement::new(element, ancestors);
387
- let rb_result =
388
- rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_ELEMENT, (rb_element,));
389
- match rb_result {
390
- Ok(_) => Ok(()),
391
- Err(err) => Err(err),
392
- }
393
- }
394
-
395
- fn process_text_handlers(
396
- rb_handler: Value,
397
- text_chunk: &mut TextChunk,
398
- ) -> Result<(), magnus::Error> {
399
- // prevents missing `handle_text_chunk` function
400
- let content = text_chunk.as_str();
401
-
402
- // seems that sometimes lol-html returns blank text / EOLs?
403
- if content.is_empty() {
404
- return Ok(());
405
- }
406
-
407
- let rb_text_chunk = SelmaHTMLTextChunk::new(text_chunk);
408
- match rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_TEXT_CHUNK, (rb_text_chunk,)) {
409
- Ok(_) => Ok(()),
410
- Err(err) => Err(magnus::Error::new(
411
- exception::runtime_error(),
412
- format!("{err:?}"),
413
- )),
414
- }
415
- }
416
- }
417
-
418
- pub fn init(m_selma: RModule) -> Result<(), magnus::Error> {
419
- let c_rewriter = m_selma
420
- .define_class("Rewriter", Default::default())
421
- .expect("cannot find class Selma::Rewriter");
422
-
423
- c_rewriter.define_singleton_method("new", function!(SelmaRewriter::new, -1))?;
424
- c_rewriter
425
- .define_method("rewrite", method!(SelmaRewriter::rewrite, 1))
426
- .expect("cannot define method `rewrite`");
427
-
428
- Ok(())
429
- }