selma 0.2.2 → 0.4.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +129 -124
- data/README.md +133 -25
- data/ext/selma/Cargo.toml +6 -3
- data/ext/selma/src/html/element.rs +32 -27
- data/ext/selma/src/html/end_tag.rs +5 -5
- data/ext/selma/src/html/text_chunk.rs +55 -12
- data/ext/selma/src/native_ref_wrap.rs +30 -33
- data/ext/selma/src/rewriter.rs +299 -139
- data/ext/selma/src/sanitizer.rs +256 -138
- data/lib/selma/config.rb +12 -0
- data/lib/selma/html/element.rb +11 -0
- data/lib/selma/html.rb +2 -0
- data/lib/selma/sanitizer/config/default.rb +1 -1
- data/lib/selma/sanitizer/config/relaxed.rb +1 -0
- data/lib/selma/sanitizer/config.rb +2 -2
- data/lib/selma/sanitizer.rb +0 -77
- data/lib/selma/version.rb +1 -1
- metadata +9 -7
data/ext/selma/src/rewriter.rs
CHANGED
@@ -1,19 +1,31 @@
|
|
1
1
|
use lol_html::{
|
2
2
|
doc_comments, doctype, element,
|
3
3
|
html_content::{Element, TextChunk},
|
4
|
-
text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter,
|
4
|
+
text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter, MemorySettings, Selector,
|
5
|
+
Settings,
|
5
6
|
};
|
6
7
|
use magnus::{
|
7
|
-
exception, function,
|
8
|
+
exception, function, gc, method,
|
9
|
+
r_hash::ForEach,
|
10
|
+
scan_args,
|
8
11
|
typed_data::Obj,
|
9
12
|
value::{Opaque, ReprValue},
|
10
|
-
Module, Object, RArray, RModule, Ruby,
|
13
|
+
DataTypeFunctions, Integer, IntoValue, Module, Object, RArray, RHash, RModule, Ruby, Symbol,
|
14
|
+
TypedData, Value,
|
11
15
|
};
|
12
16
|
|
13
|
-
use std::{
|
17
|
+
use std::{
|
18
|
+
borrow::Cow,
|
19
|
+
cell::{Ref, RefCell},
|
20
|
+
mem,
|
21
|
+
ops::Deref,
|
22
|
+
primitive::str,
|
23
|
+
rc::Rc,
|
24
|
+
};
|
14
25
|
|
15
26
|
use crate::{
|
16
27
|
html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag, text_chunk::SelmaHTMLTextChunk},
|
28
|
+
native_ref_wrap::NativeRefWrap,
|
17
29
|
sanitizer::SelmaSanitizer,
|
18
30
|
selector::SelmaSelector,
|
19
31
|
tags::Tag,
|
@@ -30,16 +42,34 @@ pub struct Handler {
|
|
30
42
|
// total_elapsed_text_handlers: f64,
|
31
43
|
}
|
32
44
|
|
45
|
+
struct RewriterOptions {
|
46
|
+
memory_options: MemorySettings,
|
47
|
+
}
|
48
|
+
|
33
49
|
pub struct Rewriter {
|
34
50
|
sanitizer: Option<SelmaSanitizer>,
|
35
51
|
handlers: Vec<Handler>,
|
52
|
+
options: RewriterOptions,
|
36
53
|
// total_elapsed: f64,
|
37
54
|
}
|
38
55
|
|
39
|
-
#[
|
56
|
+
#[derive(TypedData)]
|
57
|
+
#[magnus(class = "Selma::Rewriter", free_immediately, mark)]
|
40
58
|
pub struct SelmaRewriter(std::cell::RefCell<Rewriter>);
|
41
59
|
|
42
|
-
|
60
|
+
impl DataTypeFunctions for SelmaRewriter {
|
61
|
+
fn mark(&self, marker: &gc::Marker) {
|
62
|
+
self.0.borrow().handlers.iter().for_each(|handler| {
|
63
|
+
marker.mark(handler.rb_handler);
|
64
|
+
});
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
type RewriterValues = (
|
69
|
+
Option<Option<Obj<SelmaSanitizer>>>,
|
70
|
+
Option<RArray>,
|
71
|
+
Option<RHash>,
|
72
|
+
);
|
43
73
|
|
44
74
|
impl SelmaRewriter {
|
45
75
|
const SELMA_ON_END_TAG: &'static str = "on_end_tag";
|
@@ -50,25 +80,20 @@ impl SelmaRewriter {
|
|
50
80
|
/// @def new(sanitizer: Selma::Sanitizer.new(Selma::Sanitizer::Config::DEFAULT), handlers: [])
|
51
81
|
/// @param sanitizer [Selma::Sanitizer] The sanitizer which performs the initial cleanup
|
52
82
|
/// @param handlers [Array<Selma::Selector>] The handlers to use to perform HTML rewriting
|
83
|
+
/// @param options [Hash] Any additional options to pass to the rewriter
|
53
84
|
/// @return [Selma::Rewriter]
|
54
85
|
fn new(args: &[Value]) -> Result<Self, magnus::Error> {
|
55
|
-
let (rb_sanitizer, rb_handlers) = Self::scan_parse_args(args)?;
|
86
|
+
let (rb_sanitizer, rb_handlers, rb_options) = Self::scan_parse_args(args)?;
|
56
87
|
|
57
88
|
let sanitizer = match rb_sanitizer {
|
58
89
|
None => {
|
59
|
-
// no `sanitizer:` provided, use default
|
90
|
+
// no `sanitizer:` kwarg provided, use default
|
60
91
|
let default_sanitizer = SelmaSanitizer::new(&[])?;
|
61
92
|
let wrapped_sanitizer = Obj::wrap(default_sanitizer);
|
62
|
-
wrapped_sanitizer.funcall::<&str, (), Value>("setup", ())?;
|
63
|
-
Some(wrapped_sanitizer.
|
93
|
+
// wrapped_sanitizer.funcall::<&str, (), Value>("setup", ())?;
|
94
|
+
Some(wrapped_sanitizer.deref().to_owned())
|
64
95
|
}
|
65
|
-
Some(sanitizer_value) =>
|
66
|
-
None => None, // no `sanitizer:` provided, use default
|
67
|
-
Some(sanitizer) => {
|
68
|
-
sanitizer.funcall::<&str, (), Value>("setup", ())?;
|
69
|
-
Some(sanitizer.get().to_owned())
|
70
|
-
}
|
71
|
-
},
|
96
|
+
Some(sanitizer_value) => sanitizer_value.map(|sanitizer| sanitizer.deref().to_owned()),
|
72
97
|
};
|
73
98
|
|
74
99
|
let handlers = match rb_handlers {
|
@@ -76,9 +101,7 @@ impl SelmaRewriter {
|
|
76
101
|
Some(rb_handlers) => {
|
77
102
|
let mut handlers: Vec<Handler> = vec![];
|
78
103
|
|
79
|
-
for
|
80
|
-
let rb_handler = h.unwrap();
|
81
|
-
|
104
|
+
for rb_handler in rb_handlers.into_iter() {
|
82
105
|
// prevents missing #selector from ruining things
|
83
106
|
if !rb_handler.respond_to("selector", true).unwrap() {
|
84
107
|
let classname = unsafe { rb_handler.classname() };
|
@@ -122,9 +145,88 @@ impl SelmaRewriter {
|
|
122
145
|
));
|
123
146
|
}
|
124
147
|
|
148
|
+
let mut rewriter_options = RewriterOptions::new();
|
149
|
+
|
150
|
+
match rb_options {
|
151
|
+
None => {}
|
152
|
+
Some(options) => {
|
153
|
+
options.foreach(|key: Symbol, value: RHash| {
|
154
|
+
let key = key.to_string();
|
155
|
+
match key.as_str() {
|
156
|
+
"memory" => {
|
157
|
+
let max_allowed_memory_usage = value.get(Symbol::new("max_allowed_memory_usage"));
|
158
|
+
if max_allowed_memory_usage.is_some() {
|
159
|
+
let max_allowed_memory_usage = max_allowed_memory_usage.unwrap();
|
160
|
+
let max_allowed_memory_usage =
|
161
|
+
Integer::from_value(max_allowed_memory_usage);
|
162
|
+
if max_allowed_memory_usage.is_some() {
|
163
|
+
match max_allowed_memory_usage.unwrap().to_u64() {
|
164
|
+
Ok(max_allowed_memory_usage) => {
|
165
|
+
rewriter_options.memory_options.max_allowed_memory_usage =
|
166
|
+
max_allowed_memory_usage as usize;
|
167
|
+
}
|
168
|
+
Err(_e) => {
|
169
|
+
return Err(magnus::Error::new(
|
170
|
+
exception::arg_error(),
|
171
|
+
"max_allowed_memory_usage must be a positive integer",
|
172
|
+
));
|
173
|
+
}
|
174
|
+
}
|
175
|
+
} else {
|
176
|
+
rewriter_options.memory_options.max_allowed_memory_usage = MemorySettings::default().max_allowed_memory_usage;
|
177
|
+
}
|
178
|
+
}
|
179
|
+
|
180
|
+
let preallocated_parsing_buffer_size = value.get(Symbol::new("preallocated_parsing_buffer_size"));
|
181
|
+
if preallocated_parsing_buffer_size.is_some() {
|
182
|
+
let preallocated_parsing_buffer_size = preallocated_parsing_buffer_size.unwrap();
|
183
|
+
let preallocated_parsing_buffer_size =
|
184
|
+
Integer::from_value(preallocated_parsing_buffer_size);
|
185
|
+
if preallocated_parsing_buffer_size.is_some() {
|
186
|
+
match preallocated_parsing_buffer_size.unwrap().to_u64() {
|
187
|
+
Ok(preallocated_parsing_buffer_size) => {
|
188
|
+
rewriter_options.memory_options.preallocated_parsing_buffer_size =
|
189
|
+
preallocated_parsing_buffer_size as usize;
|
190
|
+
}
|
191
|
+
Err(_e) => {
|
192
|
+
return Err(magnus::Error::new(
|
193
|
+
exception::arg_error(),
|
194
|
+
"preallocated_parsing_buffer_size must be a positive integer",
|
195
|
+
));
|
196
|
+
}
|
197
|
+
}
|
198
|
+
} else {
|
199
|
+
rewriter_options.memory_options.preallocated_parsing_buffer_size = MemorySettings::default().preallocated_parsing_buffer_size;
|
200
|
+
}
|
201
|
+
}
|
202
|
+
}
|
203
|
+
_ => {
|
204
|
+
return Err(magnus::Error::new(
|
205
|
+
exception::arg_error(),
|
206
|
+
format!("Unknown option: {key:?}"),
|
207
|
+
));
|
208
|
+
}
|
209
|
+
}
|
210
|
+
Ok(ForEach::Continue)
|
211
|
+
})?;
|
212
|
+
}
|
213
|
+
}
|
214
|
+
|
215
|
+
if rewriter_options
|
216
|
+
.memory_options
|
217
|
+
.preallocated_parsing_buffer_size
|
218
|
+
> rewriter_options.memory_options.max_allowed_memory_usage
|
219
|
+
{
|
220
|
+
return Err(magnus::Error::new(
|
221
|
+
exception::arg_error(),
|
222
|
+
"max_allowed_memory_usage must be greater than preallocated_parsing_buffer_size",
|
223
|
+
));
|
224
|
+
}
|
225
|
+
|
125
226
|
Ok(Self(std::cell::RefCell::new(Rewriter {
|
126
227
|
sanitizer,
|
127
228
|
handlers,
|
229
|
+
options: rewriter_options,
|
128
230
|
// total_elapsed: 0.0,
|
129
231
|
})))
|
130
232
|
}
|
@@ -141,125 +243,126 @@ impl SelmaRewriter {
|
|
141
243
|
let kwargs = scan_args::get_kwargs::<
|
142
244
|
_,
|
143
245
|
(),
|
144
|
-
(
|
246
|
+
(
|
247
|
+
Option<Option<Obj<SelmaSanitizer>>>,
|
248
|
+
Option<RArray>,
|
249
|
+
Option<RHash>,
|
250
|
+
),
|
145
251
|
(),
|
146
|
-
>(args.keywords, &[], &["sanitizer", "handlers"])?;
|
147
|
-
let (rb_sanitizer, rb_handlers) = kwargs.optional;
|
252
|
+
>(args.keywords, &[], &["sanitizer", "handlers", "options"])?;
|
253
|
+
let (rb_sanitizer, rb_handlers, rb_options) = kwargs.optional;
|
148
254
|
|
149
|
-
Ok((rb_sanitizer, rb_handlers))
|
255
|
+
Ok((rb_sanitizer, rb_handlers, rb_options))
|
150
256
|
}
|
151
257
|
|
152
258
|
/// Perform HTML rewrite sequence.
|
153
259
|
fn rewrite(&self, html: String) -> Result<String, magnus::Error> {
|
154
|
-
let
|
155
|
-
None => Ok(html),
|
156
|
-
Some(sanitizer) => {
|
157
|
-
let sanitized_html = match Self::perform_sanitization(sanitizer, &html) {
|
158
|
-
Ok(sanitized_html) => sanitized_html,
|
159
|
-
Err(err) => return Err(err),
|
160
|
-
};
|
260
|
+
let binding = self.0.borrow();
|
161
261
|
|
162
|
-
|
262
|
+
let mut sanitizer_document_content_handlers: Vec<DocumentContentHandlers> = vec![];
|
263
|
+
let mut sanitizer_element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> =
|
264
|
+
vec![];
|
265
|
+
|
266
|
+
match &binding.sanitizer {
|
267
|
+
None => (),
|
268
|
+
Some(sanitizer) => {
|
269
|
+
if !sanitizer.get_allow_doctype() {
|
270
|
+
sanitizer_document_content_handlers.push(doctype!(|d| {
|
271
|
+
sanitizer.remove_doctype(d);
|
272
|
+
Ok(())
|
273
|
+
}));
|
274
|
+
}
|
275
|
+
if !sanitizer.get_allow_comments() {
|
276
|
+
sanitizer_document_content_handlers.push(doc_comments!(|c| {
|
277
|
+
sanitizer.remove_comment(c);
|
278
|
+
Ok(())
|
279
|
+
}));
|
280
|
+
}
|
281
|
+
sanitizer_element_content_handlers.push(element!("*", |el| {
|
282
|
+
sanitizer.try_remove_element(el);
|
283
|
+
if el.removed() {
|
284
|
+
return Ok(());
|
285
|
+
}
|
286
|
+
// if it was removed, there are no attributes to sanitize
|
287
|
+
match sanitizer.sanitize_attributes(el) {
|
288
|
+
Ok(_) => Ok(()),
|
289
|
+
Err(err) => Err(err.to_string().into()),
|
290
|
+
}
|
291
|
+
}));
|
163
292
|
}
|
164
293
|
};
|
165
|
-
let binding = self.0.borrow_mut();
|
166
|
-
let handlers = &binding.handlers;
|
167
294
|
|
168
|
-
|
169
|
-
|
295
|
+
let handlers: &Vec<Handler> = &binding.handlers;
|
296
|
+
|
297
|
+
match Self::perform_handler_rewrite(
|
298
|
+
self,
|
299
|
+
sanitizer_document_content_handlers,
|
300
|
+
sanitizer_element_content_handlers,
|
301
|
+
handlers,
|
302
|
+
html,
|
303
|
+
) {
|
304
|
+
Ok(rewritten_html) => match &binding.sanitizer {
|
305
|
+
None => match String::from_utf8(rewritten_html) {
|
306
|
+
Ok(output) => Ok(output),
|
307
|
+
Err(err) => Err(magnus::Error::new(
|
308
|
+
exception::runtime_error(),
|
309
|
+
format!("{err:?}"),
|
310
|
+
)),
|
311
|
+
},
|
312
|
+
Some(sanitizer) => {
|
313
|
+
Self::perform_final_sanitization(self, sanitizer, rewritten_html)
|
314
|
+
}
|
315
|
+
},
|
170
316
|
Err(err) => Err(err),
|
171
317
|
}
|
172
318
|
}
|
173
319
|
|
174
|
-
|
320
|
+
// to get rid of some really nasty edge cases with dangerous tags, we perform one more
|
321
|
+
// sanitization pass at the end
|
322
|
+
fn perform_final_sanitization(
|
323
|
+
&self,
|
175
324
|
sanitizer: &SelmaSanitizer,
|
176
|
-
html:
|
177
|
-
) -> Result<
|
178
|
-
|
179
|
-
|
180
|
-
let mut document_content_handlers: Vec<DocumentContentHandlers> = vec![];
|
181
|
-
if !sanitizer.get_allow_doctype() {
|
182
|
-
document_content_handlers.push(doctype!(|d| {
|
183
|
-
sanitizer.remove_doctype(d);
|
184
|
-
Ok(())
|
185
|
-
}));
|
186
|
-
}
|
187
|
-
if !sanitizer.get_allow_comments() {
|
188
|
-
document_content_handlers.push(doc_comments!(|c| {
|
189
|
-
sanitizer.remove_comment(c);
|
190
|
-
Ok(())
|
191
|
-
}));
|
192
|
-
}
|
193
|
-
let mut rewriter = HtmlRewriter::new(
|
194
|
-
Settings {
|
195
|
-
document_content_handlers,
|
196
|
-
element_content_handlers: vec![element!("*", |el| {
|
197
|
-
sanitizer.try_remove_element(el);
|
198
|
-
if el.removed() {
|
199
|
-
return Ok(());
|
200
|
-
}
|
201
|
-
match sanitizer.sanitize_attributes(el) {
|
202
|
-
Ok(_) => Ok(()),
|
203
|
-
Err(err) => Err(err.to_string().into()),
|
204
|
-
}
|
205
|
-
})],
|
206
|
-
// TODO: allow for MemorySettings to be defined
|
207
|
-
..Settings::default()
|
208
|
-
},
|
209
|
-
|c: &[u8]| first_pass_html.extend_from_slice(c),
|
210
|
-
);
|
211
|
-
|
212
|
-
let result = rewriter.write(html.as_bytes());
|
213
|
-
if result.is_err() {
|
214
|
-
return Err(magnus::Error::new(
|
215
|
-
exception::runtime_error(),
|
216
|
-
format!("Failed to sanitize HTML: {}", result.unwrap_err()),
|
217
|
-
));
|
218
|
-
}
|
219
|
-
}
|
220
|
-
|
221
|
-
let mut output = vec![];
|
222
|
-
{
|
223
|
-
let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
|
224
|
-
if sanitizer.get_escape_tagfilter() {
|
225
|
-
element_content_handlers.push(element!(Tag::ESCAPEWORTHY_TAGS_CSS, |el| {
|
226
|
-
let should_remove = sanitizer.allow_element(el);
|
227
|
-
if should_remove {
|
228
|
-
sanitizer.force_remove_element(el);
|
229
|
-
}
|
325
|
+
html: Vec<u8>,
|
326
|
+
) -> Result<String, magnus::Error> {
|
327
|
+
// TODO: this should ideally be done ahead of time on `initialize`, not on every `#rewrite` call
|
328
|
+
let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
|
230
329
|
|
231
|
-
|
232
|
-
|
233
|
-
|
330
|
+
if sanitizer.get_escape_tagfilter() {
|
331
|
+
element_content_handlers.push(element!(Tag::ESCAPEWORTHY_TAGS_CSS, |el| {
|
332
|
+
let should_remove = sanitizer.allow_element(el);
|
333
|
+
if should_remove {
|
334
|
+
sanitizer.force_remove_element(el);
|
335
|
+
}
|
234
336
|
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
..Settings::default()
|
239
|
-
},
|
240
|
-
|c: &[u8]| output.extend_from_slice(c),
|
241
|
-
);
|
337
|
+
Ok(())
|
338
|
+
}));
|
339
|
+
}
|
242
340
|
|
243
|
-
|
244
|
-
|
245
|
-
|
341
|
+
match Self::run_rewrite(self, vec![], element_content_handlers, html.as_slice()) {
|
342
|
+
Ok(rewritten_html) => match String::from_utf8(rewritten_html) {
|
343
|
+
Ok(output) => Ok(output),
|
344
|
+
Err(err) => Err(magnus::Error::new(
|
246
345
|
exception::runtime_error(),
|
247
|
-
format!("
|
248
|
-
))
|
249
|
-
}
|
346
|
+
format!("{err:?}"),
|
347
|
+
)),
|
348
|
+
},
|
349
|
+
Err(err) => Err(err),
|
250
350
|
}
|
251
|
-
|
252
|
-
Ok(output)
|
253
351
|
}
|
254
352
|
|
255
|
-
pub fn perform_handler_rewrite(
|
353
|
+
pub fn perform_handler_rewrite<'a>(
|
256
354
|
&self,
|
257
|
-
|
355
|
+
sanitizer_document_content_handlers: Vec<DocumentContentHandlers<'a>>,
|
356
|
+
sanitizer_element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers<'a>)>,
|
357
|
+
handlers: &'a [Handler],
|
258
358
|
html: String,
|
259
359
|
) -> Result<Vec<u8>, magnus::Error> {
|
260
|
-
// TODO: this should ideally be done ahead of time
|
360
|
+
// TODO: this should ideally be done ahead of time on `initialize`, not on every `#rewrite` call
|
261
361
|
let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
|
262
362
|
|
363
|
+
// have sanitization happen first
|
364
|
+
element_content_handlers.extend(sanitizer_element_content_handlers);
|
365
|
+
|
263
366
|
handlers.iter().for_each(|handler| {
|
264
367
|
let element_stack: Rc<RefCell<Vec<String>>> = Rc::new(RefCell::new(vec![]));
|
265
368
|
|
@@ -275,7 +378,7 @@ impl SelmaRewriter {
|
|
275
378
|
selector.match_element().unwrap(),
|
276
379
|
move |el| {
|
277
380
|
match Self::process_element_handlers(
|
278
|
-
|
381
|
+
handler,
|
279
382
|
el,
|
280
383
|
&closure_element_stack.borrow(),
|
281
384
|
) {
|
@@ -294,21 +397,19 @@ impl SelmaRewriter {
|
|
294
397
|
move |text| {
|
295
398
|
let element_stack = closure_element_stack.as_ref().borrow();
|
296
399
|
if selector.ignore_text_within().is_some() {
|
297
|
-
// check if current tag is a tag we should be ignoring text within
|
298
|
-
|
400
|
+
// check if current tag is a tag we should be ignoring text within;
|
401
|
+
// also checks if tag is within an ancestery of ignored tags
|
299
402
|
if selector
|
300
403
|
.ignore_text_within()
|
301
404
|
.unwrap()
|
302
405
|
.iter()
|
303
|
-
.any(|
|
406
|
+
.any(|t| element_stack.contains(t))
|
304
407
|
{
|
305
408
|
return Ok(());
|
306
409
|
}
|
307
410
|
}
|
308
411
|
|
309
|
-
|
310
|
-
match Self::process_text_handlers(ruby.get_inner(handler.rb_handler), text)
|
311
|
-
{
|
412
|
+
match Self::process_text_handlers(handler, text) {
|
312
413
|
Ok(_) => Ok(()),
|
313
414
|
Err(err) => Err(err.to_string().into()),
|
314
415
|
}
|
@@ -329,28 +430,46 @@ impl SelmaRewriter {
|
|
329
430
|
|
330
431
|
let closure_element_stack = element_stack.clone();
|
331
432
|
|
332
|
-
el.end_tag_handlers()
|
333
|
-
.
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
433
|
+
if let Some(end_tag_handlers) = el.end_tag_handlers() {
|
434
|
+
end_tag_handlers.push(lol_html::EndTagHandler::into(Box::new(
|
435
|
+
move |_end_tag| {
|
436
|
+
closure_element_stack.as_ref().borrow_mut().pop();
|
437
|
+
Ok(())
|
438
|
+
},
|
439
|
+
)));
|
440
|
+
}
|
339
441
|
|
340
442
|
Ok(())
|
341
443
|
}));
|
342
444
|
});
|
343
445
|
|
446
|
+
Self::run_rewrite(
|
447
|
+
self,
|
448
|
+
sanitizer_document_content_handlers,
|
449
|
+
element_content_handlers,
|
450
|
+
html.as_bytes(),
|
451
|
+
)
|
452
|
+
}
|
453
|
+
|
454
|
+
fn run_rewrite<'a>(
|
455
|
+
&self,
|
456
|
+
document_content_handlers: Vec<DocumentContentHandlers<'a>>,
|
457
|
+
element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers<'a>)>,
|
458
|
+
html: &[u8],
|
459
|
+
) -> Result<Vec<u8>, magnus::Error> {
|
460
|
+
let binding = &self.0.borrow();
|
344
461
|
let mut output = vec![];
|
345
462
|
{
|
346
463
|
let mut rewriter = HtmlRewriter::new(
|
347
464
|
Settings {
|
465
|
+
document_content_handlers,
|
348
466
|
element_content_handlers,
|
467
|
+
memory_settings: Self::get_memory_options(binding),
|
349
468
|
..Settings::default()
|
350
469
|
},
|
351
470
|
|c: &[u8]| output.extend_from_slice(c),
|
352
471
|
);
|
353
|
-
match rewriter.write(html
|
472
|
+
match rewriter.write(html) {
|
354
473
|
Ok(_) => {}
|
355
474
|
Err(err) => {
|
356
475
|
return Err(magnus::Error::new(
|
@@ -364,10 +483,12 @@ impl SelmaRewriter {
|
|
364
483
|
}
|
365
484
|
|
366
485
|
fn process_element_handlers(
|
367
|
-
|
486
|
+
handler: &Handler,
|
368
487
|
element: &mut Element,
|
369
488
|
ancestors: &[String],
|
370
489
|
) -> Result<(), magnus::Error> {
|
490
|
+
let rb_handler = handler.rb_handler.into_value();
|
491
|
+
|
371
492
|
// if `on_end_tag` function is defined, call it
|
372
493
|
if rb_handler.respond_to(Self::SELMA_ON_END_TAG, true).unwrap() {
|
373
494
|
// TODO: error here is an "EndTagError"
|
@@ -375,38 +496,61 @@ impl SelmaRewriter {
|
|
375
496
|
.end_tag_handlers()
|
376
497
|
.unwrap()
|
377
498
|
.push(Box::new(move |end_tag| {
|
378
|
-
let
|
499
|
+
let (ref_wrap, anchor) = NativeRefWrap::wrap(end_tag);
|
500
|
+
|
501
|
+
let rb_end_tag = SelmaHTMLEndTag::new(ref_wrap);
|
379
502
|
|
380
|
-
|
503
|
+
let result =
|
504
|
+
rb_handler.funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,));
|
505
|
+
|
506
|
+
mem::drop(anchor);
|
507
|
+
|
508
|
+
match result {
|
381
509
|
Ok(_) => Ok(()),
|
382
510
|
Err(err) => Err(err.to_string().into()),
|
383
511
|
}
|
384
512
|
}));
|
385
513
|
}
|
386
514
|
|
387
|
-
let
|
388
|
-
let
|
389
|
-
|
390
|
-
|
515
|
+
let (ref_wrap, anchor) = NativeRefWrap::wrap(element);
|
516
|
+
let rb_element = SelmaHTMLElement::new(ref_wrap, ancestors);
|
517
|
+
let result = rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_ELEMENT, (rb_element,));
|
518
|
+
|
519
|
+
mem::drop(anchor);
|
520
|
+
|
521
|
+
match result {
|
391
522
|
Ok(_) => Ok(()),
|
392
|
-
Err(err) => Err(
|
523
|
+
Err(err) => Err(magnus::Error::new(
|
524
|
+
exception::runtime_error(),
|
525
|
+
format!("{err:?}"),
|
526
|
+
)),
|
393
527
|
}
|
394
528
|
}
|
395
529
|
|
396
530
|
fn process_text_handlers(
|
397
|
-
|
531
|
+
handler: &Handler,
|
398
532
|
text_chunk: &mut TextChunk,
|
399
533
|
) -> Result<(), magnus::Error> {
|
534
|
+
let rb_handler = handler.rb_handler.into_value();
|
535
|
+
|
400
536
|
// prevents missing `handle_text_chunk` function
|
401
537
|
let content = text_chunk.as_str();
|
402
538
|
|
403
|
-
//
|
539
|
+
// lol-html sometimes returns blank text if
|
540
|
+
// last_in_text_node() is true
|
404
541
|
if content.is_empty() {
|
405
542
|
return Ok(());
|
406
543
|
}
|
407
544
|
|
408
|
-
let
|
409
|
-
|
545
|
+
let (ref_wrap, anchor) = NativeRefWrap::wrap(text_chunk);
|
546
|
+
|
547
|
+
let rb_text_chunk = SelmaHTMLTextChunk::new(ref_wrap);
|
548
|
+
let result =
|
549
|
+
rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_TEXT_CHUNK, (rb_text_chunk,));
|
550
|
+
|
551
|
+
mem::drop(anchor);
|
552
|
+
|
553
|
+
match result {
|
410
554
|
Ok(_) => Ok(()),
|
411
555
|
Err(err) => Err(magnus::Error::new(
|
412
556
|
exception::runtime_error(),
|
@@ -414,6 +558,22 @@ impl SelmaRewriter {
|
|
414
558
|
)),
|
415
559
|
}
|
416
560
|
}
|
561
|
+
|
562
|
+
fn get_memory_options(binding: &Ref<Rewriter>) -> MemorySettings {
|
563
|
+
let options = &binding.options.memory_options;
|
564
|
+
MemorySettings {
|
565
|
+
max_allowed_memory_usage: options.max_allowed_memory_usage,
|
566
|
+
preallocated_parsing_buffer_size: options.preallocated_parsing_buffer_size,
|
567
|
+
}
|
568
|
+
}
|
569
|
+
}
|
570
|
+
|
571
|
+
impl RewriterOptions {
|
572
|
+
pub fn new() -> Self {
|
573
|
+
Self {
|
574
|
+
memory_options: MemorySettings::default(),
|
575
|
+
}
|
576
|
+
}
|
417
577
|
}
|
418
578
|
|
419
579
|
pub fn init(m_selma: RModule) -> Result<(), magnus::Error> {
|