selma 0.2.2 → 0.4.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +129 -124
- data/README.md +133 -25
- data/ext/selma/Cargo.toml +6 -3
- data/ext/selma/src/html/element.rs +32 -27
- data/ext/selma/src/html/end_tag.rs +5 -5
- data/ext/selma/src/html/text_chunk.rs +55 -12
- data/ext/selma/src/native_ref_wrap.rs +30 -33
- data/ext/selma/src/rewriter.rs +299 -139
- data/ext/selma/src/sanitizer.rs +256 -138
- data/lib/selma/config.rb +12 -0
- data/lib/selma/html/element.rb +11 -0
- data/lib/selma/html.rb +2 -0
- data/lib/selma/sanitizer/config/default.rb +1 -1
- data/lib/selma/sanitizer/config/relaxed.rb +1 -0
- data/lib/selma/sanitizer/config.rb +2 -2
- data/lib/selma/sanitizer.rb +0 -77
- data/lib/selma/version.rb +1 -1
- metadata +9 -7
data/ext/selma/src/rewriter.rs
CHANGED
@@ -1,19 +1,31 @@
|
|
1
1
|
use lol_html::{
|
2
2
|
doc_comments, doctype, element,
|
3
3
|
html_content::{Element, TextChunk},
|
4
|
-
text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter,
|
4
|
+
text, DocumentContentHandlers, ElementContentHandlers, HtmlRewriter, MemorySettings, Selector,
|
5
|
+
Settings,
|
5
6
|
};
|
6
7
|
use magnus::{
|
7
|
-
exception, function,
|
8
|
+
exception, function, gc, method,
|
9
|
+
r_hash::ForEach,
|
10
|
+
scan_args,
|
8
11
|
typed_data::Obj,
|
9
12
|
value::{Opaque, ReprValue},
|
10
|
-
Module, Object, RArray, RModule, Ruby,
|
13
|
+
DataTypeFunctions, Integer, IntoValue, Module, Object, RArray, RHash, RModule, Ruby, Symbol,
|
14
|
+
TypedData, Value,
|
11
15
|
};
|
12
16
|
|
13
|
-
use std::{
|
17
|
+
use std::{
|
18
|
+
borrow::Cow,
|
19
|
+
cell::{Ref, RefCell},
|
20
|
+
mem,
|
21
|
+
ops::Deref,
|
22
|
+
primitive::str,
|
23
|
+
rc::Rc,
|
24
|
+
};
|
14
25
|
|
15
26
|
use crate::{
|
16
27
|
html::{element::SelmaHTMLElement, end_tag::SelmaHTMLEndTag, text_chunk::SelmaHTMLTextChunk},
|
28
|
+
native_ref_wrap::NativeRefWrap,
|
17
29
|
sanitizer::SelmaSanitizer,
|
18
30
|
selector::SelmaSelector,
|
19
31
|
tags::Tag,
|
@@ -30,16 +42,34 @@ pub struct Handler {
|
|
30
42
|
// total_elapsed_text_handlers: f64,
|
31
43
|
}
|
32
44
|
|
45
|
+
struct RewriterOptions {
|
46
|
+
memory_options: MemorySettings,
|
47
|
+
}
|
48
|
+
|
33
49
|
pub struct Rewriter {
|
34
50
|
sanitizer: Option<SelmaSanitizer>,
|
35
51
|
handlers: Vec<Handler>,
|
52
|
+
options: RewriterOptions,
|
36
53
|
// total_elapsed: f64,
|
37
54
|
}
|
38
55
|
|
39
|
-
#[
|
56
|
+
#[derive(TypedData)]
|
57
|
+
#[magnus(class = "Selma::Rewriter", free_immediately, mark)]
|
40
58
|
pub struct SelmaRewriter(std::cell::RefCell<Rewriter>);
|
41
59
|
|
42
|
-
|
60
|
+
impl DataTypeFunctions for SelmaRewriter {
|
61
|
+
fn mark(&self, marker: &gc::Marker) {
|
62
|
+
self.0.borrow().handlers.iter().for_each(|handler| {
|
63
|
+
marker.mark(handler.rb_handler);
|
64
|
+
});
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
type RewriterValues = (
|
69
|
+
Option<Option<Obj<SelmaSanitizer>>>,
|
70
|
+
Option<RArray>,
|
71
|
+
Option<RHash>,
|
72
|
+
);
|
43
73
|
|
44
74
|
impl SelmaRewriter {
|
45
75
|
const SELMA_ON_END_TAG: &'static str = "on_end_tag";
|
@@ -50,25 +80,20 @@ impl SelmaRewriter {
|
|
50
80
|
/// @def new(sanitizer: Selma::Sanitizer.new(Selma::Sanitizer::Config::DEFAULT), handlers: [])
|
51
81
|
/// @param sanitizer [Selma::Sanitizer] The sanitizer which performs the initial cleanup
|
52
82
|
/// @param handlers [Array<Selma::Selector>] The handlers to use to perform HTML rewriting
|
83
|
+
/// @param options [Hash] Any additional options to pass to the rewriter
|
53
84
|
/// @return [Selma::Rewriter]
|
54
85
|
fn new(args: &[Value]) -> Result<Self, magnus::Error> {
|
55
|
-
let (rb_sanitizer, rb_handlers) = Self::scan_parse_args(args)?;
|
86
|
+
let (rb_sanitizer, rb_handlers, rb_options) = Self::scan_parse_args(args)?;
|
56
87
|
|
57
88
|
let sanitizer = match rb_sanitizer {
|
58
89
|
None => {
|
59
|
-
// no `sanitizer:` provided, use default
|
90
|
+
// no `sanitizer:` kwarg provided, use default
|
60
91
|
let default_sanitizer = SelmaSanitizer::new(&[])?;
|
61
92
|
let wrapped_sanitizer = Obj::wrap(default_sanitizer);
|
62
|
-
wrapped_sanitizer.funcall::<&str, (), Value>("setup", ())?;
|
63
|
-
Some(wrapped_sanitizer.
|
93
|
+
// wrapped_sanitizer.funcall::<&str, (), Value>("setup", ())?;
|
94
|
+
Some(wrapped_sanitizer.deref().to_owned())
|
64
95
|
}
|
65
|
-
Some(sanitizer_value) =>
|
66
|
-
None => None, // no `sanitizer:` provided, use default
|
67
|
-
Some(sanitizer) => {
|
68
|
-
sanitizer.funcall::<&str, (), Value>("setup", ())?;
|
69
|
-
Some(sanitizer.get().to_owned())
|
70
|
-
}
|
71
|
-
},
|
96
|
+
Some(sanitizer_value) => sanitizer_value.map(|sanitizer| sanitizer.deref().to_owned()),
|
72
97
|
};
|
73
98
|
|
74
99
|
let handlers = match rb_handlers {
|
@@ -76,9 +101,7 @@ impl SelmaRewriter {
|
|
76
101
|
Some(rb_handlers) => {
|
77
102
|
let mut handlers: Vec<Handler> = vec![];
|
78
103
|
|
79
|
-
for
|
80
|
-
let rb_handler = h.unwrap();
|
81
|
-
|
104
|
+
for rb_handler in rb_handlers.into_iter() {
|
82
105
|
// prevents missing #selector from ruining things
|
83
106
|
if !rb_handler.respond_to("selector", true).unwrap() {
|
84
107
|
let classname = unsafe { rb_handler.classname() };
|
@@ -122,9 +145,88 @@ impl SelmaRewriter {
|
|
122
145
|
));
|
123
146
|
}
|
124
147
|
|
148
|
+
let mut rewriter_options = RewriterOptions::new();
|
149
|
+
|
150
|
+
match rb_options {
|
151
|
+
None => {}
|
152
|
+
Some(options) => {
|
153
|
+
options.foreach(|key: Symbol, value: RHash| {
|
154
|
+
let key = key.to_string();
|
155
|
+
match key.as_str() {
|
156
|
+
"memory" => {
|
157
|
+
let max_allowed_memory_usage = value.get(Symbol::new("max_allowed_memory_usage"));
|
158
|
+
if max_allowed_memory_usage.is_some() {
|
159
|
+
let max_allowed_memory_usage = max_allowed_memory_usage.unwrap();
|
160
|
+
let max_allowed_memory_usage =
|
161
|
+
Integer::from_value(max_allowed_memory_usage);
|
162
|
+
if max_allowed_memory_usage.is_some() {
|
163
|
+
match max_allowed_memory_usage.unwrap().to_u64() {
|
164
|
+
Ok(max_allowed_memory_usage) => {
|
165
|
+
rewriter_options.memory_options.max_allowed_memory_usage =
|
166
|
+
max_allowed_memory_usage as usize;
|
167
|
+
}
|
168
|
+
Err(_e) => {
|
169
|
+
return Err(magnus::Error::new(
|
170
|
+
exception::arg_error(),
|
171
|
+
"max_allowed_memory_usage must be a positive integer",
|
172
|
+
));
|
173
|
+
}
|
174
|
+
}
|
175
|
+
} else {
|
176
|
+
rewriter_options.memory_options.max_allowed_memory_usage = MemorySettings::default().max_allowed_memory_usage;
|
177
|
+
}
|
178
|
+
}
|
179
|
+
|
180
|
+
let preallocated_parsing_buffer_size = value.get(Symbol::new("preallocated_parsing_buffer_size"));
|
181
|
+
if preallocated_parsing_buffer_size.is_some() {
|
182
|
+
let preallocated_parsing_buffer_size = preallocated_parsing_buffer_size.unwrap();
|
183
|
+
let preallocated_parsing_buffer_size =
|
184
|
+
Integer::from_value(preallocated_parsing_buffer_size);
|
185
|
+
if preallocated_parsing_buffer_size.is_some() {
|
186
|
+
match preallocated_parsing_buffer_size.unwrap().to_u64() {
|
187
|
+
Ok(preallocated_parsing_buffer_size) => {
|
188
|
+
rewriter_options.memory_options.preallocated_parsing_buffer_size =
|
189
|
+
preallocated_parsing_buffer_size as usize;
|
190
|
+
}
|
191
|
+
Err(_e) => {
|
192
|
+
return Err(magnus::Error::new(
|
193
|
+
exception::arg_error(),
|
194
|
+
"preallocated_parsing_buffer_size must be a positive integer",
|
195
|
+
));
|
196
|
+
}
|
197
|
+
}
|
198
|
+
} else {
|
199
|
+
rewriter_options.memory_options.preallocated_parsing_buffer_size = MemorySettings::default().preallocated_parsing_buffer_size;
|
200
|
+
}
|
201
|
+
}
|
202
|
+
}
|
203
|
+
_ => {
|
204
|
+
return Err(magnus::Error::new(
|
205
|
+
exception::arg_error(),
|
206
|
+
format!("Unknown option: {key:?}"),
|
207
|
+
));
|
208
|
+
}
|
209
|
+
}
|
210
|
+
Ok(ForEach::Continue)
|
211
|
+
})?;
|
212
|
+
}
|
213
|
+
}
|
214
|
+
|
215
|
+
if rewriter_options
|
216
|
+
.memory_options
|
217
|
+
.preallocated_parsing_buffer_size
|
218
|
+
> rewriter_options.memory_options.max_allowed_memory_usage
|
219
|
+
{
|
220
|
+
return Err(magnus::Error::new(
|
221
|
+
exception::arg_error(),
|
222
|
+
"max_allowed_memory_usage must be greater than preallocated_parsing_buffer_size",
|
223
|
+
));
|
224
|
+
}
|
225
|
+
|
125
226
|
Ok(Self(std::cell::RefCell::new(Rewriter {
|
126
227
|
sanitizer,
|
127
228
|
handlers,
|
229
|
+
options: rewriter_options,
|
128
230
|
// total_elapsed: 0.0,
|
129
231
|
})))
|
130
232
|
}
|
@@ -141,125 +243,126 @@ impl SelmaRewriter {
|
|
141
243
|
let kwargs = scan_args::get_kwargs::<
|
142
244
|
_,
|
143
245
|
(),
|
144
|
-
(
|
246
|
+
(
|
247
|
+
Option<Option<Obj<SelmaSanitizer>>>,
|
248
|
+
Option<RArray>,
|
249
|
+
Option<RHash>,
|
250
|
+
),
|
145
251
|
(),
|
146
|
-
>(args.keywords, &[], &["sanitizer", "handlers"])?;
|
147
|
-
let (rb_sanitizer, rb_handlers) = kwargs.optional;
|
252
|
+
>(args.keywords, &[], &["sanitizer", "handlers", "options"])?;
|
253
|
+
let (rb_sanitizer, rb_handlers, rb_options) = kwargs.optional;
|
148
254
|
|
149
|
-
Ok((rb_sanitizer, rb_handlers))
|
255
|
+
Ok((rb_sanitizer, rb_handlers, rb_options))
|
150
256
|
}
|
151
257
|
|
152
258
|
/// Perform HTML rewrite sequence.
|
153
259
|
fn rewrite(&self, html: String) -> Result<String, magnus::Error> {
|
154
|
-
let
|
155
|
-
None => Ok(html),
|
156
|
-
Some(sanitizer) => {
|
157
|
-
let sanitized_html = match Self::perform_sanitization(sanitizer, &html) {
|
158
|
-
Ok(sanitized_html) => sanitized_html,
|
159
|
-
Err(err) => return Err(err),
|
160
|
-
};
|
260
|
+
let binding = self.0.borrow();
|
161
261
|
|
162
|
-
|
262
|
+
let mut sanitizer_document_content_handlers: Vec<DocumentContentHandlers> = vec![];
|
263
|
+
let mut sanitizer_element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> =
|
264
|
+
vec![];
|
265
|
+
|
266
|
+
match &binding.sanitizer {
|
267
|
+
None => (),
|
268
|
+
Some(sanitizer) => {
|
269
|
+
if !sanitizer.get_allow_doctype() {
|
270
|
+
sanitizer_document_content_handlers.push(doctype!(|d| {
|
271
|
+
sanitizer.remove_doctype(d);
|
272
|
+
Ok(())
|
273
|
+
}));
|
274
|
+
}
|
275
|
+
if !sanitizer.get_allow_comments() {
|
276
|
+
sanitizer_document_content_handlers.push(doc_comments!(|c| {
|
277
|
+
sanitizer.remove_comment(c);
|
278
|
+
Ok(())
|
279
|
+
}));
|
280
|
+
}
|
281
|
+
sanitizer_element_content_handlers.push(element!("*", |el| {
|
282
|
+
sanitizer.try_remove_element(el);
|
283
|
+
if el.removed() {
|
284
|
+
return Ok(());
|
285
|
+
}
|
286
|
+
// if it was removed, there are no attributes to sanitize
|
287
|
+
match sanitizer.sanitize_attributes(el) {
|
288
|
+
Ok(_) => Ok(()),
|
289
|
+
Err(err) => Err(err.to_string().into()),
|
290
|
+
}
|
291
|
+
}));
|
163
292
|
}
|
164
293
|
};
|
165
|
-
let binding = self.0.borrow_mut();
|
166
|
-
let handlers = &binding.handlers;
|
167
294
|
|
168
|
-
|
169
|
-
|
295
|
+
let handlers: &Vec<Handler> = &binding.handlers;
|
296
|
+
|
297
|
+
match Self::perform_handler_rewrite(
|
298
|
+
self,
|
299
|
+
sanitizer_document_content_handlers,
|
300
|
+
sanitizer_element_content_handlers,
|
301
|
+
handlers,
|
302
|
+
html,
|
303
|
+
) {
|
304
|
+
Ok(rewritten_html) => match &binding.sanitizer {
|
305
|
+
None => match String::from_utf8(rewritten_html) {
|
306
|
+
Ok(output) => Ok(output),
|
307
|
+
Err(err) => Err(magnus::Error::new(
|
308
|
+
exception::runtime_error(),
|
309
|
+
format!("{err:?}"),
|
310
|
+
)),
|
311
|
+
},
|
312
|
+
Some(sanitizer) => {
|
313
|
+
Self::perform_final_sanitization(self, sanitizer, rewritten_html)
|
314
|
+
}
|
315
|
+
},
|
170
316
|
Err(err) => Err(err),
|
171
317
|
}
|
172
318
|
}
|
173
319
|
|
174
|
-
|
320
|
+
// to get rid of some really nasty edge cases with dangerous tags, we perform one more
|
321
|
+
// sanitization pass at the end
|
322
|
+
fn perform_final_sanitization(
|
323
|
+
&self,
|
175
324
|
sanitizer: &SelmaSanitizer,
|
176
|
-
html:
|
177
|
-
) -> Result<
|
178
|
-
|
179
|
-
|
180
|
-
let mut document_content_handlers: Vec<DocumentContentHandlers> = vec![];
|
181
|
-
if !sanitizer.get_allow_doctype() {
|
182
|
-
document_content_handlers.push(doctype!(|d| {
|
183
|
-
sanitizer.remove_doctype(d);
|
184
|
-
Ok(())
|
185
|
-
}));
|
186
|
-
}
|
187
|
-
if !sanitizer.get_allow_comments() {
|
188
|
-
document_content_handlers.push(doc_comments!(|c| {
|
189
|
-
sanitizer.remove_comment(c);
|
190
|
-
Ok(())
|
191
|
-
}));
|
192
|
-
}
|
193
|
-
let mut rewriter = HtmlRewriter::new(
|
194
|
-
Settings {
|
195
|
-
document_content_handlers,
|
196
|
-
element_content_handlers: vec![element!("*", |el| {
|
197
|
-
sanitizer.try_remove_element(el);
|
198
|
-
if el.removed() {
|
199
|
-
return Ok(());
|
200
|
-
}
|
201
|
-
match sanitizer.sanitize_attributes(el) {
|
202
|
-
Ok(_) => Ok(()),
|
203
|
-
Err(err) => Err(err.to_string().into()),
|
204
|
-
}
|
205
|
-
})],
|
206
|
-
// TODO: allow for MemorySettings to be defined
|
207
|
-
..Settings::default()
|
208
|
-
},
|
209
|
-
|c: &[u8]| first_pass_html.extend_from_slice(c),
|
210
|
-
);
|
211
|
-
|
212
|
-
let result = rewriter.write(html.as_bytes());
|
213
|
-
if result.is_err() {
|
214
|
-
return Err(magnus::Error::new(
|
215
|
-
exception::runtime_error(),
|
216
|
-
format!("Failed to sanitize HTML: {}", result.unwrap_err()),
|
217
|
-
));
|
218
|
-
}
|
219
|
-
}
|
220
|
-
|
221
|
-
let mut output = vec![];
|
222
|
-
{
|
223
|
-
let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
|
224
|
-
if sanitizer.get_escape_tagfilter() {
|
225
|
-
element_content_handlers.push(element!(Tag::ESCAPEWORTHY_TAGS_CSS, |el| {
|
226
|
-
let should_remove = sanitizer.allow_element(el);
|
227
|
-
if should_remove {
|
228
|
-
sanitizer.force_remove_element(el);
|
229
|
-
}
|
325
|
+
html: Vec<u8>,
|
326
|
+
) -> Result<String, magnus::Error> {
|
327
|
+
// TODO: this should ideally be done ahead of time on `initialize`, not on every `#rewrite` call
|
328
|
+
let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
|
230
329
|
|
231
|
-
|
232
|
-
|
233
|
-
|
330
|
+
if sanitizer.get_escape_tagfilter() {
|
331
|
+
element_content_handlers.push(element!(Tag::ESCAPEWORTHY_TAGS_CSS, |el| {
|
332
|
+
let should_remove = sanitizer.allow_element(el);
|
333
|
+
if should_remove {
|
334
|
+
sanitizer.force_remove_element(el);
|
335
|
+
}
|
234
336
|
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
..Settings::default()
|
239
|
-
},
|
240
|
-
|c: &[u8]| output.extend_from_slice(c),
|
241
|
-
);
|
337
|
+
Ok(())
|
338
|
+
}));
|
339
|
+
}
|
242
340
|
|
243
|
-
|
244
|
-
|
245
|
-
|
341
|
+
match Self::run_rewrite(self, vec![], element_content_handlers, html.as_slice()) {
|
342
|
+
Ok(rewritten_html) => match String::from_utf8(rewritten_html) {
|
343
|
+
Ok(output) => Ok(output),
|
344
|
+
Err(err) => Err(magnus::Error::new(
|
246
345
|
exception::runtime_error(),
|
247
|
-
format!("
|
248
|
-
))
|
249
|
-
}
|
346
|
+
format!("{err:?}"),
|
347
|
+
)),
|
348
|
+
},
|
349
|
+
Err(err) => Err(err),
|
250
350
|
}
|
251
|
-
|
252
|
-
Ok(output)
|
253
351
|
}
|
254
352
|
|
255
|
-
pub fn perform_handler_rewrite(
|
353
|
+
pub fn perform_handler_rewrite<'a>(
|
256
354
|
&self,
|
257
|
-
|
355
|
+
sanitizer_document_content_handlers: Vec<DocumentContentHandlers<'a>>,
|
356
|
+
sanitizer_element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers<'a>)>,
|
357
|
+
handlers: &'a [Handler],
|
258
358
|
html: String,
|
259
359
|
) -> Result<Vec<u8>, magnus::Error> {
|
260
|
-
// TODO: this should ideally be done ahead of time
|
360
|
+
// TODO: this should ideally be done ahead of time on `initialize`, not on every `#rewrite` call
|
261
361
|
let mut element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers)> = vec![];
|
262
362
|
|
363
|
+
// have sanitization happen first
|
364
|
+
element_content_handlers.extend(sanitizer_element_content_handlers);
|
365
|
+
|
263
366
|
handlers.iter().for_each(|handler| {
|
264
367
|
let element_stack: Rc<RefCell<Vec<String>>> = Rc::new(RefCell::new(vec![]));
|
265
368
|
|
@@ -275,7 +378,7 @@ impl SelmaRewriter {
|
|
275
378
|
selector.match_element().unwrap(),
|
276
379
|
move |el| {
|
277
380
|
match Self::process_element_handlers(
|
278
|
-
|
381
|
+
handler,
|
279
382
|
el,
|
280
383
|
&closure_element_stack.borrow(),
|
281
384
|
) {
|
@@ -294,21 +397,19 @@ impl SelmaRewriter {
|
|
294
397
|
move |text| {
|
295
398
|
let element_stack = closure_element_stack.as_ref().borrow();
|
296
399
|
if selector.ignore_text_within().is_some() {
|
297
|
-
// check if current tag is a tag we should be ignoring text within
|
298
|
-
|
400
|
+
// check if current tag is a tag we should be ignoring text within;
|
401
|
+
// also checks if tag is within an ancestery of ignored tags
|
299
402
|
if selector
|
300
403
|
.ignore_text_within()
|
301
404
|
.unwrap()
|
302
405
|
.iter()
|
303
|
-
.any(|
|
406
|
+
.any(|t| element_stack.contains(t))
|
304
407
|
{
|
305
408
|
return Ok(());
|
306
409
|
}
|
307
410
|
}
|
308
411
|
|
309
|
-
|
310
|
-
match Self::process_text_handlers(ruby.get_inner(handler.rb_handler), text)
|
311
|
-
{
|
412
|
+
match Self::process_text_handlers(handler, text) {
|
312
413
|
Ok(_) => Ok(()),
|
313
414
|
Err(err) => Err(err.to_string().into()),
|
314
415
|
}
|
@@ -329,28 +430,46 @@ impl SelmaRewriter {
|
|
329
430
|
|
330
431
|
let closure_element_stack = element_stack.clone();
|
331
432
|
|
332
|
-
el.end_tag_handlers()
|
333
|
-
.
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
433
|
+
if let Some(end_tag_handlers) = el.end_tag_handlers() {
|
434
|
+
end_tag_handlers.push(lol_html::EndTagHandler::into(Box::new(
|
435
|
+
move |_end_tag| {
|
436
|
+
closure_element_stack.as_ref().borrow_mut().pop();
|
437
|
+
Ok(())
|
438
|
+
},
|
439
|
+
)));
|
440
|
+
}
|
339
441
|
|
340
442
|
Ok(())
|
341
443
|
}));
|
342
444
|
});
|
343
445
|
|
446
|
+
Self::run_rewrite(
|
447
|
+
self,
|
448
|
+
sanitizer_document_content_handlers,
|
449
|
+
element_content_handlers,
|
450
|
+
html.as_bytes(),
|
451
|
+
)
|
452
|
+
}
|
453
|
+
|
454
|
+
fn run_rewrite<'a>(
|
455
|
+
&self,
|
456
|
+
document_content_handlers: Vec<DocumentContentHandlers<'a>>,
|
457
|
+
element_content_handlers: Vec<(Cow<Selector>, ElementContentHandlers<'a>)>,
|
458
|
+
html: &[u8],
|
459
|
+
) -> Result<Vec<u8>, magnus::Error> {
|
460
|
+
let binding = &self.0.borrow();
|
344
461
|
let mut output = vec![];
|
345
462
|
{
|
346
463
|
let mut rewriter = HtmlRewriter::new(
|
347
464
|
Settings {
|
465
|
+
document_content_handlers,
|
348
466
|
element_content_handlers,
|
467
|
+
memory_settings: Self::get_memory_options(binding),
|
349
468
|
..Settings::default()
|
350
469
|
},
|
351
470
|
|c: &[u8]| output.extend_from_slice(c),
|
352
471
|
);
|
353
|
-
match rewriter.write(html
|
472
|
+
match rewriter.write(html) {
|
354
473
|
Ok(_) => {}
|
355
474
|
Err(err) => {
|
356
475
|
return Err(magnus::Error::new(
|
@@ -364,10 +483,12 @@ impl SelmaRewriter {
|
|
364
483
|
}
|
365
484
|
|
366
485
|
fn process_element_handlers(
|
367
|
-
|
486
|
+
handler: &Handler,
|
368
487
|
element: &mut Element,
|
369
488
|
ancestors: &[String],
|
370
489
|
) -> Result<(), magnus::Error> {
|
490
|
+
let rb_handler = handler.rb_handler.into_value();
|
491
|
+
|
371
492
|
// if `on_end_tag` function is defined, call it
|
372
493
|
if rb_handler.respond_to(Self::SELMA_ON_END_TAG, true).unwrap() {
|
373
494
|
// TODO: error here is an "EndTagError"
|
@@ -375,38 +496,61 @@ impl SelmaRewriter {
|
|
375
496
|
.end_tag_handlers()
|
376
497
|
.unwrap()
|
377
498
|
.push(Box::new(move |end_tag| {
|
378
|
-
let
|
499
|
+
let (ref_wrap, anchor) = NativeRefWrap::wrap(end_tag);
|
500
|
+
|
501
|
+
let rb_end_tag = SelmaHTMLEndTag::new(ref_wrap);
|
379
502
|
|
380
|
-
|
503
|
+
let result =
|
504
|
+
rb_handler.funcall::<_, _, Value>(Self::SELMA_ON_END_TAG, (rb_end_tag,));
|
505
|
+
|
506
|
+
mem::drop(anchor);
|
507
|
+
|
508
|
+
match result {
|
381
509
|
Ok(_) => Ok(()),
|
382
510
|
Err(err) => Err(err.to_string().into()),
|
383
511
|
}
|
384
512
|
}));
|
385
513
|
}
|
386
514
|
|
387
|
-
let
|
388
|
-
let
|
389
|
-
|
390
|
-
|
515
|
+
let (ref_wrap, anchor) = NativeRefWrap::wrap(element);
|
516
|
+
let rb_element = SelmaHTMLElement::new(ref_wrap, ancestors);
|
517
|
+
let result = rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_ELEMENT, (rb_element,));
|
518
|
+
|
519
|
+
mem::drop(anchor);
|
520
|
+
|
521
|
+
match result {
|
391
522
|
Ok(_) => Ok(()),
|
392
|
-
Err(err) => Err(
|
523
|
+
Err(err) => Err(magnus::Error::new(
|
524
|
+
exception::runtime_error(),
|
525
|
+
format!("{err:?}"),
|
526
|
+
)),
|
393
527
|
}
|
394
528
|
}
|
395
529
|
|
396
530
|
fn process_text_handlers(
|
397
|
-
|
531
|
+
handler: &Handler,
|
398
532
|
text_chunk: &mut TextChunk,
|
399
533
|
) -> Result<(), magnus::Error> {
|
534
|
+
let rb_handler = handler.rb_handler.into_value();
|
535
|
+
|
400
536
|
// prevents missing `handle_text_chunk` function
|
401
537
|
let content = text_chunk.as_str();
|
402
538
|
|
403
|
-
//
|
539
|
+
// lol-html sometimes returns blank text if
|
540
|
+
// last_in_text_node() is true
|
404
541
|
if content.is_empty() {
|
405
542
|
return Ok(());
|
406
543
|
}
|
407
544
|
|
408
|
-
let
|
409
|
-
|
545
|
+
let (ref_wrap, anchor) = NativeRefWrap::wrap(text_chunk);
|
546
|
+
|
547
|
+
let rb_text_chunk = SelmaHTMLTextChunk::new(ref_wrap);
|
548
|
+
let result =
|
549
|
+
rb_handler.funcall::<_, _, Value>(Self::SELMA_HANDLE_TEXT_CHUNK, (rb_text_chunk,));
|
550
|
+
|
551
|
+
mem::drop(anchor);
|
552
|
+
|
553
|
+
match result {
|
410
554
|
Ok(_) => Ok(()),
|
411
555
|
Err(err) => Err(magnus::Error::new(
|
412
556
|
exception::runtime_error(),
|
@@ -414,6 +558,22 @@ impl SelmaRewriter {
|
|
414
558
|
)),
|
415
559
|
}
|
416
560
|
}
|
561
|
+
|
562
|
+
fn get_memory_options(binding: &Ref<Rewriter>) -> MemorySettings {
|
563
|
+
let options = &binding.options.memory_options;
|
564
|
+
MemorySettings {
|
565
|
+
max_allowed_memory_usage: options.max_allowed_memory_usage,
|
566
|
+
preallocated_parsing_buffer_size: options.preallocated_parsing_buffer_size,
|
567
|
+
}
|
568
|
+
}
|
569
|
+
}
|
570
|
+
|
571
|
+
impl RewriterOptions {
|
572
|
+
pub fn new() -> Self {
|
573
|
+
Self {
|
574
|
+
memory_options: MemorySettings::default(),
|
575
|
+
}
|
576
|
+
}
|
417
577
|
}
|
418
578
|
|
419
579
|
pub fn init(m_selma: RModule) -> Result<(), magnus::Error> {
|