sanitize 6.1.3 → 7.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/{HISTORY.md → CHANGELOG.md} +32 -14
- data/LICENSE +3 -1
- data/README.md +120 -238
- data/lib/sanitize/config/basic.rb +15 -15
- data/lib/sanitize/config/default.rb +45 -45
- data/lib/sanitize/config/relaxed.rb +136 -32
- data/lib/sanitize/config/restricted.rb +2 -2
- data/lib/sanitize/config.rb +12 -14
- data/lib/sanitize/css.rb +308 -308
- data/lib/sanitize/transformers/clean_cdata.rb +9 -9
- data/lib/sanitize/transformers/clean_comment.rb +9 -9
- data/lib/sanitize/transformers/clean_css.rb +59 -55
- data/lib/sanitize/transformers/clean_doctype.rb +15 -15
- data/lib/sanitize/transformers/clean_element.rb +220 -237
- data/lib/sanitize/version.rb +3 -1
- data/lib/sanitize.rb +38 -38
- data/test/common.rb +4 -3
- data/test/test_clean_comment.rb +26 -25
- data/test/test_clean_css.rb +14 -13
- data/test/test_clean_doctype.rb +21 -20
- data/test/test_clean_element.rb +258 -273
- data/test/test_config.rb +22 -21
- data/test/test_malicious_css.rb +20 -19
- data/test/test_malicious_html.rb +100 -99
- data/test/test_parser.rb +26 -25
- data/test/test_sanitize.rb +70 -69
- data/test/test_sanitize_css.rb +149 -114
- data/test/test_transformers.rb +81 -83
- metadata +14 -43
@@ -1,268 +1,251 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
require
|
4
|
-
require
|
5
|
-
|
6
|
-
class Sanitize
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
@attributes
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
if config[:whitespace_elements].is_a?(Set)
|
76
|
-
config[:whitespace_elements].each do |element|
|
77
|
-
@whitespace_elements[element] = {:before => ' ', :after => ' '}
|
78
|
-
end
|
79
|
-
else
|
80
|
-
@whitespace_elements = config[:whitespace_elements]
|
81
|
-
end
|
82
|
-
|
83
|
-
if config[:remove_contents].is_a?(Enumerable)
|
84
|
-
@remove_element_contents.merge(config[:remove_contents].map(&:to_s))
|
85
|
-
else
|
86
|
-
@remove_all_contents = !!config[:remove_contents]
|
87
|
-
end
|
88
|
-
end
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "cgi"
|
4
|
+
require "set"
|
5
|
+
|
6
|
+
class Sanitize
|
7
|
+
module Transformers
|
8
|
+
class CleanElement
|
9
|
+
# Matches a valid HTML5 data attribute name. The unicode ranges included
|
10
|
+
# here are a conservative subset of the full range of characters that are
|
11
|
+
# technically allowed, with the intent of matching the most common
|
12
|
+
# characters used in data attribute names while excluding uncommon or
|
13
|
+
# potentially misleading characters, or characters with the potential to
|
14
|
+
# be normalized into unsafe or confusing forms.
|
15
|
+
#
|
16
|
+
# If you need data attr names with characters that aren't included here
|
17
|
+
# (such as combining marks, full-width characters, or CJK), please
|
18
|
+
# consider creating a custom transformer to validate attributes according
|
19
|
+
# to your needs.
|
20
|
+
#
|
21
|
+
# https://html.spec.whatwg.org/multipage/dom.html#embedding-custom-non-visible-data-with-the-data-*-attributes
|
22
|
+
REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
|
23
|
+
|
24
|
+
# Elements whose content is treated as unescaped text by HTML parsers.
|
25
|
+
UNESCAPED_TEXT_ELEMENTS = Set.new(%w[
|
26
|
+
iframe
|
27
|
+
noembed
|
28
|
+
noframes
|
29
|
+
noscript
|
30
|
+
plaintext
|
31
|
+
script
|
32
|
+
style
|
33
|
+
xmp
|
34
|
+
])
|
35
|
+
|
36
|
+
# Attributes that need additional escaping on `<a>` elements due to unsafe
|
37
|
+
# libxml2 behavior.
|
38
|
+
UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
|
39
|
+
name
|
40
|
+
])
|
41
|
+
|
42
|
+
# Attributes that need additional escaping on all elements due to unsafe
|
43
|
+
# libxml2 behavior.
|
44
|
+
UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
|
45
|
+
action
|
46
|
+
href
|
47
|
+
src
|
48
|
+
])
|
49
|
+
|
50
|
+
# Mapping of original characters to escape sequences for characters that
|
51
|
+
# should be escaped in attributes affected by unsafe libxml2 behavior.
|
52
|
+
UNSAFE_LIBXML_ESCAPE_CHARS = {
|
53
|
+
" " => "%20",
|
54
|
+
'"' => "%22"
|
55
|
+
}
|
56
|
+
|
57
|
+
# Regex that matches any single character that needs to be escaped in
|
58
|
+
# attributes affected by unsafe libxml2 behavior.
|
59
|
+
UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
|
60
|
+
|
61
|
+
def initialize(config)
|
62
|
+
@add_attributes = config[:add_attributes]
|
63
|
+
@attributes = config[:attributes].dup
|
64
|
+
@elements = config[:elements]
|
65
|
+
@protocols = config[:protocols]
|
66
|
+
@remove_all_contents = false
|
67
|
+
@remove_element_contents = Set.new
|
68
|
+
@whitespace_elements = {}
|
69
|
+
|
70
|
+
@attributes.each do |element_name, attrs|
|
71
|
+
unless element_name == :all
|
72
|
+
@attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
|
73
|
+
end
|
74
|
+
end
|
89
75
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
# already been deleted from the document.
|
98
|
-
#
|
99
|
-
# It's important that we not try to reparent the children of a node that has
|
100
|
-
# already been deleted, since that seems to trigger a memory leak in
|
101
|
-
# Nokogiri.
|
102
|
-
unless @elements.include?(name) || node.parent.nil?
|
103
|
-
# Elements like br, div, p, etc. need to be replaced with whitespace in
|
104
|
-
# order to preserve readability.
|
105
|
-
if @whitespace_elements.include?(name)
|
106
|
-
node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
|
107
|
-
|
108
|
-
unless node.children.empty?
|
109
|
-
node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
|
76
|
+
# Backcompat: if :whitespace_elements is a Set, convert it to a hash.
|
77
|
+
if config[:whitespace_elements].is_a?(Set)
|
78
|
+
config[:whitespace_elements].each do |element|
|
79
|
+
@whitespace_elements[element] = {before: " ", after: " "}
|
80
|
+
end
|
81
|
+
else
|
82
|
+
@whitespace_elements = config[:whitespace_elements]
|
110
83
|
end
|
111
|
-
end
|
112
84
|
|
113
|
-
|
114
|
-
|
115
|
-
|
85
|
+
if config[:remove_contents].is_a?(Enumerable)
|
86
|
+
@remove_element_contents.merge(config[:remove_contents].map(&:to_s))
|
87
|
+
else
|
88
|
+
@remove_all_contents = !!config[:remove_contents]
|
116
89
|
end
|
117
90
|
end
|
118
91
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
attr_allowlist = @attributes[name] || @attributes[:all]
|
124
|
-
|
125
|
-
if attr_allowlist.nil?
|
126
|
-
# Delete all attributes from elements with no allowlisted attributes.
|
127
|
-
node.attribute_nodes.each {|attr| attr.unlink }
|
128
|
-
else
|
129
|
-
allow_data_attributes = attr_allowlist.include?(:data)
|
92
|
+
def call(env)
|
93
|
+
node = env[:node]
|
94
|
+
return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
|
130
95
|
|
131
|
-
|
132
|
-
node.attribute_nodes.each do |attr|
|
133
|
-
attr_name = attr.name.downcase
|
96
|
+
name = env[:node_name]
|
134
97
|
|
135
|
-
unless
|
136
|
-
|
137
|
-
|
98
|
+
# Delete any element that isn't in the config allowlist, unless the node
|
99
|
+
# has already been deleted from the document.
|
100
|
+
#
|
101
|
+
# It's important that we not try to reparent the children of a node that
|
102
|
+
# has already been deleted, since that seems to trigger a memory leak in
|
103
|
+
# Nokogiri.
|
104
|
+
unless @elements.include?(name) || node.parent.nil?
|
105
|
+
# Elements like br, div, p, etc. need to be replaced with whitespace
|
106
|
+
# in order to preserve readability.
|
107
|
+
if @whitespace_elements.include?(name)
|
108
|
+
node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
|
109
|
+
|
110
|
+
unless node.children.empty?
|
111
|
+
node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
|
112
|
+
end
|
113
|
+
end
|
138
114
|
|
139
|
-
unless
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
next
|
115
|
+
unless node.children.empty?
|
116
|
+
unless @remove_all_contents || @remove_element_contents.include?(name)
|
117
|
+
node.add_previous_sibling(node.children)
|
118
|
+
end
|
144
119
|
end
|
120
|
+
|
121
|
+
node.unlink
|
122
|
+
return
|
145
123
|
end
|
146
124
|
|
147
|
-
|
125
|
+
attr_allowlist = @attributes[name] || @attributes[:all]
|
126
|
+
|
127
|
+
if attr_allowlist.nil?
|
128
|
+
# Delete all attributes from elements with no allowlisted attributes.
|
129
|
+
node.attribute_nodes.each { |attr| attr.unlink }
|
130
|
+
else
|
131
|
+
allow_data_attributes = attr_allowlist.include?(:data)
|
148
132
|
|
149
|
-
|
150
|
-
|
151
|
-
|
133
|
+
# Delete any attribute that isn't allowed on this element.
|
134
|
+
node.attribute_nodes.each do |attr|
|
135
|
+
attr_name = attr.name.downcase
|
152
136
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
137
|
+
unless attr_allowlist.include?(attr_name)
|
138
|
+
# The attribute isn't in the allowlist, but may still be allowed
|
139
|
+
# if it's a data attribute.
|
140
|
+
|
141
|
+
unless allow_data_attributes && attr_name.start_with?("data-") && attr_name =~ REGEX_DATA_ATTR
|
142
|
+
# Either the attribute isn't a data attribute or arbitrary data
|
143
|
+
# attributes aren't allowed. Remove the attribute.
|
144
|
+
attr.unlink
|
145
|
+
next
|
146
|
+
end
|
157
147
|
end
|
158
148
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
149
|
+
# The attribute is allowed.
|
150
|
+
|
151
|
+
# Remove any attributes that use unacceptable protocols.
|
152
|
+
if @protocols.include?(name) && @protocols[name].include?(attr_name)
|
153
|
+
attr_protocols = @protocols[name][attr_name]
|
154
|
+
|
155
|
+
if attr.value =~ REGEX_PROTOCOL
|
156
|
+
unless attr_protocols.include?($1.downcase)
|
157
|
+
attr.unlink
|
158
|
+
next
|
159
|
+
end
|
160
|
+
|
161
|
+
else
|
162
|
+
unless attr_protocols.include?(:relative)
|
163
|
+
attr.unlink
|
164
|
+
next
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# Leading and trailing whitespace around URLs is ignored at parse
|
169
|
+
# time. Stripping it here prevents it from being escaped by the
|
170
|
+
# libxml2 workaround below.
|
171
|
+
attr.value = attr.value.strip
|
163
172
|
end
|
164
|
-
end
|
165
173
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
174
|
+
# libxml2 >= 2.9.2 doesn't escape comments within some attributes,
|
175
|
+
# in an attempt to preserve server-side includes. This can result in
|
176
|
+
# XSS since an unescaped double quote can allow an attacker to
|
177
|
+
# inject a non-allowlisted attribute.
|
178
|
+
#
|
179
|
+
# Sanitize works around this by implementing its own escaping for
|
180
|
+
# affected attributes, some of which can exist on any element and
|
181
|
+
# some of which can only exist on `<a>` elements.
|
182
|
+
#
|
183
|
+
# This fix is technically no longer necessary with Nokogumbo >= 2.0
|
184
|
+
# since it no longer uses libxml2's serializer, but it's retained to
|
185
|
+
# avoid breaking use cases where people might be sanitizing
|
186
|
+
# individual Nokogiri nodes and then serializing them manually
|
187
|
+
# without Nokogumbo.
|
188
|
+
#
|
189
|
+
# The relevant libxml2 code is here:
|
190
|
+
# <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
|
191
|
+
if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
|
192
|
+
(name == "a" && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
|
193
|
+
|
194
|
+
attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
|
195
|
+
end
|
196
|
+
end
|
170
197
|
end
|
171
198
|
|
172
|
-
#
|
173
|
-
|
174
|
-
|
175
|
-
# non-allowlisted attribute.
|
176
|
-
#
|
177
|
-
# Sanitize works around this by implementing its own escaping for
|
178
|
-
# affected attributes, some of which can exist on any element and some
|
179
|
-
# of which can only exist on `<a>` elements.
|
180
|
-
#
|
181
|
-
# This fix is technically no longer necessary with Nokogumbo >= 2.0
|
182
|
-
# since it no longer uses libxml2's serializer, but it's retained to
|
183
|
-
# avoid breaking use cases where people might be sanitizing individual
|
184
|
-
# Nokogiri nodes and then serializing them manually without Nokogumbo.
|
185
|
-
#
|
186
|
-
# The relevant libxml2 code is here:
|
187
|
-
# <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
|
188
|
-
if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
|
189
|
-
(name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
|
190
|
-
|
191
|
-
attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
|
199
|
+
# Add required attributes.
|
200
|
+
if @add_attributes.include?(name)
|
201
|
+
@add_attributes[name].each { |key, val| node[key] = val }
|
192
202
|
end
|
193
|
-
end
|
194
|
-
end
|
195
203
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
# can't guarantee that their contents are safe.
|
211
|
-
unless node.namespace.nil?
|
212
|
-
prefix = node.namespace.prefix
|
213
|
-
|
214
|
-
if (prefix == 'math' || prefix == 'svg') && UNESCAPED_TEXT_ELEMENTS.include?(name)
|
215
|
-
node.children.each do |child|
|
216
|
-
if child.type == Nokogiri::XML::Node::TEXT_NODE
|
217
|
-
child.content = CGI.escapeHTML(child.content)
|
204
|
+
# Element-specific special cases.
|
205
|
+
case name
|
206
|
+
|
207
|
+
# If this is an allowlisted iframe that has children, remove all its
|
208
|
+
# children. The HTML standard says iframes shouldn't have content, but
|
209
|
+
# when they do, this content is parsed as text and is serialized
|
210
|
+
# verbatim without being escaped, which is unsafe because legacy
|
211
|
+
# browsers may still render it and execute `<script>` content. So the
|
212
|
+
# safe and correct thing to do is to always remove iframe content.
|
213
|
+
when "iframe"
|
214
|
+
if !node.children.empty?
|
215
|
+
node.children.each do |child|
|
216
|
+
child.unlink
|
217
|
+
end
|
218
218
|
end
|
219
|
-
end
|
220
|
-
end
|
221
|
-
end
|
222
219
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
# they do, this content is parsed as text and is serialized verbatim without
|
229
|
-
# being escaped, which is unsafe because legacy browsers may still render it
|
230
|
-
# and execute `<script>` content. So the safe and correct thing to do is to
|
231
|
-
# always remove iframe content.
|
232
|
-
when 'iframe'
|
233
|
-
if !node.children.empty?
|
234
|
-
node.children.each do |child|
|
235
|
-
child.unlink
|
236
|
-
end
|
237
|
-
end
|
220
|
+
# Prevent the use of `<meta>` elements that set a charset other than
|
221
|
+
# UTF-8, since Sanitize's output is always UTF-8.
|
222
|
+
when "meta"
|
223
|
+
if node.has_attribute?("charset") &&
|
224
|
+
node["charset"].downcase != "utf-8"
|
238
225
|
|
239
|
-
|
240
|
-
|
241
|
-
when 'meta'
|
242
|
-
if node.has_attribute?('charset') &&
|
243
|
-
node['charset'].downcase != 'utf-8'
|
226
|
+
node["charset"] = "utf-8"
|
227
|
+
end
|
244
228
|
|
245
|
-
|
246
|
-
|
229
|
+
if node.has_attribute?("http-equiv") &&
|
230
|
+
node.has_attribute?("content") &&
|
231
|
+
node["http-equiv"].downcase == "content-type" &&
|
232
|
+
node["content"].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
|
247
233
|
|
248
|
-
|
249
|
-
|
250
|
-
node['http-equiv'].downcase == 'content-type' &&
|
251
|
-
node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
|
234
|
+
node["content"] = node["content"].gsub(/;\s*charset\s*=.+\z/, ";charset=utf-8")
|
235
|
+
end
|
252
236
|
|
253
|
-
|
237
|
+
# A `<noscript>` element's content is parsed differently in browsers
|
238
|
+
# depending on whether or not scripting is enabled. Since Nokogiri
|
239
|
+
# doesn't support scripting, it always parses `<noscript>` elements as
|
240
|
+
# if scripting is disabled. This results in edge cases where it's not
|
241
|
+
# possible to reliably sanitize the contents of a `<noscript>` element
|
242
|
+
# because Nokogiri can't fully replicate the parsing behavior of a
|
243
|
+
# scripting-enabled browser. The safest thing to do is to simply remove
|
244
|
+
# all `<noscript>` elements.
|
245
|
+
when "noscript"
|
246
|
+
node.unlink
|
247
|
+
end
|
254
248
|
end
|
255
|
-
|
256
|
-
# A `<noscript>` element's content is parsed differently in browsers
|
257
|
-
# depending on whether or not scripting is enabled. Since Nokogiri doesn't
|
258
|
-
# support scripting, it always parses `<noscript>` elements as if scripting
|
259
|
-
# is disabled. This results in edge cases where it's not possible to
|
260
|
-
# reliably sanitize the contents of a `<noscript>` element because Nokogiri
|
261
|
-
# can't fully replicate the parsing behavior of a scripting-enabled browser.
|
262
|
-
# The safest thing to do is to simply remove all `<noscript>` elements.
|
263
|
-
when 'noscript'
|
264
|
-
node.unlink
|
265
249
|
end
|
266
250
|
end
|
267
|
-
|
268
|
-
end; end; end
|
251
|
+
end
|
data/lib/sanitize/version.rb
CHANGED
data/lib/sanitize.rb
CHANGED
@@ -1,20 +1,20 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
require
|
4
|
-
require
|
5
|
-
|
6
|
-
require_relative
|
7
|
-
require_relative
|
8
|
-
require_relative
|
9
|
-
require_relative
|
10
|
-
require_relative
|
11
|
-
require_relative
|
12
|
-
require_relative
|
13
|
-
require_relative
|
14
|
-
require_relative
|
15
|
-
require_relative
|
16
|
-
require_relative
|
17
|
-
require_relative
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "nokogiri"
|
4
|
+
require "set"
|
5
|
+
|
6
|
+
require_relative "sanitize/version"
|
7
|
+
require_relative "sanitize/config"
|
8
|
+
require_relative "sanitize/config/default"
|
9
|
+
require_relative "sanitize/config/restricted"
|
10
|
+
require_relative "sanitize/config/basic"
|
11
|
+
require_relative "sanitize/config/relaxed"
|
12
|
+
require_relative "sanitize/css"
|
13
|
+
require_relative "sanitize/transformers/clean_cdata"
|
14
|
+
require_relative "sanitize/transformers/clean_comment"
|
15
|
+
require_relative "sanitize/transformers/clean_css"
|
16
|
+
require_relative "sanitize/transformers/clean_doctype"
|
17
|
+
require_relative "sanitize/transformers/clean_element"
|
18
18
|
|
19
19
|
class Sanitize
|
20
20
|
attr_reader :config
|
@@ -33,12 +33,12 @@ class Sanitize
|
|
33
33
|
# - https://infra.spec.whatwg.org/#noncharacter
|
34
34
|
REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
|
35
35
|
|
36
|
-
# Matches an attribute value that could be treated by a browser as a URL
|
37
|
-
#
|
38
|
-
#
|
39
|
-
#
|
40
|
-
#
|
41
|
-
REGEX_PROTOCOL = /\A\s*([^\/#]*?)(
|
36
|
+
# Matches an attribute value that could be treated by a browser as a URL with
|
37
|
+
# a protocol prefix, such as "http:" or "javascript:". Any string of zero or
|
38
|
+
# more characters followed by a colon is considered a match, even if the colon
|
39
|
+
# is encoded as an entity and even if it's an incomplete entity (which IE6 and
|
40
|
+
# Opera will still parse).
|
41
|
+
REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?::|�*58|�*3a)/i
|
42
42
|
|
43
43
|
# Matches one or more characters that should be stripped from HTML before
|
44
44
|
# parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
|
@@ -99,12 +99,12 @@ class Sanitize
|
|
99
99
|
@transformers << Transformers::CleanElement.new(@config)
|
100
100
|
@transformers << Transformers::CleanComment unless @config[:allow_comments]
|
101
101
|
|
102
|
-
if @config[:elements].include?(
|
102
|
+
if @config[:elements].include?("style")
|
103
103
|
scss = Sanitize::CSS.new(config)
|
104
104
|
@transformers << Transformers::CSS::CleanElement.new(scss)
|
105
105
|
end
|
106
106
|
|
107
|
-
if @config[:attributes].values.any? {|attr| attr.include?(
|
107
|
+
if @config[:attributes].values.any? { |attr| attr.include?("style") }
|
108
108
|
scss ||= Sanitize::CSS.new(config)
|
109
109
|
@transformers << Transformers::CSS::CleanAttribute.new(scss)
|
110
110
|
end
|
@@ -112,7 +112,7 @@ class Sanitize
|
|
112
112
|
@transformers << Transformers::CleanDoctype
|
113
113
|
@transformers << Transformers::CleanCDATA
|
114
114
|
|
115
|
-
@transformer_config = {
|
115
|
+
@transformer_config = {config: @config}
|
116
116
|
end
|
117
117
|
|
118
118
|
# Returns a sanitized copy of the given _html_ document.
|
@@ -121,7 +121,7 @@ class Sanitize
|
|
121
121
|
# error will be raised. If this is undesirable, you should probably use
|
122
122
|
# {#fragment} instead.
|
123
123
|
def document(html)
|
124
|
-
return
|
124
|
+
return "" unless html
|
125
125
|
|
126
126
|
doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
|
127
127
|
node!(doc)
|
@@ -133,7 +133,7 @@ class Sanitize
|
|
133
133
|
|
134
134
|
# Returns a sanitized copy of the given _html_ fragment.
|
135
135
|
def fragment(html)
|
136
|
-
return
|
136
|
+
return "" unless html
|
137
137
|
|
138
138
|
frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
|
139
139
|
node!(frag)
|
@@ -152,7 +152,7 @@ class Sanitize
|
|
152
152
|
raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
|
153
153
|
|
154
154
|
if node.is_a?(Nokogiri::XML::Document)
|
155
|
-
unless @config[:elements].include?(
|
155
|
+
unless @config[:elements].include?("html")
|
156
156
|
raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
|
157
157
|
end
|
158
158
|
end
|
@@ -175,13 +175,13 @@ class Sanitize
|
|
175
175
|
def preprocess(html)
|
176
176
|
html = html.to_s.dup
|
177
177
|
|
178
|
-
unless html.encoding.name ==
|
179
|
-
html.encode!(
|
180
|
-
:
|
181
|
-
:
|
178
|
+
unless html.encoding.name == "UTF-8"
|
179
|
+
html.encode!("UTF-8",
|
180
|
+
invalid: :replace,
|
181
|
+
undef: :replace)
|
182
182
|
end
|
183
183
|
|
184
|
-
html.gsub!(REGEX_UNSUITABLE_CHARS,
|
184
|
+
html.gsub!(REGEX_UNSUITABLE_CHARS, "")
|
185
185
|
html
|
186
186
|
end
|
187
187
|
|
@@ -225,17 +225,17 @@ class Sanitize
|
|
225
225
|
|
226
226
|
child = node.child
|
227
227
|
|
228
|
-
while child
|
228
|
+
while child
|
229
229
|
prev = child.previous_sibling
|
230
230
|
traverse(child, &block)
|
231
231
|
|
232
|
-
if child.parent == node
|
233
|
-
child
|
232
|
+
child = if child.parent == node
|
233
|
+
child.next_sibling
|
234
234
|
else
|
235
235
|
# The child was unlinked or reparented, so traverse the previous node's
|
236
236
|
# next sibling, or the parent's first child if there is no previous
|
237
237
|
# node.
|
238
|
-
|
238
|
+
prev ? prev.next_sibling : node.child
|
239
239
|
end
|
240
240
|
end
|
241
241
|
end
|
data/test/common.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "minitest/autorun"
|
4
|
+
require "sanitize"
|