sanitize 2.1.1 → 6.0.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of sanitize might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/HISTORY.md +520 -55
- data/LICENSE +1 -1
- data/README.md +438 -168
- data/lib/sanitize/config/basic.rb +12 -32
- data/lib/sanitize/config/default.rb +118 -0
- data/lib/sanitize/config/relaxed.rb +716 -53
- data/lib/sanitize/config/restricted.rb +3 -23
- data/lib/sanitize/config.rb +53 -79
- data/lib/sanitize/css.rb +348 -0
- data/lib/sanitize/transformers/clean_cdata.rb +3 -3
- data/lib/sanitize/transformers/clean_comment.rb +6 -3
- data/lib/sanitize/transformers/clean_css.rb +57 -0
- data/lib/sanitize/transformers/clean_doctype.rb +19 -0
- data/lib/sanitize/transformers/clean_element.rb +192 -124
- data/lib/sanitize/version.rb +3 -1
- data/lib/sanitize.rb +172 -143
- data/test/common.rb +3 -0
- data/test/test_clean_comment.rb +47 -0
- data/test/test_clean_css.rb +67 -0
- data/test/test_clean_doctype.rb +71 -0
- data/test/test_clean_element.rb +545 -0
- data/test/test_config.rb +65 -0
- data/test/test_malicious_css.rb +42 -0
- data/test/test_malicious_html.rb +235 -0
- data/test/test_parser.rb +75 -0
- data/test/test_sanitize.rb +151 -675
- data/test/test_sanitize_css.rb +424 -0
- data/test/test_transformers.rb +230 -0
- metadata +44 -41
@@ -1,155 +1,223 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
class Sanitize; module Transformers; class CleanElement
|
6
|
+
|
7
|
+
# Matches a valid HTML5 data attribute name. The unicode ranges included here
|
8
|
+
# are a conservative subset of the full range of characters that are
|
9
|
+
# technically allowed, with the intent of matching the most common characters
|
10
|
+
# used in data attribute names while excluding uncommon or potentially
|
11
|
+
# misleading characters, or characters with the potential to be normalized
|
12
|
+
# into unsafe or confusing forms.
|
13
|
+
#
|
14
|
+
# If you need data attr names with characters that aren't included here (such
|
15
|
+
# as combining marks, full-width characters, or CJK), please consider creating
|
16
|
+
# a custom transformer to validate attributes according to your needs.
|
17
|
+
#
|
18
|
+
# http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
|
19
|
+
REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
|
20
|
+
|
21
|
+
# Attributes that need additional escaping on `<a>` elements due to unsafe
|
22
|
+
# libxml2 behavior.
|
23
|
+
UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
|
24
|
+
name
|
25
|
+
])
|
26
|
+
|
27
|
+
# Attributes that need additional escaping on all elements due to unsafe
|
28
|
+
# libxml2 behavior.
|
29
|
+
UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
|
30
|
+
action
|
31
|
+
href
|
32
|
+
src
|
33
|
+
])
|
34
|
+
|
35
|
+
# Mapping of original characters to escape sequences for characters that
|
36
|
+
# should be escaped in attributes affected by unsafe libxml2 behavior.
|
37
|
+
UNSAFE_LIBXML_ESCAPE_CHARS = {
|
38
|
+
' ' => '%20',
|
39
|
+
'"' => '%22'
|
40
|
+
}
|
41
|
+
|
42
|
+
# Regex that matches any single character that needs to be escaped in
|
43
|
+
# attributes affected by unsafe libxml2 behavior.
|
44
|
+
UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
|
45
|
+
|
46
|
+
def initialize(config)
|
47
|
+
@add_attributes = config[:add_attributes]
|
48
|
+
@attributes = config[:attributes].dup
|
49
|
+
@elements = config[:elements]
|
50
|
+
@protocols = config[:protocols]
|
51
|
+
@remove_all_contents = false
|
52
|
+
@remove_element_contents = Set.new
|
53
|
+
@whitespace_elements = {}
|
54
|
+
|
55
|
+
@attributes.each do |element_name, attrs|
|
56
|
+
unless element_name == :all
|
57
|
+
@attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
|
46
58
|
end
|
47
59
|
end
|
48
60
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
61
|
+
# Backcompat: if :whitespace_elements is a Set, convert it to a hash.
|
62
|
+
if config[:whitespace_elements].is_a?(Set)
|
63
|
+
config[:whitespace_elements].each do |element|
|
64
|
+
@whitespace_elements[element] = {:before => ' ', :after => ' '}
|
65
|
+
end
|
66
|
+
else
|
67
|
+
@whitespace_elements = config[:whitespace_elements]
|
68
|
+
end
|
54
69
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
70
|
+
if config[:remove_contents].is_a?(Enumerable)
|
71
|
+
@remove_element_contents.merge(config[:remove_contents].map(&:to_s))
|
72
|
+
else
|
73
|
+
@remove_all_contents = !!config[:remove_contents]
|
74
|
+
end
|
75
|
+
end
|
61
76
|
|
62
|
-
|
63
|
-
|
64
|
-
|
77
|
+
def call(env)
|
78
|
+
node = env[:node]
|
79
|
+
return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
|
80
|
+
|
81
|
+
name = env[:node_name]
|
82
|
+
|
83
|
+
# Delete any element that isn't in the config allowlist, unless the node has
|
84
|
+
# already been deleted from the document.
|
85
|
+
#
|
86
|
+
# It's important that we not try to reparent the children of a node that has
|
87
|
+
# already been deleted, since that seems to trigger a memory leak in
|
88
|
+
# Nokogiri.
|
89
|
+
unless @elements.include?(name) || node.parent.nil?
|
90
|
+
# Elements like br, div, p, etc. need to be replaced with whitespace in
|
91
|
+
# order to preserve readability.
|
92
|
+
if @whitespace_elements.include?(name)
|
93
|
+
node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
|
94
|
+
|
95
|
+
unless node.children.empty?
|
96
|
+
node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
|
65
97
|
end
|
98
|
+
end
|
66
99
|
|
100
|
+
unless node.children.empty?
|
67
101
|
unless @remove_all_contents || @remove_element_contents.include?(name)
|
68
|
-
node.
|
102
|
+
node.add_previous_sibling(node.children)
|
69
103
|
end
|
70
|
-
|
71
|
-
node.unlink
|
72
|
-
return
|
73
104
|
end
|
74
105
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
106
|
+
node.unlink
|
107
|
+
return
|
108
|
+
end
|
109
|
+
|
110
|
+
attr_allowlist = @attributes[name] || @attributes[:all]
|
111
|
+
|
112
|
+
if attr_allowlist.nil?
|
113
|
+
# Delete all attributes from elements with no allowlisted attributes.
|
114
|
+
node.attribute_nodes.each {|attr| attr.unlink }
|
115
|
+
else
|
116
|
+
allow_data_attributes = attr_allowlist.include?(:data)
|
117
|
+
|
118
|
+
# Delete any attribute that isn't allowed on this element.
|
119
|
+
node.attribute_nodes.each do |attr|
|
120
|
+
attr_name = attr.name.downcase
|
121
|
+
|
122
|
+
unless attr_allowlist.include?(attr_name)
|
123
|
+
# The attribute isn't in the allowlist, but may still be allowed if
|
124
|
+
# it's a data attribute.
|
125
|
+
|
126
|
+
unless allow_data_attributes && attr_name.start_with?('data-') && attr_name =~ REGEX_DATA_ATTR
|
127
|
+
# Either the attribute isn't a data attribute or arbitrary data
|
128
|
+
# attributes aren't allowed. Remove the attribute.
|
129
|
+
attr.unlink
|
130
|
+
next
|
100
131
|
end
|
101
132
|
end
|
102
133
|
|
103
|
-
#
|
104
|
-
if @protocols.has_key?(name)
|
105
|
-
protocol = @protocols[name]
|
134
|
+
# The attribute is allowed.
|
106
135
|
|
107
|
-
|
108
|
-
|
109
|
-
|
136
|
+
# Remove any attributes that use unacceptable protocols.
|
137
|
+
if @protocols.include?(name) && @protocols[name].include?(attr_name)
|
138
|
+
attr_protocols = @protocols[name][attr_name]
|
110
139
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
140
|
+
if attr.value =~ REGEX_PROTOCOL
|
141
|
+
unless attr_protocols.include?($1.downcase)
|
142
|
+
attr.unlink
|
143
|
+
next
|
115
144
|
end
|
116
145
|
|
117
|
-
|
146
|
+
else
|
147
|
+
unless attr_protocols.include?(:relative)
|
118
148
|
attr.unlink
|
119
|
-
|
120
|
-
# Leading and trailing whitespace around URLs is ignored at parse
|
121
|
-
# time. Stripping it here prevents it from being escaped by the
|
122
|
-
# libxml2 workaround below.
|
123
|
-
attr.value = attr.value.strip
|
149
|
+
next
|
124
150
|
end
|
125
151
|
end
|
152
|
+
|
153
|
+
# Leading and trailing whitespace around URLs is ignored at parse
|
154
|
+
# time. Stripping it here prevents it from being escaped by the
|
155
|
+
# libxml2 workaround below.
|
156
|
+
attr.value = attr.value.strip
|
126
157
|
end
|
127
|
-
end
|
128
158
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
159
|
+
# libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
|
160
|
+
# attempt to preserve server-side includes. This can result in XSS since
|
161
|
+
# an unescaped double quote can allow an attacker to inject a
|
162
|
+
# non-allowlisted attribute.
|
163
|
+
#
|
164
|
+
# Sanitize works around this by implementing its own escaping for
|
165
|
+
# affected attributes, some of which can exist on any element and some
|
166
|
+
# of which can only exist on `<a>` elements.
|
167
|
+
#
|
168
|
+
# This fix is technically no longer necessary with Nokogumbo >= 2.0
|
169
|
+
# since it no longer uses libxml2's serializer, but it's retained to
|
170
|
+
# avoid breaking use cases where people might be sanitizing individual
|
171
|
+
# Nokogiri nodes and then serializing them manually without Nokogumbo.
|
172
|
+
#
|
173
|
+
# The relevant libxml2 code is here:
|
174
|
+
# <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
|
142
175
|
if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
|
143
|
-
|
144
|
-
|
176
|
+
(name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
|
177
|
+
|
178
|
+
attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
|
145
179
|
end
|
146
180
|
end
|
181
|
+
end
|
182
|
+
|
183
|
+
# Add required attributes.
|
184
|
+
if @add_attributes.include?(name)
|
185
|
+
@add_attributes[name].each {|key, val| node[key] = val }
|
186
|
+
end
|
187
|
+
|
188
|
+
# Element-specific special cases.
|
189
|
+
case name
|
190
|
+
|
191
|
+
# If this is an allowlisted iframe that has children, remove all its
|
192
|
+
# children. The HTML standard says iframes shouldn't have content, but when
|
193
|
+
# they do, this content is parsed as text and is serialized verbatim without
|
194
|
+
# being escaped, which is unsafe because legacy browsers may still render it
|
195
|
+
# and execute `<script>` content. So the safe and correct thing to do is to
|
196
|
+
# always remove iframe content.
|
197
|
+
when 'iframe'
|
198
|
+
if !node.children.empty?
|
199
|
+
node.children.each do |child|
|
200
|
+
child.unlink
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
# Prevent the use of `<meta>` elements that set a charset other than UTF-8,
|
205
|
+
# since Sanitize's output is always UTF-8.
|
206
|
+
when 'meta'
|
207
|
+
if node.has_attribute?('charset') &&
|
208
|
+
node['charset'].downcase != 'utf-8'
|
209
|
+
|
210
|
+
node['charset'] = 'utf-8'
|
211
|
+
end
|
212
|
+
|
213
|
+
if node.has_attribute?('http-equiv') &&
|
214
|
+
node.has_attribute?('content') &&
|
215
|
+
node['http-equiv'].downcase == 'content-type' &&
|
216
|
+
node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
|
147
217
|
|
148
|
-
|
149
|
-
if @add_attributes.has_key?(name)
|
150
|
-
@add_attributes[name].each {|key, val| node[key] = val }
|
218
|
+
node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8')
|
151
219
|
end
|
152
220
|
end
|
153
221
|
end
|
154
222
|
|
155
|
-
end; end
|
223
|
+
end; end; end
|
data/lib/sanitize/version.rb
CHANGED