sanitize 6.1.2 → 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/{HISTORY.md → CHANGELOG.md} +40 -14
- data/LICENSE +3 -1
- data/README.md +120 -238
- data/lib/sanitize/config/basic.rb +15 -15
- data/lib/sanitize/config/default.rb +45 -45
- data/lib/sanitize/config/relaxed.rb +136 -32
- data/lib/sanitize/config/restricted.rb +2 -2
- data/lib/sanitize/config.rb +12 -14
- data/lib/sanitize/css.rb +309 -303
- data/lib/sanitize/transformers/clean_cdata.rb +9 -9
- data/lib/sanitize/transformers/clean_comment.rb +9 -9
- data/lib/sanitize/transformers/clean_css.rb +59 -55
- data/lib/sanitize/transformers/clean_doctype.rb +15 -15
- data/lib/sanitize/transformers/clean_element.rb +220 -237
- data/lib/sanitize/version.rb +3 -1
- data/lib/sanitize.rb +38 -38
- data/test/common.rb +4 -3
- data/test/test_clean_comment.rb +26 -25
- data/test/test_clean_css.rb +14 -13
- data/test/test_clean_doctype.rb +21 -20
- data/test/test_clean_element.rb +258 -273
- data/test/test_config.rb +22 -21
- data/test/test_malicious_css.rb +20 -19
- data/test/test_malicious_html.rb +100 -99
- data/test/test_parser.rb +26 -25
- data/test/test_sanitize.rb +70 -69
- data/test/test_sanitize_css.rb +152 -114
- data/test/test_transformers.rb +81 -83
- metadata +14 -43
@@ -1,268 +1,251 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
require
|
4
|
-
require
|
5
|
-
|
6
|
-
class Sanitize
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
@attributes
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
if config[:whitespace_elements].is_a?(Set)
|
76
|
-
config[:whitespace_elements].each do |element|
|
77
|
-
@whitespace_elements[element] = {:before => ' ', :after => ' '}
|
78
|
-
end
|
79
|
-
else
|
80
|
-
@whitespace_elements = config[:whitespace_elements]
|
81
|
-
end
|
82
|
-
|
83
|
-
if config[:remove_contents].is_a?(Enumerable)
|
84
|
-
@remove_element_contents.merge(config[:remove_contents].map(&:to_s))
|
85
|
-
else
|
86
|
-
@remove_all_contents = !!config[:remove_contents]
|
87
|
-
end
|
88
|
-
end
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "cgi"
|
4
|
+
require "set"
|
5
|
+
|
6
|
+
class Sanitize
|
7
|
+
module Transformers
|
8
|
+
class CleanElement
|
9
|
+
# Matches a valid HTML5 data attribute name. The unicode ranges included
|
10
|
+
# here are a conservative subset of the full range of characters that are
|
11
|
+
# technically allowed, with the intent of matching the most common
|
12
|
+
# characters used in data attribute names while excluding uncommon or
|
13
|
+
# potentially misleading characters, or characters with the potential to
|
14
|
+
# be normalized into unsafe or confusing forms.
|
15
|
+
#
|
16
|
+
# If you need data attr names with characters that aren't included here
|
17
|
+
# (such as combining marks, full-width characters, or CJK), please
|
18
|
+
# consider creating a custom transformer to validate attributes according
|
19
|
+
# to your needs.
|
20
|
+
#
|
21
|
+
# https://html.spec.whatwg.org/multipage/dom.html#embedding-custom-non-visible-data-with-the-data-*-attributes
|
22
|
+
REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
|
23
|
+
|
24
|
+
# Elements whose content is treated as unescaped text by HTML parsers.
|
25
|
+
UNESCAPED_TEXT_ELEMENTS = Set.new(%w[
|
26
|
+
iframe
|
27
|
+
noembed
|
28
|
+
noframes
|
29
|
+
noscript
|
30
|
+
plaintext
|
31
|
+
script
|
32
|
+
style
|
33
|
+
xmp
|
34
|
+
])
|
35
|
+
|
36
|
+
# Attributes that need additional escaping on `<a>` elements due to unsafe
|
37
|
+
# libxml2 behavior.
|
38
|
+
UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
|
39
|
+
name
|
40
|
+
])
|
41
|
+
|
42
|
+
# Attributes that need additional escaping on all elements due to unsafe
|
43
|
+
# libxml2 behavior.
|
44
|
+
UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
|
45
|
+
action
|
46
|
+
href
|
47
|
+
src
|
48
|
+
])
|
49
|
+
|
50
|
+
# Mapping of original characters to escape sequences for characters that
|
51
|
+
# should be escaped in attributes affected by unsafe libxml2 behavior.
|
52
|
+
UNSAFE_LIBXML_ESCAPE_CHARS = {
|
53
|
+
" " => "%20",
|
54
|
+
'"' => "%22"
|
55
|
+
}
|
56
|
+
|
57
|
+
# Regex that matches any single character that needs to be escaped in
|
58
|
+
# attributes affected by unsafe libxml2 behavior.
|
59
|
+
UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
|
60
|
+
|
61
|
+
def initialize(config)
|
62
|
+
@add_attributes = config[:add_attributes]
|
63
|
+
@attributes = config[:attributes].dup
|
64
|
+
@elements = config[:elements]
|
65
|
+
@protocols = config[:protocols]
|
66
|
+
@remove_all_contents = false
|
67
|
+
@remove_element_contents = Set.new
|
68
|
+
@whitespace_elements = {}
|
69
|
+
|
70
|
+
@attributes.each do |element_name, attrs|
|
71
|
+
unless element_name == :all
|
72
|
+
@attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
|
73
|
+
end
|
74
|
+
end
|
89
75
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
# already been deleted from the document.
|
98
|
-
#
|
99
|
-
# It's important that we not try to reparent the children of a node that has
|
100
|
-
# already been deleted, since that seems to trigger a memory leak in
|
101
|
-
# Nokogiri.
|
102
|
-
unless @elements.include?(name) || node.parent.nil?
|
103
|
-
# Elements like br, div, p, etc. need to be replaced with whitespace in
|
104
|
-
# order to preserve readability.
|
105
|
-
if @whitespace_elements.include?(name)
|
106
|
-
node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
|
107
|
-
|
108
|
-
unless node.children.empty?
|
109
|
-
node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
|
76
|
+
# Backcompat: if :whitespace_elements is a Set, convert it to a hash.
|
77
|
+
if config[:whitespace_elements].is_a?(Set)
|
78
|
+
config[:whitespace_elements].each do |element|
|
79
|
+
@whitespace_elements[element] = {before: " ", after: " "}
|
80
|
+
end
|
81
|
+
else
|
82
|
+
@whitespace_elements = config[:whitespace_elements]
|
110
83
|
end
|
111
|
-
end
|
112
84
|
|
113
|
-
|
114
|
-
|
115
|
-
|
85
|
+
if config[:remove_contents].is_a?(Enumerable)
|
86
|
+
@remove_element_contents.merge(config[:remove_contents].map(&:to_s))
|
87
|
+
else
|
88
|
+
@remove_all_contents = !!config[:remove_contents]
|
116
89
|
end
|
117
90
|
end
|
118
91
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
attr_allowlist = @attributes[name] || @attributes[:all]
|
124
|
-
|
125
|
-
if attr_allowlist.nil?
|
126
|
-
# Delete all attributes from elements with no allowlisted attributes.
|
127
|
-
node.attribute_nodes.each {|attr| attr.unlink }
|
128
|
-
else
|
129
|
-
allow_data_attributes = attr_allowlist.include?(:data)
|
92
|
+
def call(env)
|
93
|
+
node = env[:node]
|
94
|
+
return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
|
130
95
|
|
131
|
-
|
132
|
-
node.attribute_nodes.each do |attr|
|
133
|
-
attr_name = attr.name.downcase
|
96
|
+
name = env[:node_name]
|
134
97
|
|
135
|
-
unless
|
136
|
-
|
137
|
-
|
98
|
+
# Delete any element that isn't in the config allowlist, unless the node
|
99
|
+
# has already been deleted from the document.
|
100
|
+
#
|
101
|
+
# It's important that we not try to reparent the children of a node that
|
102
|
+
# has already been deleted, since that seems to trigger a memory leak in
|
103
|
+
# Nokogiri.
|
104
|
+
unless @elements.include?(name) || node.parent.nil?
|
105
|
+
# Elements like br, div, p, etc. need to be replaced with whitespace
|
106
|
+
# in order to preserve readability.
|
107
|
+
if @whitespace_elements.include?(name)
|
108
|
+
node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
|
109
|
+
|
110
|
+
unless node.children.empty?
|
111
|
+
node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
|
112
|
+
end
|
113
|
+
end
|
138
114
|
|
139
|
-
unless
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
next
|
115
|
+
unless node.children.empty?
|
116
|
+
unless @remove_all_contents || @remove_element_contents.include?(name)
|
117
|
+
node.add_previous_sibling(node.children)
|
118
|
+
end
|
144
119
|
end
|
120
|
+
|
121
|
+
node.unlink
|
122
|
+
return
|
145
123
|
end
|
146
124
|
|
147
|
-
|
125
|
+
attr_allowlist = @attributes[name] || @attributes[:all]
|
126
|
+
|
127
|
+
if attr_allowlist.nil?
|
128
|
+
# Delete all attributes from elements with no allowlisted attributes.
|
129
|
+
node.attribute_nodes.each { |attr| attr.unlink }
|
130
|
+
else
|
131
|
+
allow_data_attributes = attr_allowlist.include?(:data)
|
148
132
|
|
149
|
-
|
150
|
-
|
151
|
-
|
133
|
+
# Delete any attribute that isn't allowed on this element.
|
134
|
+
node.attribute_nodes.each do |attr|
|
135
|
+
attr_name = attr.name.downcase
|
152
136
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
137
|
+
unless attr_allowlist.include?(attr_name)
|
138
|
+
# The attribute isn't in the allowlist, but may still be allowed
|
139
|
+
# if it's a data attribute.
|
140
|
+
|
141
|
+
unless allow_data_attributes && attr_name.start_with?("data-") && attr_name =~ REGEX_DATA_ATTR
|
142
|
+
# Either the attribute isn't a data attribute or arbitrary data
|
143
|
+
# attributes aren't allowed. Remove the attribute.
|
144
|
+
attr.unlink
|
145
|
+
next
|
146
|
+
end
|
157
147
|
end
|
158
148
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
149
|
+
# The attribute is allowed.
|
150
|
+
|
151
|
+
# Remove any attributes that use unacceptable protocols.
|
152
|
+
if @protocols.include?(name) && @protocols[name].include?(attr_name)
|
153
|
+
attr_protocols = @protocols[name][attr_name]
|
154
|
+
|
155
|
+
if attr.value =~ REGEX_PROTOCOL
|
156
|
+
unless attr_protocols.include?($1.downcase)
|
157
|
+
attr.unlink
|
158
|
+
next
|
159
|
+
end
|
160
|
+
|
161
|
+
else
|
162
|
+
unless attr_protocols.include?(:relative)
|
163
|
+
attr.unlink
|
164
|
+
next
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# Leading and trailing whitespace around URLs is ignored at parse
|
169
|
+
# time. Stripping it here prevents it from being escaped by the
|
170
|
+
# libxml2 workaround below.
|
171
|
+
attr.value = attr.value.strip
|
163
172
|
end
|
164
|
-
end
|
165
173
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
174
|
+
# libxml2 >= 2.9.2 doesn't escape comments within some attributes,
|
175
|
+
# in an attempt to preserve server-side includes. This can result in
|
176
|
+
# XSS since an unescaped double quote can allow an attacker to
|
177
|
+
# inject a non-allowlisted attribute.
|
178
|
+
#
|
179
|
+
# Sanitize works around this by implementing its own escaping for
|
180
|
+
# affected attributes, some of which can exist on any element and
|
181
|
+
# some of which can only exist on `<a>` elements.
|
182
|
+
#
|
183
|
+
# This fix is technically no longer necessary with Nokogumbo >= 2.0
|
184
|
+
# since it no longer uses libxml2's serializer, but it's retained to
|
185
|
+
# avoid breaking use cases where people might be sanitizing
|
186
|
+
# individual Nokogiri nodes and then serializing them manually
|
187
|
+
# without Nokogumbo.
|
188
|
+
#
|
189
|
+
# The relevant libxml2 code is here:
|
190
|
+
# <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
|
191
|
+
if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
|
192
|
+
(name == "a" && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
|
193
|
+
|
194
|
+
attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
|
195
|
+
end
|
196
|
+
end
|
170
197
|
end
|
171
198
|
|
172
|
-
#
|
173
|
-
|
174
|
-
|
175
|
-
# non-allowlisted attribute.
|
176
|
-
#
|
177
|
-
# Sanitize works around this by implementing its own escaping for
|
178
|
-
# affected attributes, some of which can exist on any element and some
|
179
|
-
# of which can only exist on `<a>` elements.
|
180
|
-
#
|
181
|
-
# This fix is technically no longer necessary with Nokogumbo >= 2.0
|
182
|
-
# since it no longer uses libxml2's serializer, but it's retained to
|
183
|
-
# avoid breaking use cases where people might be sanitizing individual
|
184
|
-
# Nokogiri nodes and then serializing them manually without Nokogumbo.
|
185
|
-
#
|
186
|
-
# The relevant libxml2 code is here:
|
187
|
-
# <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
|
188
|
-
if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
|
189
|
-
(name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
|
190
|
-
|
191
|
-
attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
|
199
|
+
# Add required attributes.
|
200
|
+
if @add_attributes.include?(name)
|
201
|
+
@add_attributes[name].each { |key, val| node[key] = val }
|
192
202
|
end
|
193
|
-
end
|
194
|
-
end
|
195
203
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
# can't guarantee that their contents are safe.
|
211
|
-
unless node.namespace.nil?
|
212
|
-
prefix = node.namespace.prefix
|
213
|
-
|
214
|
-
if (prefix == 'math' || prefix == 'svg') && UNESCAPED_TEXT_ELEMENTS.include?(name)
|
215
|
-
node.children.each do |child|
|
216
|
-
if child.type == Nokogiri::XML::Node::TEXT_NODE
|
217
|
-
child.content = CGI.escapeHTML(child.content)
|
204
|
+
# Element-specific special cases.
|
205
|
+
case name
|
206
|
+
|
207
|
+
# If this is an allowlisted iframe that has children, remove all its
|
208
|
+
# children. The HTML standard says iframes shouldn't have content, but
|
209
|
+
# when they do, this content is parsed as text and is serialized
|
210
|
+
# verbatim without being escaped, which is unsafe because legacy
|
211
|
+
# browsers may still render it and execute `<script>` content. So the
|
212
|
+
# safe and correct thing to do is to always remove iframe content.
|
213
|
+
when "iframe"
|
214
|
+
if !node.children.empty?
|
215
|
+
node.children.each do |child|
|
216
|
+
child.unlink
|
217
|
+
end
|
218
218
|
end
|
219
|
-
end
|
220
|
-
end
|
221
|
-
end
|
222
219
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
# they do, this content is parsed as text and is serialized verbatim without
|
229
|
-
# being escaped, which is unsafe because legacy browsers may still render it
|
230
|
-
# and execute `<script>` content. So the safe and correct thing to do is to
|
231
|
-
# always remove iframe content.
|
232
|
-
when 'iframe'
|
233
|
-
if !node.children.empty?
|
234
|
-
node.children.each do |child|
|
235
|
-
child.unlink
|
236
|
-
end
|
237
|
-
end
|
220
|
+
# Prevent the use of `<meta>` elements that set a charset other than
|
221
|
+
# UTF-8, since Sanitize's output is always UTF-8.
|
222
|
+
when "meta"
|
223
|
+
if node.has_attribute?("charset") &&
|
224
|
+
node["charset"].downcase != "utf-8"
|
238
225
|
|
239
|
-
|
240
|
-
|
241
|
-
when 'meta'
|
242
|
-
if node.has_attribute?('charset') &&
|
243
|
-
node['charset'].downcase != 'utf-8'
|
226
|
+
node["charset"] = "utf-8"
|
227
|
+
end
|
244
228
|
|
245
|
-
|
246
|
-
|
229
|
+
if node.has_attribute?("http-equiv") &&
|
230
|
+
node.has_attribute?("content") &&
|
231
|
+
node["http-equiv"].downcase == "content-type" &&
|
232
|
+
node["content"].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
|
247
233
|
|
248
|
-
|
249
|
-
|
250
|
-
node['http-equiv'].downcase == 'content-type' &&
|
251
|
-
node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
|
234
|
+
node["content"] = node["content"].gsub(/;\s*charset\s*=.+\z/, ";charset=utf-8")
|
235
|
+
end
|
252
236
|
|
253
|
-
|
237
|
+
# A `<noscript>` element's content is parsed differently in browsers
|
238
|
+
# depending on whether or not scripting is enabled. Since Nokogiri
|
239
|
+
# doesn't support scripting, it always parses `<noscript>` elements as
|
240
|
+
# if scripting is disabled. This results in edge cases where it's not
|
241
|
+
# possible to reliably sanitize the contents of a `<noscript>` element
|
242
|
+
# because Nokogiri can't fully replicate the parsing behavior of a
|
243
|
+
# scripting-enabled browser. The safest thing to do is to simply remove
|
244
|
+
# all `<noscript>` elements.
|
245
|
+
when "noscript"
|
246
|
+
node.unlink
|
247
|
+
end
|
254
248
|
end
|
255
|
-
|
256
|
-
# A `<noscript>` element's content is parsed differently in browsers
|
257
|
-
# depending on whether or not scripting is enabled. Since Nokogiri doesn't
|
258
|
-
# support scripting, it always parses `<noscript>` elements as if scripting
|
259
|
-
# is disabled. This results in edge cases where it's not possible to
|
260
|
-
# reliably sanitize the contents of a `<noscript>` element because Nokogiri
|
261
|
-
# can't fully replicate the parsing behavior of a scripting-enabled browser.
|
262
|
-
# The safest thing to do is to simply remove all `<noscript>` elements.
|
263
|
-
when 'noscript'
|
264
|
-
node.unlink
|
265
249
|
end
|
266
250
|
end
|
267
|
-
|
268
|
-
end; end; end
|
251
|
+
end
|
data/lib/sanitize/version.rb
CHANGED
data/lib/sanitize.rb
CHANGED
@@ -1,20 +1,20 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
require
|
4
|
-
require
|
5
|
-
|
6
|
-
require_relative
|
7
|
-
require_relative
|
8
|
-
require_relative
|
9
|
-
require_relative
|
10
|
-
require_relative
|
11
|
-
require_relative
|
12
|
-
require_relative
|
13
|
-
require_relative
|
14
|
-
require_relative
|
15
|
-
require_relative
|
16
|
-
require_relative
|
17
|
-
require_relative
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "nokogiri"
|
4
|
+
require "set"
|
5
|
+
|
6
|
+
require_relative "sanitize/version"
|
7
|
+
require_relative "sanitize/config"
|
8
|
+
require_relative "sanitize/config/default"
|
9
|
+
require_relative "sanitize/config/restricted"
|
10
|
+
require_relative "sanitize/config/basic"
|
11
|
+
require_relative "sanitize/config/relaxed"
|
12
|
+
require_relative "sanitize/css"
|
13
|
+
require_relative "sanitize/transformers/clean_cdata"
|
14
|
+
require_relative "sanitize/transformers/clean_comment"
|
15
|
+
require_relative "sanitize/transformers/clean_css"
|
16
|
+
require_relative "sanitize/transformers/clean_doctype"
|
17
|
+
require_relative "sanitize/transformers/clean_element"
|
18
18
|
|
19
19
|
class Sanitize
|
20
20
|
attr_reader :config
|
@@ -33,12 +33,12 @@ class Sanitize
|
|
33
33
|
# - https://infra.spec.whatwg.org/#noncharacter
|
34
34
|
REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
|
35
35
|
|
36
|
-
# Matches an attribute value that could be treated by a browser as a URL
|
37
|
-
#
|
38
|
-
#
|
39
|
-
#
|
40
|
-
#
|
41
|
-
REGEX_PROTOCOL = /\A\s*([^\/#]*?)(
|
36
|
+
# Matches an attribute value that could be treated by a browser as a URL with
|
37
|
+
# a protocol prefix, such as "http:" or "javascript:". Any string of zero or
|
38
|
+
# more characters followed by a colon is considered a match, even if the colon
|
39
|
+
# is encoded as an entity and even if it's an incomplete entity (which IE6 and
|
40
|
+
# Opera will still parse).
|
41
|
+
REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?::|�*58|�*3a)/i
|
42
42
|
|
43
43
|
# Matches one or more characters that should be stripped from HTML before
|
44
44
|
# parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
|
@@ -99,12 +99,12 @@ class Sanitize
|
|
99
99
|
@transformers << Transformers::CleanElement.new(@config)
|
100
100
|
@transformers << Transformers::CleanComment unless @config[:allow_comments]
|
101
101
|
|
102
|
-
if @config[:elements].include?(
|
102
|
+
if @config[:elements].include?("style")
|
103
103
|
scss = Sanitize::CSS.new(config)
|
104
104
|
@transformers << Transformers::CSS::CleanElement.new(scss)
|
105
105
|
end
|
106
106
|
|
107
|
-
if @config[:attributes].values.any? {|attr| attr.include?(
|
107
|
+
if @config[:attributes].values.any? { |attr| attr.include?("style") }
|
108
108
|
scss ||= Sanitize::CSS.new(config)
|
109
109
|
@transformers << Transformers::CSS::CleanAttribute.new(scss)
|
110
110
|
end
|
@@ -112,7 +112,7 @@ class Sanitize
|
|
112
112
|
@transformers << Transformers::CleanDoctype
|
113
113
|
@transformers << Transformers::CleanCDATA
|
114
114
|
|
115
|
-
@transformer_config = {
|
115
|
+
@transformer_config = {config: @config}
|
116
116
|
end
|
117
117
|
|
118
118
|
# Returns a sanitized copy of the given _html_ document.
|
@@ -121,7 +121,7 @@ class Sanitize
|
|
121
121
|
# error will be raised. If this is undesirable, you should probably use
|
122
122
|
# {#fragment} instead.
|
123
123
|
def document(html)
|
124
|
-
return
|
124
|
+
return "" unless html
|
125
125
|
|
126
126
|
doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
|
127
127
|
node!(doc)
|
@@ -133,7 +133,7 @@ class Sanitize
|
|
133
133
|
|
134
134
|
# Returns a sanitized copy of the given _html_ fragment.
|
135
135
|
def fragment(html)
|
136
|
-
return
|
136
|
+
return "" unless html
|
137
137
|
|
138
138
|
frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
|
139
139
|
node!(frag)
|
@@ -152,7 +152,7 @@ class Sanitize
|
|
152
152
|
raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
|
153
153
|
|
154
154
|
if node.is_a?(Nokogiri::XML::Document)
|
155
|
-
unless @config[:elements].include?(
|
155
|
+
unless @config[:elements].include?("html")
|
156
156
|
raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
|
157
157
|
end
|
158
158
|
end
|
@@ -175,13 +175,13 @@ class Sanitize
|
|
175
175
|
def preprocess(html)
|
176
176
|
html = html.to_s.dup
|
177
177
|
|
178
|
-
unless html.encoding.name ==
|
179
|
-
html.encode!(
|
180
|
-
:
|
181
|
-
:
|
178
|
+
unless html.encoding.name == "UTF-8"
|
179
|
+
html.encode!("UTF-8",
|
180
|
+
invalid: :replace,
|
181
|
+
undef: :replace)
|
182
182
|
end
|
183
183
|
|
184
|
-
html.gsub!(REGEX_UNSUITABLE_CHARS,
|
184
|
+
html.gsub!(REGEX_UNSUITABLE_CHARS, "")
|
185
185
|
html
|
186
186
|
end
|
187
187
|
|
@@ -225,17 +225,17 @@ class Sanitize
|
|
225
225
|
|
226
226
|
child = node.child
|
227
227
|
|
228
|
-
while child
|
228
|
+
while child
|
229
229
|
prev = child.previous_sibling
|
230
230
|
traverse(child, &block)
|
231
231
|
|
232
|
-
if child.parent == node
|
233
|
-
child
|
232
|
+
child = if child.parent == node
|
233
|
+
child.next_sibling
|
234
234
|
else
|
235
235
|
# The child was unlinked or reparented, so traverse the previous node's
|
236
236
|
# next sibling, or the parent's first child if there is no previous
|
237
237
|
# node.
|
238
|
-
|
238
|
+
prev ? prev.next_sibling : node.child
|
239
239
|
end
|
240
240
|
end
|
241
241
|
end
|
data/test/common.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "minitest/autorun"
|
4
|
+
require "sanitize"
|