sanitize 6.1.2 → 7.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,268 +1,251 @@
1
- # encoding: utf-8
2
-
3
- require 'cgi'
4
- require 'set'
5
-
6
- class Sanitize; module Transformers; class CleanElement
7
-
8
- # Matches a valid HTML5 data attribute name. The unicode ranges included here
9
- # are a conservative subset of the full range of characters that are
10
- # technically allowed, with the intent of matching the most common characters
11
- # used in data attribute names while excluding uncommon or potentially
12
- # misleading characters, or characters with the potential to be normalized
13
- # into unsafe or confusing forms.
14
- #
15
- # If you need data attr names with characters that aren't included here (such
16
- # as combining marks, full-width characters, or CJK), please consider creating
17
- # a custom transformer to validate attributes according to your needs.
18
- #
19
- # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
20
- REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
21
-
22
- # Elements whose content is treated as unescaped text by HTML parsers.
23
- UNESCAPED_TEXT_ELEMENTS = Set.new(%w[
24
- iframe
25
- noembed
26
- noframes
27
- noscript
28
- plaintext
29
- script
30
- style
31
- xmp
32
- ])
33
-
34
- # Attributes that need additional escaping on `<a>` elements due to unsafe
35
- # libxml2 behavior.
36
- UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
37
- name
38
- ])
39
-
40
- # Attributes that need additional escaping on all elements due to unsafe
41
- # libxml2 behavior.
42
- UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
43
- action
44
- href
45
- src
46
- ])
47
-
48
- # Mapping of original characters to escape sequences for characters that
49
- # should be escaped in attributes affected by unsafe libxml2 behavior.
50
- UNSAFE_LIBXML_ESCAPE_CHARS = {
51
- ' ' => '%20',
52
- '"' => '%22'
53
- }
54
-
55
- # Regex that matches any single character that needs to be escaped in
56
- # attributes affected by unsafe libxml2 behavior.
57
- UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
58
-
59
- def initialize(config)
60
- @add_attributes = config[:add_attributes]
61
- @attributes = config[:attributes].dup
62
- @elements = config[:elements]
63
- @protocols = config[:protocols]
64
- @remove_all_contents = false
65
- @remove_element_contents = Set.new
66
- @whitespace_elements = {}
67
-
68
- @attributes.each do |element_name, attrs|
69
- unless element_name == :all
70
- @attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
71
- end
72
- end
73
-
74
- # Backcompat: if :whitespace_elements is a Set, convert it to a hash.
75
- if config[:whitespace_elements].is_a?(Set)
76
- config[:whitespace_elements].each do |element|
77
- @whitespace_elements[element] = {:before => ' ', :after => ' '}
78
- end
79
- else
80
- @whitespace_elements = config[:whitespace_elements]
81
- end
82
-
83
- if config[:remove_contents].is_a?(Enumerable)
84
- @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
85
- else
86
- @remove_all_contents = !!config[:remove_contents]
87
- end
88
- end
1
+ # frozen_string_literal: true
2
+
3
+ require "cgi"
4
+ require "set"
5
+
6
+ class Sanitize
7
+ module Transformers
8
+ class CleanElement
9
+ # Matches a valid HTML5 data attribute name. The unicode ranges included
10
+ # here are a conservative subset of the full range of characters that are
11
+ # technically allowed, with the intent of matching the most common
12
+ # characters used in data attribute names while excluding uncommon or
13
+ # potentially misleading characters, or characters with the potential to
14
+ # be normalized into unsafe or confusing forms.
15
+ #
16
+ # If you need data attr names with characters that aren't included here
17
+ # (such as combining marks, full-width characters, or CJK), please
18
+ # consider creating a custom transformer to validate attributes according
19
+ # to your needs.
20
+ #
21
+ # https://html.spec.whatwg.org/multipage/dom.html#embedding-custom-non-visible-data-with-the-data-*-attributes
22
+ REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
23
+
24
+ # Elements whose content is treated as unescaped text by HTML parsers.
25
+ UNESCAPED_TEXT_ELEMENTS = Set.new(%w[
26
+ iframe
27
+ noembed
28
+ noframes
29
+ noscript
30
+ plaintext
31
+ script
32
+ style
33
+ xmp
34
+ ])
35
+
36
+ # Attributes that need additional escaping on `<a>` elements due to unsafe
37
+ # libxml2 behavior.
38
+ UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
39
+ name
40
+ ])
41
+
42
+ # Attributes that need additional escaping on all elements due to unsafe
43
+ # libxml2 behavior.
44
+ UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
45
+ action
46
+ href
47
+ src
48
+ ])
49
+
50
+ # Mapping of original characters to escape sequences for characters that
51
+ # should be escaped in attributes affected by unsafe libxml2 behavior.
52
+ UNSAFE_LIBXML_ESCAPE_CHARS = {
53
+ " " => "%20",
54
+ '"' => "%22"
55
+ }
56
+
57
+ # Regex that matches any single character that needs to be escaped in
58
+ # attributes affected by unsafe libxml2 behavior.
59
+ UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
60
+
61
+ def initialize(config)
62
+ @add_attributes = config[:add_attributes]
63
+ @attributes = config[:attributes].dup
64
+ @elements = config[:elements]
65
+ @protocols = config[:protocols]
66
+ @remove_all_contents = false
67
+ @remove_element_contents = Set.new
68
+ @whitespace_elements = {}
69
+
70
+ @attributes.each do |element_name, attrs|
71
+ unless element_name == :all
72
+ @attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
73
+ end
74
+ end
89
75
 
90
- def call(env)
91
- node = env[:node]
92
- return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
93
-
94
- name = env[:node_name]
95
-
96
- # Delete any element that isn't in the config allowlist, unless the node has
97
- # already been deleted from the document.
98
- #
99
- # It's important that we not try to reparent the children of a node that has
100
- # already been deleted, since that seems to trigger a memory leak in
101
- # Nokogiri.
102
- unless @elements.include?(name) || node.parent.nil?
103
- # Elements like br, div, p, etc. need to be replaced with whitespace in
104
- # order to preserve readability.
105
- if @whitespace_elements.include?(name)
106
- node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
107
-
108
- unless node.children.empty?
109
- node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
76
+ # Backcompat: if :whitespace_elements is a Set, convert it to a hash.
77
+ if config[:whitespace_elements].is_a?(Set)
78
+ config[:whitespace_elements].each do |element|
79
+ @whitespace_elements[element] = {before: " ", after: " "}
80
+ end
81
+ else
82
+ @whitespace_elements = config[:whitespace_elements]
110
83
  end
111
- end
112
84
 
113
- unless node.children.empty?
114
- unless @remove_all_contents || @remove_element_contents.include?(name)
115
- node.add_previous_sibling(node.children)
85
+ if config[:remove_contents].is_a?(Enumerable)
86
+ @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
87
+ else
88
+ @remove_all_contents = !!config[:remove_contents]
116
89
  end
117
90
  end
118
91
 
119
- node.unlink
120
- return
121
- end
122
-
123
- attr_allowlist = @attributes[name] || @attributes[:all]
124
-
125
- if attr_allowlist.nil?
126
- # Delete all attributes from elements with no allowlisted attributes.
127
- node.attribute_nodes.each {|attr| attr.unlink }
128
- else
129
- allow_data_attributes = attr_allowlist.include?(:data)
92
+ def call(env)
93
+ node = env[:node]
94
+ return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
130
95
 
131
- # Delete any attribute that isn't allowed on this element.
132
- node.attribute_nodes.each do |attr|
133
- attr_name = attr.name.downcase
96
+ name = env[:node_name]
134
97
 
135
- unless attr_allowlist.include?(attr_name)
136
- # The attribute isn't in the allowlist, but may still be allowed if
137
- # it's a data attribute.
98
+ # Delete any element that isn't in the config allowlist, unless the node
99
+ # has already been deleted from the document.
100
+ #
101
+ # It's important that we not try to reparent the children of a node that
102
+ # has already been deleted, since that seems to trigger a memory leak in
103
+ # Nokogiri.
104
+ unless @elements.include?(name) || node.parent.nil?
105
+ # Elements like br, div, p, etc. need to be replaced with whitespace
106
+ # in order to preserve readability.
107
+ if @whitespace_elements.include?(name)
108
+ node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
109
+
110
+ unless node.children.empty?
111
+ node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
112
+ end
113
+ end
138
114
 
139
- unless allow_data_attributes && attr_name.start_with?('data-') && attr_name =~ REGEX_DATA_ATTR
140
- # Either the attribute isn't a data attribute or arbitrary data
141
- # attributes aren't allowed. Remove the attribute.
142
- attr.unlink
143
- next
115
+ unless node.children.empty?
116
+ unless @remove_all_contents || @remove_element_contents.include?(name)
117
+ node.add_previous_sibling(node.children)
118
+ end
144
119
  end
120
+
121
+ node.unlink
122
+ return
145
123
  end
146
124
 
147
- # The attribute is allowed.
125
+ attr_allowlist = @attributes[name] || @attributes[:all]
126
+
127
+ if attr_allowlist.nil?
128
+ # Delete all attributes from elements with no allowlisted attributes.
129
+ node.attribute_nodes.each { |attr| attr.unlink }
130
+ else
131
+ allow_data_attributes = attr_allowlist.include?(:data)
148
132
 
149
- # Remove any attributes that use unacceptable protocols.
150
- if @protocols.include?(name) && @protocols[name].include?(attr_name)
151
- attr_protocols = @protocols[name][attr_name]
133
+ # Delete any attribute that isn't allowed on this element.
134
+ node.attribute_nodes.each do |attr|
135
+ attr_name = attr.name.downcase
152
136
 
153
- if attr.value =~ REGEX_PROTOCOL
154
- unless attr_protocols.include?($1.downcase)
155
- attr.unlink
156
- next
137
+ unless attr_allowlist.include?(attr_name)
138
+ # The attribute isn't in the allowlist, but may still be allowed
139
+ # if it's a data attribute.
140
+
141
+ unless allow_data_attributes && attr_name.start_with?("data-") && attr_name =~ REGEX_DATA_ATTR
142
+ # Either the attribute isn't a data attribute or arbitrary data
143
+ # attributes aren't allowed. Remove the attribute.
144
+ attr.unlink
145
+ next
146
+ end
157
147
  end
158
148
 
159
- else
160
- unless attr_protocols.include?(:relative)
161
- attr.unlink
162
- next
149
+ # The attribute is allowed.
150
+
151
+ # Remove any attributes that use unacceptable protocols.
152
+ if @protocols.include?(name) && @protocols[name].include?(attr_name)
153
+ attr_protocols = @protocols[name][attr_name]
154
+
155
+ if attr.value =~ REGEX_PROTOCOL
156
+ unless attr_protocols.include?($1.downcase)
157
+ attr.unlink
158
+ next
159
+ end
160
+
161
+ else
162
+ unless attr_protocols.include?(:relative)
163
+ attr.unlink
164
+ next
165
+ end
166
+ end
167
+
168
+ # Leading and trailing whitespace around URLs is ignored at parse
169
+ # time. Stripping it here prevents it from being escaped by the
170
+ # libxml2 workaround below.
171
+ attr.value = attr.value.strip
163
172
  end
164
- end
165
173
 
166
- # Leading and trailing whitespace around URLs is ignored at parse
167
- # time. Stripping it here prevents it from being escaped by the
168
- # libxml2 workaround below.
169
- attr.value = attr.value.strip
174
+ # libxml2 >= 2.9.2 doesn't escape comments within some attributes,
175
+ # in an attempt to preserve server-side includes. This can result in
176
+ # XSS since an unescaped double quote can allow an attacker to
177
+ # inject a non-allowlisted attribute.
178
+ #
179
+ # Sanitize works around this by implementing its own escaping for
180
+ # affected attributes, some of which can exist on any element and
181
+ # some of which can only exist on `<a>` elements.
182
+ #
183
+ # This fix is technically no longer necessary with Nokogumbo >= 2.0
184
+ # since it no longer uses libxml2's serializer, but it's retained to
185
+ # avoid breaking use cases where people might be sanitizing
186
+ # individual Nokogiri nodes and then serializing them manually
187
+ # without Nokogumbo.
188
+ #
189
+ # The relevant libxml2 code is here:
190
+ # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
191
+ if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
192
+ (name == "a" && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
193
+
194
+ attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
195
+ end
196
+ end
170
197
  end
171
198
 
172
- # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
173
- # attempt to preserve server-side includes. This can result in XSS since
174
- # an unescaped double quote can allow an attacker to inject a
175
- # non-allowlisted attribute.
176
- #
177
- # Sanitize works around this by implementing its own escaping for
178
- # affected attributes, some of which can exist on any element and some
179
- # of which can only exist on `<a>` elements.
180
- #
181
- # This fix is technically no longer necessary with Nokogumbo >= 2.0
182
- # since it no longer uses libxml2's serializer, but it's retained to
183
- # avoid breaking use cases where people might be sanitizing individual
184
- # Nokogiri nodes and then serializing them manually without Nokogumbo.
185
- #
186
- # The relevant libxml2 code is here:
187
- # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
188
- if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
189
- (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
190
-
191
- attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
199
+ # Add required attributes.
200
+ if @add_attributes.include?(name)
201
+ @add_attributes[name].each { |key, val| node[key] = val }
192
202
  end
193
- end
194
- end
195
203
 
196
- # Add required attributes.
197
- if @add_attributes.include?(name)
198
- @add_attributes[name].each {|key, val| node[key] = val }
199
- end
200
-
201
- # Make a best effort to ensure that text nodes in invalid "unescaped text"
202
- # elements that are inside a math or svg namespace are properly escaped so
203
- # that they don't get parsed as HTML.
204
- #
205
- # Sanitize is explicitly documented as not supporting MathML or SVG, but
206
- # people sometimes allow `<math>` and `<svg>` elements in their custom
207
- # configs without realizing that it's not safe. This workaround makes it
208
- # slightly less unsafe, but you still shouldn't allow `<math>` or `<svg>`
209
- # because Nokogiri doesn't parse them the same way browsers do and Sanitize
210
- # can't guarantee that their contents are safe.
211
- unless node.namespace.nil?
212
- prefix = node.namespace.prefix
213
-
214
- if (prefix == 'math' || prefix == 'svg') && UNESCAPED_TEXT_ELEMENTS.include?(name)
215
- node.children.each do |child|
216
- if child.type == Nokogiri::XML::Node::TEXT_NODE
217
- child.content = CGI.escapeHTML(child.content)
204
+ # Element-specific special cases.
205
+ case name
206
+
207
+ # If this is an allowlisted iframe that has children, remove all its
208
+ # children. The HTML standard says iframes shouldn't have content, but
209
+ # when they do, this content is parsed as text and is serialized
210
+ # verbatim without being escaped, which is unsafe because legacy
211
+ # browsers may still render it and execute `<script>` content. So the
212
+ # safe and correct thing to do is to always remove iframe content.
213
+ when "iframe"
214
+ if !node.children.empty?
215
+ node.children.each do |child|
216
+ child.unlink
217
+ end
218
218
  end
219
- end
220
- end
221
- end
222
219
 
223
- # Element-specific special cases.
224
- case name
225
-
226
- # If this is an allowlisted iframe that has children, remove all its
227
- # children. The HTML standard says iframes shouldn't have content, but when
228
- # they do, this content is parsed as text and is serialized verbatim without
229
- # being escaped, which is unsafe because legacy browsers may still render it
230
- # and execute `<script>` content. So the safe and correct thing to do is to
231
- # always remove iframe content.
232
- when 'iframe'
233
- if !node.children.empty?
234
- node.children.each do |child|
235
- child.unlink
236
- end
237
- end
220
+ # Prevent the use of `<meta>` elements that set a charset other than
221
+ # UTF-8, since Sanitize's output is always UTF-8.
222
+ when "meta"
223
+ if node.has_attribute?("charset") &&
224
+ node["charset"].downcase != "utf-8"
238
225
 
239
- # Prevent the use of `<meta>` elements that set a charset other than UTF-8,
240
- # since Sanitize's output is always UTF-8.
241
- when 'meta'
242
- if node.has_attribute?('charset') &&
243
- node['charset'].downcase != 'utf-8'
226
+ node["charset"] = "utf-8"
227
+ end
244
228
 
245
- node['charset'] = 'utf-8'
246
- end
229
+ if node.has_attribute?("http-equiv") &&
230
+ node.has_attribute?("content") &&
231
+ node["http-equiv"].downcase == "content-type" &&
232
+ node["content"].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
247
233
 
248
- if node.has_attribute?('http-equiv') &&
249
- node.has_attribute?('content') &&
250
- node['http-equiv'].downcase == 'content-type' &&
251
- node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
234
+ node["content"] = node["content"].gsub(/;\s*charset\s*=.+\z/, ";charset=utf-8")
235
+ end
252
236
 
253
- node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8')
237
+ # A `<noscript>` element's content is parsed differently in browsers
238
+ # depending on whether or not scripting is enabled. Since Nokogiri
239
+ # doesn't support scripting, it always parses `<noscript>` elements as
240
+ # if scripting is disabled. This results in edge cases where it's not
241
+ # possible to reliably sanitize the contents of a `<noscript>` element
242
+ # because Nokogiri can't fully replicate the parsing behavior of a
243
+ # scripting-enabled browser. The safest thing to do is to simply remove
244
+ # all `<noscript>` elements.
245
+ when "noscript"
246
+ node.unlink
247
+ end
254
248
  end
255
-
256
- # A `<noscript>` element's content is parsed differently in browsers
257
- # depending on whether or not scripting is enabled. Since Nokogiri doesn't
258
- # support scripting, it always parses `<noscript>` elements as if scripting
259
- # is disabled. This results in edge cases where it's not possible to
260
- # reliably sanitize the contents of a `<noscript>` element because Nokogiri
261
- # can't fully replicate the parsing behavior of a scripting-enabled browser.
262
- # The safest thing to do is to simply remove all `<noscript>` elements.
263
- when 'noscript'
264
- node.unlink
265
249
  end
266
250
  end
267
-
268
- end; end; end
251
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Sanitize
2
- VERSION = '6.1.2'
4
+ VERSION = "7.0.0"
3
5
  end
data/lib/sanitize.rb CHANGED
@@ -1,20 +1,20 @@
1
- # encoding: utf-8
2
-
3
- require 'nokogiri'
4
- require 'set'
5
-
6
- require_relative 'sanitize/version'
7
- require_relative 'sanitize/config'
8
- require_relative 'sanitize/config/default'
9
- require_relative 'sanitize/config/restricted'
10
- require_relative 'sanitize/config/basic'
11
- require_relative 'sanitize/config/relaxed'
12
- require_relative 'sanitize/css'
13
- require_relative 'sanitize/transformers/clean_cdata'
14
- require_relative 'sanitize/transformers/clean_comment'
15
- require_relative 'sanitize/transformers/clean_css'
16
- require_relative 'sanitize/transformers/clean_doctype'
17
- require_relative 'sanitize/transformers/clean_element'
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require "set"
5
+
6
+ require_relative "sanitize/version"
7
+ require_relative "sanitize/config"
8
+ require_relative "sanitize/config/default"
9
+ require_relative "sanitize/config/restricted"
10
+ require_relative "sanitize/config/basic"
11
+ require_relative "sanitize/config/relaxed"
12
+ require_relative "sanitize/css"
13
+ require_relative "sanitize/transformers/clean_cdata"
14
+ require_relative "sanitize/transformers/clean_comment"
15
+ require_relative "sanitize/transformers/clean_css"
16
+ require_relative "sanitize/transformers/clean_doctype"
17
+ require_relative "sanitize/transformers/clean_element"
18
18
 
19
19
  class Sanitize
20
20
  attr_reader :config
@@ -33,12 +33,12 @@ class Sanitize
33
33
  # - https://infra.spec.whatwg.org/#noncharacter
34
34
  REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
35
35
 
36
- # Matches an attribute value that could be treated by a browser as a URL
37
- # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
38
- # or more characters followed by a colon is considered a match, even if the
39
- # colon is encoded as an entity and even if it's an incomplete entity (which
40
- # IE6 and Opera will still parse).
41
- REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
36
+ # Matches an attribute value that could be treated by a browser as a URL with
37
+ # a protocol prefix, such as "http:" or "javascript:". Any string of zero or
38
+ # more characters followed by a colon is considered a match, even if the colon
39
+ # is encoded as an entity and even if it's an incomplete entity (which IE6 and
40
+ # Opera will still parse).
41
+ REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?::|&#0*58|&#x0*3a)/i
42
42
 
43
43
  # Matches one or more characters that should be stripped from HTML before
44
44
  # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
@@ -99,12 +99,12 @@ class Sanitize
99
99
  @transformers << Transformers::CleanElement.new(@config)
100
100
  @transformers << Transformers::CleanComment unless @config[:allow_comments]
101
101
 
102
- if @config[:elements].include?('style')
102
+ if @config[:elements].include?("style")
103
103
  scss = Sanitize::CSS.new(config)
104
104
  @transformers << Transformers::CSS::CleanElement.new(scss)
105
105
  end
106
106
 
107
- if @config[:attributes].values.any? {|attr| attr.include?('style') }
107
+ if @config[:attributes].values.any? { |attr| attr.include?("style") }
108
108
  scss ||= Sanitize::CSS.new(config)
109
109
  @transformers << Transformers::CSS::CleanAttribute.new(scss)
110
110
  end
@@ -112,7 +112,7 @@ class Sanitize
112
112
  @transformers << Transformers::CleanDoctype
113
113
  @transformers << Transformers::CleanCDATA
114
114
 
115
- @transformer_config = { config: @config }
115
+ @transformer_config = {config: @config}
116
116
  end
117
117
 
118
118
  # Returns a sanitized copy of the given _html_ document.
@@ -121,7 +121,7 @@ class Sanitize
121
121
  # error will be raised. If this is undesirable, you should probably use
122
122
  # {#fragment} instead.
123
123
  def document(html)
124
- return '' unless html
124
+ return "" unless html
125
125
 
126
126
  doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
127
127
  node!(doc)
@@ -133,7 +133,7 @@ class Sanitize
133
133
 
134
134
  # Returns a sanitized copy of the given _html_ fragment.
135
135
  def fragment(html)
136
- return '' unless html
136
+ return "" unless html
137
137
 
138
138
  frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
139
139
  node!(frag)
@@ -152,7 +152,7 @@ class Sanitize
152
152
  raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
153
153
 
154
154
  if node.is_a?(Nokogiri::XML::Document)
155
- unless @config[:elements].include?('html')
155
+ unless @config[:elements].include?("html")
156
156
  raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
157
157
  end
158
158
  end
@@ -175,13 +175,13 @@ class Sanitize
175
175
  def preprocess(html)
176
176
  html = html.to_s.dup
177
177
 
178
- unless html.encoding.name == 'UTF-8'
179
- html.encode!('UTF-8',
180
- :invalid => :replace,
181
- :undef => :replace)
178
+ unless html.encoding.name == "UTF-8"
179
+ html.encode!("UTF-8",
180
+ invalid: :replace,
181
+ undef: :replace)
182
182
  end
183
183
 
184
- html.gsub!(REGEX_UNSUITABLE_CHARS, '')
184
+ html.gsub!(REGEX_UNSUITABLE_CHARS, "")
185
185
  html
186
186
  end
187
187
 
@@ -225,17 +225,17 @@ class Sanitize
225
225
 
226
226
  child = node.child
227
227
 
228
- while child do
228
+ while child
229
229
  prev = child.previous_sibling
230
230
  traverse(child, &block)
231
231
 
232
- if child.parent == node
233
- child = child.next_sibling
232
+ child = if child.parent == node
233
+ child.next_sibling
234
234
  else
235
235
  # The child was unlinked or reparented, so traverse the previous node's
236
236
  # next sibling, or the parent's first child if there is no previous
237
237
  # node.
238
- child = prev ? prev.next_sibling : node.child
238
+ prev ? prev.next_sibling : node.child
239
239
  end
240
240
  end
241
241
  end
data/test/common.rb CHANGED
@@ -1,3 +1,4 @@
1
- # encoding: utf-8
2
- require 'minitest/autorun'
3
- require_relative '../lib/sanitize'
1
+ # frozen_string_literal: true
2
+
3
+ require "minitest/autorun"
4
+ require "sanitize"