sanitize 6.1.2 → 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,268 +1,251 @@
1
- # encoding: utf-8
2
-
3
- require 'cgi'
4
- require 'set'
5
-
6
- class Sanitize; module Transformers; class CleanElement
7
-
8
- # Matches a valid HTML5 data attribute name. The unicode ranges included here
9
- # are a conservative subset of the full range of characters that are
10
- # technically allowed, with the intent of matching the most common characters
11
- # used in data attribute names while excluding uncommon or potentially
12
- # misleading characters, or characters with the potential to be normalized
13
- # into unsafe or confusing forms.
14
- #
15
- # If you need data attr names with characters that aren't included here (such
16
- # as combining marks, full-width characters, or CJK), please consider creating
17
- # a custom transformer to validate attributes according to your needs.
18
- #
19
- # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
20
- REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
21
-
22
- # Elements whose content is treated as unescaped text by HTML parsers.
23
- UNESCAPED_TEXT_ELEMENTS = Set.new(%w[
24
- iframe
25
- noembed
26
- noframes
27
- noscript
28
- plaintext
29
- script
30
- style
31
- xmp
32
- ])
33
-
34
- # Attributes that need additional escaping on `<a>` elements due to unsafe
35
- # libxml2 behavior.
36
- UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
37
- name
38
- ])
39
-
40
- # Attributes that need additional escaping on all elements due to unsafe
41
- # libxml2 behavior.
42
- UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
43
- action
44
- href
45
- src
46
- ])
47
-
48
- # Mapping of original characters to escape sequences for characters that
49
- # should be escaped in attributes affected by unsafe libxml2 behavior.
50
- UNSAFE_LIBXML_ESCAPE_CHARS = {
51
- ' ' => '%20',
52
- '"' => '%22'
53
- }
54
-
55
- # Regex that matches any single character that needs to be escaped in
56
- # attributes affected by unsafe libxml2 behavior.
57
- UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
58
-
59
- def initialize(config)
60
- @add_attributes = config[:add_attributes]
61
- @attributes = config[:attributes].dup
62
- @elements = config[:elements]
63
- @protocols = config[:protocols]
64
- @remove_all_contents = false
65
- @remove_element_contents = Set.new
66
- @whitespace_elements = {}
67
-
68
- @attributes.each do |element_name, attrs|
69
- unless element_name == :all
70
- @attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
71
- end
72
- end
73
-
74
- # Backcompat: if :whitespace_elements is a Set, convert it to a hash.
75
- if config[:whitespace_elements].is_a?(Set)
76
- config[:whitespace_elements].each do |element|
77
- @whitespace_elements[element] = {:before => ' ', :after => ' '}
78
- end
79
- else
80
- @whitespace_elements = config[:whitespace_elements]
81
- end
82
-
83
- if config[:remove_contents].is_a?(Enumerable)
84
- @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
85
- else
86
- @remove_all_contents = !!config[:remove_contents]
87
- end
88
- end
1
+ # frozen_string_literal: true
2
+
3
+ require "cgi"
4
+ require "set"
5
+
6
+ class Sanitize
7
+ module Transformers
8
+ class CleanElement
9
+ # Matches a valid HTML5 data attribute name. The unicode ranges included
10
+ # here are a conservative subset of the full range of characters that are
11
+ # technically allowed, with the intent of matching the most common
12
+ # characters used in data attribute names while excluding uncommon or
13
+ # potentially misleading characters, or characters with the potential to
14
+ # be normalized into unsafe or confusing forms.
15
+ #
16
+ # If you need data attr names with characters that aren't included here
17
+ # (such as combining marks, full-width characters, or CJK), please
18
+ # consider creating a custom transformer to validate attributes according
19
+ # to your needs.
20
+ #
21
+ # https://html.spec.whatwg.org/multipage/dom.html#embedding-custom-non-visible-data-with-the-data-*-attributes
22
+ REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
23
+
24
+ # Elements whose content is treated as unescaped text by HTML parsers.
25
+ UNESCAPED_TEXT_ELEMENTS = Set.new(%w[
26
+ iframe
27
+ noembed
28
+ noframes
29
+ noscript
30
+ plaintext
31
+ script
32
+ style
33
+ xmp
34
+ ])
35
+
36
+ # Attributes that need additional escaping on `<a>` elements due to unsafe
37
+ # libxml2 behavior.
38
+ UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
39
+ name
40
+ ])
41
+
42
+ # Attributes that need additional escaping on all elements due to unsafe
43
+ # libxml2 behavior.
44
+ UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
45
+ action
46
+ href
47
+ src
48
+ ])
49
+
50
+ # Mapping of original characters to escape sequences for characters that
51
+ # should be escaped in attributes affected by unsafe libxml2 behavior.
52
+ UNSAFE_LIBXML_ESCAPE_CHARS = {
53
+ " " => "%20",
54
+ '"' => "%22"
55
+ }
56
+
57
+ # Regex that matches any single character that needs to be escaped in
58
+ # attributes affected by unsafe libxml2 behavior.
59
+ UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
60
+
61
+ def initialize(config)
62
+ @add_attributes = config[:add_attributes]
63
+ @attributes = config[:attributes].dup
64
+ @elements = config[:elements]
65
+ @protocols = config[:protocols]
66
+ @remove_all_contents = false
67
+ @remove_element_contents = Set.new
68
+ @whitespace_elements = {}
69
+
70
+ @attributes.each do |element_name, attrs|
71
+ unless element_name == :all
72
+ @attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
73
+ end
74
+ end
89
75
 
90
- def call(env)
91
- node = env[:node]
92
- return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
93
-
94
- name = env[:node_name]
95
-
96
- # Delete any element that isn't in the config allowlist, unless the node has
97
- # already been deleted from the document.
98
- #
99
- # It's important that we not try to reparent the children of a node that has
100
- # already been deleted, since that seems to trigger a memory leak in
101
- # Nokogiri.
102
- unless @elements.include?(name) || node.parent.nil?
103
- # Elements like br, div, p, etc. need to be replaced with whitespace in
104
- # order to preserve readability.
105
- if @whitespace_elements.include?(name)
106
- node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
107
-
108
- unless node.children.empty?
109
- node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
76
+ # Backcompat: if :whitespace_elements is a Set, convert it to a hash.
77
+ if config[:whitespace_elements].is_a?(Set)
78
+ config[:whitespace_elements].each do |element|
79
+ @whitespace_elements[element] = {before: " ", after: " "}
80
+ end
81
+ else
82
+ @whitespace_elements = config[:whitespace_elements]
110
83
  end
111
- end
112
84
 
113
- unless node.children.empty?
114
- unless @remove_all_contents || @remove_element_contents.include?(name)
115
- node.add_previous_sibling(node.children)
85
+ if config[:remove_contents].is_a?(Enumerable)
86
+ @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
87
+ else
88
+ @remove_all_contents = !!config[:remove_contents]
116
89
  end
117
90
  end
118
91
 
119
- node.unlink
120
- return
121
- end
122
-
123
- attr_allowlist = @attributes[name] || @attributes[:all]
124
-
125
- if attr_allowlist.nil?
126
- # Delete all attributes from elements with no allowlisted attributes.
127
- node.attribute_nodes.each {|attr| attr.unlink }
128
- else
129
- allow_data_attributes = attr_allowlist.include?(:data)
92
+ def call(env)
93
+ node = env[:node]
94
+ return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
130
95
 
131
- # Delete any attribute that isn't allowed on this element.
132
- node.attribute_nodes.each do |attr|
133
- attr_name = attr.name.downcase
96
+ name = env[:node_name]
134
97
 
135
- unless attr_allowlist.include?(attr_name)
136
- # The attribute isn't in the allowlist, but may still be allowed if
137
- # it's a data attribute.
98
+ # Delete any element that isn't in the config allowlist, unless the node
99
+ # has already been deleted from the document.
100
+ #
101
+ # It's important that we not try to reparent the children of a node that
102
+ # has already been deleted, since that seems to trigger a memory leak in
103
+ # Nokogiri.
104
+ unless @elements.include?(name) || node.parent.nil?
105
+ # Elements like br, div, p, etc. need to be replaced with whitespace
106
+ # in order to preserve readability.
107
+ if @whitespace_elements.include?(name)
108
+ node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
109
+
110
+ unless node.children.empty?
111
+ node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
112
+ end
113
+ end
138
114
 
139
- unless allow_data_attributes && attr_name.start_with?('data-') && attr_name =~ REGEX_DATA_ATTR
140
- # Either the attribute isn't a data attribute or arbitrary data
141
- # attributes aren't allowed. Remove the attribute.
142
- attr.unlink
143
- next
115
+ unless node.children.empty?
116
+ unless @remove_all_contents || @remove_element_contents.include?(name)
117
+ node.add_previous_sibling(node.children)
118
+ end
144
119
  end
120
+
121
+ node.unlink
122
+ return
145
123
  end
146
124
 
147
- # The attribute is allowed.
125
+ attr_allowlist = @attributes[name] || @attributes[:all]
126
+
127
+ if attr_allowlist.nil?
128
+ # Delete all attributes from elements with no allowlisted attributes.
129
+ node.attribute_nodes.each { |attr| attr.unlink }
130
+ else
131
+ allow_data_attributes = attr_allowlist.include?(:data)
148
132
 
149
- # Remove any attributes that use unacceptable protocols.
150
- if @protocols.include?(name) && @protocols[name].include?(attr_name)
151
- attr_protocols = @protocols[name][attr_name]
133
+ # Delete any attribute that isn't allowed on this element.
134
+ node.attribute_nodes.each do |attr|
135
+ attr_name = attr.name.downcase
152
136
 
153
- if attr.value =~ REGEX_PROTOCOL
154
- unless attr_protocols.include?($1.downcase)
155
- attr.unlink
156
- next
137
+ unless attr_allowlist.include?(attr_name)
138
+ # The attribute isn't in the allowlist, but may still be allowed
139
+ # if it's a data attribute.
140
+
141
+ unless allow_data_attributes && attr_name.start_with?("data-") && attr_name =~ REGEX_DATA_ATTR
142
+ # Either the attribute isn't a data attribute or arbitrary data
143
+ # attributes aren't allowed. Remove the attribute.
144
+ attr.unlink
145
+ next
146
+ end
157
147
  end
158
148
 
159
- else
160
- unless attr_protocols.include?(:relative)
161
- attr.unlink
162
- next
149
+ # The attribute is allowed.
150
+
151
+ # Remove any attributes that use unacceptable protocols.
152
+ if @protocols.include?(name) && @protocols[name].include?(attr_name)
153
+ attr_protocols = @protocols[name][attr_name]
154
+
155
+ if attr.value =~ REGEX_PROTOCOL
156
+ unless attr_protocols.include?($1.downcase)
157
+ attr.unlink
158
+ next
159
+ end
160
+
161
+ else
162
+ unless attr_protocols.include?(:relative)
163
+ attr.unlink
164
+ next
165
+ end
166
+ end
167
+
168
+ # Leading and trailing whitespace around URLs is ignored at parse
169
+ # time. Stripping it here prevents it from being escaped by the
170
+ # libxml2 workaround below.
171
+ attr.value = attr.value.strip
163
172
  end
164
- end
165
173
 
166
- # Leading and trailing whitespace around URLs is ignored at parse
167
- # time. Stripping it here prevents it from being escaped by the
168
- # libxml2 workaround below.
169
- attr.value = attr.value.strip
174
+ # libxml2 >= 2.9.2 doesn't escape comments within some attributes,
175
+ # in an attempt to preserve server-side includes. This can result in
176
+ # XSS since an unescaped double quote can allow an attacker to
177
+ # inject a non-allowlisted attribute.
178
+ #
179
+ # Sanitize works around this by implementing its own escaping for
180
+ # affected attributes, some of which can exist on any element and
181
+ # some of which can only exist on `<a>` elements.
182
+ #
183
+ # This fix is technically no longer necessary with Nokogumbo >= 2.0
184
+ # since it no longer uses libxml2's serializer, but it's retained to
185
+ # avoid breaking use cases where people might be sanitizing
186
+ # individual Nokogiri nodes and then serializing them manually
187
+ # without Nokogumbo.
188
+ #
189
+ # The relevant libxml2 code is here:
190
+ # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
191
+ if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
192
+ (name == "a" && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
193
+
194
+ attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
195
+ end
196
+ end
170
197
  end
171
198
 
172
- # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
173
- # attempt to preserve server-side includes. This can result in XSS since
174
- # an unescaped double quote can allow an attacker to inject a
175
- # non-allowlisted attribute.
176
- #
177
- # Sanitize works around this by implementing its own escaping for
178
- # affected attributes, some of which can exist on any element and some
179
- # of which can only exist on `<a>` elements.
180
- #
181
- # This fix is technically no longer necessary with Nokogumbo >= 2.0
182
- # since it no longer uses libxml2's serializer, but it's retained to
183
- # avoid breaking use cases where people might be sanitizing individual
184
- # Nokogiri nodes and then serializing them manually without Nokogumbo.
185
- #
186
- # The relevant libxml2 code is here:
187
- # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
188
- if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
189
- (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
190
-
191
- attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
199
+ # Add required attributes.
200
+ if @add_attributes.include?(name)
201
+ @add_attributes[name].each { |key, val| node[key] = val }
192
202
  end
193
- end
194
- end
195
203
 
196
- # Add required attributes.
197
- if @add_attributes.include?(name)
198
- @add_attributes[name].each {|key, val| node[key] = val }
199
- end
200
-
201
- # Make a best effort to ensure that text nodes in invalid "unescaped text"
202
- # elements that are inside a math or svg namespace are properly escaped so
203
- # that they don't get parsed as HTML.
204
- #
205
- # Sanitize is explicitly documented as not supporting MathML or SVG, but
206
- # people sometimes allow `<math>` and `<svg>` elements in their custom
207
- # configs without realizing that it's not safe. This workaround makes it
208
- # slightly less unsafe, but you still shouldn't allow `<math>` or `<svg>`
209
- # because Nokogiri doesn't parse them the same way browsers do and Sanitize
210
- # can't guarantee that their contents are safe.
211
- unless node.namespace.nil?
212
- prefix = node.namespace.prefix
213
-
214
- if (prefix == 'math' || prefix == 'svg') && UNESCAPED_TEXT_ELEMENTS.include?(name)
215
- node.children.each do |child|
216
- if child.type == Nokogiri::XML::Node::TEXT_NODE
217
- child.content = CGI.escapeHTML(child.content)
204
+ # Element-specific special cases.
205
+ case name
206
+
207
+ # If this is an allowlisted iframe that has children, remove all its
208
+ # children. The HTML standard says iframes shouldn't have content, but
209
+ # when they do, this content is parsed as text and is serialized
210
+ # verbatim without being escaped, which is unsafe because legacy
211
+ # browsers may still render it and execute `<script>` content. So the
212
+ # safe and correct thing to do is to always remove iframe content.
213
+ when "iframe"
214
+ if !node.children.empty?
215
+ node.children.each do |child|
216
+ child.unlink
217
+ end
218
218
  end
219
- end
220
- end
221
- end
222
219
 
223
- # Element-specific special cases.
224
- case name
225
-
226
- # If this is an allowlisted iframe that has children, remove all its
227
- # children. The HTML standard says iframes shouldn't have content, but when
228
- # they do, this content is parsed as text and is serialized verbatim without
229
- # being escaped, which is unsafe because legacy browsers may still render it
230
- # and execute `<script>` content. So the safe and correct thing to do is to
231
- # always remove iframe content.
232
- when 'iframe'
233
- if !node.children.empty?
234
- node.children.each do |child|
235
- child.unlink
236
- end
237
- end
220
+ # Prevent the use of `<meta>` elements that set a charset other than
221
+ # UTF-8, since Sanitize's output is always UTF-8.
222
+ when "meta"
223
+ if node.has_attribute?("charset") &&
224
+ node["charset"].downcase != "utf-8"
238
225
 
239
- # Prevent the use of `<meta>` elements that set a charset other than UTF-8,
240
- # since Sanitize's output is always UTF-8.
241
- when 'meta'
242
- if node.has_attribute?('charset') &&
243
- node['charset'].downcase != 'utf-8'
226
+ node["charset"] = "utf-8"
227
+ end
244
228
 
245
- node['charset'] = 'utf-8'
246
- end
229
+ if node.has_attribute?("http-equiv") &&
230
+ node.has_attribute?("content") &&
231
+ node["http-equiv"].downcase == "content-type" &&
232
+ node["content"].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
247
233
 
248
- if node.has_attribute?('http-equiv') &&
249
- node.has_attribute?('content') &&
250
- node['http-equiv'].downcase == 'content-type' &&
251
- node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
234
+ node["content"] = node["content"].gsub(/;\s*charset\s*=.+\z/, ";charset=utf-8")
235
+ end
252
236
 
253
- node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8')
237
+ # A `<noscript>` element's content is parsed differently in browsers
238
+ # depending on whether or not scripting is enabled. Since Nokogiri
239
+ # doesn't support scripting, it always parses `<noscript>` elements as
240
+ # if scripting is disabled. This results in edge cases where it's not
241
+ # possible to reliably sanitize the contents of a `<noscript>` element
242
+ # because Nokogiri can't fully replicate the parsing behavior of a
243
+ # scripting-enabled browser. The safest thing to do is to simply remove
244
+ # all `<noscript>` elements.
245
+ when "noscript"
246
+ node.unlink
247
+ end
254
248
  end
255
-
256
- # A `<noscript>` element's content is parsed differently in browsers
257
- # depending on whether or not scripting is enabled. Since Nokogiri doesn't
258
- # support scripting, it always parses `<noscript>` elements as if scripting
259
- # is disabled. This results in edge cases where it's not possible to
260
- # reliably sanitize the contents of a `<noscript>` element because Nokogiri
261
- # can't fully replicate the parsing behavior of a scripting-enabled browser.
262
- # The safest thing to do is to simply remove all `<noscript>` elements.
263
- when 'noscript'
264
- node.unlink
265
249
  end
266
250
  end
267
-
268
- end; end; end
251
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Sanitize
2
- VERSION = '6.1.2'
4
+ VERSION = "7.0.0"
3
5
  end
data/lib/sanitize.rb CHANGED
@@ -1,20 +1,20 @@
1
- # encoding: utf-8
2
-
3
- require 'nokogiri'
4
- require 'set'
5
-
6
- require_relative 'sanitize/version'
7
- require_relative 'sanitize/config'
8
- require_relative 'sanitize/config/default'
9
- require_relative 'sanitize/config/restricted'
10
- require_relative 'sanitize/config/basic'
11
- require_relative 'sanitize/config/relaxed'
12
- require_relative 'sanitize/css'
13
- require_relative 'sanitize/transformers/clean_cdata'
14
- require_relative 'sanitize/transformers/clean_comment'
15
- require_relative 'sanitize/transformers/clean_css'
16
- require_relative 'sanitize/transformers/clean_doctype'
17
- require_relative 'sanitize/transformers/clean_element'
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require "set"
5
+
6
+ require_relative "sanitize/version"
7
+ require_relative "sanitize/config"
8
+ require_relative "sanitize/config/default"
9
+ require_relative "sanitize/config/restricted"
10
+ require_relative "sanitize/config/basic"
11
+ require_relative "sanitize/config/relaxed"
12
+ require_relative "sanitize/css"
13
+ require_relative "sanitize/transformers/clean_cdata"
14
+ require_relative "sanitize/transformers/clean_comment"
15
+ require_relative "sanitize/transformers/clean_css"
16
+ require_relative "sanitize/transformers/clean_doctype"
17
+ require_relative "sanitize/transformers/clean_element"
18
18
 
19
19
  class Sanitize
20
20
  attr_reader :config
@@ -33,12 +33,12 @@ class Sanitize
33
33
  # - https://infra.spec.whatwg.org/#noncharacter
34
34
  REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
35
35
 
36
- # Matches an attribute value that could be treated by a browser as a URL
37
- # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
38
- # or more characters followed by a colon is considered a match, even if the
39
- # colon is encoded as an entity and even if it's an incomplete entity (which
40
- # IE6 and Opera will still parse).
41
- REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
36
+ # Matches an attribute value that could be treated by a browser as a URL with
37
+ # a protocol prefix, such as "http:" or "javascript:". Any string of zero or
38
+ # more characters followed by a colon is considered a match, even if the colon
39
+ # is encoded as an entity and even if it's an incomplete entity (which IE6 and
40
+ # Opera will still parse).
41
+ REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?::|&#0*58|&#x0*3a)/i
42
42
 
43
43
  # Matches one or more characters that should be stripped from HTML before
44
44
  # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
@@ -99,12 +99,12 @@ class Sanitize
99
99
  @transformers << Transformers::CleanElement.new(@config)
100
100
  @transformers << Transformers::CleanComment unless @config[:allow_comments]
101
101
 
102
- if @config[:elements].include?('style')
102
+ if @config[:elements].include?("style")
103
103
  scss = Sanitize::CSS.new(config)
104
104
  @transformers << Transformers::CSS::CleanElement.new(scss)
105
105
  end
106
106
 
107
- if @config[:attributes].values.any? {|attr| attr.include?('style') }
107
+ if @config[:attributes].values.any? { |attr| attr.include?("style") }
108
108
  scss ||= Sanitize::CSS.new(config)
109
109
  @transformers << Transformers::CSS::CleanAttribute.new(scss)
110
110
  end
@@ -112,7 +112,7 @@ class Sanitize
112
112
  @transformers << Transformers::CleanDoctype
113
113
  @transformers << Transformers::CleanCDATA
114
114
 
115
- @transformer_config = { config: @config }
115
+ @transformer_config = {config: @config}
116
116
  end
117
117
 
118
118
  # Returns a sanitized copy of the given _html_ document.
@@ -121,7 +121,7 @@ class Sanitize
121
121
  # error will be raised. If this is undesirable, you should probably use
122
122
  # {#fragment} instead.
123
123
  def document(html)
124
- return '' unless html
124
+ return "" unless html
125
125
 
126
126
  doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
127
127
  node!(doc)
@@ -133,7 +133,7 @@ class Sanitize
133
133
 
134
134
  # Returns a sanitized copy of the given _html_ fragment.
135
135
  def fragment(html)
136
- return '' unless html
136
+ return "" unless html
137
137
 
138
138
  frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
139
139
  node!(frag)
@@ -152,7 +152,7 @@ class Sanitize
152
152
  raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
153
153
 
154
154
  if node.is_a?(Nokogiri::XML::Document)
155
- unless @config[:elements].include?('html')
155
+ unless @config[:elements].include?("html")
156
156
  raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
157
157
  end
158
158
  end
@@ -175,13 +175,13 @@ class Sanitize
175
175
  def preprocess(html)
176
176
  html = html.to_s.dup
177
177
 
178
- unless html.encoding.name == 'UTF-8'
179
- html.encode!('UTF-8',
180
- :invalid => :replace,
181
- :undef => :replace)
178
+ unless html.encoding.name == "UTF-8"
179
+ html.encode!("UTF-8",
180
+ invalid: :replace,
181
+ undef: :replace)
182
182
  end
183
183
 
184
- html.gsub!(REGEX_UNSUITABLE_CHARS, '')
184
+ html.gsub!(REGEX_UNSUITABLE_CHARS, "")
185
185
  html
186
186
  end
187
187
 
@@ -225,17 +225,17 @@ class Sanitize
225
225
 
226
226
  child = node.child
227
227
 
228
- while child do
228
+ while child
229
229
  prev = child.previous_sibling
230
230
  traverse(child, &block)
231
231
 
232
- if child.parent == node
233
- child = child.next_sibling
232
+ child = if child.parent == node
233
+ child.next_sibling
234
234
  else
235
235
  # The child was unlinked or reparented, so traverse the previous node's
236
236
  # next sibling, or the parent's first child if there is no previous
237
237
  # node.
238
- child = prev ? prev.next_sibling : node.child
238
+ prev ? prev.next_sibling : node.child
239
239
  end
240
240
  end
241
241
  end
data/test/common.rb CHANGED
@@ -1,3 +1,4 @@
1
- # encoding: utf-8
2
- require 'minitest/autorun'
3
- require_relative '../lib/sanitize'
1
+ # frozen_string_literal: true
2
+
3
+ require "minitest/autorun"
4
+ require "sanitize"