sanitize 2.1.1 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sanitize might be problematic. Click here for more details.

data/lib/sanitize.rb CHANGED
@@ -1,55 +1,24 @@
1
1
  # encoding: utf-8
2
- #--
3
- # Copyright (c) 2013 Ryan Grove <ryan@wonko.com>
4
- #
5
- # Permission is hereby granted, free of charge, to any person obtaining a copy
6
- # of this software and associated documentation files (the 'Software'), to deal
7
- # in the Software without restriction, including without limitation the rights
8
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- # copies of the Software, and to permit persons to whom the Software is
10
- # furnished to do so, subject to the following conditions:
11
- #
12
- # The above copyright notice and this permission notice shall be included in all
13
- # copies or substantial portions of the Software.
14
- #
15
- # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- # SOFTWARE.
22
- #++
23
2
 
3
+ require 'nokogumbo'
24
4
  require 'set'
25
5
 
26
- require 'nokogiri'
27
- require 'sanitize/version'
28
- require 'sanitize/config'
29
- require 'sanitize/config/restricted'
30
- require 'sanitize/config/basic'
31
- require 'sanitize/config/relaxed'
32
- require 'sanitize/transformers/clean_cdata'
33
- require 'sanitize/transformers/clean_comment'
34
- require 'sanitize/transformers/clean_element'
6
+ require_relative 'sanitize/version'
7
+ require_relative 'sanitize/config'
8
+ require_relative 'sanitize/config/default'
9
+ require_relative 'sanitize/config/restricted'
10
+ require_relative 'sanitize/config/basic'
11
+ require_relative 'sanitize/config/relaxed'
12
+ require_relative 'sanitize/css'
13
+ require_relative 'sanitize/transformers/clean_cdata'
14
+ require_relative 'sanitize/transformers/clean_comment'
15
+ require_relative 'sanitize/transformers/clean_css'
16
+ require_relative 'sanitize/transformers/clean_doctype'
17
+ require_relative 'sanitize/transformers/clean_element'
35
18
 
36
19
  class Sanitize
37
20
  attr_reader :config
38
21
 
39
- # Matches a valid HTML5 data attribute name. The unicode ranges included here
40
- # are a conservative subset of the full range of characters that are
41
- # technically allowed, with the intent of matching the most common characters
42
- # used in data attribute names while excluding uncommon or potentially
43
- # misleading characters, or characters with the potential to be normalized
44
- # into unsafe or confusing forms.
45
- #
46
- # If you need data attr names with characters that aren't included here (such
47
- # as combining marks, full-width characters, or CJK), please consider creating
48
- # a custom transformer to validate attributes according to your needs.
49
- #
50
- # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
51
- REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
52
-
53
22
  # Matches an attribute value that could be treated by a browser as a URL
54
23
  # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
55
24
  # or more characters followed by a colon is considered a match, even if the
@@ -57,38 +26,47 @@ class Sanitize
57
26
  # IE6 and Opera will still parse).
58
27
  REGEX_PROTOCOL = /\A([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
59
28
 
29
+ # Matches Unicode characters that should be stripped from HTML before passing
30
+ # it to the parser.
31
+ #
32
+ # http://www.w3.org/TR/unicode-xml/#Charlist
33
+ REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
34
+
60
35
  #--
61
36
  # Class Methods
62
37
  #++
63
38
 
64
- # Returns a sanitized copy of _html_, using the settings in _config_ if
65
- # specified.
66
- def self.clean(html, config = {})
67
- Sanitize.new(config).clean(html)
39
+ # Returns a sanitized copy of the given full _html_ document, using the
40
+ # settings in _config_ if specified.
41
+ #
42
+ # When sanitizing a document, the `<html>` element must be whitelisted or an
43
+ # error will be raised. If this is undesirable, you should probably use
44
+ # {#fragment} instead.
45
+ def self.document(html, config = {})
46
+ Sanitize.new(config).document(html)
68
47
  end
69
48
 
70
- # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
71
- # were made.
72
- def self.clean!(html, config = {})
73
- Sanitize.new(config).clean!(html)
49
+ # Returns a sanitized copy of the given _html_ fragment, using the settings in
50
+ # _config_ if specified.
51
+ def self.fragment(html, config = {})
52
+ Sanitize.new(config).fragment(html)
74
53
  end
75
54
 
76
- # Performs a Sanitize#clean using a full-document HTML parser instead of
77
- # the default fragment parser. This will add a DOCTYPE and html tag
78
- # unless they are already present
79
- def self.clean_document(html, config = {})
80
- Sanitize.new(config).clean_document(html)
55
+ # Sanitizes the given `Nokogiri::XML::Node` instance and all its children.
56
+ def self.node!(node, config = {})
57
+ Sanitize.new(config).node!(node)
81
58
  end
82
59
 
83
- # Performs Sanitize#clean_document in place, returning _html_, or +nil+ if no
84
- # changes were made.
85
- def self.clean_document!(html, config = {})
86
- Sanitize.new(config).clean_document!(html)
87
- end
60
+ # Aliases for pre-3.0.0 backcompat.
61
+ class << Sanitize
62
+ # @deprecated Use {.document} instead.
63
+ alias_method :clean_document, :document
64
+
65
+ # @deprecated Use {.fragment} instead.
66
+ alias_method :clean, :fragment
88
67
 
89
- # Sanitizes the specified Nokogiri::XML::Node and all its children.
90
- def self.clean_node!(node, config = {})
91
- Sanitize.new(config).clean_node!(node)
68
+ # @deprecated Use {.node!} instead.
69
+ alias_method :clean_node!, :node!
92
70
  end
93
71
 
94
72
  #--
@@ -97,97 +75,156 @@ class Sanitize
97
75
 
98
76
  # Returns a new Sanitize object initialized with the settings in _config_.
99
77
  def initialize(config = {})
100
- @config = Config::DEFAULT.merge(config)
78
+ @config = Config.merge(Config::DEFAULT, config)
79
+
80
+ @transformers = Array(@config[:transformers].dup)
101
81
 
102
- @transformers = {
103
- :breadth => Array(@config[:transformers_breadth].dup),
104
- :depth => Array(@config[:transformers]) + Array(@config[:transformers_depth])
105
- }
82
+ # Default transformers always run at the end of the chain, after any custom
83
+ # transformers.
84
+ @transformers << Transformers::CleanComment unless @config[:allow_comments]
85
+ @transformers << Transformers::CleanDoctype unless @config[:allow_doctype]
106
86
 
107
- # Default depth transformers. These always run at the end of the chain,
108
- # after any custom transformers.
109
- @transformers[:depth] << Transformers::CleanComment unless @config[:allow_comments]
87
+ if @config[:elements].include?('style')
88
+ scss = Sanitize::CSS.new(config)
89
+ @transformers << Transformers::CSS::CleanElement.new(scss)
90
+ end
91
+
92
+ if @config[:attributes].values.any? {|attr| attr.include?('style') }
93
+ scss ||= Sanitize::CSS.new(config)
94
+ @transformers << Transformers::CSS::CleanAttribute.new(scss)
95
+ end
110
96
 
111
- @transformers[:depth] <<
97
+ @transformers <<
112
98
  Transformers::CleanCDATA <<
113
99
  Transformers::CleanElement.new(@config)
114
100
  end
115
101
 
116
- # Returns a sanitized copy of the given _html_ fragment.
117
- def clean(html)
118
- if html
119
- dupe = html.dup
120
- clean!(dupe) || dupe
121
- end
102
+ # Returns a sanitized copy of the given _html_ document.
103
+ #
104
+ # When sanitizing a document, the `<html>` element must be whitelisted or an
105
+ # error will be raised. If this is undesirable, you should probably use
106
+ # {#fragment} instead.
107
+ def document(html)
108
+ return '' unless html
109
+
110
+ doc = Nokogiri::HTML5.parse(preprocess(html))
111
+ node!(doc)
112
+ to_html(doc)
122
113
  end
123
114
 
124
- # Performs clean in place, returning _html_, or +nil+ if no changes were
125
- # made.
126
- def clean!(html, parser = Nokogiri::HTML::DocumentFragment)
127
- fragment = parser.parse(html)
128
- clean_node!(fragment)
115
+ # @deprecated Use {#document} instead.
116
+ alias_method :clean_document, :document
129
117
 
130
- output_method_params = {:encoding => @config[:output_encoding], :indent => 0}
118
+ # Returns a sanitized copy of the given _html_ fragment.
119
+ def fragment(html)
120
+ return '' unless html
121
+
122
+ html = preprocess(html)
123
+ doc = Nokogiri::HTML5.parse("<html><body>#{html}")
131
124
 
132
- if @config[:output] == :xhtml
133
- output_method = fragment.method(:to_xhtml)
134
- output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML
135
- elsif @config[:output] == :html
136
- output_method = fragment.method(:to_html)
125
+ # Hack to allow fragments containing <body>. Borrowed from
126
+ # Nokogiri::HTML::DocumentFragment.
127
+ if html =~ /\A<body(?:\s|>)/i
128
+ path = '/html/body'
137
129
  else
138
- raise Error, "unsupported output format: #{@config[:output]}"
130
+ path = '/html/body/node()'
139
131
  end
140
132
 
141
- result = output_method.call(output_method_params)
133
+ frag = doc.fragment
134
+ doc.xpath(path).each {|node| frag << node }
142
135
 
143
- return result == html ? nil : html[0, html.length] = result
136
+ node!(frag)
137
+ to_html(frag)
144
138
  end
145
139
 
146
- # Returns a sanitized copy of the given full _html_ document.
147
- def clean_document(html)
148
- unless html.nil?
149
- clean_document!(html.dup) || html
140
+ # @deprecated Use {#fragment} instead.
141
+ alias_method :clean, :fragment
142
+
143
+ # Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it
144
+ # in place.
145
+ #
146
+ # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
147
+ # whitelisted or an error will be raised.
148
+ def node!(node)
149
+ raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
150
+
151
+ if node.is_a?(Nokogiri::XML::Document)
152
+ unless @config[:elements].include?('html')
153
+ raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
154
+ end
150
155
  end
151
- end
152
156
 
153
- # Performs clean_document in place, returning _html_, or +nil+ if no changes
154
- # were made.
155
- def clean_document!(html)
156
- if !@config[:elements].include?('html') && !@config[:remove_contents]
157
- raise 'You must have the HTML element whitelisted to call #clean_document unless remove_contents is set to true'
158
- # otherwise Nokogiri will raise for having multiple root nodes when
159
- # it moves its children to the root document context
157
+ node_whitelist = Set.new
158
+
159
+ traverse(node) do |n|
160
+ transform_node!(n, node_whitelist)
160
161
  end
161
162
 
162
- clean!(html, Nokogiri::HTML::Document)
163
+ node
163
164
  end
164
165
 
165
- # Sanitizes the specified Nokogiri::XML::Node and all its children.
166
- def clean_node!(node)
167
- raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
166
+ # @deprecated Use {#node!} instead.
167
+ alias_method :clean_node!, :node!
168
168
 
169
- node_whitelist = Set.new
169
+ private
170
170
 
171
- unless @transformers[:breadth].empty?
172
- traverse_breadth(node) {|n| transform_node!(n, node_whitelist, :breadth) }
171
+ # Preprocesses HTML before parsing to remove undesirable Unicode chars.
172
+ def preprocess(html)
173
+ html.to_s.dup
174
+
175
+ unless html.encoding.name == 'UTF-8'
176
+ html.encode!('UTF-8',
177
+ :invalid => :replace,
178
+ :undef => :replace)
173
179
  end
174
180
 
175
- traverse_depth(node) {|n| transform_node!(n, node_whitelist, :depth) }
176
- node
181
+ html.gsub!(REGEX_UNSUITABLE_CHARS, '')
182
+ html
177
183
  end
178
184
 
179
- private
185
+ def to_html(node)
186
+ replace_meta = false
180
187
 
181
- def transform_node!(node, node_whitelist, mode)
182
- @transformers[mode].each do |transformer|
183
- result = transformer.call({
188
+ # Hacky workaround for a libxml2 bug that adds an undesired Content-Type
189
+ # meta tag to all serialized HTML documents.
190
+ #
191
+ # https://github.com/sparklemotion/nokogiri/issues/1008
192
+ if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
193
+ node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
194
+
195
+ regex_meta = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
196
+
197
+ # Only replace the content-type meta tag if <meta> isn't whitelisted or
198
+ # the original document didn't actually include a content-type meta tag.
199
+ replace_meta = !@config[:elements].include?('meta') ||
200
+ node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
201
+ meta['http-equiv'].downcase == 'content-type'
202
+ end
203
+ end
204
+
205
+ so = Nokogiri::XML::Node::SaveOptions
206
+
207
+ # Serialize to HTML without any formatting to prevent Nokogiri from adding
208
+ # newlines after certain tags.
209
+ html = node.to_html(
210
+ :encoding => 'utf-8',
211
+ :indent => 0,
212
+ :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
213
+ )
214
+
215
+ html.gsub!(regex_meta, '\1') if replace_meta
216
+ html
217
+ end
218
+
219
+ def transform_node!(node, node_whitelist)
220
+ @transformers.each do |transformer|
221
+ result = transformer.call(
184
222
  :config => @config,
185
223
  :is_whitelisted => node_whitelist.include?(node),
186
224
  :node => node,
187
225
  :node_name => node.name.downcase,
188
- :node_whitelist => node_whitelist,
189
- :traversal_mode => mode
190
- })
226
+ :node_whitelist => node_whitelist
227
+ )
191
228
 
192
229
  if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
193
230
  node_whitelist.merge(result[:node_whitelist])
@@ -197,18 +234,26 @@ class Sanitize
197
234
  node
198
235
  end
199
236
 
200
- # Performs breadth-first traversal, operating first on the root node, then
201
- # traversing downwards.
202
- def traverse_breadth(node, &block)
237
+ # Performs top-down traversal of the given node, operating first on the node
238
+ # itself, then traversing each child (if any) in order.
239
+ def traverse(node, &block)
203
240
  block.call(node)
204
- node.children.each {|child| traverse_breadth(child, &block) }
205
- end
206
241
 
207
- # Performs depth-first traversal, operating first on the deepest nodes in the
208
- # document, then traversing upwards to the root.
209
- def traverse_depth(node, &block)
210
- node.children.each {|child| traverse_depth(child, &block) }
211
- block.call(node)
242
+ child = node.child
243
+
244
+ while child do
245
+ prev = child.previous_sibling
246
+ traverse(child, &block)
247
+
248
+ if child.parent != node
249
+ # The child was unlinked or reparented, so traverse the previous node's
250
+ # next sibling, or the parent's first child if there is no previous
251
+ # node.
252
+ child = prev ? prev.next_sibling : node.child
253
+ else
254
+ child = child.next_sibling
255
+ end
256
+ end
212
257
  end
213
258
 
214
259
  class Error < StandardError; end
@@ -1,86 +1,60 @@
1
- #--
2
- # Copyright (c) 2013 Ryan Grove <ryan@wonko.com>
3
- #
4
- # Permission is hereby granted, free of charge, to any person obtaining a copy
5
- # of this software and associated documentation files (the 'Software'), to deal
6
- # in the Software without restriction, including without limitation the rights
7
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
- # copies of the Software, and to permit persons to whom the Software is
9
- # furnished to do so, subject to the following conditions:
10
- #
11
- # The above copyright notice and this permission notice shall be included in all
12
- # copies or substantial portions of the Software.
13
- #
14
- # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20
- # SOFTWARE.
21
- #++
1
+ # encoding: utf-8
2
+
3
+ require 'set'
22
4
 
23
5
  class Sanitize
24
6
  module Config
25
- DEFAULT = {
26
-
27
- # Whether or not to allow HTML comments. Allowing comments is strongly
28
- # discouraged, since IE allows script execution within conditional
29
- # comments.
30
- :allow_comments => false,
31
-
32
- # HTML attributes to add to specific elements. By default, no attributes
33
- # are added.
34
- :add_attributes => {},
35
-
36
- # HTML attributes to allow in specific elements. By default, no attributes
37
- # are allowed. Use the symbol :data to indicate that arbitrary HTML5
38
- # data-* attributes should be allowed.
39
- :attributes => {},
40
-
41
- # HTML elements to allow. By default, no elements are allowed (which means
42
- # that all HTML will be stripped).
43
- :elements => [],
44
-
45
- # Output format. Supported formats are :html and :xhtml. Default is :html.
46
- :output => :html,
47
-
48
- # Character encoding to use for HTML output. Default is 'utf-8'.
49
- :output_encoding => 'utf-8',
50
-
51
- # URL handling protocols to allow in specific attributes. By default, no
52
- # protocols are allowed. Use :relative in place of a protocol if you want
53
- # to allow relative URLs sans protocol.
54
- :protocols => {},
55
-
56
- # If this is true, Sanitize will remove the contents of any filtered
57
- # elements in addition to the elements themselves. By default, Sanitize
58
- # leaves the safe parts of an element's contents behind when the element
59
- # is removed.
60
- #
61
- # If this is an Array of element names, then only the contents of the
62
- # specified elements (when filtered) will be removed, and the contents of
63
- # all other filtered elements will be left behind.
64
- :remove_contents => false,
65
-
66
- # Transformers allow you to filter or alter nodes using custom logic. See
67
- # README.rdoc for details and examples.
68
- :transformers => [],
69
-
70
- # By default, transformers perform depth-first traversal (deepest node
71
- # upward). This setting allows you to specify transformers that should
72
- # perform breadth-first traversal (top node downward).
73
- :transformers_breadth => [],
74
7
 
75
- # Elements which, when removed, should have their contents surrounded by
76
- # space characters to preserve readability. For example,
77
- # `foo<div>bar</div>baz` will become 'foo bar baz' when the <div> is
78
- # removed.
79
- :whitespace_elements => %w[
80
- address article aside blockquote br dd div dl dt footer h1 h2 h3 h4 h5
81
- h6 header hgroup hr li nav ol p pre section ul
82
- ]
8
+ # Deeply freezes and returns the given configuration Hash.
9
+ def self.freeze_config(config)
10
+ if Hash === config
11
+ config.each_value {|c| freeze_config(c) }
12
+ elsif Array === config || Set === config
13
+ config.each {|c| freeze_config(c) }
14
+ end
15
+
16
+ config.freeze
17
+ end
18
+
19
+ # Returns a new Hash containing the result of deeply merging *other_config*
20
+ # into *config*. Does not modify *config* or *other_config*.
21
+ #
22
+ # This is the safest way to use a built-in Sanitize config as the basis for
23
+ # your own custom config.
24
+ def self.merge(config, other_config = {})
25
+ raise ArgumentError, 'config must be a Hash' unless Hash === config
26
+ raise ArgumentError, 'other_config must be a Hash' unless Hash === other_config
27
+
28
+ merged = {}
29
+ keys = Set.new(config.keys + other_config.keys)
30
+
31
+ keys.each do |key|
32
+ oldval = config[key]
33
+
34
+ if other_config.has_key?(key)
35
+ newval = other_config[key]
36
+
37
+ if Hash === oldval && Hash === newval
38
+ merged[key] = oldval.empty? ? newval.dup : merge(oldval, newval)
39
+ elsif Array === newval && key != :transformers
40
+ merged[key] = Set.new(newval)
41
+ else
42
+ merged[key] = can_dupe?(newval) ? newval.dup : newval
43
+ end
44
+ else
45
+ merged[key] = can_dupe?(oldval) ? oldval.dup : oldval
46
+ end
47
+ end
48
+
49
+ merged
50
+ end
51
+
52
+ # Returns `true` if `dup` may be safely called on _value_, `false`
53
+ # otherwise.
54
+ def self.can_dupe?(value)
55
+ !(true == value || false == value || value.nil? || Numeric === value || Symbol === value)
56
+ end
57
+ private_class_method :can_dupe?
83
58
 
84
- }
85
59
  end
86
60
  end