sanitize 2.1.1 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sanitize might be problematic. Click here for more details.

data/lib/sanitize.rb CHANGED
@@ -1,94 +1,87 @@
1
1
  # encoding: utf-8
2
- #--
3
- # Copyright (c) 2013 Ryan Grove <ryan@wonko.com>
4
- #
5
- # Permission is hereby granted, free of charge, to any person obtaining a copy
6
- # of this software and associated documentation files (the 'Software'), to deal
7
- # in the Software without restriction, including without limitation the rights
8
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- # copies of the Software, and to permit persons to whom the Software is
10
- # furnished to do so, subject to the following conditions:
11
- #
12
- # The above copyright notice and this permission notice shall be included in all
13
- # copies or substantial portions of the Software.
14
- #
15
- # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- # SOFTWARE.
22
- #++
23
2
 
3
+ require 'nokogiri'
24
4
  require 'set'
25
5
 
26
- require 'nokogiri'
27
- require 'sanitize/version'
28
- require 'sanitize/config'
29
- require 'sanitize/config/restricted'
30
- require 'sanitize/config/basic'
31
- require 'sanitize/config/relaxed'
32
- require 'sanitize/transformers/clean_cdata'
33
- require 'sanitize/transformers/clean_comment'
34
- require 'sanitize/transformers/clean_element'
6
+ require_relative 'sanitize/version'
7
+ require_relative 'sanitize/config'
8
+ require_relative 'sanitize/config/default'
9
+ require_relative 'sanitize/config/restricted'
10
+ require_relative 'sanitize/config/basic'
11
+ require_relative 'sanitize/config/relaxed'
12
+ require_relative 'sanitize/css'
13
+ require_relative 'sanitize/transformers/clean_cdata'
14
+ require_relative 'sanitize/transformers/clean_comment'
15
+ require_relative 'sanitize/transformers/clean_css'
16
+ require_relative 'sanitize/transformers/clean_doctype'
17
+ require_relative 'sanitize/transformers/clean_element'
35
18
 
36
19
  class Sanitize
37
20
  attr_reader :config
38
21
 
39
- # Matches a valid HTML5 data attribute name. The unicode ranges included here
40
- # are a conservative subset of the full range of characters that are
41
- # technically allowed, with the intent of matching the most common characters
42
- # used in data attribute names while excluding uncommon or potentially
43
- # misleading characters, or characters with the potential to be normalized
44
- # into unsafe or confusing forms.
22
+ # Matches one or more control characters that should be removed from HTML
23
+ # before parsing, as defined by the HTML living standard.
45
24
  #
46
- # If you need data attr names with characters that aren't included here (such
47
- # as combining marks, full-width characters, or CJK), please consider creating
48
- # a custom transformer to validate attributes according to your needs.
25
+ # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
26
+ # - https://infra.spec.whatwg.org/#control
27
+ REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
28
+
29
+ # Matches one or more non-characters that should be removed from HTML before
30
+ # parsing, as defined by the HTML living standard.
49
31
  #
50
- # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
51
- REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
32
+ # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
33
+ # - https://infra.spec.whatwg.org/#noncharacter
34
+ REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
52
35
 
53
36
  # Matches an attribute value that could be treated by a browser as a URL
54
37
  # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
55
38
  # or more characters followed by a colon is considered a match, even if the
56
39
  # colon is encoded as an entity and even if it's an incomplete entity (which
57
40
  # IE6 and Opera will still parse).
58
- REGEX_PROTOCOL = /\A([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
41
+ REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
42
+
43
+ # Matches one or more characters that should be stripped from HTML before
44
+ # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
45
+ # `REGEX_HTML_NON_CHARACTERS`.
46
+ #
47
+ # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
48
+ REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
59
49
 
60
50
  #--
61
51
  # Class Methods
62
52
  #++
63
53
 
64
- # Returns a sanitized copy of _html_, using the settings in _config_ if
65
- # specified.
66
- def self.clean(html, config = {})
67
- Sanitize.new(config).clean(html)
54
+ # Returns a sanitized copy of the given full _html_ document, using the
55
+ # settings in _config_ if specified.
56
+ #
57
+ # When sanitizing a document, the `<html>` element must be allowlisted or an
58
+ # error will be raised. If this is undesirable, you should probably use
59
+ # {#fragment} instead.
60
+ def self.document(html, config = {})
61
+ Sanitize.new(config).document(html)
68
62
  end
69
63
 
70
- # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
71
- # were made.
72
- def self.clean!(html, config = {})
73
- Sanitize.new(config).clean!(html)
64
+ # Returns a sanitized copy of the given _html_ fragment, using the settings in
65
+ # _config_ if specified.
66
+ def self.fragment(html, config = {})
67
+ Sanitize.new(config).fragment(html)
74
68
  end
75
69
 
76
- # Performs a Sanitize#clean using a full-document HTML parser instead of
77
- # the default fragment parser. This will add a DOCTYPE and html tag
78
- # unless they are already present
79
- def self.clean_document(html, config = {})
80
- Sanitize.new(config).clean_document(html)
70
+ # Sanitizes the given `Nokogiri::XML::Node` instance and all its children.
71
+ def self.node!(node, config = {})
72
+ Sanitize.new(config).node!(node)
81
73
  end
82
74
 
83
- # Performs Sanitize#clean_document in place, returning _html_, or +nil+ if no
84
- # changes were made.
85
- def self.clean_document!(html, config = {})
86
- Sanitize.new(config).clean_document!(html)
87
- end
75
+ # Aliases for pre-3.0.0 backcompat.
76
+ class << Sanitize
77
+ # @deprecated Use {.document} instead.
78
+ alias_method :clean_document, :document
88
79
 
89
- # Sanitizes the specified Nokogiri::XML::Node and all its children.
90
- def self.clean_node!(node, config = {})
91
- Sanitize.new(config).clean_node!(node)
80
+ # @deprecated Use {.fragment} instead.
81
+ alias_method :clean, :fragment
82
+
83
+ # @deprecated Use {.node!} instead.
84
+ alias_method :clean_node!, :node!
92
85
  end
93
86
 
94
87
  #--
@@ -97,118 +90,154 @@ class Sanitize
97
90
 
98
91
  # Returns a new Sanitize object initialized with the settings in _config_.
99
92
  def initialize(config = {})
100
- @config = Config::DEFAULT.merge(config)
101
-
102
- @transformers = {
103
- :breadth => Array(@config[:transformers_breadth].dup),
104
- :depth => Array(@config[:transformers]) + Array(@config[:transformers_depth])
105
- }
93
+ @config = Config.merge(Config::DEFAULT, config)
106
94
 
107
- # Default depth transformers. These always run at the end of the chain,
108
- # after any custom transformers.
109
- @transformers[:depth] << Transformers::CleanComment unless @config[:allow_comments]
95
+ @transformers = Array(@config[:transformers]).dup
110
96
 
111
- @transformers[:depth] <<
112
- Transformers::CleanCDATA <<
113
- Transformers::CleanElement.new(@config)
114
- end
97
+ # Default transformers always run at the end of the chain, after any custom
98
+ # transformers.
99
+ @transformers << Transformers::CleanElement.new(@config)
100
+ @transformers << Transformers::CleanComment unless @config[:allow_comments]
115
101
 
116
- # Returns a sanitized copy of the given _html_ fragment.
117
- def clean(html)
118
- if html
119
- dupe = html.dup
120
- clean!(dupe) || dupe
102
+ if @config[:elements].include?('style')
103
+ scss = Sanitize::CSS.new(config)
104
+ @transformers << Transformers::CSS::CleanElement.new(scss)
121
105
  end
122
- end
123
106
 
124
- # Performs clean in place, returning _html_, or +nil+ if no changes were
125
- # made.
126
- def clean!(html, parser = Nokogiri::HTML::DocumentFragment)
127
- fragment = parser.parse(html)
128
- clean_node!(fragment)
129
-
130
- output_method_params = {:encoding => @config[:output_encoding], :indent => 0}
131
-
132
- if @config[:output] == :xhtml
133
- output_method = fragment.method(:to_xhtml)
134
- output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML
135
- elsif @config[:output] == :html
136
- output_method = fragment.method(:to_html)
137
- else
138
- raise Error, "unsupported output format: #{@config[:output]}"
107
+ if @config[:attributes].values.any? {|attr| attr.include?('style') }
108
+ scss ||= Sanitize::CSS.new(config)
109
+ @transformers << Transformers::CSS::CleanAttribute.new(scss)
139
110
  end
140
111
 
141
- result = output_method.call(output_method_params)
112
+ @transformers << Transformers::CleanDoctype
113
+ @transformers << Transformers::CleanCDATA
142
114
 
143
- return result == html ? nil : html[0, html.length] = result
115
+ @transformer_config = { config: @config }
144
116
  end
145
117
 
146
- # Returns a sanitized copy of the given full _html_ document.
147
- def clean_document(html)
148
- unless html.nil?
149
- clean_document!(html.dup) || html
150
- end
118
+ # Returns a sanitized copy of the given _html_ document.
119
+ #
120
+ # When sanitizing a document, the `<html>` element must be allowlisted or an
121
+ # error will be raised. If this is undesirable, you should probably use
122
+ # {#fragment} instead.
123
+ def document(html)
124
+ return '' unless html
125
+
126
+ doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
127
+ node!(doc)
128
+ to_html(doc)
151
129
  end
152
130
 
153
- # Performs clean_document in place, returning _html_, or +nil+ if no changes
154
- # were made.
155
- def clean_document!(html)
156
- if !@config[:elements].include?('html') && !@config[:remove_contents]
157
- raise 'You must have the HTML element whitelisted to call #clean_document unless remove_contents is set to true'
158
- # otherwise Nokogiri will raise for having multiple root nodes when
159
- # it moves its children to the root document context
160
- end
131
+ # @deprecated Use {#document} instead.
132
+ alias_method :clean_document, :document
133
+
134
+ # Returns a sanitized copy of the given _html_ fragment.
135
+ def fragment(html)
136
+ return '' unless html
161
137
 
162
- clean!(html, Nokogiri::HTML::Document)
138
+ frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
139
+ node!(frag)
140
+ to_html(frag)
163
141
  end
164
142
 
165
- # Sanitizes the specified Nokogiri::XML::Node and all its children.
166
- def clean_node!(node)
143
+ # @deprecated Use {#fragment} instead.
144
+ alias_method :clean, :fragment
145
+
146
+ # Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it
147
+ # in place.
148
+ #
149
+ # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
150
+ # allowlisted or an error will be raised.
151
+ def node!(node)
167
152
  raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
168
153
 
169
- node_whitelist = Set.new
154
+ if node.is_a?(Nokogiri::XML::Document)
155
+ unless @config[:elements].include?('html')
156
+ raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
157
+ end
158
+ end
170
159
 
171
- unless @transformers[:breadth].empty?
172
- traverse_breadth(node) {|n| transform_node!(n, node_whitelist, :breadth) }
160
+ node_allowlist = Set.new
161
+
162
+ traverse(node) do |n|
163
+ transform_node!(n, node_allowlist)
173
164
  end
174
165
 
175
- traverse_depth(node) {|n| transform_node!(n, node_whitelist, :depth) }
176
166
  node
177
167
  end
178
168
 
169
+ # @deprecated Use {#node!} instead.
170
+ alias_method :clean_node!, :node!
171
+
179
172
  private
180
173
 
181
- def transform_node!(node, node_whitelist, mode)
182
- @transformers[mode].each do |transformer|
183
- result = transformer.call({
184
- :config => @config,
185
- :is_whitelisted => node_whitelist.include?(node),
186
- :node => node,
187
- :node_name => node.name.downcase,
188
- :node_whitelist => node_whitelist,
189
- :traversal_mode => mode
190
- })
191
-
192
- if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
193
- node_whitelist.merge(result[:node_whitelist])
174
+ # Preprocesses HTML before parsing to remove undesirable Unicode chars.
175
+ def preprocess(html)
176
+ html = html.to_s.dup
177
+
178
+ unless html.encoding.name == 'UTF-8'
179
+ html.encode!('UTF-8',
180
+ :invalid => :replace,
181
+ :undef => :replace)
182
+ end
183
+
184
+ html.gsub!(REGEX_UNSUITABLE_CHARS, '')
185
+ html
186
+ end
187
+
188
+ def to_html(node)
189
+ node.to_html(preserve_newline: true)
190
+ end
191
+
192
+ def transform_node!(node, node_allowlist)
193
+ @transformers.each do |transformer|
194
+ # Since transform_node! may be called in a tight loop to process thousands
195
+ # of items, we can optimize both memory and CPU performance by:
196
+ #
197
+ # 1. Reusing the same config hash for each transformer
198
+ # 2. Directly assigning values to hash instead of using merge!. Not only
199
+ # does merge! create a new hash, it is also 2.6x slower:
200
+ # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
201
+ config = @transformer_config
202
+ config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
203
+ config[:node] = node
204
+ config[:node_name] = node.name.downcase
205
+ config[:node_allowlist] = config[:node_whitelist] = node_allowlist
206
+
207
+ result = transformer.call(**config)
208
+
209
+ if result.is_a?(Hash)
210
+ result_allowlist = result[:node_allowlist] || result[:node_whitelist]
211
+
212
+ if result_allowlist.respond_to?(:each)
213
+ node_allowlist.merge(result_allowlist)
214
+ end
194
215
  end
195
216
  end
196
217
 
197
218
  node
198
219
  end
199
220
 
200
- # Performs breadth-first traversal, operating first on the root node, then
201
- # traversing downwards.
202
- def traverse_breadth(node, &block)
203
- block.call(node)
204
- node.children.each {|child| traverse_breadth(child, &block) }
205
- end
221
+ # Performs top-down traversal of the given node, operating first on the node
222
+ # itself, then traversing each child (if any) in order.
223
+ def traverse(node, &block)
224
+ yield node
206
225
 
207
- # Performs depth-first traversal, operating first on the deepest nodes in the
208
- # document, then traversing upwards to the root.
209
- def traverse_depth(node, &block)
210
- node.children.each {|child| traverse_depth(child, &block) }
211
- block.call(node)
226
+ child = node.child
227
+
228
+ while child do
229
+ prev = child.previous_sibling
230
+ traverse(child, &block)
231
+
232
+ if child.parent == node
233
+ child = child.next_sibling
234
+ else
235
+ # The child was unlinked or reparented, so traverse the previous node's
236
+ # next sibling, or the parent's first child if there is no previous
237
+ # node.
238
+ child = prev ? prev.next_sibling : node.child
239
+ end
240
+ end
212
241
  end
213
242
 
214
243
  class Error < StandardError; end
data/test/common.rb ADDED
@@ -0,0 +1,3 @@
1
+ # encoding: utf-8
2
+ require 'minitest/autorun'
3
+ require_relative '../lib/sanitize'
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+ require_relative 'common'
3
+
4
+ describe 'Sanitize::Transformers::CleanComment' do
5
+ make_my_diffs_pretty!
6
+ parallelize_me!
7
+
8
+ describe 'when :allow_comments is false' do
9
+ before do
10
+ @s = Sanitize.new(:allow_comments => false, :elements => ['div'])
11
+ end
12
+
13
+ it 'should remove comments' do
14
+ @s.fragment('foo <!-- comment --> bar').must_equal 'foo bar'
15
+ @s.fragment('foo <!-- ').must_equal 'foo '
16
+ @s.fragment('foo <!-- - -> bar').must_equal 'foo '
17
+ @s.fragment("foo <!--\n\n\n\n-->bar").must_equal 'foo bar'
18
+ @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo --&gt; --&gt;bar'
19
+ @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
20
+
21
+ # Special case: the comment markup is inside a <script>, which makes it
22
+ # text content and not an actual HTML comment.
23
+ @s.fragment("<script><!-- comment --></script>").must_equal ''
24
+
25
+ Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script'])
26
+ .must_equal '<script><!-- comment --></script>'
27
+ end
28
+ end
29
+
30
+ describe 'when :allow_comments is true' do
31
+ before do
32
+ @s = Sanitize.new(:allow_comments => true, :elements => ['div'])
33
+ end
34
+
35
+ it 'should allow comments' do
36
+ @s.fragment('foo <!-- comment --> bar').must_equal 'foo <!-- comment --> bar'
37
+ @s.fragment('foo <!-- ').must_equal 'foo <!-- -->'
38
+ @s.fragment('foo <!-- - -> bar').must_equal 'foo <!-- - -> bar-->'
39
+ @s.fragment("foo <!--\n\n\n\n-->bar").must_equal "foo <!--\n\n\n\n-->bar"
40
+ @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
41
+ @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
42
+
43
+ Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script'])
44
+ .must_equal '<script><!-- comment --></script>'
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,67 @@
1
+ # encoding: utf-8
2
+ require_relative 'common'
3
+
4
+ describe 'Sanitize::Transformers::CSS::CleanAttribute' do
5
+ make_my_diffs_pretty!
6
+ parallelize_me!
7
+
8
+ before do
9
+ @s = Sanitize.new(Sanitize::Config::RELAXED)
10
+ end
11
+
12
+ it 'should sanitize CSS properties in style attributes' do
13
+ @s.fragment(%[
14
+ <div style="color: #fff; width: expression(alert(1)); /* <-- evil! */"></div>
15
+ ].strip).must_equal %[
16
+ <div style="color: #fff; /* <-- evil! */"></div>
17
+ ].strip
18
+ end
19
+
20
+ it 'should remove the style attribute if the sanitized CSS is empty' do
21
+ @s.fragment('<div style="width: expression(alert(1))"></div>').
22
+ must_equal '<div></div>'
23
+ end
24
+ end
25
+
26
+ describe 'Sanitize::Transformers::CSS::CleanElement' do
27
+ make_my_diffs_pretty!
28
+ parallelize_me!
29
+
30
+ before do
31
+ @s = Sanitize.new(Sanitize::Config::RELAXED)
32
+ end
33
+
34
+ it 'should sanitize CSS stylesheets in <style> elements' do
35
+ html = %[
36
+ <style>@import url(evil.css);
37
+ /* Yay CSS! */
38
+ .foo { color: #fff; }
39
+ #bar { background: url(yay.jpg); bogus: wtf; }
40
+ .evil { width: expression(xss()); }
41
+
42
+ @media screen (max-width:480px) {
43
+ .foo { width: 400px; }
44
+ #bar:not(.baz) { height: 100px; }
45
+ }
46
+ </style>
47
+ ].strip
48
+
49
+ @s.fragment(html).must_equal %[
50
+ <style>
51
+ /* Yay CSS! */
52
+ .foo { color: #fff; }
53
+ #bar { background: url(yay.jpg); }
54
+ .evil { }
55
+
56
+ @media screen (max-width:480px) {
57
+ .foo { width: 400px; }
58
+ #bar:not(.baz) { height: 100px; }
59
+ }
60
+ </style>
61
+ ].strip
62
+ end
63
+
64
+ it 'should remove the <style> element if the sanitized CSS is empty' do
65
+ @s.fragment('<style></style>').must_equal ''
66
+ end
67
+ end
@@ -0,0 +1,71 @@
1
+ # encoding: utf-8
2
+ require_relative 'common'
3
+
4
+ describe 'Sanitize::Transformers::CleanDoctype' do
5
+ make_my_diffs_pretty!
6
+ parallelize_me!
7
+
8
+ describe 'when :allow_doctype is false' do
9
+ before do
10
+ @s = Sanitize.new(:allow_doctype => false, :elements => ['html'])
11
+ end
12
+
13
+ it 'should remove doctype declarations' do
14
+ @s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>"
15
+ @s.fragment('<!DOCTYPE html>foo').must_equal 'foo'
16
+ end
17
+
18
+ it 'should not allow doctype definitions in fragments' do
19
+ @s.fragment('<!DOCTYPE html><html>foo</html>')
20
+ .must_equal "foo"
21
+
22
+ @s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
23
+ .must_equal "foo"
24
+
25
+ @s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
26
+ .must_equal "foo"
27
+ end
28
+ end
29
+
30
+ describe 'when :allow_doctype is true' do
31
+ before do
32
+ @s = Sanitize.new(:allow_doctype => true, :elements => ['html'])
33
+ end
34
+
35
+ it 'should allow doctype declarations in documents' do
36
+ @s.document('<!DOCTYPE html><html>foo</html>')
37
+ .must_equal "<!DOCTYPE html><html>foo</html>"
38
+
39
+ @s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
40
+ .must_equal "<!DOCTYPE html><html>foo</html>"
41
+
42
+ @s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
43
+ .must_equal "<!DOCTYPE html><html>foo</html>"
44
+ end
45
+
46
+ it 'should not allow obviously invalid doctype declarations in documents' do
47
+ @s.document('<!DOCTYPE blah blah blah><html>foo</html>')
48
+ .must_equal "<!DOCTYPE html><html>foo</html>"
49
+
50
+ @s.document('<!DOCTYPE blah><html>foo</html>')
51
+ .must_equal "<!DOCTYPE html><html>foo</html>"
52
+
53
+ @s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
54
+ .must_equal "<!DOCTYPE html><html>foo</html>"
55
+
56
+ @s.document('<!whatever><html>foo</html>')
57
+ .must_equal "<html>foo</html>"
58
+ end
59
+
60
+ it 'should not allow doctype definitions in fragments' do
61
+ @s.fragment('<!DOCTYPE html><html>foo</html>')
62
+ .must_equal "foo"
63
+
64
+ @s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
65
+ .must_equal "foo"
66
+
67
+ @s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
68
+ .must_equal "foo"
69
+ end
70
+ end
71
+ end