loofah 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of loofah might be problematic. Click here for more details.
- data.tar.gz.sig +0 -0
- data/CHANGELOG.rdoc +18 -0
- data/MIT-LICENSE.txt +21 -0
- data/Manifest.txt +28 -0
- data/README.rdoc +110 -0
- data/Rakefile +16 -0
- data/TODO.rdoc +9 -0
- data/benchmark/benchmark.rb +72 -0
- data/benchmark/fragment.html +96 -0
- data/benchmark/www.slashdot.com.html +2560 -0
- data/init.rb +2 -0
- data/lib/loofah.rb +197 -0
- data/lib/loofah/active_record.rb +44 -0
- data/lib/loofah/deprecated.rb +38 -0
- data/lib/loofah/html/document.rb +19 -0
- data/lib/loofah/html/document_fragment.rb +30 -0
- data/lib/loofah/html5/scrub.rb +70 -0
- data/lib/loofah/html5/whitelist.rb +170 -0
- data/lib/loofah/scrubber.rb +108 -0
- data/test/helper.rb +8 -0
- data/test/html5/test_deprecated_sanitizer.rb +185 -0
- data/test/html5/test_sanitizer.rb +245 -0
- data/test/html5/testdata/tests1.dat +501 -0
- data/test/test_active_record.rb +71 -0
- data/test/test_api.rb +51 -0
- data/test/test_deprecated_basic.rb +68 -0
- data/test/test_microsofty.rb +91 -0
- data/test/test_scrubber.rb +100 -0
- data/test/test_strip_tags.rb +36 -0
- metadata +148 -0
- metadata.gz.sig +0 -0
@@ -0,0 +1,108 @@
|
|
1
|
+
module Loofah
|
2
|
+
#
|
3
|
+
# Methods that are mixed into Loofah::HTML::Document and Loofah::HTML::DocumentFragment.
|
4
|
+
#
|
5
|
+
module ScrubberInstanceMethods
|
6
|
+
|
7
|
+
#
|
8
|
+
# Clean up the HTML. See Loofah for full usage.
|
9
|
+
#
|
10
|
+
def scrub!(method)
|
11
|
+
case method
|
12
|
+
when :escape, :prune, :whitewash
|
13
|
+
__sanitize_roots.children.each do |node|
|
14
|
+
Scrubber.traverse_conditionally_top_down(node, method.to_sym)
|
15
|
+
end
|
16
|
+
when :strip
|
17
|
+
__sanitize_roots.children.each do |node|
|
18
|
+
Scrubber.traverse_conditionally_bottom_up(node, method.to_sym)
|
19
|
+
end
|
20
|
+
else
|
21
|
+
raise ArgumentError, "unknown sanitize filter '#{method}'"
|
22
|
+
end
|
23
|
+
self
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# Returns the HTML markup contained by the fragment or document
|
28
|
+
#
|
29
|
+
def to_s
|
30
|
+
__sanitize_roots.children.to_s
|
31
|
+
end
|
32
|
+
alias :serialize :to_s
|
33
|
+
|
34
|
+
#
|
35
|
+
# Returns a plain-text version of the markup contained by the fragment or document
|
36
|
+
#
|
37
|
+
def text
|
38
|
+
__sanitize_roots.children.inner_text
|
39
|
+
end
|
40
|
+
alias :inner_text :text
|
41
|
+
alias :to_str :text
|
42
|
+
end
|
43
|
+
|
44
|
+
module Scrubber
|
45
|
+
class << self
|
46
|
+
|
47
|
+
def sanitize(node)
|
48
|
+
case node.type
|
49
|
+
when Nokogiri::XML::Node::ELEMENT_NODE
|
50
|
+
if HTML5::HashedWhiteList::ALLOWED_ELEMENTS[node.name]
|
51
|
+
HTML5::Scrub.scrub_attributes node
|
52
|
+
return false
|
53
|
+
end
|
54
|
+
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
|
55
|
+
return false
|
56
|
+
end
|
57
|
+
true
|
58
|
+
end
|
59
|
+
|
60
|
+
def escape(node)
|
61
|
+
return false unless sanitize(node)
|
62
|
+
replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document)
|
63
|
+
node.add_next_sibling replacement_killer
|
64
|
+
node.remove
|
65
|
+
return true
|
66
|
+
end
|
67
|
+
|
68
|
+
def prune(node)
|
69
|
+
return false unless sanitize(node)
|
70
|
+
node.remove
|
71
|
+
return true
|
72
|
+
end
|
73
|
+
|
74
|
+
def strip(node)
|
75
|
+
return false unless sanitize(node)
|
76
|
+
replacement_killer = node.before node.inner_html
|
77
|
+
node.remove
|
78
|
+
return true
|
79
|
+
end
|
80
|
+
|
81
|
+
def whitewash(node)
|
82
|
+
case node.type
|
83
|
+
when Nokogiri::XML::Node::ELEMENT_NODE
|
84
|
+
if HTML5::HashedWhiteList::ALLOWED_ELEMENTS[node.name]
|
85
|
+
node.attributes.each { |attr| node.remove_attribute(attr.first) }
|
86
|
+
return false if node.namespaces.empty?
|
87
|
+
end
|
88
|
+
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
|
89
|
+
return false
|
90
|
+
end
|
91
|
+
node.remove
|
92
|
+
return true
|
93
|
+
end
|
94
|
+
|
95
|
+
def traverse_conditionally_top_down(node, method_name)
|
96
|
+
return if send(method_name, node)
|
97
|
+
node.children.each {|j| traverse_conditionally_top_down(j, method_name)}
|
98
|
+
end
|
99
|
+
|
100
|
+
def traverse_conditionally_bottom_up(node, method_name)
|
101
|
+
node.children.each {|j| traverse_conditionally_bottom_up(j, method_name)}
|
102
|
+
return if send(method_name, node)
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,185 @@
|
|
1
|
+
#
|
2
|
+
# these tests taken from the HTML5 sanitization project and modified for use with Loofah
|
3
|
+
# see the original here: http://code.google.com/p/html5lib/source/browse/ruby/test/test_sanitizer.rb
|
4
|
+
#
|
5
|
+
# license text at the bottom of this file
|
6
|
+
#
|
7
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'helper'))
|
8
|
+
|
9
|
+
class HTML5TestDeprecatedSanitizer < Test::Unit::TestCase
|
10
|
+
include Loofah
|
11
|
+
|
12
|
+
def sanitize_html stream
|
13
|
+
Loofah.sanitize(stream)
|
14
|
+
end
|
15
|
+
|
16
|
+
def sanitize_doc stream
|
17
|
+
Loofah.sanitize_document(stream)
|
18
|
+
end
|
19
|
+
|
20
|
+
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
21
|
+
# libxml uses double-quotes, so let's swappo-boppo our quotes before comparing.
|
22
|
+
assert_equal htmloutput, sanitize_html(input).gsub(/"/,"'"), input
|
23
|
+
|
24
|
+
doc = sanitize_doc(input).gsub(/"/,"'")
|
25
|
+
assert doc.include?(htmloutput), "#{input}:\n#{doc}\nshould include:\n#{htmloutput}"
|
26
|
+
end
|
27
|
+
|
28
|
+
HTML5::WhiteList::ALLOWED_ELEMENTS.each do |tag_name|
|
29
|
+
define_method "test_should_allow_#{tag_name}_tag" do
|
30
|
+
input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
31
|
+
htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.downcase}>"
|
32
|
+
xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
33
|
+
rexmloutput = xhtmloutput
|
34
|
+
|
35
|
+
##
|
36
|
+
## these special cases are HTML5-tokenizer-dependent.
|
37
|
+
## libxml2 cleans up HTML differently, and I trust that.
|
38
|
+
##
|
39
|
+
# if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
|
40
|
+
# htmloutput = "foo <bad>bar</bad> baz"
|
41
|
+
# xhtmloutput = htmloutput
|
42
|
+
# elsif tag_name == 'col'
|
43
|
+
# htmloutput = "foo <bad>bar</bad> baz"
|
44
|
+
# xhtmloutput = htmloutput
|
45
|
+
# rexmloutput = "<col title='1' />"
|
46
|
+
# elsif tag_name == 'table'
|
47
|
+
# htmloutput = "foo <bad>bar</bad>baz<table title='1'> </table>"
|
48
|
+
# xhtmloutput = htmloutput
|
49
|
+
# elsif tag_name == 'image'
|
50
|
+
# htmloutput = "<image title='1'/>foo <bad>bar</bad> baz"
|
51
|
+
# xhtmloutput = htmloutput
|
52
|
+
# rexmloutput = "<image title='1'>foo <bad>bar</bad> baz</image>"
|
53
|
+
if HTML5::WhiteList::VOID_ELEMENTS.include?(tag_name)
|
54
|
+
if Nokogiri::LIBXML_VERSION <= "2.6.16"
|
55
|
+
htmloutput = "<#{tag_name} title='1'/><p>foo <bad>bar</bad> baz</p>"
|
56
|
+
else
|
57
|
+
htmloutput = "<#{tag_name} title='1'/>foo <bad>bar</bad> baz"
|
58
|
+
end
|
59
|
+
xhtmloutput = htmloutput
|
60
|
+
# htmloutput += '<br/>' if tag_name == 'br'
|
61
|
+
rexmloutput = "<#{tag_name} title='1' />"
|
62
|
+
end
|
63
|
+
check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
##
|
68
|
+
## libxml2 downcases tag names as it parses, so this is unnecessary.
|
69
|
+
##
|
70
|
+
# HTML5::WhiteList::ALLOWED_ELEMENTS.each do |tag_name|
|
71
|
+
# define_method "test_should_forbid_#{tag_name.upcase}_tag" do
|
72
|
+
# input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
73
|
+
# output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
74
|
+
# check_sanitization(input, output, output, output)
|
75
|
+
# end
|
76
|
+
# end
|
77
|
+
|
78
|
+
HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
79
|
+
next if attribute_name == 'style'
|
80
|
+
next if attribute_name =~ /:/ && Nokogiri::LIBXML_VERSION <= '2.6.16'
|
81
|
+
define_method "test_should_allow_#{attribute_name}_attribute" do
|
82
|
+
input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
83
|
+
output = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
84
|
+
htmloutput = "<p #{attribute_name.downcase}='foo'>foo <bad>bar</bad> baz</p>"
|
85
|
+
check_sanitization(input, htmloutput, output, output)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
##
|
90
|
+
## libxml2 downcases attributes as it parses, so this is unnecessary.
|
91
|
+
##
|
92
|
+
# HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
93
|
+
# define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
|
94
|
+
# input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
|
95
|
+
# output = "<p>foo <bad>bar</bad> baz</p>"
|
96
|
+
# check_sanitization(input, output, output, output)
|
97
|
+
# end
|
98
|
+
# end
|
99
|
+
|
100
|
+
HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
|
101
|
+
define_method "test_should_allow_#{protocol}_uris" do
|
102
|
+
input = %(<a href="#{protocol}">foo</a>)
|
103
|
+
output = "<a href='#{protocol}'>foo</a>"
|
104
|
+
check_sanitization(input, output, output, output)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
|
109
|
+
define_method "test_should_allow_uppercase_#{protocol}_uris" do
|
110
|
+
input = %(<a href="#{protocol.upcase}">foo</a>)
|
111
|
+
output = "<a href='#{protocol.upcase}'>foo</a>"
|
112
|
+
check_sanitization(input, output, output, output)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
if false # TODO. should we even care about this? libxml2 punt.
|
117
|
+
def test_should_handle_astral_plane_characters
|
118
|
+
input = "<p>𝒵 𝔸</p>"
|
119
|
+
output = "<p>\360\235\222\265 \360\235\224\270</p>"
|
120
|
+
check_sanitization(input, output, output, output)
|
121
|
+
|
122
|
+
input = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
123
|
+
output = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
124
|
+
check_sanitization(input, output, output, output)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# This affects only NS4. Is it worth fixing?
|
129
|
+
# def test_javascript_includes
|
130
|
+
# input = %(<div size="&{alert('XSS')}">foo</div>)
|
131
|
+
# output = "<div>foo</div>"
|
132
|
+
# check_sanitization(input, output, output, output)
|
133
|
+
# end
|
134
|
+
|
135
|
+
#html5_test_files('sanitizer').each do |filename|
|
136
|
+
# JSON::parse(open(filename).read).each do |test|
|
137
|
+
# define_method "test_#{test['name']}" do
|
138
|
+
# check_sanitization(
|
139
|
+
# test['input'],
|
140
|
+
# test['output'],
|
141
|
+
# test['xhtml'] || test['output'],
|
142
|
+
# test['rexml'] || test['output']
|
143
|
+
# )
|
144
|
+
# end
|
145
|
+
# end
|
146
|
+
#end
|
147
|
+
end
|
148
|
+
|
149
|
+
# <html5_license>
|
150
|
+
#
|
151
|
+
# Copyright (c) 2006-2008 The Authors
|
152
|
+
#
|
153
|
+
# Contributors:
|
154
|
+
# James Graham - jg307@cam.ac.uk
|
155
|
+
# Anne van Kesteren - annevankesteren@gmail.com
|
156
|
+
# Lachlan Hunt - lachlan.hunt@lachy.id.au
|
157
|
+
# Matt McDonald - kanashii@kanashii.ca
|
158
|
+
# Sam Ruby - rubys@intertwingly.net
|
159
|
+
# Ian Hickson (Google) - ian@hixie.ch
|
160
|
+
# Thomas Broyer - t.broyer@ltgt.net
|
161
|
+
# Jacques Distler - distler@golem.ph.utexas.edu
|
162
|
+
# Henri Sivonen - hsivonen@iki.fi
|
163
|
+
# The Mozilla Foundation (contributions from Henri Sivonen since 2008)
|
164
|
+
#
|
165
|
+
# Permission is hereby granted, free of charge, to any person
|
166
|
+
# obtaining a copy of this software and associated documentation files
|
167
|
+
# (the "Software"), to deal in the Software without restriction,
|
168
|
+
# including without limitation the rights to use, copy, modify, merge,
|
169
|
+
# publish, distribute, sublicense, and/or sell copies of the Software,
|
170
|
+
# and to permit persons to whom the Software is furnished to do so,
|
171
|
+
# subject to the following conditions:
|
172
|
+
#
|
173
|
+
# The above copyright notice and this permission notice shall be
|
174
|
+
# included in all copies or substantial portions of the Software.
|
175
|
+
#
|
176
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
177
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
178
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
179
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
180
|
+
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
181
|
+
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
182
|
+
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
183
|
+
# SOFTWARE.
|
184
|
+
#
|
185
|
+
# </html5_license>
|
@@ -0,0 +1,245 @@
|
|
1
|
+
#
|
2
|
+
# these tests taken from the HTML5 sanitization project and modified for use with Loofah
|
3
|
+
# see the original here: http://code.google.com/p/html5lib/source/browse/ruby/test/test_sanitizer.rb
|
4
|
+
#
|
5
|
+
# license text at the bottom of this file
|
6
|
+
#
|
7
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'helper'))
|
8
|
+
require 'json'
|
9
|
+
|
10
|
+
class Html5TestSanitizer < Test::Unit::TestCase
|
11
|
+
include Loofah
|
12
|
+
|
13
|
+
def sanitize_xhtml stream
|
14
|
+
Loofah.fragment(stream).scrub!(:escape).to_xhtml
|
15
|
+
end
|
16
|
+
|
17
|
+
def sanitize_html stream
|
18
|
+
Loofah.fragment(stream).scrub!(:escape).to_html
|
19
|
+
end
|
20
|
+
|
21
|
+
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
22
|
+
## libxml uses double-quotes, so let's swappo-boppo our quotes before comparing.
|
23
|
+
sane = sanitize_html(input).gsub('"',"'")
|
24
|
+
|
25
|
+
## HTML5's parsers are shit. there's so much inconsistency with what has closing tags, etc, that
|
26
|
+
## it would require a lot of manual hacking to make the tests match libxml's output.
|
27
|
+
## instead, I'm taking the shotgun approach, and trying to match any of the described outputs.
|
28
|
+
assert((htmloutput == sane) || (rexmloutput == sane) || (xhtmloutput == sane), input)
|
29
|
+
end
|
30
|
+
|
31
|
+
HTML5::WhiteList::ALLOWED_ELEMENTS.each do |tag_name|
|
32
|
+
define_method "test_should_allow_#{tag_name}_tag" do
|
33
|
+
input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
34
|
+
htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.downcase}>"
|
35
|
+
xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
36
|
+
rexmloutput = xhtmloutput
|
37
|
+
|
38
|
+
if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
|
39
|
+
htmloutput = "foo <bad>bar</bad> baz"
|
40
|
+
xhtmloutput = htmloutput
|
41
|
+
elsif tag_name == 'col'
|
42
|
+
htmloutput = "<col title='1'>foo <bad>bar</bad> baz"
|
43
|
+
xhtmloutput = htmloutput
|
44
|
+
rexmloutput = "<col title='1' />"
|
45
|
+
elsif tag_name == 'table'
|
46
|
+
htmloutput = "foo <bad>bar</bad>baz<table title='1'> </table>"
|
47
|
+
xhtmloutput = htmloutput
|
48
|
+
elsif tag_name == 'image'
|
49
|
+
htmloutput = "<img title='1'/>foo <bad>bar</bad> baz"
|
50
|
+
xhtmloutput = htmloutput
|
51
|
+
rexmloutput = "<image title='1'>foo <bad>bar</bad> baz</image>"
|
52
|
+
elsif HTML5::WhiteList::VOID_ELEMENTS.include?(tag_name)
|
53
|
+
htmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz"
|
54
|
+
xhtmloutput = htmloutput
|
55
|
+
htmloutput += '<br/>' if tag_name == 'br'
|
56
|
+
rexmloutput = "<#{tag_name} title='1' />"
|
57
|
+
end
|
58
|
+
check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
##
|
63
|
+
## libxml2 downcases elements, so this is moot.
|
64
|
+
##
|
65
|
+
# HTML5::WhiteList::ALLOWED_ELEMENTS.each do |tag_name|
|
66
|
+
# define_method "test_should_forbid_#{tag_name.upcase}_tag" do
|
67
|
+
# input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
68
|
+
# output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
69
|
+
# check_sanitization(input, output, output, output)
|
70
|
+
# end
|
71
|
+
# end
|
72
|
+
|
73
|
+
HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
74
|
+
next if attribute_name == 'style'
|
75
|
+
define_method "test_should_allow_#{attribute_name}_attribute" do
|
76
|
+
input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
77
|
+
if %w[checked compact disabled ismap multiple nohref noshade nowrap readonly selected].include?(attribute_name)
|
78
|
+
output = "<p #{attribute_name}>foo <bad>bar</bad> baz</p>"
|
79
|
+
htmloutput = "<p #{attribute_name.downcase}>foo <bad>bar</bad> baz</p>"
|
80
|
+
else
|
81
|
+
output = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
82
|
+
htmloutput = "<p #{attribute_name.downcase}='foo'>foo <bad>bar</bad> baz</p>"
|
83
|
+
end
|
84
|
+
check_sanitization(input, htmloutput, output, output)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
##
|
89
|
+
## libxml2 downcases attributes, so this is moot.
|
90
|
+
##
|
91
|
+
# HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
92
|
+
# define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
|
93
|
+
# input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
|
94
|
+
# output = "<p>foo <bad>bar</bad> baz</p>"
|
95
|
+
# check_sanitization(input, output, output, output)
|
96
|
+
# end
|
97
|
+
# end
|
98
|
+
|
99
|
+
HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
|
100
|
+
define_method "test_should_allow_#{protocol}_uris" do
|
101
|
+
input = %(<a href="#{protocol}">foo</a>)
|
102
|
+
output = "<a href='#{protocol}'>foo</a>"
|
103
|
+
check_sanitization(input, output, output, output)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
|
108
|
+
define_method "test_should_allow_uppercase_#{protocol}_uris" do
|
109
|
+
input = %(<a href="#{protocol.upcase}">foo</a>)
|
110
|
+
output = "<a href='#{protocol.upcase}'>foo</a>"
|
111
|
+
check_sanitization(input, output, output, output)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
HTML5::WhiteList::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
|
116
|
+
next unless HTML5::WhiteList::ALLOWED_ELEMENTS.include?(tag_name)
|
117
|
+
define_method "test_#{tag_name}_should_allow_local_href" do
|
118
|
+
input = %(<#{tag_name} xlink:href="#foo"/>)
|
119
|
+
output = "<#{tag_name.downcase} xlink:href='#foo'></#{tag_name.downcase}>"
|
120
|
+
xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
|
121
|
+
check_sanitization(input, output, xhtmloutput, xhtmloutput)
|
122
|
+
end
|
123
|
+
|
124
|
+
define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
|
125
|
+
input = %(<#{tag_name} xlink:href="\n#foo"/>)
|
126
|
+
output = "<#{tag_name.downcase} xlink:href='\n#foo'></#{tag_name.downcase}>"
|
127
|
+
xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
|
128
|
+
check_sanitization(input, output, xhtmloutput, xhtmloutput)
|
129
|
+
end
|
130
|
+
|
131
|
+
define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
|
132
|
+
input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
|
133
|
+
output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
|
134
|
+
xhtmloutput = "<#{tag_name}></#{tag_name}>"
|
135
|
+
check_sanitization(input, output, xhtmloutput, xhtmloutput)
|
136
|
+
end
|
137
|
+
|
138
|
+
define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
|
139
|
+
input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
|
140
|
+
output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
|
141
|
+
xhtmloutput = "<#{tag_name}></#{tag_name}>"
|
142
|
+
check_sanitization(input, output, xhtmloutput, xhtmloutput)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def test_should_handle_astral_plane_characters
|
147
|
+
input = "<p>𝒵 𝔸</p>"
|
148
|
+
output = "<p>\360\235\222\265 \360\235\224\270</p>"
|
149
|
+
check_sanitization(input, output, output, output)
|
150
|
+
|
151
|
+
input = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
152
|
+
output = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
153
|
+
check_sanitization(input, output, output, output)
|
154
|
+
end
|
155
|
+
|
156
|
+
# This affects only NS4. Is it worth fixing?
|
157
|
+
# def test_javascript_includes
|
158
|
+
# input = %(<div size="&{alert('XSS')}">foo</div>)
|
159
|
+
# output = "<div>foo</div>"
|
160
|
+
# check_sanitization(input, output, output, output)
|
161
|
+
# end
|
162
|
+
|
163
|
+
##
|
164
|
+
## these tests primarily test the parser logic, not the sanitizer
|
165
|
+
## logic. i call bullshit. we're not writing a test suite for
|
166
|
+
## libxml2 here, so let's rely on the unit tests above to take care
|
167
|
+
## of our valid elements and attributes.
|
168
|
+
##
|
169
|
+
# Dir[File.join(File.dirname(__FILE__), 'testdata', '*.*')].each do |filename|
|
170
|
+
# JSON::parse(open(filename).read).each do |test|
|
171
|
+
# define_method "test_#{test['name']}" do
|
172
|
+
# check_sanitization(
|
173
|
+
# test['input'],
|
174
|
+
# test['output'],
|
175
|
+
# test['xhtml'] || test['output'],
|
176
|
+
# test['rexml'] || test['output']
|
177
|
+
# )
|
178
|
+
# end
|
179
|
+
# end
|
180
|
+
# end
|
181
|
+
|
182
|
+
## added because we don't have any coverage above on SVG_ATTR_VAL_ALLOWS_REF
|
183
|
+
HTML5::WhiteList::SVG_ATTR_VAL_ALLOWS_REF.each do |attr_name|
|
184
|
+
define_method "test_should_allow_uri_refs_in_svg_attribute_#{attr_name}" do
|
185
|
+
input = "<rect fill='url(#foo)' />"
|
186
|
+
output = "<rect fill='url(#foo)'></rect>"
|
187
|
+
check_sanitization(input, output, output, output)
|
188
|
+
end
|
189
|
+
|
190
|
+
define_method "test_absolute_uri_refs_in_svg_attribute_#{attr_name}" do
|
191
|
+
input = "<rect fill='url(http://bad.com/) #fff' />"
|
192
|
+
output = "<rect fill=' #fff'></rect>"
|
193
|
+
check_sanitization(input, output, output, output)
|
194
|
+
end
|
195
|
+
|
196
|
+
define_method "test_uri_ref_with_space_in_svg_attribute_#{attr_name}" do
|
197
|
+
input = "<rect fill='url(\n#foo)' />"
|
198
|
+
rexml = "<rect fill='url(\n#foo)'></rect>"
|
199
|
+
end
|
200
|
+
|
201
|
+
define_method "test_absolute_uri_ref_with_space_in_svg_attribute_#{attr_name}" do
|
202
|
+
input = "<rect fill=\"url(\nhttp://bad.com/)\" />"
|
203
|
+
rexml = "<rect fill=' '></rect>"
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
end
|
208
|
+
|
209
|
+
# <html5_license>
|
210
|
+
#
|
211
|
+
# Copyright (c) 2006-2008 The Authors
|
212
|
+
#
|
213
|
+
# Contributors:
|
214
|
+
# James Graham - jg307@cam.ac.uk
|
215
|
+
# Anne van Kesteren - annevankesteren@gmail.com
|
216
|
+
# Lachlan Hunt - lachlan.hunt@lachy.id.au
|
217
|
+
# Matt McDonald - kanashii@kanashii.ca
|
218
|
+
# Sam Ruby - rubys@intertwingly.net
|
219
|
+
# Ian Hickson (Google) - ian@hixie.ch
|
220
|
+
# Thomas Broyer - t.broyer@ltgt.net
|
221
|
+
# Jacques Distler - distler@golem.ph.utexas.edu
|
222
|
+
# Henri Sivonen - hsivonen@iki.fi
|
223
|
+
# The Mozilla Foundation (contributions from Henri Sivonen since 2008)
|
224
|
+
#
|
225
|
+
# Permission is hereby granted, free of charge, to any person
|
226
|
+
# obtaining a copy of this software and associated documentation files
|
227
|
+
# (the "Software"), to deal in the Software without restriction,
|
228
|
+
# including without limitation the rights to use, copy, modify, merge,
|
229
|
+
# publish, distribute, sublicense, and/or sell copies of the Software,
|
230
|
+
# and to permit persons to whom the Software is furnished to do so,
|
231
|
+
# subject to the following conditions:
|
232
|
+
#
|
233
|
+
# The above copyright notice and this permission notice shall be
|
234
|
+
# included in all copies or substantial portions of the Software.
|
235
|
+
#
|
236
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
237
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
238
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
239
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
240
|
+
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
241
|
+
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
242
|
+
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
243
|
+
# SOFTWARE.
|
244
|
+
#
|
245
|
+
# </html5_license>
|