sanitize 2.1.1 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sanitize might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/HISTORY.md +520 -55
- data/LICENSE +1 -1
- data/README.md +438 -168
- data/lib/sanitize/config/basic.rb +12 -32
- data/lib/sanitize/config/default.rb +118 -0
- data/lib/sanitize/config/relaxed.rb +716 -53
- data/lib/sanitize/config/restricted.rb +3 -23
- data/lib/sanitize/config.rb +53 -79
- data/lib/sanitize/css.rb +348 -0
- data/lib/sanitize/transformers/clean_cdata.rb +3 -3
- data/lib/sanitize/transformers/clean_comment.rb +6 -3
- data/lib/sanitize/transformers/clean_css.rb +57 -0
- data/lib/sanitize/transformers/clean_doctype.rb +19 -0
- data/lib/sanitize/transformers/clean_element.rb +192 -124
- data/lib/sanitize/version.rb +3 -1
- data/lib/sanitize.rb +172 -143
- data/test/common.rb +3 -0
- data/test/test_clean_comment.rb +47 -0
- data/test/test_clean_css.rb +67 -0
- data/test/test_clean_doctype.rb +71 -0
- data/test/test_clean_element.rb +545 -0
- data/test/test_config.rb +65 -0
- data/test/test_malicious_css.rb +42 -0
- data/test/test_malicious_html.rb +235 -0
- data/test/test_parser.rb +75 -0
- data/test/test_sanitize.rb +151 -675
- data/test/test_sanitize_css.rb +424 -0
- data/test/test_transformers.rb +230 -0
- metadata +44 -41
@@ -0,0 +1,235 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require_relative 'common'
|
3
|
+
|
4
|
+
# Miscellaneous attempts to sneak maliciously crafted HTML past Sanitize. Many
|
5
|
+
# of these are courtesy of (or inspired by) the OWASP XSS Filter Evasion Cheat
|
6
|
+
# Sheet.
|
7
|
+
#
|
8
|
+
# https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet
|
9
|
+
|
10
|
+
describe 'Malicious HTML' do
|
11
|
+
make_my_diffs_pretty!
|
12
|
+
parallelize_me!
|
13
|
+
|
14
|
+
before do
|
15
|
+
@s = Sanitize.new(Sanitize::Config::RELAXED)
|
16
|
+
end
|
17
|
+
|
18
|
+
describe 'comments' do
|
19
|
+
it 'should not allow script injection via conditional comments' do
|
20
|
+
@s.fragment(%[<!--[if gte IE 4]>\n<script>alert('XSS');</script>\n<![endif]-->]).
|
21
|
+
must_equal ''
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe 'interpolation (ERB, PHP, etc.)' do
|
26
|
+
it 'should escape ERB-style tags' do
|
27
|
+
@s.fragment('<% naughty_ruby_code %>').
|
28
|
+
must_equal '<% naughty_ruby_code %>'
|
29
|
+
|
30
|
+
@s.fragment('<%= naughty_ruby_code %>').
|
31
|
+
must_equal '<%= naughty_ruby_code %>'
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should remove PHP-style tags' do
|
35
|
+
@s.fragment('<? naughtyPHPCode(); ?>').
|
36
|
+
must_equal ''
|
37
|
+
|
38
|
+
@s.fragment('<?= naughtyPHPCode(); ?>').
|
39
|
+
must_equal ''
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
describe '<body>' do
|
44
|
+
it 'should not be possible to inject JS via a malformed event attribute' do
|
45
|
+
@s.document('<html><head></head><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>').
|
46
|
+
must_equal "<html><head></head><body></body></html>"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
describe '<iframe>' do
|
51
|
+
it 'should not be possible to inject an iframe using an improperly closed tag' do
|
52
|
+
@s.fragment(%[<iframe src=http://ha.ckers.org/scriptlet.html <]).
|
53
|
+
must_equal ''
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe '<img>' do
|
58
|
+
it 'should not be possible to inject JS via an unquoted <img> src attribute' do
|
59
|
+
@s.fragment("<img src=javascript:alert('XSS')>").must_equal '<img>'
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'should not be possible to inject JS using grave accents as <img> src delimiters' do
|
63
|
+
@s.fragment("<img src=`javascript:alert('XSS')`>").must_equal '<img>'
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should not be possible to inject <script> via a malformed <img> tag' do
|
67
|
+
@s.fragment('<img """><script>alert("XSS")</script>">').
|
68
|
+
must_equal '<img>">'
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'should not be possible to inject protocol-based JS' do
|
72
|
+
@s.fragment('<img src=javascript:alert('XSS')>').
|
73
|
+
must_equal '<img>'
|
74
|
+
|
75
|
+
@s.fragment('<img src=javascript:alert('XSS')>').
|
76
|
+
must_equal '<img>'
|
77
|
+
|
78
|
+
@s.fragment('<img src=javascript:alert('XSS')>').
|
79
|
+
must_equal '<img>'
|
80
|
+
|
81
|
+
# Encoded tab character.
|
82
|
+
@s.fragment(%[<img src="jav	ascript:alert('XSS');">]).
|
83
|
+
must_equal '<img>'
|
84
|
+
|
85
|
+
# Encoded newline.
|
86
|
+
@s.fragment(%[<img src="jav
ascript:alert('XSS');">]).
|
87
|
+
must_equal '<img>'
|
88
|
+
|
89
|
+
# Encoded carriage return.
|
90
|
+
@s.fragment(%[<img src="jav
ascript:alert('XSS');">]).
|
91
|
+
must_equal '<img>'
|
92
|
+
|
93
|
+
# Null byte.
|
94
|
+
@s.fragment(%[<img src=java\0script:alert("XSS")>]).
|
95
|
+
must_equal '<img>'
|
96
|
+
|
97
|
+
# Spaces plus meta char.
|
98
|
+
@s.fragment(%[<img src="  javascript:alert('XSS');">]).
|
99
|
+
must_equal '<img>'
|
100
|
+
|
101
|
+
# Mixed spaces and tabs.
|
102
|
+
@s.fragment(%[<img src="j\na v\tascript://alert('XSS');">]).
|
103
|
+
must_equal '<img>'
|
104
|
+
end
|
105
|
+
|
106
|
+
it 'should not be possible to inject protocol-based JS via whitespace' do
|
107
|
+
@s.fragment(%[<img src="jav\tascript:alert('XSS');">]).
|
108
|
+
must_equal '<img>'
|
109
|
+
end
|
110
|
+
|
111
|
+
it 'should not be possible to inject JS using a half-open <img> tag' do
|
112
|
+
@s.fragment(%[<img src="javascript:alert('XSS')"]).
|
113
|
+
must_equal ''
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
describe '<script>' do
|
118
|
+
it 'should not be possible to inject <script> using a malformed non-alphanumeric tag name' do
|
119
|
+
@s.fragment(%[<script/xss src="http://ha.ckers.org/xss.js">alert(1)</script>]).
|
120
|
+
must_equal ''
|
121
|
+
end
|
122
|
+
|
123
|
+
it 'should not be possible to inject <script> via extraneous open brackets' do
|
124
|
+
@s.fragment(%[<<script>alert("XSS");//<</script>]).
|
125
|
+
must_equal '<'
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
|
130
|
+
# attempt to preserve server-side includes. This can result in XSS since an
|
131
|
+
# unescaped double quote can allow an attacker to inject a non-allowlisted
|
132
|
+
# attribute. Sanitize works around this by implementing its own escaping for
|
133
|
+
# affected attributes.
|
134
|
+
#
|
135
|
+
# The relevant libxml2 code is here:
|
136
|
+
# <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
|
137
|
+
describe 'unsafe libxml2 server-side includes in attributes' do
|
138
|
+
using_unpatched_libxml2 = Nokogiri::VersionInfo.instance.libxml2_using_system?
|
139
|
+
|
140
|
+
tag_configs = [
|
141
|
+
{
|
142
|
+
tag_name: 'a',
|
143
|
+
escaped_attrs: %w[ action href src name ],
|
144
|
+
unescaped_attrs: []
|
145
|
+
},
|
146
|
+
|
147
|
+
{
|
148
|
+
tag_name: 'div',
|
149
|
+
escaped_attrs: %w[ action href src ],
|
150
|
+
unescaped_attrs: %w[ name ]
|
151
|
+
}
|
152
|
+
]
|
153
|
+
|
154
|
+
before do
|
155
|
+
@s = Sanitize.new({
|
156
|
+
elements: %w[ a div ],
|
157
|
+
|
158
|
+
attributes: {
|
159
|
+
all: %w[ action href src name ]
|
160
|
+
}
|
161
|
+
})
|
162
|
+
end
|
163
|
+
|
164
|
+
tag_configs.each do |tag_config|
|
165
|
+
tag_name = tag_config[:tag_name]
|
166
|
+
|
167
|
+
tag_config[:escaped_attrs].each do |attr_name|
|
168
|
+
input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
|
169
|
+
|
170
|
+
it 'should escape unsafe characters in attributes' do
|
171
|
+
skip "behavior should only exist in nokogiri's patched libxml" if using_unpatched_libxml2
|
172
|
+
|
173
|
+
# This uses Nokogumbo's HTML-compliant serializer rather than
|
174
|
+
# libxml2's.
|
175
|
+
@s.fragment(input).
|
176
|
+
must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
|
177
|
+
|
178
|
+
# This uses the not-quite-standards-compliant libxml2 serializer via
|
179
|
+
# Nokogiri, so the output may be a little different as of Nokogiri
|
180
|
+
# 1.10.2 when using Nokogiri's vendored libxml2 due to this patch:
|
181
|
+
# https://github.com/sparklemotion/nokogiri/commit/4852e43cb6039e26d8c51af78621e539cbf46c5d
|
182
|
+
fragment = Nokogiri::HTML.fragment(input)
|
183
|
+
@s.node!(fragment)
|
184
|
+
fragment.to_html.
|
185
|
+
must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
|
186
|
+
end
|
187
|
+
|
188
|
+
it 'should round-trip to the same output' do
|
189
|
+
output = @s.fragment(input)
|
190
|
+
@s.fragment(output).must_equal(output)
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
tag_config[:unescaped_attrs].each do |attr_name|
|
195
|
+
input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
|
196
|
+
|
197
|
+
it 'should not escape characters unnecessarily' do
|
198
|
+
skip "behavior should only exist in nokogiri's patched libxml" if using_unpatched_libxml2
|
199
|
+
|
200
|
+
# This uses Nokogumbo's HTML-compliant serializer rather than
|
201
|
+
# libxml2's.
|
202
|
+
@s.fragment(input).
|
203
|
+
must_equal(%[<#{tag_name} #{attr_name}="examp<!--" onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
|
204
|
+
|
205
|
+
# This uses the not-quite-standards-compliant libxml2 serializer via
|
206
|
+
# Nokogiri, so the output may be a little different as of Nokogiri
|
207
|
+
# 1.10.2 when using Nokogiri's vendored libxml2 due to this patch:
|
208
|
+
# https://github.com/sparklemotion/nokogiri/commit/4852e43cb6039e26d8c51af78621e539cbf46c5d
|
209
|
+
fragment = Nokogiri::HTML.fragment(input)
|
210
|
+
@s.node!(fragment)
|
211
|
+
fragment.to_html.
|
212
|
+
must_equal(%[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>])
|
213
|
+
end
|
214
|
+
|
215
|
+
it 'should round-trip to the same output' do
|
216
|
+
output = @s.fragment(input)
|
217
|
+
@s.fragment(output).must_equal(output)
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
# https://github.com/rgrove/sanitize/security/advisories/GHSA-p4x4-rw2p-8j8m
|
224
|
+
describe 'foreign content bypass in relaxed config' do
|
225
|
+
it 'prevents a sanitization bypass via carefully crafted foreign content' do
|
226
|
+
%w[iframe noembed noframes noscript plaintext script style xmp].each do |tag_name|
|
227
|
+
@s.fragment(%[<math><#{tag_name}>/*</#{tag_name}><img src onerror=alert(1)>*/]).
|
228
|
+
must_equal ''
|
229
|
+
|
230
|
+
@s.fragment(%[<svg><#{tag_name}>/*</#{tag_name}><img src onerror=alert(1)>*/]).
|
231
|
+
must_equal ''
|
232
|
+
end
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
data/test/test_parser.rb
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require_relative 'common'
|
3
|
+
|
4
|
+
describe 'Parser' do
|
5
|
+
make_my_diffs_pretty!
|
6
|
+
parallelize_me!
|
7
|
+
|
8
|
+
it 'should translate valid entities into characters' do
|
9
|
+
Sanitize.fragment("'é&").must_equal("'é&")
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should translate orphaned ampersands into entities' do
|
13
|
+
Sanitize.fragment('at&t').must_equal('at&t')
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'should not add newlines after tags when serializing a fragment' do
|
17
|
+
Sanitize.fragment("<div>foo\n\n<p>bar</p><div>\nbaz</div></div><div>quux</div>", :elements => ['div', 'p'])
|
18
|
+
.must_equal "<div>foo\n\n<p>bar</p><div>\nbaz</div></div><div>quux</div>"
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should not have the Nokogiri 1.4.2+ unterminated script/style element bug' do
|
22
|
+
Sanitize.fragment('foo <script>bar').must_equal 'foo '
|
23
|
+
Sanitize.fragment('foo <style>bar').must_equal 'foo '
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'ambiguous non-tag brackets like "1 > 2 and 2 < 1" should be parsed correctly' do
|
27
|
+
Sanitize.fragment('1 > 2 and 2 < 1').must_equal '1 > 2 and 2 < 1'
|
28
|
+
Sanitize.fragment('OMG HAPPY BIRTHDAY! *<:-D').must_equal 'OMG HAPPY BIRTHDAY! *<:-D'
|
29
|
+
end
|
30
|
+
|
31
|
+
describe 'when siblings are added after a node during traversal' do
|
32
|
+
it 'the added siblings should be traversed' do
|
33
|
+
html = %[
|
34
|
+
<div id="one">
|
35
|
+
<div id="one_one">
|
36
|
+
<div id="one_one_one"></div>
|
37
|
+
</div>
|
38
|
+
<div id="one_two"></div>
|
39
|
+
</div>
|
40
|
+
<div id="two">
|
41
|
+
<div id="two_one"><div id="two_one_one"></div></div>
|
42
|
+
<div id="two_two"></div>
|
43
|
+
</div>
|
44
|
+
<div id="three"></div>
|
45
|
+
]
|
46
|
+
|
47
|
+
siblings = []
|
48
|
+
|
49
|
+
Sanitize.fragment(html, :transformers => ->(env) {
|
50
|
+
name = env[:node].name
|
51
|
+
|
52
|
+
if name == 'div'
|
53
|
+
env[:node].add_next_sibling('<b id="added_' + env[:node]['id'] + '">')
|
54
|
+
elsif name == 'b'
|
55
|
+
siblings << env[:node][:id]
|
56
|
+
end
|
57
|
+
|
58
|
+
return {:node_allowlist => [env[:node]]}
|
59
|
+
})
|
60
|
+
|
61
|
+
# All siblings should be traversed, and in the order added.
|
62
|
+
siblings.must_equal [
|
63
|
+
"added_one_one_one",
|
64
|
+
"added_one_one",
|
65
|
+
"added_one_two",
|
66
|
+
"added_one",
|
67
|
+
"added_two_one_one",
|
68
|
+
"added_two_one",
|
69
|
+
"added_two_two",
|
70
|
+
"added_two",
|
71
|
+
"added_three"
|
72
|
+
]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|