loofah 0.4.2 → 2.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +604 -0
  3. data/MIT-LICENSE.txt +3 -1
  4. data/README.md +410 -0
  5. data/SECURITY.md +18 -0
  6. data/lib/loofah/concerns.rb +207 -0
  7. data/lib/loofah/elements.rb +98 -0
  8. data/lib/loofah/helpers.rb +91 -4
  9. data/lib/loofah/html4/document.rb +17 -0
  10. data/lib/loofah/html4/document_fragment.rb +15 -0
  11. data/lib/loofah/html5/document.rb +17 -0
  12. data/lib/loofah/html5/document_fragment.rb +15 -0
  13. data/lib/loofah/html5/libxml2_workarounds.rb +28 -0
  14. data/lib/loofah/html5/safelist.rb +1058 -0
  15. data/lib/loofah/html5/scrub.rb +211 -40
  16. data/lib/loofah/metahelpers.rb +18 -0
  17. data/lib/loofah/scrubber.rb +31 -13
  18. data/lib/loofah/scrubbers.rb +262 -31
  19. data/lib/loofah/version.rb +6 -0
  20. data/lib/loofah/xml/document.rb +2 -0
  21. data/lib/loofah/xml/document_fragment.rb +6 -9
  22. data/lib/loofah.rb +131 -52
  23. metadata +79 -158
  24. data/CHANGELOG.rdoc +0 -92
  25. data/DEPRECATED.rdoc +0 -12
  26. data/Manifest.txt +0 -34
  27. data/README.rdoc +0 -330
  28. data/Rakefile +0 -61
  29. data/TODO.rdoc +0 -4
  30. data/benchmark/benchmark.rb +0 -149
  31. data/benchmark/fragment.html +0 -96
  32. data/benchmark/helper.rb +0 -73
  33. data/benchmark/www.slashdot.com.html +0 -2560
  34. data/init.rb +0 -1
  35. data/lib/loofah/active_record.rb +0 -62
  36. data/lib/loofah/html/document.rb +0 -22
  37. data/lib/loofah/html/document_fragment.rb +0 -46
  38. data/lib/loofah/html5/whitelist.rb +0 -174
  39. data/lib/loofah/instance_methods.rb +0 -77
  40. data/lib/loofah/xss_foliate.rb +0 -212
  41. data/test/helper.rb +0 -8
  42. data/test/html5/test_sanitizer.rb +0 -248
  43. data/test/test_active_record.rb +0 -146
  44. data/test/test_ad_hoc.rb +0 -272
  45. data/test/test_api.rb +0 -128
  46. data/test/test_helpers.rb +0 -28
  47. data/test/test_scrubber.rb +0 -227
  48. data/test/test_scrubbers.rb +0 -144
  49. data/test/test_xss_foliate.rb +0 -171
  50. data.tar.gz.sig +0 -0
  51. metadata.gz.sig +0 -2
@@ -1,248 +0,0 @@
1
- #
2
- # these tests taken from the HTML5 sanitization project and modified for use with Loofah
3
- # see the original here: http://code.google.com/p/html5lib/source/browse/ruby/test/test_sanitizer.rb
4
- #
5
- # license text at the bottom of this file
6
- #
7
- require File.expand_path(File.join(File.dirname(__FILE__), '..', 'helper'))
8
- require 'json'
9
-
10
- class Html5TestSanitizer < Test::Unit::TestCase
11
- include Loofah
12
-
13
- def sanitize_xhtml stream
14
- Loofah.fragment(stream).scrub!(:escape).to_xhtml
15
- end
16
-
17
- def sanitize_html stream
18
- Loofah.fragment(stream).scrub!(:escape).to_html
19
- end
20
-
21
- def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
22
- ## libxml uses double-quotes, so let's swappo-boppo our quotes before comparing.
23
- sane = sanitize_html(input).gsub('"',"'")
24
-
25
- ## HTML5's parsers are shit. there's so much inconsistency with what has closing tags, etc, that
26
- ## it would require a lot of manual hacking to make the tests match libxml's output.
27
- ## instead, I'm taking the shotgun approach, and trying to match any of the described outputs.
28
- assert((htmloutput == sane) || (rexmloutput == sane) || (xhtmloutput == sane), input)
29
- end
30
-
31
- (HTML5::WhiteList::ALLOWED_ELEMENTS).each do |tag_name|
32
- define_method "test_should_allow_#{tag_name}_tag" do
33
- input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
34
- htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
35
- xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
36
- rexmloutput = xhtmloutput
37
-
38
- if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
39
- htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
40
- xhtmloutput = htmloutput
41
- elsif tag_name == 'col'
42
- htmloutput = "<col title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
43
- xhtmloutput = htmloutput
44
- rexmloutput = "<col title='1' />"
45
- elsif tag_name == 'table'
46
- htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
47
- xhtmloutput = htmloutput
48
- elsif tag_name == 'image'
49
- htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
50
- xhtmloutput = htmloutput
51
- rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
52
- elsif HTML5::WhiteList::VOID_ELEMENTS.include?(tag_name)
53
- htmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
54
- xhtmloutput = htmloutput
55
- htmloutput += '<br/>' if tag_name == 'br'
56
- rexmloutput = "<#{tag_name} title='1' />"
57
- end
58
- check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
59
- end
60
- end
61
-
62
- ##
63
- ## libxml2 downcases elements, so this is moot.
64
- ##
65
- # HTML5::WhiteList::ALLOWED_ELEMENTS.each do |tag_name|
66
- # define_method "test_should_forbid_#{tag_name.upcase}_tag" do
67
- # input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
68
- # output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
69
- # check_sanitization(input, output, output, output)
70
- # end
71
- # end
72
-
73
- HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
74
- next if attribute_name == 'style'
75
- define_method "test_should_allow_#{attribute_name}_attribute" do
76
- input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
77
- if %w[checked compact disabled ismap multiple nohref noshade nowrap readonly selected].include?(attribute_name)
78
- output = "<p #{attribute_name}>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
79
- htmloutput = "<p #{attribute_name.downcase}>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
80
- else
81
- output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
82
- htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
83
- end
84
- check_sanitization(input, htmloutput, output, output)
85
- end
86
- end
87
-
88
- ##
89
- ## libxml2 downcases attributes, so this is moot.
90
- ##
91
- # HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
92
- # define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
93
- # input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
94
- # output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
95
- # check_sanitization(input, output, output, output)
96
- # end
97
- # end
98
-
99
- HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
100
- define_method "test_should_allow_#{protocol}_uris" do
101
- input = %(<a href="#{protocol}">foo</a>)
102
- output = "<a href='#{protocol}'>foo</a>"
103
- check_sanitization(input, output, output, output)
104
- end
105
- end
106
-
107
- HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
108
- define_method "test_should_allow_uppercase_#{protocol}_uris" do
109
- input = %(<a href="#{protocol.upcase}">foo</a>)
110
- output = "<a href='#{protocol.upcase}'>foo</a>"
111
- check_sanitization(input, output, output, output)
112
- end
113
- end
114
-
115
- HTML5::WhiteList::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
116
- next unless HTML5::WhiteList::ALLOWED_ELEMENTS.include?(tag_name)
117
- define_method "test_#{tag_name}_should_allow_local_href" do
118
- input = %(<#{tag_name} xlink:href="#foo"/>)
119
- output = "<#{tag_name.downcase} xlink:href='#foo'></#{tag_name.downcase}>"
120
- xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
121
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
122
- end
123
-
124
- define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
125
- input = %(<#{tag_name} xlink:href="\n#foo"/>)
126
- output = "<#{tag_name.downcase} xlink:href='\n#foo'></#{tag_name.downcase}>"
127
- xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
128
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
129
- end
130
-
131
- define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
132
- input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
133
- output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
134
- xhtmloutput = "<#{tag_name}></#{tag_name}>"
135
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
136
- end
137
-
138
- define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
139
- input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
140
- output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
141
- xhtmloutput = "<#{tag_name}></#{tag_name}>"
142
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
143
- end
144
- end
145
-
146
- ##
147
- ## as tenderlove says, "care < 0"
148
- ##
149
- # def test_should_handle_astral_plane_characters
150
- # input = "<p>&#x1d4b5; &#x1d538;</p>"
151
- # output = "<p>\360\235\222\265 \360\235\224\270</p>"
152
- # check_sanitization(input, output, output, output)
153
-
154
- # input = "<p><tspan>\360\235\224\270</tspan> a</p>"
155
- # output = "<p><tspan>\360\235\224\270</tspan> a</p>"
156
- # check_sanitization(input, output, output, output)
157
- # end
158
-
159
- # This affects only NS4. Is it worth fixing?
160
- # def test_javascript_includes
161
- # input = %(<div size="&{alert('XSS')}">foo</div>)
162
- # output = "<div>foo</div>"
163
- # check_sanitization(input, output, output, output)
164
- # end
165
-
166
- ##
167
- ## these tests primarily test the parser logic, not the sanitizer
168
- ## logic. i call bullshit. we're not writing a test suite for
169
- ## libxml2 here, so let's rely on the unit tests above to take care
170
- ## of our valid elements and attributes.
171
- ##
172
- # Dir[File.join(File.dirname(__FILE__), 'testdata', '*.*')].each do |filename|
173
- # JSON::parse(open(filename).read).each do |test|
174
- # define_method "test_#{test['name']}" do
175
- # check_sanitization(
176
- # test['input'],
177
- # test['output'],
178
- # test['xhtml'] || test['output'],
179
- # test['rexml'] || test['output']
180
- # )
181
- # end
182
- # end
183
- # end
184
-
185
- ## added because we don't have any coverage above on SVG_ATTR_VAL_ALLOWS_REF
186
- HTML5::WhiteList::SVG_ATTR_VAL_ALLOWS_REF.each do |attr_name|
187
- define_method "test_should_allow_uri_refs_in_svg_attribute_#{attr_name}" do
188
- input = "<rect fill='url(#foo)' />"
189
- output = "<rect fill='url(#foo)'></rect>"
190
- check_sanitization(input, output, output, output)
191
- end
192
-
193
- define_method "test_absolute_uri_refs_in_svg_attribute_#{attr_name}" do
194
- input = "<rect fill='url(http://bad.com/) #fff' />"
195
- output = "<rect fill=' #fff'></rect>"
196
- check_sanitization(input, output, output, output)
197
- end
198
-
199
- define_method "test_uri_ref_with_space_in_svg_attribute_#{attr_name}" do
200
- input = "<rect fill='url(\n#foo)' />"
201
- rexml = "<rect fill='url(\n#foo)'></rect>"
202
- end
203
-
204
- define_method "test_absolute_uri_ref_with_space_in_svg_attribute_#{attr_name}" do
205
- input = "<rect fill=\"url(\nhttp://bad.com/)\" />"
206
- rexml = "<rect fill=' '></rect>"
207
- end
208
- end
209
-
210
- end
211
-
212
- # <html5_license>
213
- #
214
- # Copyright (c) 2006-2008 The Authors
215
- #
216
- # Contributors:
217
- # James Graham - jg307@cam.ac.uk
218
- # Anne van Kesteren - annevankesteren@gmail.com
219
- # Lachlan Hunt - lachlan.hunt@lachy.id.au
220
- # Matt McDonald - kanashii@kanashii.ca
221
- # Sam Ruby - rubys@intertwingly.net
222
- # Ian Hickson (Google) - ian@hixie.ch
223
- # Thomas Broyer - t.broyer@ltgt.net
224
- # Jacques Distler - distler@golem.ph.utexas.edu
225
- # Henri Sivonen - hsivonen@iki.fi
226
- # The Mozilla Foundation (contributions from Henri Sivonen since 2008)
227
- #
228
- # Permission is hereby granted, free of charge, to any person
229
- # obtaining a copy of this software and associated documentation files
230
- # (the "Software"), to deal in the Software without restriction,
231
- # including without limitation the rights to use, copy, modify, merge,
232
- # publish, distribute, sublicense, and/or sell copies of the Software,
233
- # and to permit persons to whom the Software is furnished to do so,
234
- # subject to the following conditions:
235
- #
236
- # The above copyright notice and this permission notice shall be
237
- # included in all copies or substantial portions of the Software.
238
- #
239
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
240
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
241
- # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
242
- # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
243
- # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
244
- # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
245
- # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
246
- # SOFTWARE.
247
- #
248
- # </html5_license>
@@ -1,146 +0,0 @@
1
- require File.expand_path(File.join(File.dirname(__FILE__), 'helper'))
2
-
3
- require 'loofah/active_record'
4
-
5
- class TestActiveRecord < Test::Unit::TestCase
6
-
7
- HTML_STRING = "<div>omgwtfbbq</div>"
8
- PLAIN_TEXT = "vanilla text"
9
-
10
- context "with a Post model" do
11
-
12
- setup do
13
- ActsAsFu.build_model(:posts) do
14
- string :plain_text
15
- string :html_string
16
- end
17
- end
18
-
19
- context "scrubbing a single field as a fragment" do
20
- context "using a symbol to indicate the attribute" do
21
- setup do
22
- Post.html_fragment :html_string, :scrub => :prune
23
- assert ! Post.xss_foliated?
24
- @post = Post.new :html_string => HTML_STRING, :plain_text => PLAIN_TEXT
25
- end
26
-
27
- should "scrub the specified field" do
28
- Loofah.expects(:scrub_fragment).with(HTML_STRING, :prune).once
29
- Loofah.expects(:scrub_fragment).with(PLAIN_TEXT, :prune).never
30
- @post.valid?
31
- end
32
-
33
- should "only call scrub_fragment once" do
34
- Loofah.expects(:scrub_fragment).once
35
- @post.valid?
36
- end
37
-
38
- should "generate strings" do
39
- @post.valid?
40
- assert_equal String, @post.html_string.class
41
- assert_equal HTML_STRING, @post.html_string
42
- end
43
- end
44
-
45
- context "using a string to indicate the attribute" do
46
- setup do
47
- Post.html_fragment 'html_string', :scrub => :prune
48
- assert ! Post.xss_foliated?
49
- @post = Post.new :html_string => HTML_STRING, :plain_text => PLAIN_TEXT
50
- end
51
-
52
- should "scrub the specified field" do
53
- Loofah.expects(:scrub_fragment).with(HTML_STRING, :prune).once
54
- Loofah.expects(:scrub_fragment).with(PLAIN_TEXT, :prune).never
55
- @post.valid?
56
- end
57
- end
58
- end
59
-
60
- context "scrubbing a single field as a document" do
61
- context "using a symbol to indicate the attribute" do
62
- setup do
63
- Post.html_document :html_string, :scrub => :strip
64
- @post = Post.new :html_string => HTML_STRING, :plain_text => PLAIN_TEXT
65
- end
66
-
67
- should "scrub the specified field, but not other fields" do
68
- Loofah.expects(:scrub_document).with(HTML_STRING, :strip).once
69
- Loofah.expects(:scrub_document).with(PLAIN_TEXT, :strip).never
70
- @post.valid?
71
- end
72
-
73
- should "only call scrub_document once" do
74
- Loofah.expects(:scrub_document).once
75
- @post.valid?
76
- end
77
-
78
- should "generate strings" do
79
- @post.valid?
80
- assert_equal String, @post.html_string.class
81
- end
82
- end
83
-
84
- context "using a string to indicate the attribute" do
85
- setup do
86
- Post.html_document 'html_string', :scrub => :strip
87
- @post = Post.new :html_string => HTML_STRING, :plain_text => PLAIN_TEXT
88
- end
89
-
90
- should "scrub the specified field, but not other fields" do
91
- Loofah.expects(:scrub_document).with(HTML_STRING, :strip).once
92
- Loofah.expects(:scrub_document).with(PLAIN_TEXT, :strip).never
93
- @post.valid?
94
- end
95
- end
96
- end
97
-
98
- context "not passing any options" do
99
- should "raise ArgumentError" do
100
- assert_raises(ArgumentError) {
101
- Post.html_fragment :foo
102
- }
103
- end
104
- end
105
-
106
- context "not passing :scrub option" do
107
- should "raise ArgumentError" do
108
- assert_raise(ArgumentError) {
109
- Post.html_fragment :foo, :bar => :quux
110
- }
111
- end
112
- end
113
-
114
- context "passing a :scrub option" do
115
- should "not raise ArgumentError" do
116
- assert_nothing_raised {
117
- Post.html_fragment :foo, :scrub => :quux
118
- }
119
- end
120
- end
121
-
122
- context "passing a Scrubber" do
123
- setup do
124
- @called = false
125
- @scrubber = Loofah::Scrubber.new do |node|
126
- @called = true
127
- end
128
- end
129
-
130
- should "not raise ArgumentError" do
131
- assert_nothing_raised {
132
- Post.html_fragment :html_string, :scrub => @scrubber
133
- }
134
- end
135
-
136
- should "scrub properly" do
137
- Post.html_fragment :html_string, :scrub => @scrubber
138
- post = Post.new :html_string => HTML_STRING, :plain_text => PLAIN_TEXT
139
- post.valid?
140
- assert @called
141
- end
142
- end
143
-
144
- end
145
-
146
- end
data/test/test_ad_hoc.rb DELETED
@@ -1,272 +0,0 @@
1
- require File.expand_path(File.join(File.dirname(__FILE__), 'helper'))
2
-
3
- class TestAdHoc < Test::Unit::TestCase
4
-
5
- def test_empty_string_with_escape
6
- assert_equal "", Loofah.scrub_fragment("", :escape).to_xml
7
- end
8
-
9
- def test_empty_string_with_prune
10
- assert_equal Loofah.scrub_document("", :prune).text, ""
11
- end
12
-
13
- def test_xml_document_scrub
14
- xml = Loofah.xml_document <<-EOXML
15
- <root>
16
- <employee deceased='true'>Abraham Lincoln</employee>
17
- <employee deceased='false'>Abe Vigoda</employee>
18
- </root>
19
- EOXML
20
- bring_out_your_dead = Loofah::Scrubber.new do |node|
21
- if node.name == "employee" and node["deceased"] == "true"
22
- node.remove
23
- Loofah::Scrubber::STOP # don't bother with the rest of the subtree
24
- end
25
- end
26
- assert_equal 2, xml.css("employee").length
27
-
28
- xml.scrub!(bring_out_your_dead)
29
-
30
- employees = xml.css "employee"
31
- assert_equal 1, employees.length
32
- assert_equal "Abe Vigoda", employees.first.inner_text
33
- end
34
-
35
- def test_xml_fragment_scrub
36
- xml = Loofah.xml_fragment <<-EOXML
37
- <employee deceased='true'>Abraham Lincoln</employee>
38
- <employee deceased='false'>Abe Vigoda</employee>
39
- EOXML
40
- bring_out_your_dead = Loofah::Scrubber.new do |node|
41
- if node.name == "employee" and node["deceased"] == "true"
42
- node.remove
43
- Loofah::Scrubber::STOP # don't bother with the rest of the subtree
44
- end
45
- end
46
- assert_equal 2, xml.css("employee").length
47
-
48
- xml.scrub!(bring_out_your_dead)
49
-
50
- employees = xml.css "employee"
51
- assert_equal 1, employees.length
52
- assert_equal "Abe Vigoda", employees.first.inner_text
53
- end
54
-
55
- def test_html_fragment_to_s_should_not_include_head_tags
56
- html = Loofah.fragment "<style>foo</style><div>bar</div>"
57
- assert_equal "<div>bar</div>", html.to_s
58
- end
59
-
60
- def test_html_fragment_text_should_not_include_head_tags
61
- html = Loofah.fragment "<style>foo</style><div>bar</div>"
62
- assert_equal "bar", html.text
63
- end
64
-
65
- def test_html_document_text_should_not_include_head_tags
66
- html = Loofah.document "<style>foo</style><div>bar</div>"
67
- assert_equal "bar", html.text
68
- end
69
-
70
- def test_node_scrub_should_only_scrub_subtree
71
- xml = Loofah.document <<-EOHTML
72
- <html><body>
73
- <div class='scrub'>
74
- <script>I should be removed</script>
75
- </div>
76
- <div class='noscrub'>
77
- <script>I should remain</script>
78
- </div>
79
- </body></html>
80
- EOHTML
81
- node = xml.at_css "div.scrub"
82
- node.scrub!(:prune)
83
- assert_contains xml.to_s, /I should remain/
84
- assert_does_not_contain xml.to_s, /I should be removed/
85
- end
86
-
87
- def test_nodeset_scrub_should_only_scrub_subtrees
88
- xml = Loofah.document <<-EOHTML
89
- <html><body>
90
- <div class='scrub'>
91
- <script>I should be removed</script>
92
- </div>
93
- <div class='noscrub'>
94
- <script>I should remain</script>
95
- </div>
96
- <div class='scrub'>
97
- <script>I should also be removed</script>
98
- </div>
99
- </body></html>
100
- EOHTML
101
- node_set = xml.css "div.scrub"
102
- assert_equal 2, node_set.length
103
- node_set.scrub!(:prune)
104
- assert_contains xml.to_s, /I should remain/
105
- assert_does_not_contain xml.to_s, /I should be removed/
106
- assert_does_not_contain xml.to_s, /I should also be removed/
107
- end
108
-
109
- def test_removal_of_illegal_tag
110
- html = <<-HTML
111
- following this there should be no jim tag
112
- <jim>jim</jim>
113
- was there?
114
- HTML
115
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
116
- assert sane.xpath("//jim").empty?
117
- end
118
-
119
- def test_removal_of_illegal_attribute
120
- html = "<p class=bar foo=bar abbr=bar />"
121
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
122
- node = sane.xpath("//p").first
123
- assert node.attributes['class']
124
- assert node.attributes['abbr']
125
- assert_nil node.attributes['foo']
126
- end
127
-
128
- def test_removal_of_illegal_url_in_href
129
- html = <<-HTML
130
- <a href='jimbo://jim.jim/'>this link should have its href removed because of illegal url</a>
131
- <a href='http://jim.jim/'>this link should be fine</a>
132
- HTML
133
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
134
- nodes = sane.xpath("//a")
135
- assert_nil nodes.first.attributes['href']
136
- assert nodes.last.attributes['href']
137
- end
138
-
139
- def test_css_sanitization
140
- html = "<p style='background-color: url(\"http://foo.com/\") ; background-color: #000 ;' />"
141
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
142
- assert_match(/#000/, sane.inner_html)
143
- assert_no_match(/foo\.com/, sane.inner_html)
144
- end
145
-
146
- def test_fragment_with_no_tags
147
- assert_equal "This fragment has no tags.", Loofah.scrub_fragment("This fragment has no tags.", :escape).to_xml
148
- end
149
-
150
- def test_fragment_in_p_tag
151
- assert_equal "<p>This fragment is in a p.</p>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>", :escape).to_xml
152
- end
153
-
154
- def test_fragment_in_p_tag_plus_stuff
155
- assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>foo<strong>bar</strong>", :escape).to_xml
156
- end
157
-
158
- def test_fragment_with_text_nodes_leading_and_trailing
159
- assert_equal "text<p>fragment</p>text", Loofah.scrub_fragment("text<p>fragment</p>text", :escape).to_xml
160
- end
161
-
162
- def test_whitewash_on_fragment
163
- html = "safe<frameset rows=\"*\"><frame src=\"http://example.com\"></frameset> <b>description</b>"
164
- whitewashed = Loofah.scrub_document(html, :whitewash).xpath("/html/body/*").to_s
165
- assert_equal "<p>safe</p><b>description</b>", whitewashed.gsub("\n","")
166
- end
167
-
168
- MSWORD_HTML = <<-EOHTML
169
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta name="ProgId" content="Word.Document"><meta name="Generator" content="Microsoft Word 11"><meta name="Originator" content="Microsoft Word 11"><link rel="File-List" href="file:///C:%5CDOCUME%7E1%5CNICOLE%7E1%5CLOCALS%7E1%5CTemp%5Cmsohtml1%5C01%5Cclip_filelist.xml"><!--[if gte mso 9]><xml>
170
- <w:WordDocument>
171
- <w:View>Normal</w:View>
172
- <w:Zoom>0</w:Zoom>
173
- <w:PunctuationKerning/>
174
- <w:ValidateAgainstSchemas/>
175
- <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
176
- <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
177
- <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
178
- <w:Compatibility>
179
- <w:BreakWrappedTables/>
180
- <w:SnapToGridInCell/>
181
- <w:WrapTextWithPunct/>
182
- <w:UseAsianBreakRules/>
183
- <w:DontGrowAutofit/>
184
- </w:Compatibility>
185
- <w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>
186
- </w:WordDocument>
187
- </xml><![endif]--><!--[if gte mso 9]><xml>
188
- <w:LatentStyles DefLockedState="false" LatentStyleCount="156">
189
- </w:LatentStyles>
190
- </xml><![endif]--><style>
191
- <!--
192
- /* Style Definitions */
193
- p.MsoNormal, li.MsoNormal, div.MsoNormal
194
- {mso-style-parent:"";
195
- margin:0in;
196
- margin-bottom:.0001pt;
197
- mso-pagination:widow-orphan;
198
- font-size:12.0pt;
199
- font-family:"Times New Roman";
200
- mso-fareast-font-family:"Times New Roman";}
201
- @page Section1
202
- {size:8.5in 11.0in;
203
- margin:1.0in 1.25in 1.0in 1.25in;
204
- mso-header-margin:.5in;
205
- mso-footer-margin:.5in;
206
- mso-paper-source:0;}
207
- div.Section1
208
- {page:Section1;}
209
- -->
210
- </style><!--[if gte mso 10]>
211
- <style>
212
- /* Style Definitions */
213
- table.MsoNormalTable
214
- {mso-style-name:"Table Normal";
215
- mso-tstyle-rowband-size:0;
216
- mso-tstyle-colband-size:0;
217
- mso-style-noshow:yes;
218
- mso-style-parent:"";
219
- mso-padding-alt:0in 5.4pt 0in 5.4pt;
220
- mso-para-margin:0in;
221
- mso-para-margin-bottom:.0001pt;
222
- mso-pagination:widow-orphan;
223
- font-size:10.0pt;
224
- font-family:"Times New Roman";
225
- mso-ansi-language:#0400;
226
- mso-fareast-language:#0400;
227
- mso-bidi-language:#0400;}
228
- </style>
229
- <![endif]-->
230
-
231
- <p class="MsoNormal">Foo <b style="">BOLD<o:p></o:p></b></p>
232
- EOHTML
233
-
234
- def test_fragment_whitewash_on_microsofty_markup
235
- whitewashed = Loofah.fragment(MSWORD_HTML).scrub!(:whitewash)
236
- assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.to_s
237
- end
238
-
239
- def test_document_whitewash_on_microsofty_markup
240
- whitewashed = Loofah.document(MSWORD_HTML).scrub!(:whitewash)
241
- assert_contains whitewashed.to_s, %r(<p>Foo <b>BOLD</b></p>)
242
- assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.xpath("/html/body/*").to_s
243
- end
244
-
245
- def test_return_empty_string_when_nothing_left
246
- assert_equal "", Loofah.scrub_document('<script>test</script>', :prune).text
247
- end
248
-
249
- def test_removal_of_all_tags
250
- html = <<-HTML
251
- What's up <strong>doc</strong>?
252
- HTML
253
- stripped = Loofah.scrub_document(html, :prune).text
254
- assert_equal "What's up doc?".strip, stripped.strip
255
- end
256
-
257
- def test_dont_remove_whitespace
258
- html = "Foo\nBar"
259
- assert_equal html, Loofah.scrub_document(html, :prune).text
260
- end
261
-
262
- def test_dont_remove_whitespace_between_tags
263
- html = "<p>Foo</p>\n<p>Bar</p>"
264
- assert_equal "Foo\nBar", Loofah.scrub_document(html, :prune).text
265
- end
266
-
267
- def test_removal_of_entities
268
- html = "<p>this is &lt; that &quot;&amp;&quot; the other &gt; boo&apos;ya</p>"
269
- assert_equal 'this is < that "&" the other > boo\'ya', Loofah.scrub_document(html, :prune).text
270
- end
271
-
272
- end