loofah 1.0.0 → 2.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +489 -0
  3. data/MIT-LICENSE.txt +3 -1
  4. data/README.md +364 -0
  5. data/SECURITY.md +18 -0
  6. data/lib/loofah/elements.rb +88 -11
  7. data/lib/loofah/helpers.rb +76 -2
  8. data/lib/loofah/html/document.rb +1 -0
  9. data/lib/loofah/html/document_fragment.rb +9 -2
  10. data/lib/loofah/html5/libxml2_workarounds.rb +27 -0
  11. data/lib/loofah/html5/safelist.rb +1042 -0
  12. data/lib/loofah/html5/scrub.rb +198 -40
  13. data/lib/loofah/instance_methods.rb +16 -10
  14. data/lib/loofah/metahelpers.rb +9 -10
  15. data/lib/loofah/scrubber.rb +22 -6
  16. data/lib/loofah/scrubbers.rb +96 -16
  17. data/lib/loofah/version.rb +5 -0
  18. data/lib/loofah/xml/document.rb +1 -0
  19. data/lib/loofah/xml/document_fragment.rb +5 -2
  20. data/lib/loofah.rb +38 -25
  21. metadata +159 -172
  22. data/CHANGELOG.rdoc +0 -134
  23. data/Gemfile +0 -1
  24. data/Manifest.txt +0 -34
  25. data/README.rdoc +0 -312
  26. data/Rakefile +0 -53
  27. data/benchmark/benchmark.rb +0 -149
  28. data/benchmark/fragment.html +0 -96
  29. data/benchmark/helper.rb +0 -73
  30. data/benchmark/www.slashdot.com.html +0 -2560
  31. data/lib/loofah/html5/whitelist.rb +0 -168
  32. data/test/helper.rb +0 -7
  33. data/test/html5/test_sanitizer.rb +0 -248
  34. data/test/integration/test_ad_hoc.rb +0 -176
  35. data/test/integration/test_helpers.rb +0 -33
  36. data/test/integration/test_html.rb +0 -51
  37. data/test/integration/test_scrubbers.rb +0 -331
  38. data/test/integration/test_xml.rb +0 -55
  39. data/test/unit/test_api.rb +0 -138
  40. data/test/unit/test_helpers.rb +0 -27
  41. data/test/unit/test_scrubber.rb +0 -229
  42. data/test/unit/test_scrubbers.rb +0 -14
@@ -1,168 +0,0 @@
1
- module Loofah
2
- module HTML5 # :nodoc:
3
- #
4
- # HTML whitelist lifted from HTML5lib sanitizer code:
5
- #
6
- # http://code.google.com/p/html5lib/
7
- #
8
- # <html5_license>
9
- #
10
- # Copyright (c) 2006-2008 The Authors
11
- #
12
- # Contributors:
13
- # James Graham - jg307@cam.ac.uk
14
- # Anne van Kesteren - annevankesteren@gmail.com
15
- # Lachlan Hunt - lachlan.hunt@lachy.id.au
16
- # Matt McDonald - kanashii@kanashii.ca
17
- # Sam Ruby - rubys@intertwingly.net
18
- # Ian Hickson (Google) - ian@hixie.ch
19
- # Thomas Broyer - t.broyer@ltgt.net
20
- # Jacques Distler - distler@golem.ph.utexas.edu
21
- # Henri Sivonen - hsivonen@iki.fi
22
- # The Mozilla Foundation (contributions from Henri Sivonen since 2008)
23
- #
24
- # Permission is hereby granted, free of charge, to any person
25
- # obtaining a copy of this software and associated documentation
26
- # files (the "Software"), to deal in the Software without
27
- # restriction, including without limitation the rights to use, copy,
28
- # modify, merge, publish, distribute, sublicense, and/or sell copies
29
- # of the Software, and to permit persons to whom the Software is
30
- # furnished to do so, subject to the following conditions:
31
- #
32
- # The above copyright notice and this permission notice shall be
33
- # included in all copies or substantial portions of the Software.
34
- #
35
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
36
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
37
- # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
38
- # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
39
- # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
40
- # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
41
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
42
- # DEALINGS IN THE SOFTWARE.
43
- #
44
- # </html5_license>
45
- module WhiteList
46
- ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
47
- button caption center cite code col colgroup dd del dfn dir div dl dt
48
- em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
49
- legend li map menu ol optgroup option p pre q s samp select small span
50
- strike strong sub sup table tbody td textarea tfoot th thead tr tt u
51
- ul var]
52
-
53
- MATHML_ELEMENTS = %w[annotation annotation-xml maction math merror mfrac
54
- mfenced mi mmultiscripts mn mo mover mpadded mphantom mprescripts mroot mrow
55
- mspace msqrt mstyle msub msubsup msup mtable mtd mtext mtr munder
56
- munderover none semantics]
57
-
58
- SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform
59
- circle defs desc ellipse font-face font-face-name font-face-src foreignObject
60
- g glyph hkern linearGradient line marker metadata missing-glyph
61
- mpath path polygon polyline radialGradient rect set stop svg switch
62
- text title tspan use]
63
-
64
- ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action
65
- align alt axis border cellpadding cellspacing char charoff charset
66
- checked cite class clear cols colspan color compact coords datetime
67
- dir disabled enctype for frame headers height href hreflang hspace id
68
- ismap label lang longdesc maxlength media method multiple name nohref
69
- noshade nowrap prompt readonly rel rev rows rowspan rules scope
70
- selected shape size span src start style summary tabindex target title
71
- type usemap valign value vspace width xml:lang]
72
-
73
- MATHML_ATTRIBUTES = %w[actiontype align close columnalign columnalign
74
- columnalign columnlines columnspacing columnspan depth display
75
- displaystyle encoding equalcolumns equalrows fence fontstyle fontweight
76
- frame height linethickness lspace mathbackground mathcolor mathvariant
77
- mathvariant maxsize minsize open other rowalign rowalign rowalign rowlines
78
- rowspacing rowspan rspace scriptlevel selection separator separators
79
- stretchy width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
80
-
81
- SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic
82
- arabic-form ascent attributeName attributeType baseProfile bbox begin
83
- by calcMode cap-height class color color-rendering content cx cy d dx
84
- dy descent display dur end fill fill-opacity fill-rule font-family
85
- font-size font-stretch font-style font-variant font-weight from fx fy g1
86
- g2 glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
87
- ideographic k keyPoints keySplines keyTimes lang marker-end
88
- marker-mid marker-start markerHeight markerUnits markerWidth
89
- mathematical max min name offset opacity orient origin
90
- overline-position overline-thickness panose-1 path pathLength points
91
- preserveAspectRatio r refX refY repeatCount repeatDur
92
- requiredExtensions requiredFeatures restart rotate rx ry slope stemh
93
- stemv stop-color stop-opacity strikethrough-position
94
- strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
95
- stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
96
- stroke-width systemLanguage target text-anchor to transform type u1
97
- u2 underline-position underline-thickness unicode unicode-range
98
- units-per-em values version viewBox visibility width widths x
99
- x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
100
- xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
101
- xmlns:xlink y y1 y2 zoomAndPan]
102
-
103
- ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href xml:base]
104
-
105
- SVG_ATTR_VAL_ALLOWS_REF = %w[clip-path color-profile cursor fill
106
- filter marker marker-start marker-mid marker-end mask stroke]
107
-
108
- SVG_ALLOW_LOCAL_HREF = %w[altGlyph animate animateColor animateMotion
109
- animateTransform cursor feImage filter linearGradient pattern
110
- radialGradient textpath tref set use]
111
-
112
- ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
113
- border-bottom-color border-collapse border-color border-left-color
114
- border-right-color border-top-color clear color cursor direction
115
- display elevation float font font-family font-size font-style
116
- font-variant font-weight height letter-spacing line-height overflow
117
- pause pause-after pause-before pitch pitch-range richness speak
118
- speak-header speak-numeral speak-punctuation speech-rate stress
119
- text-align text-decoration text-indent unicode-bidi vertical-align
120
- voice-family volume white-space width]
121
-
122
- ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom
123
- brown center collapse dashed dotted fuchsia gray green !important
124
- italic left lime maroon medium none navy normal nowrap olive pointer
125
- purple red right solid silver teal top transparent underline white
126
- yellow]
127
-
128
- ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke
129
- stroke-width stroke-linecap stroke-linejoin stroke-opacity]
130
-
131
- ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp
132
- telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
133
-
134
- # subclasses may define their own versions of these constants
135
- ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
136
- ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
137
- ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
138
- ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
139
- ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
140
- ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
141
-
142
- VOID_ELEMENTS = %w[
143
- base
144
- link
145
- meta
146
- hr
147
- br
148
- img
149
- embed
150
- param
151
- area
152
- col
153
- input
154
- ]
155
-
156
- # additional tags we should consider safe since we have libxml2 fixing up our documents.
157
- TAGS_SAFE_WITH_LIBXML2 = %w[html head body]
158
- ALLOWED_ELEMENTS_WITH_LIBXML2 = ALLOWED_ELEMENTS + TAGS_SAFE_WITH_LIBXML2
159
- end
160
-
161
- #
162
- # The HTML5lib whitelist arrays, transformed into hashes for faster lookup.
163
- #
164
- module HashedWhiteList
165
- include Loofah::MetaHelpers::HashifiedConstants(WhiteList)
166
- end
167
- end
168
- end
data/test/helper.rb DELETED
@@ -1,7 +0,0 @@
1
- require 'rubygems'
2
- require 'test/unit'
3
- require 'shoulda'
4
- require 'mocha'
5
- require File.expand_path(File.join(File.dirname(__FILE__), "..", "lib", "loofah"))
6
-
7
- puts "=> testing with Nokogiri #{Nokogiri::VERSION_INFO.inspect}"
@@ -1,248 +0,0 @@
1
- #
2
- # these tests taken from the HTML5 sanitization project and modified for use with Loofah
3
- # see the original here: http://code.google.com/p/html5lib/source/browse/ruby/test/test_sanitizer.rb
4
- #
5
- # license text at the bottom of this file
6
- #
7
- require File.expand_path(File.join(File.dirname(__FILE__), '..', 'helper'))
8
- require 'json'
9
-
10
- class Html5TestSanitizer < Test::Unit::TestCase
11
- include Loofah
12
-
13
- def sanitize_xhtml stream
14
- Loofah.fragment(stream).scrub!(:escape).to_xhtml
15
- end
16
-
17
- def sanitize_html stream
18
- Loofah.fragment(stream).scrub!(:escape).to_html
19
- end
20
-
21
- def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
22
- ## libxml uses double-quotes, so let's swappo-boppo our quotes before comparing.
23
- sane = sanitize_html(input).gsub('"',"'")
24
-
25
- ## HTML5's parsers are shit. there's so much inconsistency with what has closing tags, etc, that
26
- ## it would require a lot of manual hacking to make the tests match libxml's output.
27
- ## instead, I'm taking the shotgun approach, and trying to match any of the described outputs.
28
- assert((htmloutput == sane) || (rexmloutput == sane) || (xhtmloutput == sane), input)
29
- end
30
-
31
- (HTML5::WhiteList::ALLOWED_ELEMENTS).each do |tag_name|
32
- define_method "test_should_allow_#{tag_name}_tag" do
33
- input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
34
- htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
35
- xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
36
- rexmloutput = xhtmloutput
37
-
38
- if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
39
- htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
40
- xhtmloutput = htmloutput
41
- elsif tag_name == 'col'
42
- htmloutput = "<col title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
43
- xhtmloutput = htmloutput
44
- rexmloutput = "<col title='1' />"
45
- elsif tag_name == 'table'
46
- htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
47
- xhtmloutput = htmloutput
48
- elsif tag_name == 'image'
49
- htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
50
- xhtmloutput = htmloutput
51
- rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
52
- elsif HTML5::WhiteList::VOID_ELEMENTS.include?(tag_name)
53
- htmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
54
- xhtmloutput = htmloutput
55
- htmloutput += '<br/>' if tag_name == 'br'
56
- rexmloutput = "<#{tag_name} title='1' />"
57
- end
58
- check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
59
- end
60
- end
61
-
62
- ##
63
- ## libxml2 downcases elements, so this is moot.
64
- ##
65
- # HTML5::WhiteList::ALLOWED_ELEMENTS.each do |tag_name|
66
- # define_method "test_should_forbid_#{tag_name.upcase}_tag" do
67
- # input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
68
- # output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
69
- # check_sanitization(input, output, output, output)
70
- # end
71
- # end
72
-
73
- HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
74
- next if attribute_name == 'style'
75
- define_method "test_should_allow_#{attribute_name}_attribute" do
76
- input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
77
- if %w[checked compact disabled ismap multiple nohref noshade nowrap readonly selected].include?(attribute_name)
78
- output = "<p #{attribute_name}>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
79
- htmloutput = "<p #{attribute_name.downcase}>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
80
- else
81
- output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
82
- htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
83
- end
84
- check_sanitization(input, htmloutput, output, output)
85
- end
86
- end
87
-
88
- ##
89
- ## libxml2 downcases attributes, so this is moot.
90
- ##
91
- # HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
92
- # define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
93
- # input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
94
- # output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
95
- # check_sanitization(input, output, output, output)
96
- # end
97
- # end
98
-
99
- HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
100
- define_method "test_should_allow_#{protocol}_uris" do
101
- input = %(<a href="#{protocol}">foo</a>)
102
- output = "<a href='#{protocol}'>foo</a>"
103
- check_sanitization(input, output, output, output)
104
- end
105
- end
106
-
107
- HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
108
- define_method "test_should_allow_uppercase_#{protocol}_uris" do
109
- input = %(<a href="#{protocol.upcase}">foo</a>)
110
- output = "<a href='#{protocol.upcase}'>foo</a>"
111
- check_sanitization(input, output, output, output)
112
- end
113
- end
114
-
115
- HTML5::WhiteList::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
116
- next unless HTML5::WhiteList::ALLOWED_ELEMENTS.include?(tag_name)
117
- define_method "test_#{tag_name}_should_allow_local_href" do
118
- input = %(<#{tag_name} xlink:href="#foo"/>)
119
- output = "<#{tag_name.downcase} xlink:href='#foo'></#{tag_name.downcase}>"
120
- xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
121
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
122
- end
123
-
124
- define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
125
- input = %(<#{tag_name} xlink:href="\n#foo"/>)
126
- output = "<#{tag_name.downcase} xlink:href='\n#foo'></#{tag_name.downcase}>"
127
- xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
128
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
129
- end
130
-
131
- define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
132
- input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
133
- output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
134
- xhtmloutput = "<#{tag_name}></#{tag_name}>"
135
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
136
- end
137
-
138
- define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
139
- input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
140
- output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
141
- xhtmloutput = "<#{tag_name}></#{tag_name}>"
142
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
143
- end
144
- end
145
-
146
- ##
147
- ## as tenderlove says, "care < 0"
148
- ##
149
- # def test_should_handle_astral_plane_characters
150
- # input = "<p>&#x1d4b5; &#x1d538;</p>"
151
- # output = "<p>\360\235\222\265 \360\235\224\270</p>"
152
- # check_sanitization(input, output, output, output)
153
-
154
- # input = "<p><tspan>\360\235\224\270</tspan> a</p>"
155
- # output = "<p><tspan>\360\235\224\270</tspan> a</p>"
156
- # check_sanitization(input, output, output, output)
157
- # end
158
-
159
- # This affects only NS4. Is it worth fixing?
160
- # def test_javascript_includes
161
- # input = %(<div size="&{alert('XSS')}">foo</div>)
162
- # output = "<div>foo</div>"
163
- # check_sanitization(input, output, output, output)
164
- # end
165
-
166
- ##
167
- ## these tests primarily test the parser logic, not the sanitizer
168
- ## logic. i call bullshit. we're not writing a test suite for
169
- ## libxml2 here, so let's rely on the unit tests above to take care
170
- ## of our valid elements and attributes.
171
- ##
172
- # Dir[File.join(File.dirname(__FILE__), 'testdata', '*.*')].each do |filename|
173
- # JSON::parse(open(filename).read).each do |test|
174
- # define_method "test_#{test['name']}" do
175
- # check_sanitization(
176
- # test['input'],
177
- # test['output'],
178
- # test['xhtml'] || test['output'],
179
- # test['rexml'] || test['output']
180
- # )
181
- # end
182
- # end
183
- # end
184
-
185
- ## added because we don't have any coverage above on SVG_ATTR_VAL_ALLOWS_REF
186
- HTML5::WhiteList::SVG_ATTR_VAL_ALLOWS_REF.each do |attr_name|
187
- define_method "test_should_allow_uri_refs_in_svg_attribute_#{attr_name}" do
188
- input = "<rect fill='url(#foo)' />"
189
- output = "<rect fill='url(#foo)'></rect>"
190
- check_sanitization(input, output, output, output)
191
- end
192
-
193
- define_method "test_absolute_uri_refs_in_svg_attribute_#{attr_name}" do
194
- input = "<rect fill='url(http://bad.com/) #fff' />"
195
- output = "<rect fill=' #fff'></rect>"
196
- check_sanitization(input, output, output, output)
197
- end
198
-
199
- define_method "test_uri_ref_with_space_in_svg_attribute_#{attr_name}" do
200
- input = "<rect fill='url(\n#foo)' />"
201
- rexml = "<rect fill='url(\n#foo)'></rect>"
202
- end
203
-
204
- define_method "test_absolute_uri_ref_with_space_in_svg_attribute_#{attr_name}" do
205
- input = "<rect fill=\"url(\nhttp://bad.com/)\" />"
206
- rexml = "<rect fill=' '></rect>"
207
- end
208
- end
209
-
210
- end
211
-
212
- # <html5_license>
213
- #
214
- # Copyright (c) 2006-2008 The Authors
215
- #
216
- # Contributors:
217
- # James Graham - jg307@cam.ac.uk
218
- # Anne van Kesteren - annevankesteren@gmail.com
219
- # Lachlan Hunt - lachlan.hunt@lachy.id.au
220
- # Matt McDonald - kanashii@kanashii.ca
221
- # Sam Ruby - rubys@intertwingly.net
222
- # Ian Hickson (Google) - ian@hixie.ch
223
- # Thomas Broyer - t.broyer@ltgt.net
224
- # Jacques Distler - distler@golem.ph.utexas.edu
225
- # Henri Sivonen - hsivonen@iki.fi
226
- # The Mozilla Foundation (contributions from Henri Sivonen since 2008)
227
- #
228
- # Permission is hereby granted, free of charge, to any person
229
- # obtaining a copy of this software and associated documentation files
230
- # (the "Software"), to deal in the Software without restriction,
231
- # including without limitation the rights to use, copy, modify, merge,
232
- # publish, distribute, sublicense, and/or sell copies of the Software,
233
- # and to permit persons to whom the Software is furnished to do so,
234
- # subject to the following conditions:
235
- #
236
- # The above copyright notice and this permission notice shall be
237
- # included in all copies or substantial portions of the Software.
238
- #
239
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
240
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
241
- # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
242
- # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
243
- # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
244
- # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
245
- # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
246
- # SOFTWARE.
247
- #
248
- # </html5_license>
@@ -1,176 +0,0 @@
1
- require File.expand_path(File.join(File.dirname(__FILE__), '..', 'helper'))
2
-
3
- class TestAdHoc < Test::Unit::TestCase
4
-
5
- context "blank input string" do
6
- context "fragment" do
7
- should "return a blank string" do
8
- assert_equal "", Loofah.scrub_fragment("", :prune).to_s
9
- end
10
- end
11
-
12
- context "document" do
13
- should "return a blank string" do
14
- assert_equal "", Loofah.scrub_document("", :prune).root.to_s
15
- end
16
- end
17
- end
18
-
19
- def test_removal_of_illegal_tag
20
- html = <<-HTML
21
- following this there should be no jim tag
22
- <jim>jim</jim>
23
- was there?
24
- HTML
25
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
26
- assert sane.xpath("//jim").empty?
27
- end
28
-
29
- def test_removal_of_illegal_attribute
30
- html = "<p class=bar foo=bar abbr=bar />"
31
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
32
- node = sane.xpath("//p").first
33
- assert node.attributes['class']
34
- assert node.attributes['abbr']
35
- assert_nil node.attributes['foo']
36
- end
37
-
38
- def test_removal_of_illegal_url_in_href
39
- html = <<-HTML
40
- <a href='jimbo://jim.jim/'>this link should have its href removed because of illegal url</a>
41
- <a href='http://jim.jim/'>this link should be fine</a>
42
- HTML
43
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
44
- nodes = sane.xpath("//a")
45
- assert_nil nodes.first.attributes['href']
46
- assert nodes.last.attributes['href']
47
- end
48
-
49
- def test_css_sanitization
50
- html = "<p style='background-color: url(\"http://foo.com/\") ; background-color: #000 ;' />"
51
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
52
- assert_match(/#000/, sane.inner_html)
53
- assert_no_match(/foo\.com/, sane.inner_html)
54
- end
55
-
56
- def test_fragment_with_no_tags
57
- assert_equal "This fragment has no tags.", Loofah.scrub_fragment("This fragment has no tags.", :escape).to_xml
58
- end
59
-
60
- def test_fragment_in_p_tag
61
- assert_equal "<p>This fragment is in a p.</p>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>", :escape).to_xml
62
- end
63
-
64
- def test_fragment_in_p_tag_plus_stuff
65
- assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>foo<strong>bar</strong>", :escape).to_xml
66
- end
67
-
68
- def test_fragment_with_text_nodes_leading_and_trailing
69
- assert_equal "text<p>fragment</p>text", Loofah.scrub_fragment("text<p>fragment</p>text", :escape).to_xml
70
- end
71
-
72
- def test_whitewash_on_fragment
73
- html = "safe<frameset rows=\"*\"><frame src=\"http://example.com\"></frameset> <b>description</b>"
74
- whitewashed = Loofah.scrub_document(html, :whitewash).xpath("/html/body/*").to_s
75
- assert_equal "<p>safe</p><b>description</b>", whitewashed.gsub("\n","")
76
- end
77
-
78
- MSWORD_HTML = <<-EOHTML
79
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta name="ProgId" content="Word.Document"><meta name="Generator" content="Microsoft Word 11"><meta name="Originator" content="Microsoft Word 11"><link rel="File-List" href="file:///C:%5CDOCUME%7E1%5CNICOLE%7E1%5CLOCALS%7E1%5CTemp%5Cmsohtml1%5C01%5Cclip_filelist.xml"><!--[if gte mso 9]><xml>
80
- <w:WordDocument>
81
- <w:View>Normal</w:View>
82
- <w:Zoom>0</w:Zoom>
83
- <w:PunctuationKerning/>
84
- <w:ValidateAgainstSchemas/>
85
- <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
86
- <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
87
- <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
88
- <w:Compatibility>
89
- <w:BreakWrappedTables/>
90
- <w:SnapToGridInCell/>
91
- <w:WrapTextWithPunct/>
92
- <w:UseAsianBreakRules/>
93
- <w:DontGrowAutofit/>
94
- </w:Compatibility>
95
- <w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>
96
- </w:WordDocument>
97
- </xml><![endif]--><!--[if gte mso 9]><xml>
98
- <w:LatentStyles DefLockedState="false" LatentStyleCount="156">
99
- </w:LatentStyles>
100
- </xml><![endif]--><style>
101
- <!--
102
- /* Style Definitions */
103
- p.MsoNormal, li.MsoNormal, div.MsoNormal
104
- {mso-style-parent:"";
105
- margin:0in;
106
- margin-bottom:.0001pt;
107
- mso-pagination:widow-orphan;
108
- font-size:12.0pt;
109
- font-family:"Times New Roman";
110
- mso-fareast-font-family:"Times New Roman";}
111
- @page Section1
112
- {size:8.5in 11.0in;
113
- margin:1.0in 1.25in 1.0in 1.25in;
114
- mso-header-margin:.5in;
115
- mso-footer-margin:.5in;
116
- mso-paper-source:0;}
117
- div.Section1
118
- {page:Section1;}
119
- -->
120
- </style><!--[if gte mso 10]>
121
- <style>
122
- /* Style Definitions */
123
- table.MsoNormalTable
124
- {mso-style-name:"Table Normal";
125
- mso-tstyle-rowband-size:0;
126
- mso-tstyle-colband-size:0;
127
- mso-style-noshow:yes;
128
- mso-style-parent:"";
129
- mso-padding-alt:0in 5.4pt 0in 5.4pt;
130
- mso-para-margin:0in;
131
- mso-para-margin-bottom:.0001pt;
132
- mso-pagination:widow-orphan;
133
- font-size:10.0pt;
134
- font-family:"Times New Roman";
135
- mso-ansi-language:#0400;
136
- mso-fareast-language:#0400;
137
- mso-bidi-language:#0400;}
138
- </style>
139
- <![endif]-->
140
-
141
- <p class="MsoNormal">Foo <b style="">BOLD<o:p></o:p></b></p>
142
- EOHTML
143
-
144
- def test_fragment_whitewash_on_microsofty_markup
145
- whitewashed = Loofah.fragment(MSWORD_HTML).scrub!(:whitewash)
146
- assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.to_s
147
- end
148
-
149
- def test_document_whitewash_on_microsofty_markup
150
- whitewashed = Loofah.document(MSWORD_HTML).scrub!(:whitewash)
151
- assert_contains whitewashed.to_s, %r(<p>Foo <b>BOLD</b></p>)
152
- assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.xpath("/html/body/*").to_s
153
- end
154
-
155
- def test_return_empty_string_when_nothing_left
156
- assert_equal "", Loofah.scrub_document('<script>test</script>', :prune).text
157
- end
158
-
159
- def test_removal_of_all_tags
160
- html = <<-HTML
161
- What's up <strong>doc</strong>?
162
- HTML
163
- stripped = Loofah.scrub_document(html, :prune).text
164
- assert_equal %Q(What\'s up doc?).strip, stripped.strip
165
- end
166
-
167
- def test_dont_remove_whitespace
168
- html = "Foo\nBar"
169
- assert_equal html, Loofah.scrub_document(html, :prune).text
170
- end
171
-
172
- def test_dont_remove_whitespace_between_tags
173
- html = "<p>Foo</p>\n<p>Bar</p>"
174
- assert_equal "Foo\nBar", Loofah.scrub_document(html, :prune).text
175
- end
176
- end
@@ -1,33 +0,0 @@
1
- require File.expand_path(File.join(File.dirname(__FILE__), '..', 'helper'))
2
-
3
- class TestHelpers < Test::Unit::TestCase
4
- context "#strip_tags" do
5
- context "on safe markup" do
6
- should "strip out tags" do
7
- assert_equal "omgwtfbbq!!1!", Loofah::Helpers.strip_tags("<div>omgwtfbbq</div><span>!!1!</span>")
8
- end
9
- end
10
-
11
- context "on hack attack" do
12
- should "strip escape html entities" do
13
- bad_shit = "&lt;script&gt;alert('evil')&lt;/script&gt;"
14
- assert_equal bad_shit, Loofah::Helpers.strip_tags(bad_shit)
15
- end
16
- end
17
- end
18
-
19
- context "#sanitize" do
20
- context "on safe markup" do
21
- should "render the safe html" do
22
- html = "<div>omgwtfbbq</div><span>!!1!</span>"
23
- assert_equal html, Loofah::Helpers.sanitize(html)
24
- end
25
- end
26
-
27
- context "on hack attack" do
28
- should "strip the unsafe tags" do
29
- assert_equal "alert('evil')<span>w00t</span>", Loofah::Helpers.sanitize("<script>alert('evil')</script><span>w00t</span>")
30
- end
31
- end
32
- end
33
- end