loofah 2.1.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of loofah might be problematic. Click here for more details.

@@ -1,5 +1,3 @@
1
- #encoding: US-ASCII
2
-
3
1
  require 'cgi'
4
2
  require 'crass'
5
3
 
@@ -8,13 +6,13 @@ module Loofah
8
6
  module Scrub
9
7
 
10
8
  CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
11
- CSS_KEYWORDISH = /\A(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
9
+ CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
12
10
  CRASS_SEMICOLON = {:node => :semicolon, :raw => ";"}
13
11
 
14
12
  class << self
15
13
 
16
14
  def allowed_element? element_name
17
- ::Loofah::HTML5::WhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
15
+ ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
18
16
  end
19
17
 
20
18
  # alternative implementation of the html5lib attribute scrubbing algorithm
@@ -30,31 +28,31 @@ module Loofah
30
28
  next
31
29
  end
32
30
 
33
- unless WhiteList::ALLOWED_ATTRIBUTES.include?(attr_name)
31
+ unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
34
32
  attr_node.remove
35
33
  next
36
34
  end
37
35
 
38
- if WhiteList::ATTR_VAL_IS_URI.include?(attr_name)
36
+ if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
39
37
  # this block lifted nearly verbatim from HTML5 sanitization
40
38
  val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
41
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! WhiteList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0])
39
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
42
40
  attr_node.remove
43
41
  next
44
- elsif val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0] == 'data'
42
+ elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == 'data'
45
43
  # permit only allowed data mediatypes
46
- mediatype = val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[1]
44
+ mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
47
45
  mediatype, _ = mediatype.split(';')[0..1] if mediatype
48
- if mediatype && !WhiteList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
46
+ if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
49
47
  attr_node.remove
50
48
  next
51
49
  end
52
50
  end
53
51
  end
54
- if WhiteList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
52
+ if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
55
53
  attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
56
54
  end
57
- if WhiteList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
55
+ if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
58
56
  attr_node.remove
59
57
  next
60
58
  end
@@ -65,6 +63,8 @@ module Loofah
65
63
  node.attribute_nodes.each do |attr_node|
66
64
  node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/
67
65
  end
66
+
67
+ force_correct_attribute_escaping! node
68
68
  end
69
69
 
70
70
  def scrub_css_attribute node
@@ -79,14 +79,14 @@ module Loofah
79
79
  style_tree.each do |node|
80
80
  next unless node[:node] == :property
81
81
  next if node[:children].any? do |child|
82
- [:url, :bad_url, :function].include? child[:node]
82
+ [:url, :bad_url].include?(child[:node]) || (child[:node] == :function && !SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase))
83
83
  end
84
84
  name = node[:name].downcase
85
- if WhiteList::ALLOWED_CSS_PROPERTIES.include?(name) || WhiteList::ALLOWED_SVG_PROPERTIES.include?(name)
85
+ if SafeList::ALLOWED_CSS_PROPERTIES.include?(name) || SafeList::ALLOWED_SVG_PROPERTIES.include?(name)
86
86
  sanitized_tree << node << CRASS_SEMICOLON
87
- elsif WhiteList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
87
+ elsif SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
88
88
  value = node[:value].split.map do |keyword|
89
- if WhiteList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
89
+ if SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
90
90
  keyword
91
91
  end
92
92
  end.compact
@@ -100,6 +100,33 @@ module Loofah
100
100
 
101
101
  Crass::Parser.stringify sanitized_tree
102
102
  end
103
+
104
+ #
105
+ # libxml2 >= 2.9.2 fails to escape comments within some attributes.
106
+ #
107
+ # see comments about CVE-2018-8048 within the tests for more information
108
+ #
109
+ def force_correct_attribute_escaping! node
110
+ return unless Nokogiri::VersionInfo.instance.libxml2?
111
+
112
+ node.attribute_nodes.each do |attr_node|
113
+ next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name)
114
+
115
+ tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name]
116
+ next unless tag_name.nil? || tag_name == node.name
117
+
118
+ #
119
+ # this block is just like CGI.escape in Ruby 2.4, but
120
+ # only encodes space and double-quote, to mimic
121
+ # pre-2.9.2 behavior
122
+ #
123
+ encoding = attr_node.value.encoding
124
+ attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
125
+ '%' + m.unpack('H2' * m.bytesize).join('%').upcase
126
+ end.force_encoding(encoding)
127
+ end
128
+ end
129
+
103
130
  end
104
131
  end
105
132
  end
@@ -1,7 +1,7 @@
1
1
  module Loofah
2
2
  #
3
3
  # Loofah provides some built-in scrubbers for sanitizing with
4
- # HTML5lib's whitelist and for accomplishing some common
4
+ # HTML5lib's safelist and for accomplishing some common
5
5
  # transformation tasks.
6
6
  #
7
7
  #
@@ -99,7 +99,12 @@ module Loofah
99
99
 
100
100
  def scrub(node)
101
101
  return CONTINUE if html5lib_sanitize(node) == CONTINUE
102
- node.before node.children
102
+ if node.children.length == 1 && node.children.first.cdata?
103
+ sanitized_text = Loofah.fragment(node.children.first.to_html).scrub!(:strip).to_html
104
+ node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
105
+ else
106
+ node.before node.children
107
+ end
103
108
  node.remove
104
109
  end
105
110
  end
@@ -0,0 +1,63 @@
1
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta name="ProgId" content="Word.Document"><meta name="Generator" content="Microsoft Word 11"><meta name="Originator" content="Microsoft Word 11"><link rel="File-List" href="file:///C:%5CDOCUME%7E1%5CNICOLE%7E1%5CLOCALS%7E1%5CTemp%5Cmsohtml1%5C01%5Cclip_filelist.xml"><!--[if gte mso 9]><xml>
2
+ <w:WordDocument>
3
+ <w:View>Normal</w:View>
4
+ <w:Zoom>0</w:Zoom>
5
+ <w:PunctuationKerning/>
6
+ <w:ValidateAgainstSchemas/>
7
+ <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
8
+ <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
9
+ <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
10
+ <w:Compatibility>
11
+ <w:BreakWrappedTables/>
12
+ <w:SnapToGridInCell/>
13
+ <w:WrapTextWithPunct/>
14
+ <w:UseAsianBreakRules/>
15
+ <w:DontGrowAutofit/>
16
+ </w:Compatibility>
17
+ <w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>
18
+ </w:WordDocument>
19
+ </xml><![endif]--><!--[if gte mso 9]><xml>
20
+ <w:LatentStyles DefLockedState="false" LatentStyleCount="156">
21
+ </w:LatentStyles>
22
+ </xml><![endif]--><style>
23
+ <!--
24
+ /* Style Definitions */
25
+ p.MsoNormal, li.MsoNormal, div.MsoNormal
26
+ {mso-style-parent:"";
27
+ margin:0in;
28
+ margin-bottom:.0001pt;
29
+ mso-pagination:widow-orphan;
30
+ font-size:12.0pt;
31
+ font-family:"Times New Roman";
32
+ mso-fareast-font-family:"Times New Roman";}
33
+ @page Section1
34
+ {size:8.5in 11.0in;
35
+ margin:1.0in 1.25in 1.0in 1.25in;
36
+ mso-header-margin:.5in;
37
+ mso-footer-margin:.5in;
38
+ mso-paper-source:0;}
39
+ div.Section1
40
+ {page:Section1;}
41
+ -->
42
+ </style><!--[if gte mso 10]>
43
+ <style>
44
+ /* Style Definitions */
45
+ table.MsoNormalTable
46
+ {mso-style-name:"Table Normal";
47
+ mso-tstyle-rowband-size:0;
48
+ mso-tstyle-colband-size:0;
49
+ mso-style-noshow:yes;
50
+ mso-style-parent:"";
51
+ mso-padding-alt:0in 5.4pt 0in 5.4pt;
52
+ mso-para-margin:0in;
53
+ mso-para-margin-bottom:.0001pt;
54
+ mso-pagination:widow-orphan;
55
+ font-size:10.0pt;
56
+ font-family:"Times New Roman";
57
+ mso-ansi-language:#0400;
58
+ mso-fareast-language:#0400;
59
+ mso-bidi-language:#0400;}
60
+ </style>
61
+ <![endif]-->
62
+
63
+ <p class="MsoNormal">Foo <b style="">BOLD<o:p></o:p></b></p>
@@ -20,9 +20,9 @@ class Html5TestSanitizer < Loofah::TestCase
20
20
  def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
21
21
  ## libxml uses double-quotes, so let's swappo-boppo our quotes before comparing.
22
22
  sane = sanitize_html(input).gsub('"',"'")
23
- htmloutput.gsub!('"',"'")
24
- xhtmloutput.gsub!('"',"'")
25
- rexmloutput.gsub!('"',"'")
23
+ htmloutput = htmloutput.gsub('"',"'")
24
+ xhtmloutput = xhtmloutput.gsub('"',"'")
25
+ rexmloutput = rexmloutput.gsub('"',"'")
26
26
 
27
27
  ## HTML5's parsers are shit. there's so much inconsistency with what has closing tags, etc, that
28
28
  ## it would require a lot of manual hacking to make the tests match libxml's output.
@@ -37,7 +37,7 @@ class Html5TestSanitizer < Loofah::TestCase
37
37
  assert_in_delta t0, Time.now, 0.1 # arbitrary seconds
38
38
  end
39
39
 
40
- (HTML5::WhiteList::ALLOWED_ELEMENTS).each do |tag_name|
40
+ (HTML5::SafeList::ALLOWED_ELEMENTS).each do |tag_name|
41
41
  define_method "test_should_allow_#{tag_name}_tag" do
42
42
  input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
43
43
  htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
@@ -58,7 +58,7 @@ class Html5TestSanitizer < Loofah::TestCase
58
58
  htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
59
59
  xhtmloutput = htmloutput
60
60
  rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
61
- elsif HTML5::WhiteList::VOID_ELEMENTS.include?(tag_name)
61
+ elsif HTML5::SafeList::VOID_ELEMENTS.include?(tag_name)
62
62
  htmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
63
63
  xhtmloutput = htmloutput
64
64
  htmloutput += '<br/>' if tag_name == 'br'
@@ -71,7 +71,7 @@ class Html5TestSanitizer < Loofah::TestCase
71
71
  ##
72
72
  ## libxml2 downcases elements, so this is moot.
73
73
  ##
74
- # HTML5::WhiteList::ALLOWED_ELEMENTS.each do |tag_name|
74
+ # HTML5::SafeList::ALLOWED_ELEMENTS.each do |tag_name|
75
75
  # define_method "test_should_forbid_#{tag_name.upcase}_tag" do
76
76
  # input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
77
77
  # output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
@@ -79,7 +79,7 @@ class Html5TestSanitizer < Loofah::TestCase
79
79
  # end
80
80
  # end
81
81
 
82
- HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
82
+ HTML5::SafeList::ALLOWED_ATTRIBUTES.each do |attribute_name|
83
83
  next if attribute_name == 'style'
84
84
  define_method "test_should_allow_#{attribute_name}_attribute" do
85
85
  input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
@@ -110,10 +110,17 @@ class Html5TestSanitizer < Loofah::TestCase
110
110
  check_sanitization(input, htmloutput, output, output)
111
111
  end
112
112
 
113
+ def test_should_allow_contenteditable
114
+ input = '<p contenteditable="false">Hi!</p>'
115
+ output = '<p contenteditable="false">Hi!</p>'
116
+
117
+ check_sanitization(input, output, output, output)
118
+ end
119
+
113
120
  ##
114
121
  ## libxml2 downcases attributes, so this is moot.
115
122
  ##
116
- # HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
123
+ # HTML5::SafeList::ALLOWED_ATTRIBUTES.each do |attribute_name|
117
124
  # define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
118
125
  # input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
119
126
  # output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
@@ -121,7 +128,7 @@ class Html5TestSanitizer < Loofah::TestCase
121
128
  # end
122
129
  # end
123
130
 
124
- HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
131
+ HTML5::SafeList::ALLOWED_PROTOCOLS.each do |protocol|
125
132
  define_method "test_should_allow_#{protocol}_uris" do
126
133
  input = %(<a href="#{protocol}">foo</a>)
127
134
  output = "<a href='#{protocol}'>foo</a>"
@@ -129,15 +136,15 @@ class Html5TestSanitizer < Loofah::TestCase
129
136
  end
130
137
  end
131
138
 
132
- HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
139
+ HTML5::SafeList::ALLOWED_PROTOCOLS.each do |protocol|
133
140
  define_method "test_should_allow_uppercase_#{protocol}_uris" do
134
141
  input = %(<a href="#{protocol.upcase}">foo</a>)
135
142
  output = "<a href='#{protocol.upcase}'>foo</a>"
136
143
  check_sanitization(input, output, output, output)
137
144
  end
138
145
  end
139
-
140
- HTML5::WhiteList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
146
+
147
+ HTML5::SafeList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
141
148
  define_method "test_should_allow_data_#{data_uri_type}_uris" do
142
149
  input = %(<a href="data:#{data_uri_type}">foo</a>)
143
150
  output = "<a href='data:#{data_uri_type}'>foo</a>"
@@ -149,7 +156,7 @@ class Html5TestSanitizer < Loofah::TestCase
149
156
  end
150
157
  end
151
158
 
152
- HTML5::WhiteList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
159
+ HTML5::SafeList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
153
160
  define_method "test_should_allow_uppercase_data_#{data_uri_type}_uris" do
154
161
  input = %(<a href="DATA:#{data_uri_type.upcase}">foo</a>)
155
162
  output = "<a href='DATA:#{data_uri_type.upcase}'>foo</a>"
@@ -172,8 +179,8 @@ class Html5TestSanitizer < Loofah::TestCase
172
179
  end
173
180
 
174
181
 
175
- HTML5::WhiteList::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
176
- next unless HTML5::WhiteList::ALLOWED_ELEMENTS.include?(tag_name)
182
+ HTML5::SafeList::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
183
+ next unless HTML5::SafeList::ALLOWED_ELEMENTS.include?(tag_name)
177
184
  define_method "test_#{tag_name}_should_allow_local_href" do
178
185
  input = %(<#{tag_name} xlink:href="#foo"/>)
179
186
  output = "<#{tag_name.downcase} xlink:href='#foo'></#{tag_name.downcase}>"
@@ -249,7 +256,7 @@ class Html5TestSanitizer < Loofah::TestCase
249
256
  end
250
257
 
251
258
  ## added because we don't have any coverage above on SVG_ATTR_VAL_ALLOWS_REF
252
- HTML5::WhiteList::SVG_ATTR_VAL_ALLOWS_REF.each do |attr_name|
259
+ HTML5::SafeList::SVG_ATTR_VAL_ALLOWS_REF.each do |attr_name|
253
260
  define_method "test_should_allow_uri_refs_in_svg_attribute_#{attr_name}" do
254
261
  input = "<rect fill='url(#foo)' />"
255
262
  output = "<rect fill='url(#foo)'></rect>"
@@ -263,6 +270,12 @@ class Html5TestSanitizer < Loofah::TestCase
263
270
  end
264
271
  end
265
272
 
273
+ def test_css_list_style
274
+ html = '<ul style="list-style: none"></ul>'
275
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
276
+ assert_match %r/list-style/, sane.inner_html
277
+ end
278
+
266
279
  def test_css_negative_value_sanitization
267
280
  html = "<span style=\"letter-spacing:-0.03em;\">"
268
281
  sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
@@ -275,6 +288,44 @@ class Html5TestSanitizer < Loofah::TestCase
275
288
  assert_match %r/-0.05em/, sane.inner_html
276
289
  end
277
290
 
291
+ def test_css_high_precision_value_shorthand_css_properties
292
+ html = "<span style=\"margin-left:0.3333333334em;\">"
293
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
294
+ assert_match %r/0.3333333334em/, sane.inner_html
295
+ end
296
+
297
+ def test_css_function_sanitization_leaves_safelisted_functions_calc
298
+ html = "<span style=\"width:calc(5%)\">"
299
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
300
+ assert_match %r/calc\(5%\)/, sane.inner_html
301
+
302
+ html = "<span style=\"width: calc(5%)\">"
303
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
304
+ assert_match %r/calc\(5%\)/, sane.inner_html
305
+ end
306
+
307
+ def test_css_function_sanitization_leaves_safelisted_functions_rgb
308
+ html = '<span style="color: rgb(255, 0, 0)">'
309
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
310
+ assert_match %r/rgb\(255, 0, 0\)/, sane.inner_html
311
+ end
312
+
313
+ def test_css_function_sanitization_leaves_safelisted_list_style_type
314
+ html = "<ol style='list-style-type:lower-greek;'></ol>"
315
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
316
+ assert_match %r/list-style-type:lower-greek/, sane.inner_html
317
+ end
318
+
319
+ def test_css_function_sanitization_strips_style_attributes_with_unsafe_functions
320
+ html = "<span style=\"width:url(data-evil-url)\">"
321
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
322
+ assert_match %r/<span><\/span>/, sane.inner_html
323
+
324
+ html = "<span style=\"width: url(data-evil-url)\">"
325
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
326
+ assert_match %r/<span><\/span>/, sane.inner_html
327
+ end
328
+
278
329
  def test_issue_90_slow_regex
279
330
  skip("timing tests are hard to make pass and have little regression-testing value")
280
331
 
@@ -0,0 +1,10 @@
1
+ require "helper"
2
+
3
+ class UnitHTML5Scrub < Loofah::TestCase
4
+ include Loofah
5
+
6
+ def test_scrub_css
7
+ assert_equal Loofah::HTML5::Scrub.scrub_css("background: #ABC012"), "background:#ABC012;"
8
+ assert_equal Loofah::HTML5::Scrub.scrub_css("background: #abc012"), "background:#abc012;"
9
+ end
10
+ end
@@ -16,161 +16,189 @@ class IntegrationTestAdHoc < Loofah::TestCase
16
16
  end
17
17
  end
18
18
 
19
- def test_removal_of_illegal_tag
20
- html = <<-HTML
19
+ context "tests" do
20
+ MSWORD_HTML = File.read(File.join(File.dirname(__FILE__), "..", "assets", "msword.html")).freeze
21
+
22
+ def test_removal_of_illegal_tag
23
+ html = <<-HTML
21
24
  following this there should be no jim tag
22
25
  <jim>jim</jim>
23
26
  was there?
24
27
  HTML
25
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
26
- assert sane.xpath("//jim").empty?
27
- end
28
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
29
+ assert sane.xpath("//jim").empty?
30
+ end
28
31
 
29
- def test_removal_of_illegal_attribute
30
- html = "<p class=bar foo=bar abbr=bar />"
31
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
32
- node = sane.xpath("//p").first
33
- assert node.attributes['class']
34
- assert node.attributes['abbr']
35
- assert_nil node.attributes['foo']
36
- end
32
+ def test_removal_of_illegal_attribute
33
+ html = "<p class=bar foo=bar abbr=bar />"
34
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
35
+ node = sane.xpath("//p").first
36
+ assert node.attributes['class']
37
+ assert node.attributes['abbr']
38
+ assert_nil node.attributes['foo']
39
+ end
37
40
 
38
- def test_removal_of_illegal_url_in_href
39
- html = <<-HTML
41
+ def test_removal_of_illegal_url_in_href
42
+ html = <<-HTML
40
43
  <a href='jimbo://jim.jim/'>this link should have its href removed because of illegal url</a>
41
44
  <a href='http://jim.jim/'>this link should be fine</a>
42
45
  HTML
43
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
44
- nodes = sane.xpath("//a")
45
- assert_nil nodes.first.attributes['href']
46
- assert nodes.last.attributes['href']
47
- end
46
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
47
+ nodes = sane.xpath("//a")
48
+ assert_nil nodes.first.attributes['href']
49
+ assert nodes.last.attributes['href']
50
+ end
48
51
 
49
- def test_css_sanitization
50
- html = "<p style='background-color: url(\"http://foo.com/\") ; background-color: #000 ;' />"
51
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
52
- assert_match %r/#000/, sane.inner_html
53
- refute_match %r/foo\.com/, sane.inner_html
54
- end
52
+ def test_css_sanitization
53
+ html = "<p style='background-color: url(\"http://foo.com/\") ; background-color: #000 ;' />"
54
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
55
+ assert_match %r/#000/, sane.inner_html
56
+ refute_match %r/foo\.com/, sane.inner_html
57
+ end
55
58
 
56
- def test_fragment_with_no_tags
57
- assert_equal "This fragment has no tags.", Loofah.scrub_fragment("This fragment has no tags.", :escape).to_xml
58
- end
59
+ def test_fragment_with_no_tags
60
+ assert_equal "This fragment has no tags.", Loofah.scrub_fragment("This fragment has no tags.", :escape).to_xml
61
+ end
59
62
 
60
- def test_fragment_in_p_tag
61
- assert_equal "<p>This fragment is in a p.</p>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>", :escape).to_xml
62
- end
63
+ def test_fragment_in_p_tag
64
+ assert_equal "<p>This fragment is in a p.</p>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>", :escape).to_xml
65
+ end
63
66
 
64
- def test_fragment_in_p_tag_plus_stuff
65
- assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>foo<strong>bar</strong>", :escape).to_xml
66
- end
67
+ def test_fragment_in_p_tag_plus_stuff
68
+ assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>foo<strong>bar</strong>", :escape).to_xml
69
+ end
67
70
 
68
- def test_fragment_with_text_nodes_leading_and_trailing
69
- assert_equal "text<p>fragment</p>text", Loofah.scrub_fragment("text<p>fragment</p>text", :escape).to_xml
70
- end
71
+ def test_fragment_with_text_nodes_leading_and_trailing
72
+ assert_equal "text<p>fragment</p>text", Loofah.scrub_fragment("text<p>fragment</p>text", :escape).to_xml
73
+ end
71
74
 
72
- def test_whitewash_on_fragment
73
- html = "safe<frameset rows=\"*\"><frame src=\"http://example.com\"></frameset> <b>description</b>"
74
- whitewashed = Loofah.scrub_document(html, :whitewash).xpath("/html/body/*").to_s
75
- assert_equal "<p>safe</p><b>description</b>", whitewashed.gsub("\n","")
76
- end
75
+ def test_whitewash_on_fragment
76
+ html = "safe<frameset rows=\"*\"><frame src=\"http://example.com\"></frameset> <b>description</b>"
77
+ whitewashed = Loofah.scrub_document(html, :whitewash).xpath("/html/body/*").to_s
78
+ assert_equal "<p>safe</p><b>description</b>", whitewashed.gsub("\n","")
79
+ end
77
80
 
78
- MSWORD_HTML = <<-EOHTML
79
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta name="ProgId" content="Word.Document"><meta name="Generator" content="Microsoft Word 11"><meta name="Originator" content="Microsoft Word 11"><link rel="File-List" href="file:///C:%5CDOCUME%7E1%5CNICOLE%7E1%5CLOCALS%7E1%5CTemp%5Cmsohtml1%5C01%5Cclip_filelist.xml"><!--[if gte mso 9]><xml>
80
- <w:WordDocument>
81
- <w:View>Normal</w:View>
82
- <w:Zoom>0</w:Zoom>
83
- <w:PunctuationKerning/>
84
- <w:ValidateAgainstSchemas/>
85
- <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
86
- <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
87
- <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
88
- <w:Compatibility>
89
- <w:BreakWrappedTables/>
90
- <w:SnapToGridInCell/>
91
- <w:WrapTextWithPunct/>
92
- <w:UseAsianBreakRules/>
93
- <w:DontGrowAutofit/>
94
- </w:Compatibility>
95
- <w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>
96
- </w:WordDocument>
97
- </xml><![endif]--><!--[if gte mso 9]><xml>
98
- <w:LatentStyles DefLockedState="false" LatentStyleCount="156">
99
- </w:LatentStyles>
100
- </xml><![endif]--><style>
101
- <!--
102
- /* Style Definitions */
103
- p.MsoNormal, li.MsoNormal, div.MsoNormal
104
- {mso-style-parent:"";
105
- margin:0in;
106
- margin-bottom:.0001pt;
107
- mso-pagination:widow-orphan;
108
- font-size:12.0pt;
109
- font-family:"Times New Roman";
110
- mso-fareast-font-family:"Times New Roman";}
111
- @page Section1
112
- {size:8.5in 11.0in;
113
- margin:1.0in 1.25in 1.0in 1.25in;
114
- mso-header-margin:.5in;
115
- mso-footer-margin:.5in;
116
- mso-paper-source:0;}
117
- div.Section1
118
- {page:Section1;}
119
- -->
120
- </style><!--[if gte mso 10]>
121
- <style>
122
- /* Style Definitions */
123
- table.MsoNormalTable
124
- {mso-style-name:"Table Normal";
125
- mso-tstyle-rowband-size:0;
126
- mso-tstyle-colband-size:0;
127
- mso-style-noshow:yes;
128
- mso-style-parent:"";
129
- mso-padding-alt:0in 5.4pt 0in 5.4pt;
130
- mso-para-margin:0in;
131
- mso-para-margin-bottom:.0001pt;
132
- mso-pagination:widow-orphan;
133
- font-size:10.0pt;
134
- font-family:"Times New Roman";
135
- mso-ansi-language:#0400;
136
- mso-fareast-language:#0400;
137
- mso-bidi-language:#0400;}
138
- </style>
139
- <![endif]-->
140
-
141
- <p class="MsoNormal">Foo <b style="">BOLD<o:p></o:p></b></p>
142
- EOHTML
143
-
144
- def test_fragment_whitewash_on_microsofty_markup
145
- whitewashed = Loofah.fragment(MSWORD_HTML).scrub!(:whitewash)
146
- assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.to_s.strip
147
- end
81
+ def test_fragment_whitewash_on_microsofty_markup
82
+ whitewashed = Loofah.fragment(MSWORD_HTML).scrub!(:whitewash)
83
+ assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.to_s.strip
84
+ end
148
85
 
149
- def test_document_whitewash_on_microsofty_markup
150
- whitewashed = Loofah.document(MSWORD_HTML).scrub!(:whitewash)
151
- assert_match %r(<p>Foo <b>BOLD</b></p>), whitewashed.to_s
152
- assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.xpath("/html/body/*").to_s
153
- end
86
+ def test_document_whitewash_on_microsofty_markup
87
+ whitewashed = Loofah.document(MSWORD_HTML).scrub!(:whitewash)
88
+ assert_match %r(<p>Foo <b>BOLD</b></p>), whitewashed.to_s
89
+ assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.xpath("/html/body/*").to_s
90
+ end
154
91
 
155
- def test_return_empty_string_when_nothing_left
156
- assert_equal "", Loofah.scrub_document('<script>test</script>', :prune).text
157
- end
92
+ def test_return_empty_string_when_nothing_left
93
+ assert_equal "", Loofah.scrub_document('<script>test</script>', :prune).text
94
+ end
158
95
 
159
- def test_removal_of_all_tags
160
- html = <<-HTML
96
+ def test_nested_script_cdata_tags_should_be_scrubbed
97
+ html = "<script><script src='malicious.js'></script>"
98
+ stripped = Loofah.fragment(html).scrub!(:strip)
99
+ assert_empty stripped.xpath("//script")
100
+ refute_match("<script", stripped.to_html)
101
+ end
102
+
103
+ def test_nested_script_cdata_tags_should_be_scrubbed_2
104
+ html = "<script><script>alert('a');</script></script>"
105
+ stripped = Loofah.fragment(html).scrub!(:strip)
106
+ assert_empty stripped.xpath("//script")
107
+ refute_match("<script", stripped.to_html)
108
+ end
109
+
110
+ def test_removal_of_all_tags
111
+ html = <<-HTML
161
112
  What's up <strong>doc</strong>?
162
113
  HTML
163
- stripped = Loofah.scrub_document(html, :prune).text
164
- assert_equal %Q(What\'s up doc?).strip, stripped.strip
165
- end
114
+ stripped = Loofah.scrub_document(html, :prune).text
115
+ assert_equal %Q(What\'s up doc?).strip, stripped.strip
116
+ end
166
117
 
167
- def test_dont_remove_whitespace
168
- html = "Foo\nBar"
169
- assert_equal html, Loofah.scrub_document(html, :prune).text
170
- end
118
+ def test_dont_remove_whitespace
119
+ html = "Foo\nBar"
120
+ assert_equal html, Loofah.scrub_document(html, :prune).text
121
+ end
122
+
123
+ def test_dont_remove_whitespace_between_tags
124
+ html = "<p>Foo</p>\n<p>Bar</p>"
125
+ assert_equal "Foo\nBar", Loofah.scrub_document(html, :prune).text
126
+ end
171
127
 
172
- def test_dont_remove_whitespace_between_tags
173
- html = "<p>Foo</p>\n<p>Bar</p>"
174
- assert_equal "Foo\nBar", Loofah.scrub_document(html, :prune).text
128
+ #
129
+ # tests for CVE-2018-8048 (see https://github.com/flavorjones/loofah/issues/144)
130
+ #
131
+ # libxml2 >= 2.9.2 fails to escape comments within some attributes. It
132
+ # wants to ensure these comments can be treated as "server-side includes",
133
+ # but as a result fails to ensure that serialization is well-formed,
134
+ # resulting in an opportunity for XSS injection of code into a final
135
+ # re-parsed document (presumably in a browser).
136
+ #
137
+ # we'll test this by parsing the HTML, serializing it, then
138
+ # re-parsing it to ensure there isn't any ambiguity in the output
139
+ # that might allow code injection into a browser consuming
140
+ # "sanitized" output.
141
+ #
142
+ [
143
+ #
144
+ # these tags and attributes are determined by the code at:
145
+ #
146
+ # https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714
147
+ #
148
+ {tag: "a", attr: "href"},
149
+ {tag: "div", attr: "href"},
150
+ {tag: "a", attr: "action"},
151
+ {tag: "div", attr: "action"},
152
+ {tag: "a", attr: "src"},
153
+ {tag: "div", attr: "src"},
154
+ {tag: "a", attr: "name"},
155
+ #
156
+ # note that div+name is _not_ affected by the libxml2 issue.
157
+ # but we test it anyway to ensure our logic isn't modifying
158
+ # attributes that don't need modifying.
159
+ #
160
+ {tag: "div", attr: "name", unescaped: true},
161
+ ].each do |config|
162
+
163
+ define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do
164
+ html = %{<#{config[:tag]} #{config[:attr]}='examp<!--" unsafeattr=foo()>-->le.com'>test</#{config[:tag]}>}
165
+
166
+ reparsed = Loofah.fragment(Loofah.fragment(html).scrub!(:prune).to_html)
167
+ attributes = reparsed.at_css(config[:tag]).attribute_nodes
168
+
169
+ assert_equal [config[:attr]], attributes.collect(&:name)
170
+ if Nokogiri::VersionInfo.instance.libxml2?
171
+ if config[:unescaped]
172
+ #
173
+ # this attribute was emitted wrapped in single-quotes, so a double quote is A-OK.
174
+ # assert that this attribute's serialization is unaffected.
175
+ #
176
+ assert_equal %{examp<!--" unsafeattr=foo()>-->le.com}, attributes.first.value
177
+ else
178
+ #
179
+ # let's match the behavior in libxml < 2.9.2.
180
+ # test that this attribute's serialization is well-formed and sanitized.
181
+ #
182
+ assert_equal %{examp<!--%22%20unsafeattr=foo()>-->le.com}, attributes.first.value
183
+ end
184
+ else
185
+ #
186
+ # yay for consistency in javaland. move along, nothing to see here.
187
+ #
188
+ assert_equal %{examp<!--%22 unsafeattr=foo()>-->le.com}, attributes.first.value
189
+ end
190
+ end
191
+ end
192
+
193
+ # see:
194
+ # - https://github.com/flavorjones/loofah/issues/154
195
+ # - https://hackerone.com/reports/429267
196
+ context "xss protection from svg xmlns:xlink animate attribute" do
197
+ it "sanitizes appropriate attributes" do
198
+ html = %Q{<svg><a xmlns:xlink=http://www.w3.org/1999/xlink xlink:href=?><circle r=400 /><animate attributeName=xlink:href begin=0 from=javascript:alert(1) to=%26>}
199
+ sanitized = Loofah.scrub_fragment(html, :escape)
200
+ assert_nil sanitized.at_css("animate")["from"]
201
+ end
202
+ end
175
203
  end
176
204
  end