loofah 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of loofah might be problematic. Click here for more details.

@@ -2,13 +2,88 @@ require 'set'
2
2
 
3
3
  module Loofah
4
4
  module Elements
5
- # Block elements in HTML4
6
- STRICT_BLOCK_LEVEL = Set.new %w[address blockquote center dir div dl
7
- fieldset form h1 h2 h3 h4 h5 h6 hr isindex menu noframes
8
- noscript ol p pre table ul]
5
+ STRICT_BLOCK_LEVEL_HTML4 = Set.new %w[
6
+ address
7
+ blockquote
8
+ center
9
+ dir
10
+ div
11
+ dl
12
+ fieldset
13
+ form
14
+ h1
15
+ h2
16
+ h3
17
+ h4
18
+ h5
19
+ h6
20
+ hr
21
+ isindex
22
+ menu
23
+ noframes
24
+ noscript
25
+ ol
26
+ p
27
+ pre
28
+ table
29
+ ul
30
+ ]
9
31
 
10
- # The following elements may also be considered block-level elements since they may contain block-level elements
11
- LOOSE_BLOCK_LEVEL = Set.new %w[dd dt frameset li tbody td tfoot th thead tr]
32
+ # https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
33
+ STRICT_BLOCK_LEVEL_HTML5 = Set.new %w[
34
+ address
35
+ article
36
+ aside
37
+ blockquote
38
+ canvas
39
+ dd
40
+ div
41
+ dl
42
+ dt
43
+ fieldset
44
+ figcaption
45
+ figure
46
+ footer
47
+ form
48
+ h1
49
+ h2
50
+ h3
51
+ h4
52
+ h5
53
+ h6
54
+ header
55
+ hgroup
56
+ hr
57
+ li
58
+ main
59
+ nav
60
+ noscript
61
+ ol
62
+ output
63
+ p
64
+ pre
65
+ section
66
+ table
67
+ tfoot
68
+ ul
69
+ video
70
+ ]
71
+
72
+ STRICT_BLOCK_LEVEL = STRICT_BLOCK_LEVEL_HTML4 + STRICT_BLOCK_LEVEL_HTML5
73
+
74
+ # The following elements may also be considered block-level
75
+ # elements since they may contain block-level elements
76
+ LOOSE_BLOCK_LEVEL = Set.new %w[dd
77
+ dt
78
+ frameset
79
+ li
80
+ tbody
81
+ td
82
+ tfoot
83
+ th
84
+ thead
85
+ tr
86
+ ]
12
87
 
13
88
  BLOCK_LEVEL = STRICT_BLOCK_LEVEL + LOOSE_BLOCK_LEVEL
14
89
  end
@@ -44,7 +44,7 @@ module Loofah
44
44
  elsif val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0] == 'data'
45
45
  # permit only allowed data mediatypes
46
46
  mediatype = val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[1]
47
- mediatype, base64 = mediatype.split(';')[0..1] if mediatype
47
+ mediatype, _ = mediatype.split(';')[0..1] if mediatype
48
48
  if mediatype && !WhiteList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
49
49
  attr_node.remove
50
50
  next
@@ -79,7 +79,7 @@ module Loofah
79
79
  style_tree.each do |node|
80
80
  next unless node[:node] == :property
81
81
  next if node[:children].any? do |child|
82
- [:url, :bad_url, :function].include? child[:node]
82
+ [:url, :bad_url].include?(child[:node]) || (child[:node] == :function && !WhiteList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase))
83
83
  end
84
84
  name = node[:name].downcase
85
85
  if WhiteList::ALLOWED_CSS_PROPERTIES.include?(name) || WhiteList::ALLOWED_SVG_PROPERTIES.include?(name)
@@ -51,7 +51,7 @@ module Loofah
51
51
  caption center cite code col colgroup command datalist dd del
52
52
  details dfn dir div dl dt em fieldset figcaption figure footer
53
53
  font form h1 h2 h3 h4 h5 h6 header hr i img input ins kbd label
54
- legend li map mark menu meter nav ol output optgroup option p
54
+ legend li main map mark menu meter nav ol output optgroup option p
55
55
  pre q s samp section select small span strike strong sub summary
56
56
  sup table tbody td textarea tfoot th thead time tr tt u ul var
57
57
  video]
@@ -65,7 +65,7 @@ module Loofah
65
65
  circle clipPath defs desc ellipse feGaussianBlur filter font-face
66
66
  font-face-name font-face-src foreignObject
67
67
  g glyph hkern linearGradient line marker mask metadata missing-glyph
68
- mpath path polygon polyline radialGradient rect set stop svg switch
68
+ mpath path polygon polyline radialGradient rect set stop svg switch symbol
69
69
  text textPath title tspan use]
70
70
 
71
71
  ACCEPTABLE_ATTRIBUTES = Set.new %w[abbr accept accept-charset accesskey action
@@ -125,8 +125,8 @@ module Loofah
125
125
  border-bottom-color border-collapse border-color border-left-color
126
126
  border-right-color border-top-color clear color cursor direction
127
127
  display elevation float font font-family font-size font-style
128
- font-variant font-weight height letter-spacing line-height overflow
129
- pause pause-after pause-before pitch pitch-range richness speak
128
+ font-variant font-weight height letter-spacing line-height list-style-type
129
+ overflow pause pause-after pause-before pitch pitch-range richness speak
130
130
  speak-header speak-numeral speak-punctuation speech-rate stress
131
131
  text-align text-decoration text-indent unicode-bidi vertical-align
132
132
  voice-family volume white-space width]
@@ -137,6 +137,8 @@ module Loofah
137
137
  purple red right solid silver teal top transparent underline white
138
138
  yellow]
139
139
 
140
+ ACCEPTABLE_CSS_FUNCTIONS = Set.new %w[calc rgb]
141
+
140
142
  SHORTHAND_CSS_PROPERTIES = Set.new %w[background border margin padding]
141
143
 
142
144
  ACCEPTABLE_SVG_PROPERTIES = Set.new %w[fill fill-opacity fill-rule stroke
@@ -155,6 +157,7 @@ module Loofah
155
157
  ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
156
158
  ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
157
159
  ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
160
+ ALLOWED_CSS_FUNCTIONS = ACCEPTABLE_CSS_FUNCTIONS
158
161
  ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
159
162
  ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
160
163
  ALLOWED_URI_DATA_MEDIATYPES = ACCEPTABLE_URI_DATA_MEDIATYPES
@@ -99,7 +99,12 @@ module Loofah
99
99
 
100
100
  def scrub(node)
101
101
  return CONTINUE if html5lib_sanitize(node) == CONTINUE
102
- node.before node.children
102
+ if node.children.length == 1 && node.children.first.cdata?
103
+ sanitized_text = Loofah.fragment(node.children.first.to_html).scrub!(:strip).to_html
104
+ node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
105
+ else
106
+ node.before node.children
107
+ end
103
108
  node.remove
104
109
  end
105
110
  end
@@ -20,9 +20,9 @@ class Html5TestSanitizer < Loofah::TestCase
20
20
  def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
21
21
  ## libxml uses double-quotes, so let's swappo-boppo our quotes before comparing.
22
22
  sane = sanitize_html(input).gsub('"',"'")
23
- htmloutput.gsub!('"',"'")
24
- xhtmloutput.gsub!('"',"'")
25
- rexmloutput.gsub!('"',"'")
23
+ htmloutput = htmloutput.gsub('"',"'")
24
+ xhtmloutput = xhtmloutput.gsub('"',"'")
25
+ rexmloutput = rexmloutput.gsub('"',"'")
26
26
 
27
27
  ## HTML5's parsers are shit. there's so much inconsistency with what has closing tags, etc, that
28
28
  ## it would require a lot of manual hacking to make the tests match libxml's output.
@@ -136,7 +136,7 @@ class Html5TestSanitizer < Loofah::TestCase
136
136
  check_sanitization(input, output, output, output)
137
137
  end
138
138
  end
139
-
139
+
140
140
  HTML5::WhiteList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
141
141
  define_method "test_should_allow_data_#{data_uri_type}_uris" do
142
142
  input = %(<a href="data:#{data_uri_type}">foo</a>)
@@ -275,6 +275,38 @@ class Html5TestSanitizer < Loofah::TestCase
275
275
  assert_match %r/-0.05em/, sane.inner_html
276
276
  end
277
277
 
278
+ def test_css_function_sanitization_leaves_whitelisted_functions_calc
279
+ html = "<span style=\"width:calc(5%)\">"
280
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
281
+ assert_match %r/calc\(5%\)/, sane.inner_html
282
+
283
+ html = "<span style=\"width: calc(5%)\">"
284
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
285
+ assert_match %r/calc\(5%\)/, sane.inner_html
286
+ end
287
+
288
+ def test_css_function_sanitization_leaves_whitelisted_functions_rgb
289
+ html = '<span style="color: rgb(255, 0, 0)">'
290
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
291
+ assert_match %r/rgb\(255, 0, 0\)/, sane.inner_html
292
+ end
293
+
294
+ def test_css_function_sanitization_leaves_whitelisted_list_style_type
295
+ html = "<ol style='list-style-type:lower-greek;'></ol>"
296
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
297
+ assert_match %r/list-style-type:lower-greek/, sane.inner_html
298
+ end
299
+
300
+ def test_css_function_sanitization_strips_style_attributes_with_unsafe_functions
301
+ html = "<span style=\"width:attr(data-evil-attr)\">"
302
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
303
+ assert_match %r/<span><\/span>/, sane.inner_html
304
+
305
+ html = "<span style=\"width: attr(data-evil-attr)\">"
306
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
307
+ assert_match %r/<span><\/span>/, sane.inner_html
308
+ end
309
+
278
310
  def test_issue_90_slow_regex
279
311
  skip("timing tests are hard to make pass and have little regression-testing value")
280
312
 
@@ -16,66 +16,67 @@ class IntegrationTestAdHoc < Loofah::TestCase
16
16
  end
17
17
  end
18
18
 
19
- def test_removal_of_illegal_tag
20
- html = <<-HTML
19
+ context "tests" do
20
+ def test_removal_of_illegal_tag
21
+ html = <<-HTML
21
22
  following this there should be no jim tag
22
23
  <jim>jim</jim>
23
24
  was there?
24
25
  HTML
25
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
26
- assert sane.xpath("//jim").empty?
27
- end
26
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
27
+ assert sane.xpath("//jim").empty?
28
+ end
28
29
 
29
- def test_removal_of_illegal_attribute
30
- html = "<p class=bar foo=bar abbr=bar />"
31
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
32
- node = sane.xpath("//p").first
33
- assert node.attributes['class']
34
- assert node.attributes['abbr']
35
- assert_nil node.attributes['foo']
36
- end
30
+ def test_removal_of_illegal_attribute
31
+ html = "<p class=bar foo=bar abbr=bar />"
32
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
33
+ node = sane.xpath("//p").first
34
+ assert node.attributes['class']
35
+ assert node.attributes['abbr']
36
+ assert_nil node.attributes['foo']
37
+ end
37
38
 
38
- def test_removal_of_illegal_url_in_href
39
- html = <<-HTML
39
+ def test_removal_of_illegal_url_in_href
40
+ html = <<-HTML
40
41
  <a href='jimbo://jim.jim/'>this link should have its href removed because of illegal url</a>
41
42
  <a href='http://jim.jim/'>this link should be fine</a>
42
43
  HTML
43
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
44
- nodes = sane.xpath("//a")
45
- assert_nil nodes.first.attributes['href']
46
- assert nodes.last.attributes['href']
47
- end
44
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
45
+ nodes = sane.xpath("//a")
46
+ assert_nil nodes.first.attributes['href']
47
+ assert nodes.last.attributes['href']
48
+ end
48
49
 
49
- def test_css_sanitization
50
- html = "<p style='background-color: url(\"http://foo.com/\") ; background-color: #000 ;' />"
51
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
52
- assert_match %r/#000/, sane.inner_html
53
- refute_match %r/foo\.com/, sane.inner_html
54
- end
50
+ def test_css_sanitization
51
+ html = "<p style='background-color: url(\"http://foo.com/\") ; background-color: #000 ;' />"
52
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
53
+ assert_match %r/#000/, sane.inner_html
54
+ refute_match %r/foo\.com/, sane.inner_html
55
+ end
55
56
 
56
- def test_fragment_with_no_tags
57
- assert_equal "This fragment has no tags.", Loofah.scrub_fragment("This fragment has no tags.", :escape).to_xml
58
- end
57
+ def test_fragment_with_no_tags
58
+ assert_equal "This fragment has no tags.", Loofah.scrub_fragment("This fragment has no tags.", :escape).to_xml
59
+ end
59
60
 
60
- def test_fragment_in_p_tag
61
- assert_equal "<p>This fragment is in a p.</p>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>", :escape).to_xml
62
- end
61
+ def test_fragment_in_p_tag
62
+ assert_equal "<p>This fragment is in a p.</p>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>", :escape).to_xml
63
+ end
63
64
 
64
- def test_fragment_in_p_tag_plus_stuff
65
- assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>foo<strong>bar</strong>", :escape).to_xml
66
- end
65
+ def test_fragment_in_p_tag_plus_stuff
66
+ assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>foo<strong>bar</strong>", :escape).to_xml
67
+ end
67
68
 
68
- def test_fragment_with_text_nodes_leading_and_trailing
69
- assert_equal "text<p>fragment</p>text", Loofah.scrub_fragment("text<p>fragment</p>text", :escape).to_xml
70
- end
69
+ def test_fragment_with_text_nodes_leading_and_trailing
70
+ assert_equal "text<p>fragment</p>text", Loofah.scrub_fragment("text<p>fragment</p>text", :escape).to_xml
71
+ end
71
72
 
72
- def test_whitewash_on_fragment
73
- html = "safe<frameset rows=\"*\"><frame src=\"http://example.com\"></frameset> <b>description</b>"
74
- whitewashed = Loofah.scrub_document(html, :whitewash).xpath("/html/body/*").to_s
75
- assert_equal "<p>safe</p><b>description</b>", whitewashed.gsub("\n","")
76
- end
73
+ def test_whitewash_on_fragment
74
+ html = "safe<frameset rows=\"*\"><frame src=\"http://example.com\"></frameset> <b>description</b>"
75
+ whitewashed = Loofah.scrub_document(html, :whitewash).xpath("/html/body/*").to_s
76
+ assert_equal "<p>safe</p><b>description</b>", whitewashed.gsub("\n","")
77
+ end
77
78
 
78
- MSWORD_HTML = <<-EOHTML
79
+ MSWORD_HTML = <<-EOHTML
79
80
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta name="ProgId" content="Word.Document"><meta name="Generator" content="Microsoft Word 11"><meta name="Originator" content="Microsoft Word 11"><link rel="File-List" href="file:///C:%5CDOCUME%7E1%5CNICOLE%7E1%5CLOCALS%7E1%5CTemp%5Cmsohtml1%5C01%5Cclip_filelist.xml"><!--[if gte mso 9]><xml>
80
81
  <w:WordDocument>
81
82
  <w:View>Normal</w:View>
@@ -141,36 +142,52 @@ mso-bidi-language:#0400;}
141
142
  <p class="MsoNormal">Foo <b style="">BOLD<o:p></o:p></b></p>
142
143
  EOHTML
143
144
 
144
- def test_fragment_whitewash_on_microsofty_markup
145
- whitewashed = Loofah.fragment(MSWORD_HTML).scrub!(:whitewash)
146
- assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.to_s.strip
147
- end
145
+ def test_fragment_whitewash_on_microsofty_markup
146
+ whitewashed = Loofah.fragment(MSWORD_HTML).scrub!(:whitewash)
147
+ assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.to_s.strip
148
+ end
148
149
 
149
- def test_document_whitewash_on_microsofty_markup
150
- whitewashed = Loofah.document(MSWORD_HTML).scrub!(:whitewash)
151
- assert_match %r(<p>Foo <b>BOLD</b></p>), whitewashed.to_s
152
- assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.xpath("/html/body/*").to_s
153
- end
150
+ def test_document_whitewash_on_microsofty_markup
151
+ whitewashed = Loofah.document(MSWORD_HTML).scrub!(:whitewash)
152
+ assert_match %r(<p>Foo <b>BOLD</b></p>), whitewashed.to_s
153
+ assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.xpath("/html/body/*").to_s
154
+ end
154
155
 
155
- def test_return_empty_string_when_nothing_left
156
- assert_equal "", Loofah.scrub_document('<script>test</script>', :prune).text
157
- end
156
+ def test_return_empty_string_when_nothing_left
157
+ assert_equal "", Loofah.scrub_document('<script>test</script>', :prune).text
158
+ end
159
+
160
+ def test_nested_script_cdata_tags_should_be_scrubbed
161
+ html = "<script><script src='malicious.js'></script>"
162
+ stripped = Loofah.fragment(html).scrub!(:strip)
163
+ assert_empty stripped.xpath("//script")
164
+ refute_match("<script", stripped.to_html)
165
+ end
158
166
 
159
- def test_removal_of_all_tags
160
- html = <<-HTML
167
+ def test_nested_script_cdata_tags_should_be_scrubbed_2
168
+ html = "<script><script>alert('a');</script></script>"
169
+ stripped = Loofah.fragment(html).scrub!(:strip)
170
+ assert_empty stripped.xpath("//script")
171
+ refute_match("<script", stripped.to_html)
172
+ end
173
+
174
+ def test_removal_of_all_tags
175
+ html = <<-HTML
161
176
  What's up <strong>doc</strong>?
162
177
  HTML
163
- stripped = Loofah.scrub_document(html, :prune).text
164
- assert_equal %Q(What\'s up doc?).strip, stripped.strip
165
- end
178
+ stripped = Loofah.scrub_document(html, :prune).text
179
+ assert_equal %Q(What\'s up doc?).strip, stripped.strip
180
+ end
166
181
 
167
- def test_dont_remove_whitespace
168
- html = "Foo\nBar"
169
- assert_equal html, Loofah.scrub_document(html, :prune).text
170
- end
182
+ def test_dont_remove_whitespace
183
+ html = "Foo\nBar"
184
+ assert_equal html, Loofah.scrub_document(html, :prune).text
185
+ end
171
186
 
172
- def test_dont_remove_whitespace_between_tags
173
- html = "<p>Foo</p>\n<p>Bar</p>"
174
- assert_equal "Foo\nBar", Loofah.scrub_document(html, :prune).text
187
+ def test_dont_remove_whitespace_between_tags
188
+ html = "<p>Foo</p>\n<p>Bar</p>"
189
+ assert_equal "Foo\nBar", Loofah.scrub_document(html, :prune).text
190
+ end
175
191
  end
176
192
  end
193
+
@@ -19,11 +19,16 @@ class IntegrationTestHtml < Loofah::TestCase
19
19
  end
20
20
 
21
21
  context "#to_text" do
22
- it "add newlines before and after block elements" do
22
+ it "add newlines before and after html4 block elements" do
23
23
  html = Loofah.fragment "<div>tweedle<h1>beetle</h1>bottle<span>puddle</span>paddle<div>battle</div>muddle</div>"
24
24
  assert_equal "\ntweedle\nbeetle\nbottlepuddlepaddle\nbattle\nmuddle\n", html.to_text
25
25
  end
26
26
 
27
+ it "add newlines before and after html5 block elements" do
28
+ html = Loofah.fragment "<div>tweedle<section>beetle</section>bottle<span>puddle</span>paddle<div>battle</div>muddle</div>"
29
+ assert_equal "\ntweedle\nbeetle\nbottlepuddlepaddle\nbattle\nmuddle\n", html.to_text
30
+ end
31
+
27
32
  it "remove extraneous whitespace" do
28
33
  html = Loofah.fragment "<div>tweedle\n\n\t\n\s\nbeetle</div>"
29
34
  assert_equal "\ntweedle\n\nbeetle\n", html.to_text
@@ -47,11 +52,16 @@ class IntegrationTestHtml < Loofah::TestCase
47
52
  end
48
53
 
49
54
  context "#to_text" do
50
- it "add newlines before and after block elements" do
55
+ it "add newlines before and after html4 block elements" do
51
56
  html = Loofah.document "<div>tweedle<h1>beetle</h1>bottle<span>puddle</span>paddle<div>battle</div>muddle</div>"
52
57
  assert_equal "\ntweedle\nbeetle\nbottlepuddlepaddle\nbattle\nmuddle\n", html.to_text
53
58
  end
54
59
 
60
+ it "add newlines before and after html5 block elements" do
61
+ html = Loofah.document "<div>tweedle<section>beetle</section>bottle<span>puddle</span>paddle<div>battle</div>muddle</div>"
62
+ assert_equal "\ntweedle\nbeetle\nbottlepuddlepaddle\nbattle\nmuddle\n", html.to_text
63
+ end
64
+
55
65
  it "remove extraneous whitespace" do
56
66
  html = Loofah.document "<div>tweedle\n\n\t\n\s\nbeetle</div>"
57
67
  assert_equal "\ntweedle\n\nbeetle\n", html.to_text