loofah 2.2.3 → 2.21.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +269 -31
  3. data/README.md +109 -124
  4. data/lib/loofah/concerns.rb +207 -0
  5. data/lib/loofah/elements.rb +85 -79
  6. data/lib/loofah/helpers.rb +37 -20
  7. data/lib/loofah/{html → html4}/document.rb +6 -7
  8. data/lib/loofah/html4/document_fragment.rb +15 -0
  9. data/lib/loofah/html5/document.rb +17 -0
  10. data/lib/loofah/html5/document_fragment.rb +15 -0
  11. data/lib/loofah/html5/libxml2_workarounds.rb +10 -8
  12. data/lib/loofah/html5/safelist.rb +1055 -0
  13. data/lib/loofah/html5/scrub.rb +153 -58
  14. data/lib/loofah/metahelpers.rb +11 -6
  15. data/lib/loofah/scrubber.rb +22 -15
  16. data/lib/loofah/scrubbers.rb +66 -55
  17. data/lib/loofah/version.rb +6 -0
  18. data/lib/loofah/xml/document.rb +2 -0
  19. data/lib/loofah/xml/document_fragment.rb +4 -7
  20. data/lib/loofah.rb +131 -38
  21. metadata +28 -216
  22. data/.gemtest +0 -0
  23. data/Gemfile +0 -22
  24. data/Manifest.txt +0 -40
  25. data/Rakefile +0 -79
  26. data/benchmark/benchmark.rb +0 -149
  27. data/benchmark/fragment.html +0 -96
  28. data/benchmark/helper.rb +0 -73
  29. data/benchmark/www.slashdot.com.html +0 -2560
  30. data/lib/loofah/html/document_fragment.rb +0 -40
  31. data/lib/loofah/html5/whitelist.rb +0 -186
  32. data/lib/loofah/instance_methods.rb +0 -127
  33. data/test/assets/msword.html +0 -63
  34. data/test/assets/testdata_sanitizer_tests1.dat +0 -502
  35. data/test/helper.rb +0 -18
  36. data/test/html5/test_sanitizer.rb +0 -382
  37. data/test/integration/test_ad_hoc.rb +0 -204
  38. data/test/integration/test_helpers.rb +0 -43
  39. data/test/integration/test_html.rb +0 -72
  40. data/test/integration/test_scrubbers.rb +0 -400
  41. data/test/integration/test_xml.rb +0 -55
  42. data/test/unit/test_api.rb +0 -142
  43. data/test/unit/test_encoding.rb +0 -20
  44. data/test/unit/test_helpers.rb +0 -62
  45. data/test/unit/test_scrubber.rb +0 -229
  46. data/test/unit/test_scrubbers.rb +0 -14
@@ -1,104 +1,160 @@
1
- require 'cgi'
2
- require 'crass'
1
+ # frozen_string_literal: true
2
+
3
+ require "cgi"
4
+ require "crass"
3
5
 
4
6
  module Loofah
5
7
  module HTML5 # :nodoc:
6
8
  module Scrub
7
-
8
9
  CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
9
- CSS_KEYWORDISH = /\A(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
10
- CRASS_SEMICOLON = {:node => :semicolon, :raw => ";"}
10
+ CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/ # rubocop:disable Layout/LineLength
11
+ CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
12
+ CSS_IMPORTANT = "!important"
13
+ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
14
+ DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
11
15
 
12
16
  class << self
13
-
14
- def allowed_element? element_name
15
- ::Loofah::HTML5::WhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
17
+ def allowed_element?(element_name)
18
+ ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
16
19
  end
17
20
 
18
21
  # alternative implementation of the html5lib attribute scrubbing algorithm
19
- def scrub_attributes node
22
+ def scrub_attributes(node)
20
23
  node.attribute_nodes.each do |attr_node|
21
24
  attr_name = if attr_node.namespace
22
- "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
23
- else
24
- attr_node.node_name
25
- end
25
+ "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
26
+ else
27
+ attr_node.node_name
28
+ end
26
29
 
27
- if attr_name =~ /\Adata-[\w-]+\z/
30
+ if DATA_ATTRIBUTE_NAME.match?(attr_name)
28
31
  next
29
32
  end
30
33
 
31
- unless WhiteList::ALLOWED_ATTRIBUTES.include?(attr_name)
34
+ unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
32
35
  attr_node.remove
33
36
  next
34
37
  end
35
38
 
36
- if WhiteList::ATTR_VAL_IS_URI.include?(attr_name)
37
- # this block lifted nearly verbatim from HTML5 sanitization
38
- val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
39
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! WhiteList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0])
40
- attr_node.remove
41
- next
42
- elsif val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0] == 'data'
43
- # permit only allowed data mediatypes
44
- mediatype = val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[1]
45
- mediatype, _ = mediatype.split(';')[0..1] if mediatype
46
- if mediatype && !WhiteList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
47
- attr_node.remove
48
- next
49
- end
50
- end
51
- end
52
- if WhiteList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
53
- attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
39
+ if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
40
+ next if scrub_uri_attribute(attr_node)
54
41
  end
55
- if WhiteList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
56
- attr_node.remove
57
- next
42
+
43
+ if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
44
+ scrub_attribute_that_allows_local_ref(attr_node)
58
45
  end
46
+
47
+ next unless SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) &&
48
+ attr_name == "xlink:href" &&
49
+ attr_node.value =~ /^\s*[^#\s].*/m
50
+
51
+ attr_node.remove
52
+ next
59
53
  end
60
54
 
61
- scrub_css_attribute node
55
+ scrub_css_attribute(node)
62
56
 
63
57
  node.attribute_nodes.each do |attr_node|
64
- node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/
58
+ if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
59
+ node.remove_attribute(attr_node.name)
60
+ end
65
61
  end
66
62
 
67
- force_correct_attribute_escaping! node
63
+ force_correct_attribute_escaping!(node)
68
64
  end
69
65
 
70
- def scrub_css_attribute node
71
- style = node.attributes['style']
66
+ def scrub_css_attribute(node)
67
+ style = node.attributes["style"]
72
68
  style.value = scrub_css(style.value) if style
73
69
  end
74
70
 
75
- def scrub_css style
76
- style_tree = Crass.parse_properties style
71
+ def scrub_css(style)
72
+ url_flags = [:url, :bad_url]
73
+ style_tree = Crass.parse_properties(style)
77
74
  sanitized_tree = []
78
75
 
79
76
  style_tree.each do |node|
80
77
  next unless node[:node] == :property
81
78
  next if node[:children].any? do |child|
82
- [:url, :bad_url].include?(child[:node]) || (child[:node] == :function && !WhiteList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase))
79
+ url_flags.include?(child[:node])
83
80
  end
81
+
84
82
  name = node[:name].downcase
85
- if WhiteList::ALLOWED_CSS_PROPERTIES.include?(name) || WhiteList::ALLOWED_SVG_PROPERTIES.include?(name)
86
- sanitized_tree << node << CRASS_SEMICOLON
87
- elsif WhiteList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
88
- value = node[:value].split.map do |keyword|
89
- if WhiteList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
83
+ next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
84
+ SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
85
+ SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
86
+
87
+ value = node[:children].map do |child|
88
+ case child[:node]
89
+ when :whitespace
90
+ nil
91
+ when :string
92
+ if CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES.match?(child[:raw])
93
+ Crass::Parser.stringify(child)
94
+ end
95
+ when :function
96
+ if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
97
+ Crass::Parser.stringify(child)
98
+ end
99
+ when :ident
100
+ keyword = child[:value]
101
+ if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
102
+ SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
103
+ (keyword =~ CSS_KEYWORDISH)
90
104
  keyword
91
105
  end
92
- end.compact
93
- unless value.empty?
94
- propstring = sprintf "%s:%s", name, value.join(" ")
95
- sanitized_node = Crass.parse_properties(propstring).first
96
- sanitized_tree << sanitized_node << CRASS_SEMICOLON
106
+ else
107
+ child[:raw]
97
108
  end
98
- end
109
+ end.compact
110
+
111
+ next if value.empty?
112
+
113
+ value << CSS_IMPORTANT if node[:important]
114
+ propstring = format("%s:%s", name, value.join(" "))
115
+ sanitized_node = Crass.parse_properties(propstring).first
116
+ sanitized_tree << sanitized_node << CRASS_SEMICOLON
99
117
  end
100
118
 
101
- Crass::Parser.stringify sanitized_tree
119
+ Crass::Parser.stringify(sanitized_tree)
120
+ end
121
+
122
+ def scrub_attribute_that_allows_local_ref(attr_node)
123
+ return unless attr_node.value
124
+
125
+ nodes = Crass::Parser.new(attr_node.value).parse_component_values
126
+
127
+ values = nodes.map do |node|
128
+ case node[:node]
129
+ when :url
130
+ if node[:value].start_with?("#")
131
+ node[:raw]
132
+ end
133
+ when :hash, :ident, :string
134
+ node[:raw]
135
+ end
136
+ end.compact
137
+
138
+ attr_node.value = values.join(" ")
139
+ end
140
+
141
+ def scrub_uri_attribute(attr_node)
142
+ # this block lifted nearly verbatim from HTML5 sanitization
143
+ val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
144
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ &&
145
+ !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
146
+ attr_node.remove
147
+ return true
148
+ elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
149
+ # permit only allowed data mediatypes
150
+ mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
151
+ mediatype, _ = mediatype.split(";")[0..1] if mediatype
152
+ if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
153
+ attr_node.remove
154
+ return true
155
+ end
156
+ end
157
+ false
102
158
  end
103
159
 
104
160
  #
@@ -106,7 +162,7 @@ module Loofah
106
162
  #
107
163
  # see comments about CVE-2018-8048 within the tests for more information
108
164
  #
109
- def force_correct_attribute_escaping! node
165
+ def force_correct_attribute_escaping!(node)
110
166
  return unless Nokogiri::VersionInfo.instance.libxml2?
111
167
 
112
168
  node.attribute_nodes.each do |attr_node|
@@ -122,11 +178,50 @@ module Loofah
122
178
  #
123
179
  encoding = attr_node.value.encoding
124
180
  attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
125
- '%' + m.unpack('H2' * m.bytesize).join('%').upcase
181
+ "%" + m.unpack("H2" * m.bytesize).join("%").upcase
126
182
  end.force_encoding(encoding)
127
183
  end
128
184
  end
129
185
 
186
+ def cdata_needs_escaping?(node)
187
+ # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` tag as cdata, but it acts that way
188
+ node.cdata? || (Nokogiri.jruby? && node.text? && node.parent.name == "style")
189
+ end
190
+
191
+ def cdata_escape(node)
192
+ escaped_text = escape_tags(node.text)
193
+ if Nokogiri.jruby?
194
+ node.document.create_text_node(escaped_text)
195
+ else
196
+ node.document.create_cdata(escaped_text)
197
+ end
198
+ end
199
+
200
+ TABLE_FOR_ESCAPE_HTML__ = {
201
+ "<" => "&lt;",
202
+ ">" => "&gt;",
203
+ "&" => "&amp;",
204
+ }
205
+
206
+ def escape_tags(string)
207
+ # modified version of CGI.escapeHTML from ruby 3.1
208
+ enc = string.encoding
209
+ if enc.ascii_compatible?
210
+ string = string.b
211
+ string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
212
+ string.force_encoding(enc)
213
+ else
214
+ if enc.dummy?
215
+ origenc = enc
216
+ enc = Encoding::Converter.asciicompat_encoding(enc)
217
+ string = enc ? string.encode(enc) : string.b
218
+ end
219
+ table = Hash[TABLE_FOR_ESCAPE_HTML__.map { |pair| pair.map { |s| s.encode(enc) } }]
220
+ string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
221
+ string.encode!(origenc) if origenc
222
+ string
223
+ end
224
+ end
130
225
  end
131
226
  end
132
227
  end
@@ -1,11 +1,16 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Loofah
2
4
  module MetaHelpers # :nodoc:
3
- def self.add_downcased_set_members_to_all_set_constants mojule
4
- mojule.constants.each do |constant_sym|
5
- constant = mojule.const_get constant_sym
6
- next unless Set === constant
7
- constant.dup.each do |member|
8
- constant.add member.downcase
5
+ class << self
6
+ def add_downcased_set_members_to_all_set_constants(mojule)
7
+ mojule.constants.each do |constant_sym|
8
+ constant = mojule.const_get(constant_sym)
9
+ next unless Set === constant
10
+
11
+ constant.dup.each do |member|
12
+ constant.add(member.downcase)
13
+ end
9
14
  end
10
15
  end
11
16
  end
@@ -1,8 +1,10 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Loofah
2
4
  #
3
5
  # A RuntimeError raised when Loofah could not find an appropriate scrubber.
4
6
  #
5
- class ScrubberNotFound < RuntimeError ; end
7
+ class ScrubberNotFound < RuntimeError; end
6
8
 
7
9
  #
8
10
  # A Scrubber wraps up a block (or method) that is run on an HTML node (element):
@@ -23,7 +25,7 @@ module Loofah
23
25
  #
24
26
  # This can then be run on a document:
25
27
  #
26
- # Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
28
+ # Loofah.html5_fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
27
29
  # # => "<div>foo</div><p>bar</p>"
28
30
  #
29
31
  # Scrubbers can be run on a document in either a top-down traversal (the
@@ -31,12 +33,11 @@ module Loofah
31
33
  # Scrubber::STOP to terminate the traversal of a subtree.
32
34
  #
33
35
  class Scrubber
34
-
35
36
  # Top-down Scrubbers may return CONTINUE to indicate that the subtree should be traversed.
36
37
  CONTINUE = Object.new.freeze
37
38
 
38
39
  # Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
39
- STOP = Object.new.freeze
40
+ STOP = Object.new.freeze
40
41
 
41
42
  # When a scrubber is initialized, the :direction may be specified
42
43
  # as :top_down (the default) or :bottom_up.
@@ -64,9 +65,11 @@ module Loofah
64
65
  def initialize(options = {}, &block)
65
66
  direction = options[:direction] || :top_down
66
67
  unless [:top_down, :bottom_up].include?(direction)
67
- raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
+ raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
69
  end
69
- @direction, @block = direction, block
70
+
71
+ @direction = direction
72
+ @block = block
70
73
  end
71
74
 
72
75
  #
@@ -83,7 +86,7 @@ module Loofah
83
86
  # +scrub+, which will be called for each document node.
84
87
  #
85
88
  def scrub(node)
86
- raise ScrubberNotFound, "No scrub method has been defined on #{self.class.to_s}"
89
+ raise ScrubberNotFound, "No scrub method has been defined on #{self.class}"
87
90
  end
88
91
 
89
92
  #
@@ -91,10 +94,10 @@ module Loofah
91
94
  # If the attribute is set, don't overwrite the existing value
92
95
  #
93
96
  def append_attribute(node, attribute, value)
94
- current_value = node.get_attribute(attribute) || ''
97
+ current_value = node.get_attribute(attribute) || ""
95
98
  current_values = current_value.split(/\s+/)
96
99
  updated_value = current_values | [value]
97
- node.set_attribute(attribute, updated_value.join(' '))
100
+ node.set_attribute(attribute, updated_value.join(" "))
98
101
  end
99
102
 
100
103
  private
@@ -102,11 +105,15 @@ module Loofah
102
105
  def html5lib_sanitize(node)
103
106
  case node.type
104
107
  when Nokogiri::XML::Node::ELEMENT_NODE
105
- if HTML5::Scrub.allowed_element? node.name
106
- HTML5::Scrub.scrub_attributes node
108
+ if HTML5::Scrub.allowed_element?(node.name)
109
+ HTML5::Scrub.scrub_attributes(node)
107
110
  return Scrubber::CONTINUE
108
111
  end
109
112
  when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
113
+ if HTML5::Scrub.cdata_needs_escaping?(node)
114
+ node.before(HTML5::Scrub.cdata_escape(node))
115
+ return Scrubber::STOP
116
+ end
110
117
  return Scrubber::CONTINUE
111
118
  end
112
119
  Scrubber::STOP
@@ -115,14 +122,14 @@ module Loofah
115
122
  def traverse_conditionally_top_down(node)
116
123
  if block
117
124
  return if block.call(node) == STOP
118
- else
119
- return if scrub(node) == STOP
125
+ elsif scrub(node) == STOP
126
+ return
120
127
  end
121
- node.children.each {|j| traverse_conditionally_top_down(j)}
128
+ node.children.each { |j| traverse_conditionally_top_down(j) }
122
129
  end
123
130
 
124
131
  def traverse_conditionally_bottom_up(node)
125
- node.children.each {|j| traverse_conditionally_bottom_up(j)}
132
+ node.children.each { |j| traverse_conditionally_bottom_up(j) }
126
133
  if block
127
134
  block.call(node)
128
135
  else