loofah 0.4.2 → 2.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +604 -0
  3. data/MIT-LICENSE.txt +3 -1
  4. data/README.md +410 -0
  5. data/SECURITY.md +18 -0
  6. data/lib/loofah/concerns.rb +207 -0
  7. data/lib/loofah/elements.rb +98 -0
  8. data/lib/loofah/helpers.rb +91 -4
  9. data/lib/loofah/html4/document.rb +17 -0
  10. data/lib/loofah/html4/document_fragment.rb +15 -0
  11. data/lib/loofah/html5/document.rb +17 -0
  12. data/lib/loofah/html5/document_fragment.rb +15 -0
  13. data/lib/loofah/html5/libxml2_workarounds.rb +28 -0
  14. data/lib/loofah/html5/safelist.rb +1058 -0
  15. data/lib/loofah/html5/scrub.rb +211 -40
  16. data/lib/loofah/metahelpers.rb +18 -0
  17. data/lib/loofah/scrubber.rb +31 -13
  18. data/lib/loofah/scrubbers.rb +262 -31
  19. data/lib/loofah/version.rb +6 -0
  20. data/lib/loofah/xml/document.rb +2 -0
  21. data/lib/loofah/xml/document_fragment.rb +6 -9
  22. data/lib/loofah.rb +131 -52
  23. metadata +79 -158
  24. data/CHANGELOG.rdoc +0 -92
  25. data/DEPRECATED.rdoc +0 -12
  26. data/Manifest.txt +0 -34
  27. data/README.rdoc +0 -330
  28. data/Rakefile +0 -61
  29. data/TODO.rdoc +0 -4
  30. data/benchmark/benchmark.rb +0 -149
  31. data/benchmark/fragment.html +0 -96
  32. data/benchmark/helper.rb +0 -73
  33. data/benchmark/www.slashdot.com.html +0 -2560
  34. data/init.rb +0 -1
  35. data/lib/loofah/active_record.rb +0 -62
  36. data/lib/loofah/html/document.rb +0 -22
  37. data/lib/loofah/html/document_fragment.rb +0 -46
  38. data/lib/loofah/html5/whitelist.rb +0 -174
  39. data/lib/loofah/instance_methods.rb +0 -77
  40. data/lib/loofah/xss_foliate.rb +0 -212
  41. data/test/helper.rb +0 -8
  42. data/test/html5/test_sanitizer.rb +0 -248
  43. data/test/test_active_record.rb +0 -146
  44. data/test/test_ad_hoc.rb +0 -272
  45. data/test/test_api.rb +0 -128
  46. data/test/test_helpers.rb +0 -28
  47. data/test/test_scrubber.rb +0 -227
  48. data/test/test_scrubbers.rb +0 -144
  49. data/test/test_xss_foliate.rb +0 -171
  50. data.tar.gz.sig +0 -0
  51. metadata.gz.sig +0 -2
@@ -1,70 +1,241 @@
1
- require 'cgi'
1
+ # frozen_string_literal: true
2
+
3
+ require "cgi/escape"
4
+ require "cgi/util" if RUBY_VERSION < "3.5"
5
+ require "crass"
2
6
 
3
7
  module Loofah
4
8
  module HTML5 # :nodoc:
5
9
  module Scrub
10
+ CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
11
+ CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/ # rubocop:disable Layout/LineLength
12
+ CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
13
+ CSS_IMPORTANT = "!important"
14
+ CSS_WHITESPACE = " "
15
+ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
16
+ DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
17
+ URI_PROTOCOL_REGEX = /\A[a-z][a-z0-9+\-.]*:/ # RFC 3986
6
18
 
7
19
  class << self
20
+ def allowed_element?(element_name)
21
+ ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
22
+ end
8
23
 
9
24
  # alternative implementation of the html5lib attribute scrubbing algorithm
10
25
  def scrub_attributes(node)
11
26
  node.attribute_nodes.each do |attr_node|
12
27
  attr_name = if attr_node.namespace
13
- "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
14
- else
15
- attr_node.node_name
16
- end
17
- attr_node.remove unless HashedWhiteList::ALLOWED_ATTRIBUTES[attr_name]
18
- if HashedWhiteList::ATTR_VAL_IS_URI[attr_name]
19
- # this block lifted nearly verbatim from HTML5 sanitization
20
- val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
21
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and HashedWhiteList::ALLOWED_PROTOCOLS[val_unescaped.split(':')[0]].nil?
22
- attr_node.remove
23
- end
28
+ "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
29
+ else
30
+ attr_node.node_name
24
31
  end
25
- if HashedWhiteList::SVG_ATTR_VAL_ALLOWS_REF[attr_name]
26
- attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
32
+
33
+ if DATA_ATTRIBUTE_NAME.match?(attr_name)
34
+ next
27
35
  end
28
- if HashedWhiteList::SVG_ALLOW_LOCAL_HREF[node.name] && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
36
+
37
+ unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
29
38
  attr_node.remove
39
+ next
40
+ end
41
+
42
+ if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
43
+ next if scrub_uri_attribute(attr_node)
30
44
  end
45
+
46
+ if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
47
+ scrub_attribute_that_allows_local_ref(attr_node)
48
+ end
49
+
50
+ next unless SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) &&
51
+ attr_name == "xlink:href" &&
52
+ attr_node.value =~ /^\s*[^#\s].*/m
53
+
54
+ attr_node.remove
55
+ next
31
56
  end
32
- if node.attributes['style']
33
- node['style'] = scrub_css(node.attributes['style'])
57
+
58
+ scrub_css_attribute(node)
59
+
60
+ node.attribute_nodes.each do |attr_node|
61
+ if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
62
+ node.remove_attribute(attr_node.name)
63
+ end
34
64
  end
65
+
66
+ force_correct_attribute_escaping!(node)
67
+ end
68
+
69
+ def scrub_css_attribute(node)
70
+ style = node.attributes["style"]
71
+ style.value = scrub_css(style.value) if style
35
72
  end
36
73
 
37
- # lifted nearly verbatim from html5lib
38
74
  def scrub_css(style)
39
- # disallow urls
40
- style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
41
-
42
- # gauntlet
43
- return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
44
- return '' unless style =~ /^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$/
45
-
46
- clean = []
47
- style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
48
- next if val.empty?
49
- prop.downcase!
50
- if HashedWhiteList::ALLOWED_CSS_PROPERTIES[prop]
51
- clean << "#{prop}: #{val};"
52
- elsif %w[background border margin padding].include?(prop.split('-')[0])
53
- clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
54
- HashedWhiteList::ALLOWED_CSS_KEYWORDS[keyword].nil? and
55
- keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
75
+ url_flags = [:url, :bad_url]
76
+ style_tree = Crass.parse_properties(style)
77
+ sanitized_tree = []
78
+
79
+ style_tree.each do |node|
80
+ next unless node[:node] == :property
81
+ next if node[:children].any? do |child|
82
+ url_flags.include?(child[:node])
83
+ end
84
+
85
+ name = node[:name].downcase
86
+ next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
87
+ SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
88
+ SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
89
+
90
+ value = node[:children].map do |child|
91
+ case child[:node]
92
+ when :whitespace
93
+ CSS_WHITESPACE
94
+ when :string
95
+ if CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES.match?(child[:raw])
96
+ Crass::Parser.stringify(child)
97
+ end
98
+ when :function
99
+ if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
100
+ Crass::Parser.stringify(child)
101
+ end
102
+ when :ident
103
+ keyword = child[:value]
104
+ if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
105
+ SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
106
+ (keyword =~ CSS_KEYWORDISH)
107
+ keyword
108
+ end
109
+ else
110
+ child[:raw]
111
+ end
112
+ end.compact.join.strip
113
+
114
+ next if value.empty?
115
+
116
+ value << CSS_WHITESPACE << CSS_IMPORTANT if node[:important]
117
+ propstring = format("%s:%s", name, value)
118
+ sanitized_node = Crass.parse_properties(propstring).first
119
+ sanitized_tree << sanitized_node << CRASS_SEMICOLON
120
+ end
121
+
122
+ Crass::Parser.stringify(sanitized_tree)
123
+ end
124
+
125
+ def scrub_attribute_that_allows_local_ref(attr_node)
126
+ return unless attr_node.value
127
+
128
+ nodes = Crass::Parser.new(attr_node.value).parse_component_values
129
+
130
+ values = nodes.map do |node|
131
+ case node[:node]
132
+ when :url
133
+ if node[:value].start_with?("#")
134
+ node[:raw]
56
135
  end
57
- elsif HashedWhiteList::ALLOWED_SVG_PROPERTIES[prop]
58
- clean << "#{prop}: #{val};"
136
+ when :hash, :ident, :string
137
+ node[:raw]
138
+ end
139
+ end.compact
140
+
141
+ attr_node.value = values.join(" ")
142
+ end
143
+
144
+ # Returns true if the given URI string is safe, false otherwise.
145
+ # This method can be used to validate URI attribute values without
146
+ # requiring a Nokogiri DOM node.
147
+ def allowed_uri?(uri_string)
148
+ # this logic lifted nearly verbatim from HTML5 sanitization
149
+ val_unescaped = CGI.unescapeHTML(uri_string.gsub(CONTROL_CHARACTERS, "")).gsub("&colon;", ":").downcase
150
+ if URI_PROTOCOL_REGEX.match?(val_unescaped)
151
+ protocol = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0]
152
+ return false unless SafeList::ALLOWED_PROTOCOLS.include?(protocol)
153
+
154
+ if protocol == "data"
155
+ # permit only allowed data mediatypes
156
+ mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
157
+ mediatype, _ = mediatype.split(/[;,]/)[0..1] if mediatype
158
+ return false if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
59
159
  end
60
160
  end
161
+ true
162
+ end
61
163
 
62
- style = clean.join(' ')
164
+ def scrub_uri_attribute(attr_node)
165
+ if allowed_uri?(attr_node.value)
166
+ false
167
+ else
168
+ attr_node.remove
169
+ true
170
+ end
63
171
  end
64
172
 
65
- end
173
+ #
174
+ # libxml2 >= 2.9.2 fails to escape comments within some attributes.
175
+ #
176
+ # see comments about CVE-2018-8048 within the tests for more information
177
+ #
178
+ def force_correct_attribute_escaping!(node)
179
+ return unless Nokogiri::VersionInfo.instance.libxml2?
180
+
181
+ node.attribute_nodes.each do |attr_node|
182
+ next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name)
183
+
184
+ tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name]
185
+ next unless tag_name.nil? || tag_name == node.name
66
186
 
187
+ #
188
+ # this block is just like CGI.escape in Ruby 2.4, but
189
+ # only encodes space and double-quote, to mimic
190
+ # pre-2.9.2 behavior
191
+ #
192
+ encoding = attr_node.value.encoding
193
+ attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
194
+ "%" + m.unpack("H2" * m.bytesize).join("%").upcase
195
+ end.force_encoding(encoding)
196
+ end
197
+ end
198
+
199
+ def cdata_needs_escaping?(node)
200
+ # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` tag as cdata, but it acts that way
201
+ node.cdata? || (Nokogiri.jruby? && node.text? && node.parent.name == "style")
202
+ end
203
+
204
+ def cdata_escape(node)
205
+ escaped_text = escape_tags(node.text)
206
+ if Nokogiri.jruby?
207
+ node.document.create_text_node(escaped_text)
208
+ else
209
+ node.document.create_cdata(escaped_text)
210
+ end
211
+ end
212
+
213
+ TABLE_FOR_ESCAPE_HTML__ = {
214
+ "<" => "&lt;",
215
+ ">" => "&gt;",
216
+ "&" => "&amp;",
217
+ }
218
+
219
+ def escape_tags(string)
220
+ # modified version of CGI.escapeHTML from ruby 3.1
221
+ enc = string.encoding
222
+ if enc.ascii_compatible?
223
+ string = string.b
224
+ string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
225
+ string.force_encoding(enc)
226
+ else
227
+ if enc.dummy?
228
+ origenc = enc
229
+ enc = Encoding::Converter.asciicompat_encoding(enc)
230
+ string = enc ? string.encode(enc) : string.b
231
+ end
232
+ table = Hash[TABLE_FOR_ESCAPE_HTML__.map { |pair| pair.map { |s| s.encode(enc) } }]
233
+ string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
234
+ string.encode!(origenc) if origenc
235
+ string
236
+ end
237
+ end
238
+ end
67
239
  end
68
240
  end
69
241
  end
70
-
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Loofah
4
+ module MetaHelpers # :nodoc:
5
+ class << self
6
+ def add_downcased_set_members_to_all_set_constants(mojule)
7
+ mojule.constants.each do |constant_sym|
8
+ constant = mojule.const_get(constant_sym)
9
+ next unless Set === constant
10
+
11
+ constant.dup.each do |member|
12
+ constant.add(member.downcase)
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -1,8 +1,10 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Loofah
2
4
  #
3
5
  # A RuntimeError raised when Loofah could not find an appropriate scrubber.
4
6
  #
5
- class ScrubberNotFound < RuntimeError ; end
7
+ class ScrubberNotFound < RuntimeError; end
6
8
 
7
9
  #
8
10
  # A Scrubber wraps up a block (or method) that is run on an HTML node (element):
@@ -23,7 +25,7 @@ module Loofah
23
25
  #
24
26
  # This can then be run on a document:
25
27
  #
26
- # Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
28
+ # Loofah.html5_fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
27
29
  # # => "<div>foo</div><p>bar</p>"
28
30
  #
29
31
  # Scrubbers can be run on a document in either a top-down traversal (the
@@ -31,12 +33,11 @@ module Loofah
31
33
  # Scrubber::STOP to terminate the traversal of a subtree.
32
34
  #
33
35
  class Scrubber
34
-
35
36
  # Top-down Scrubbers may return CONTINUE to indicate that the subtree should be traversed.
36
37
  CONTINUE = Object.new.freeze
37
38
 
38
39
  # Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
39
- STOP = Object.new.freeze
40
+ STOP = Object.new.freeze
40
41
 
41
42
  # When a scrubber is initialized, the :direction may be specified
42
43
  # as :top_down (the default) or :bottom_up.
@@ -64,9 +65,11 @@ module Loofah
64
65
  def initialize(options = {}, &block)
65
66
  direction = options[:direction] || :top_down
66
67
  unless [:top_down, :bottom_up].include?(direction)
67
- raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
+ raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
69
  end
69
- @direction, @block = direction, block
70
+
71
+ @direction = direction
72
+ @block = block
70
73
  end
71
74
 
72
75
  #
@@ -83,7 +86,18 @@ module Loofah
83
86
  # +scrub+, which will be called for each document node.
84
87
  #
85
88
  def scrub(node)
86
- raise ScrubberNotFound, "No scrub method has been defined on #{self.class.to_s}"
89
+ raise ScrubberNotFound, "No scrub method has been defined on #{self.class}"
90
+ end
91
+
92
+ #
93
+ # If the attribute is not set, add it
94
+ # If the attribute is set, don't overwrite the existing value
95
+ #
96
+ def append_attribute(node, attribute, value)
97
+ current_value = node.get_attribute(attribute) || ""
98
+ current_values = current_value.split(/\s+/)
99
+ updated_value = current_values | [value]
100
+ node.set_attribute(attribute, updated_value.join(" "))
87
101
  end
88
102
 
89
103
  private
@@ -91,11 +105,15 @@ module Loofah
91
105
  def html5lib_sanitize(node)
92
106
  case node.type
93
107
  when Nokogiri::XML::Node::ELEMENT_NODE
94
- if HTML5::HashedWhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2[node.name]
95
- HTML5::Scrub.scrub_attributes node
108
+ if HTML5::Scrub.allowed_element?(node.name)
109
+ HTML5::Scrub.scrub_attributes(node)
96
110
  return Scrubber::CONTINUE
97
111
  end
98
112
  when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
113
+ if HTML5::Scrub.cdata_needs_escaping?(node)
114
+ node.before(HTML5::Scrub.cdata_escape(node))
115
+ return Scrubber::STOP
116
+ end
99
117
  return Scrubber::CONTINUE
100
118
  end
101
119
  Scrubber::STOP
@@ -104,14 +122,14 @@ module Loofah
104
122
  def traverse_conditionally_top_down(node)
105
123
  if block
106
124
  return if block.call(node) == STOP
107
- else
108
- return if scrub(node) == STOP
125
+ elsif scrub(node) == STOP
126
+ return
109
127
  end
110
- node.children.each {|j| traverse_conditionally_top_down(j)}
128
+ node.children.each { |j| traverse_conditionally_top_down(j) }
111
129
  end
112
130
 
113
131
  def traverse_conditionally_bottom_up(node)
114
- node.children.each {|j| traverse_conditionally_bottom_up(j)}
132
+ node.children.each { |j| traverse_conditionally_bottom_up(j) }
115
133
  if block
116
134
  block.call(node)
117
135
  else