loofah 2.2.3 → 2.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +269 -31
  3. data/README.md +109 -124
  4. data/lib/loofah/concerns.rb +207 -0
  5. data/lib/loofah/elements.rb +85 -79
  6. data/lib/loofah/helpers.rb +37 -20
  7. data/lib/loofah/{html → html4}/document.rb +6 -7
  8. data/lib/loofah/html4/document_fragment.rb +15 -0
  9. data/lib/loofah/html5/document.rb +17 -0
  10. data/lib/loofah/html5/document_fragment.rb +15 -0
  11. data/lib/loofah/html5/libxml2_workarounds.rb +10 -8
  12. data/lib/loofah/html5/safelist.rb +1055 -0
  13. data/lib/loofah/html5/scrub.rb +153 -58
  14. data/lib/loofah/metahelpers.rb +11 -6
  15. data/lib/loofah/scrubber.rb +22 -15
  16. data/lib/loofah/scrubbers.rb +66 -55
  17. data/lib/loofah/version.rb +6 -0
  18. data/lib/loofah/xml/document.rb +2 -0
  19. data/lib/loofah/xml/document_fragment.rb +4 -7
  20. data/lib/loofah.rb +131 -38
  21. metadata +28 -216
  22. data/.gemtest +0 -0
  23. data/Gemfile +0 -22
  24. data/Manifest.txt +0 -40
  25. data/Rakefile +0 -79
  26. data/benchmark/benchmark.rb +0 -149
  27. data/benchmark/fragment.html +0 -96
  28. data/benchmark/helper.rb +0 -73
  29. data/benchmark/www.slashdot.com.html +0 -2560
  30. data/lib/loofah/html/document_fragment.rb +0 -40
  31. data/lib/loofah/html5/whitelist.rb +0 -186
  32. data/lib/loofah/instance_methods.rb +0 -127
  33. data/test/assets/msword.html +0 -63
  34. data/test/assets/testdata_sanitizer_tests1.dat +0 -502
  35. data/test/helper.rb +0 -18
  36. data/test/html5/test_sanitizer.rb +0 -382
  37. data/test/integration/test_ad_hoc.rb +0 -204
  38. data/test/integration/test_helpers.rb +0 -43
  39. data/test/integration/test_html.rb +0 -72
  40. data/test/integration/test_scrubbers.rb +0 -400
  41. data/test/integration/test_xml.rb +0 -55
  42. data/test/unit/test_api.rb +0 -142
  43. data/test/unit/test_encoding.rb +0 -20
  44. data/test/unit/test_helpers.rb +0 -62
  45. data/test/unit/test_scrubber.rb +0 -229
  46. data/test/unit/test_scrubbers.rb +0 -14
@@ -1,104 +1,160 @@
1
- require 'cgi'
2
- require 'crass'
1
+ # frozen_string_literal: true
2
+
3
+ require "cgi"
4
+ require "crass"
3
5
 
4
6
  module Loofah
5
7
  module HTML5 # :nodoc:
6
8
  module Scrub
7
-
8
9
  CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
9
- CSS_KEYWORDISH = /\A(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
10
- CRASS_SEMICOLON = {:node => :semicolon, :raw => ";"}
10
+ CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/ # rubocop:disable Layout/LineLength
11
+ CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
12
+ CSS_IMPORTANT = "!important"
13
+ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
14
+ DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
11
15
 
12
16
  class << self
13
-
14
- def allowed_element? element_name
15
- ::Loofah::HTML5::WhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
17
+ def allowed_element?(element_name)
18
+ ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
16
19
  end
17
20
 
18
21
  # alternative implementation of the html5lib attribute scrubbing algorithm
19
- def scrub_attributes node
22
+ def scrub_attributes(node)
20
23
  node.attribute_nodes.each do |attr_node|
21
24
  attr_name = if attr_node.namespace
22
- "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
23
- else
24
- attr_node.node_name
25
- end
25
+ "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
26
+ else
27
+ attr_node.node_name
28
+ end
26
29
 
27
- if attr_name =~ /\Adata-[\w-]+\z/
30
+ if DATA_ATTRIBUTE_NAME.match?(attr_name)
28
31
  next
29
32
  end
30
33
 
31
- unless WhiteList::ALLOWED_ATTRIBUTES.include?(attr_name)
34
+ unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
32
35
  attr_node.remove
33
36
  next
34
37
  end
35
38
 
36
- if WhiteList::ATTR_VAL_IS_URI.include?(attr_name)
37
- # this block lifted nearly verbatim from HTML5 sanitization
38
- val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
39
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! WhiteList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0])
40
- attr_node.remove
41
- next
42
- elsif val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0] == 'data'
43
- # permit only allowed data mediatypes
44
- mediatype = val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[1]
45
- mediatype, _ = mediatype.split(';')[0..1] if mediatype
46
- if mediatype && !WhiteList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
47
- attr_node.remove
48
- next
49
- end
50
- end
51
- end
52
- if WhiteList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
53
- attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
39
+ if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
40
+ next if scrub_uri_attribute(attr_node)
54
41
  end
55
- if WhiteList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
56
- attr_node.remove
57
- next
42
+
43
+ if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
44
+ scrub_attribute_that_allows_local_ref(attr_node)
58
45
  end
46
+
47
+ next unless SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) &&
48
+ attr_name == "xlink:href" &&
49
+ attr_node.value =~ /^\s*[^#\s].*/m
50
+
51
+ attr_node.remove
52
+ next
59
53
  end
60
54
 
61
- scrub_css_attribute node
55
+ scrub_css_attribute(node)
62
56
 
63
57
  node.attribute_nodes.each do |attr_node|
64
- node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/
58
+ if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
59
+ node.remove_attribute(attr_node.name)
60
+ end
65
61
  end
66
62
 
67
- force_correct_attribute_escaping! node
63
+ force_correct_attribute_escaping!(node)
68
64
  end
69
65
 
70
- def scrub_css_attribute node
71
- style = node.attributes['style']
66
+ def scrub_css_attribute(node)
67
+ style = node.attributes["style"]
72
68
  style.value = scrub_css(style.value) if style
73
69
  end
74
70
 
75
- def scrub_css style
76
- style_tree = Crass.parse_properties style
71
+ def scrub_css(style)
72
+ url_flags = [:url, :bad_url]
73
+ style_tree = Crass.parse_properties(style)
77
74
  sanitized_tree = []
78
75
 
79
76
  style_tree.each do |node|
80
77
  next unless node[:node] == :property
81
78
  next if node[:children].any? do |child|
82
- [:url, :bad_url].include?(child[:node]) || (child[:node] == :function && !WhiteList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase))
79
+ url_flags.include?(child[:node])
83
80
  end
81
+
84
82
  name = node[:name].downcase
85
- if WhiteList::ALLOWED_CSS_PROPERTIES.include?(name) || WhiteList::ALLOWED_SVG_PROPERTIES.include?(name)
86
- sanitized_tree << node << CRASS_SEMICOLON
87
- elsif WhiteList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
88
- value = node[:value].split.map do |keyword|
89
- if WhiteList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
83
+ next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
84
+ SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
85
+ SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
86
+
87
+ value = node[:children].map do |child|
88
+ case child[:node]
89
+ when :whitespace
90
+ nil
91
+ when :string
92
+ if CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES.match?(child[:raw])
93
+ Crass::Parser.stringify(child)
94
+ end
95
+ when :function
96
+ if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
97
+ Crass::Parser.stringify(child)
98
+ end
99
+ when :ident
100
+ keyword = child[:value]
101
+ if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
102
+ SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
103
+ (keyword =~ CSS_KEYWORDISH)
90
104
  keyword
91
105
  end
92
- end.compact
93
- unless value.empty?
94
- propstring = sprintf "%s:%s", name, value.join(" ")
95
- sanitized_node = Crass.parse_properties(propstring).first
96
- sanitized_tree << sanitized_node << CRASS_SEMICOLON
106
+ else
107
+ child[:raw]
97
108
  end
98
- end
109
+ end.compact
110
+
111
+ next if value.empty?
112
+
113
+ value << CSS_IMPORTANT if node[:important]
114
+ propstring = format("%s:%s", name, value.join(" "))
115
+ sanitized_node = Crass.parse_properties(propstring).first
116
+ sanitized_tree << sanitized_node << CRASS_SEMICOLON
99
117
  end
100
118
 
101
- Crass::Parser.stringify sanitized_tree
119
+ Crass::Parser.stringify(sanitized_tree)
120
+ end
121
+
122
+ def scrub_attribute_that_allows_local_ref(attr_node)
123
+ return unless attr_node.value
124
+
125
+ nodes = Crass::Parser.new(attr_node.value).parse_component_values
126
+
127
+ values = nodes.map do |node|
128
+ case node[:node]
129
+ when :url
130
+ if node[:value].start_with?("#")
131
+ node[:raw]
132
+ end
133
+ when :hash, :ident, :string
134
+ node[:raw]
135
+ end
136
+ end.compact
137
+
138
+ attr_node.value = values.join(" ")
139
+ end
140
+
141
+ def scrub_uri_attribute(attr_node)
142
+ # this block lifted nearly verbatim from HTML5 sanitization
143
+ val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
144
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ &&
145
+ !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
146
+ attr_node.remove
147
+ return true
148
+ elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
149
+ # permit only allowed data mediatypes
150
+ mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
151
+ mediatype, _ = mediatype.split(";")[0..1] if mediatype
152
+ if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
153
+ attr_node.remove
154
+ return true
155
+ end
156
+ end
157
+ false
102
158
  end
103
159
 
104
160
  #
@@ -106,7 +162,7 @@ module Loofah
106
162
  #
107
163
  # see comments about CVE-2018-8048 within the tests for more information
108
164
  #
109
- def force_correct_attribute_escaping! node
165
+ def force_correct_attribute_escaping!(node)
110
166
  return unless Nokogiri::VersionInfo.instance.libxml2?
111
167
 
112
168
  node.attribute_nodes.each do |attr_node|
@@ -122,11 +178,50 @@ module Loofah
122
178
  #
123
179
  encoding = attr_node.value.encoding
124
180
  attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
125
- '%' + m.unpack('H2' * m.bytesize).join('%').upcase
181
+ "%" + m.unpack("H2" * m.bytesize).join("%").upcase
126
182
  end.force_encoding(encoding)
127
183
  end
128
184
  end
129
185
 
186
+ def cdata_needs_escaping?(node)
187
+ # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` tag as cdata, but it acts that way
188
+ node.cdata? || (Nokogiri.jruby? && node.text? && node.parent.name == "style")
189
+ end
190
+
191
+ def cdata_escape(node)
192
+ escaped_text = escape_tags(node.text)
193
+ if Nokogiri.jruby?
194
+ node.document.create_text_node(escaped_text)
195
+ else
196
+ node.document.create_cdata(escaped_text)
197
+ end
198
+ end
199
+
200
+ TABLE_FOR_ESCAPE_HTML__ = {
201
+ "<" => "&lt;",
202
+ ">" => "&gt;",
203
+ "&" => "&amp;",
204
+ }
205
+
206
+ def escape_tags(string)
207
+ # modified version of CGI.escapeHTML from ruby 3.1
208
+ enc = string.encoding
209
+ if enc.ascii_compatible?
210
+ string = string.b
211
+ string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
212
+ string.force_encoding(enc)
213
+ else
214
+ if enc.dummy?
215
+ origenc = enc
216
+ enc = Encoding::Converter.asciicompat_encoding(enc)
217
+ string = enc ? string.encode(enc) : string.b
218
+ end
219
+ table = Hash[TABLE_FOR_ESCAPE_HTML__.map { |pair| pair.map { |s| s.encode(enc) } }]
220
+ string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
221
+ string.encode!(origenc) if origenc
222
+ string
223
+ end
224
+ end
130
225
  end
131
226
  end
132
227
  end
@@ -1,11 +1,16 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Loofah
2
4
  module MetaHelpers # :nodoc:
3
- def self.add_downcased_set_members_to_all_set_constants mojule
4
- mojule.constants.each do |constant_sym|
5
- constant = mojule.const_get constant_sym
6
- next unless Set === constant
7
- constant.dup.each do |member|
8
- constant.add member.downcase
5
+ class << self
6
+ def add_downcased_set_members_to_all_set_constants(mojule)
7
+ mojule.constants.each do |constant_sym|
8
+ constant = mojule.const_get(constant_sym)
9
+ next unless Set === constant
10
+
11
+ constant.dup.each do |member|
12
+ constant.add(member.downcase)
13
+ end
9
14
  end
10
15
  end
11
16
  end
@@ -1,8 +1,10 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Loofah
2
4
  #
3
5
  # A RuntimeError raised when Loofah could not find an appropriate scrubber.
4
6
  #
5
- class ScrubberNotFound < RuntimeError ; end
7
+ class ScrubberNotFound < RuntimeError; end
6
8
 
7
9
  #
8
10
  # A Scrubber wraps up a block (or method) that is run on an HTML node (element):
@@ -23,7 +25,7 @@ module Loofah
23
25
  #
24
26
  # This can then be run on a document:
25
27
  #
26
- # Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
28
+ # Loofah.html5_fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
27
29
  # # => "<div>foo</div><p>bar</p>"
28
30
  #
29
31
  # Scrubbers can be run on a document in either a top-down traversal (the
@@ -31,12 +33,11 @@ module Loofah
31
33
  # Scrubber::STOP to terminate the traversal of a subtree.
32
34
  #
33
35
  class Scrubber
34
-
35
36
  # Top-down Scrubbers may return CONTINUE to indicate that the subtree should be traversed.
36
37
  CONTINUE = Object.new.freeze
37
38
 
38
39
  # Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
39
- STOP = Object.new.freeze
40
+ STOP = Object.new.freeze
40
41
 
41
42
  # When a scrubber is initialized, the :direction may be specified
42
43
  # as :top_down (the default) or :bottom_up.
@@ -64,9 +65,11 @@ module Loofah
64
65
  def initialize(options = {}, &block)
65
66
  direction = options[:direction] || :top_down
66
67
  unless [:top_down, :bottom_up].include?(direction)
67
- raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
+ raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
69
  end
69
- @direction, @block = direction, block
70
+
71
+ @direction = direction
72
+ @block = block
70
73
  end
71
74
 
72
75
  #
@@ -83,7 +86,7 @@ module Loofah
83
86
  # +scrub+, which will be called for each document node.
84
87
  #
85
88
  def scrub(node)
86
- raise ScrubberNotFound, "No scrub method has been defined on #{self.class.to_s}"
89
+ raise ScrubberNotFound, "No scrub method has been defined on #{self.class}"
87
90
  end
88
91
 
89
92
  #
@@ -91,10 +94,10 @@ module Loofah
91
94
  # If the attribute is set, don't overwrite the existing value
92
95
  #
93
96
  def append_attribute(node, attribute, value)
94
- current_value = node.get_attribute(attribute) || ''
97
+ current_value = node.get_attribute(attribute) || ""
95
98
  current_values = current_value.split(/\s+/)
96
99
  updated_value = current_values | [value]
97
- node.set_attribute(attribute, updated_value.join(' '))
100
+ node.set_attribute(attribute, updated_value.join(" "))
98
101
  end
99
102
 
100
103
  private
@@ -102,11 +105,15 @@ module Loofah
102
105
  def html5lib_sanitize(node)
103
106
  case node.type
104
107
  when Nokogiri::XML::Node::ELEMENT_NODE
105
- if HTML5::Scrub.allowed_element? node.name
106
- HTML5::Scrub.scrub_attributes node
108
+ if HTML5::Scrub.allowed_element?(node.name)
109
+ HTML5::Scrub.scrub_attributes(node)
107
110
  return Scrubber::CONTINUE
108
111
  end
109
112
  when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
113
+ if HTML5::Scrub.cdata_needs_escaping?(node)
114
+ node.before(HTML5::Scrub.cdata_escape(node))
115
+ return Scrubber::STOP
116
+ end
110
117
  return Scrubber::CONTINUE
111
118
  end
112
119
  Scrubber::STOP
@@ -115,14 +122,14 @@ module Loofah
115
122
  def traverse_conditionally_top_down(node)
116
123
  if block
117
124
  return if block.call(node) == STOP
118
- else
119
- return if scrub(node) == STOP
125
+ elsif scrub(node) == STOP
126
+ return
120
127
  end
121
- node.children.each {|j| traverse_conditionally_top_down(j)}
128
+ node.children.each { |j| traverse_conditionally_top_down(j) }
122
129
  end
123
130
 
124
131
  def traverse_conditionally_bottom_up(node)
125
- node.children.each {|j| traverse_conditionally_bottom_up(j)}
132
+ node.children.each { |j| traverse_conditionally_bottom_up(j) }
126
133
  if block
127
134
  block.call(node)
128
135
  else