loofah 2.2.3 → 2.21.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +269 -31
- data/README.md +109 -124
- data/lib/loofah/concerns.rb +207 -0
- data/lib/loofah/elements.rb +85 -79
- data/lib/loofah/helpers.rb +37 -20
- data/lib/loofah/{html → html4}/document.rb +6 -7
- data/lib/loofah/html4/document_fragment.rb +15 -0
- data/lib/loofah/html5/document.rb +17 -0
- data/lib/loofah/html5/document_fragment.rb +15 -0
- data/lib/loofah/html5/libxml2_workarounds.rb +10 -8
- data/lib/loofah/html5/safelist.rb +1055 -0
- data/lib/loofah/html5/scrub.rb +153 -58
- data/lib/loofah/metahelpers.rb +11 -6
- data/lib/loofah/scrubber.rb +22 -15
- data/lib/loofah/scrubbers.rb +66 -55
- data/lib/loofah/version.rb +6 -0
- data/lib/loofah/xml/document.rb +2 -0
- data/lib/loofah/xml/document_fragment.rb +4 -7
- data/lib/loofah.rb +131 -38
- metadata +28 -216
- data/.gemtest +0 -0
- data/Gemfile +0 -22
- data/Manifest.txt +0 -40
- data/Rakefile +0 -79
- data/benchmark/benchmark.rb +0 -149
- data/benchmark/fragment.html +0 -96
- data/benchmark/helper.rb +0 -73
- data/benchmark/www.slashdot.com.html +0 -2560
- data/lib/loofah/html/document_fragment.rb +0 -40
- data/lib/loofah/html5/whitelist.rb +0 -186
- data/lib/loofah/instance_methods.rb +0 -127
- data/test/assets/msword.html +0 -63
- data/test/assets/testdata_sanitizer_tests1.dat +0 -502
- data/test/helper.rb +0 -18
- data/test/html5/test_sanitizer.rb +0 -382
- data/test/integration/test_ad_hoc.rb +0 -204
- data/test/integration/test_helpers.rb +0 -43
- data/test/integration/test_html.rb +0 -72
- data/test/integration/test_scrubbers.rb +0 -400
- data/test/integration/test_xml.rb +0 -55
- data/test/unit/test_api.rb +0 -142
- data/test/unit/test_encoding.rb +0 -20
- data/test/unit/test_helpers.rb +0 -62
- data/test/unit/test_scrubber.rb +0 -229
- data/test/unit/test_scrubbers.rb +0 -14
data/lib/loofah/html5/scrub.rb
CHANGED
@@ -1,104 +1,160 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "cgi"
|
4
|
+
require "crass"
|
3
5
|
|
4
6
|
module Loofah
|
5
7
|
module HTML5 # :nodoc:
|
6
8
|
module Scrub
|
7
|
-
|
8
9
|
CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
|
9
|
-
CSS_KEYWORDISH = /\A(#[0-9a-
|
10
|
-
CRASS_SEMICOLON = {:
|
10
|
+
CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/ # rubocop:disable Layout/LineLength
|
11
|
+
CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
|
12
|
+
CSS_IMPORTANT = "!important"
|
13
|
+
CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
|
14
|
+
DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
|
11
15
|
|
12
16
|
class << self
|
13
|
-
|
14
|
-
|
15
|
-
::Loofah::HTML5::WhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
|
17
|
+
def allowed_element?(element_name)
|
18
|
+
::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
|
16
19
|
end
|
17
20
|
|
18
21
|
# alternative implementation of the html5lib attribute scrubbing algorithm
|
19
|
-
def scrub_attributes
|
22
|
+
def scrub_attributes(node)
|
20
23
|
node.attribute_nodes.each do |attr_node|
|
21
24
|
attr_name = if attr_node.namespace
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
25
|
+
"#{attr_node.namespace.prefix}:#{attr_node.node_name}"
|
26
|
+
else
|
27
|
+
attr_node.node_name
|
28
|
+
end
|
26
29
|
|
27
|
-
if attr_name
|
30
|
+
if DATA_ATTRIBUTE_NAME.match?(attr_name)
|
28
31
|
next
|
29
32
|
end
|
30
33
|
|
31
|
-
unless
|
34
|
+
unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
|
32
35
|
attr_node.remove
|
33
36
|
next
|
34
37
|
end
|
35
38
|
|
36
|
-
if
|
37
|
-
|
38
|
-
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
|
39
|
-
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! WhiteList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0])
|
40
|
-
attr_node.remove
|
41
|
-
next
|
42
|
-
elsif val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0] == 'data'
|
43
|
-
# permit only allowed data mediatypes
|
44
|
-
mediatype = val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[1]
|
45
|
-
mediatype, _ = mediatype.split(';')[0..1] if mediatype
|
46
|
-
if mediatype && !WhiteList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
|
47
|
-
attr_node.remove
|
48
|
-
next
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
if WhiteList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
|
53
|
-
attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
|
39
|
+
if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
|
40
|
+
next if scrub_uri_attribute(attr_node)
|
54
41
|
end
|
55
|
-
|
56
|
-
|
57
|
-
|
42
|
+
|
43
|
+
if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
|
44
|
+
scrub_attribute_that_allows_local_ref(attr_node)
|
58
45
|
end
|
46
|
+
|
47
|
+
next unless SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) &&
|
48
|
+
attr_name == "xlink:href" &&
|
49
|
+
attr_node.value =~ /^\s*[^#\s].*/m
|
50
|
+
|
51
|
+
attr_node.remove
|
52
|
+
next
|
59
53
|
end
|
60
54
|
|
61
|
-
scrub_css_attribute
|
55
|
+
scrub_css_attribute(node)
|
62
56
|
|
63
57
|
node.attribute_nodes.each do |attr_node|
|
64
|
-
|
58
|
+
if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
|
59
|
+
node.remove_attribute(attr_node.name)
|
60
|
+
end
|
65
61
|
end
|
66
62
|
|
67
|
-
force_correct_attribute_escaping!
|
63
|
+
force_correct_attribute_escaping!(node)
|
68
64
|
end
|
69
65
|
|
70
|
-
def scrub_css_attribute
|
71
|
-
style = node.attributes[
|
66
|
+
def scrub_css_attribute(node)
|
67
|
+
style = node.attributes["style"]
|
72
68
|
style.value = scrub_css(style.value) if style
|
73
69
|
end
|
74
70
|
|
75
|
-
def scrub_css
|
76
|
-
|
71
|
+
def scrub_css(style)
|
72
|
+
url_flags = [:url, :bad_url]
|
73
|
+
style_tree = Crass.parse_properties(style)
|
77
74
|
sanitized_tree = []
|
78
75
|
|
79
76
|
style_tree.each do |node|
|
80
77
|
next unless node[:node] == :property
|
81
78
|
next if node[:children].any? do |child|
|
82
|
-
|
79
|
+
url_flags.include?(child[:node])
|
83
80
|
end
|
81
|
+
|
84
82
|
name = node[:name].downcase
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
83
|
+
next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
|
84
|
+
SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
|
85
|
+
SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
|
86
|
+
|
87
|
+
value = node[:children].map do |child|
|
88
|
+
case child[:node]
|
89
|
+
when :whitespace
|
90
|
+
nil
|
91
|
+
when :string
|
92
|
+
if CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES.match?(child[:raw])
|
93
|
+
Crass::Parser.stringify(child)
|
94
|
+
end
|
95
|
+
when :function
|
96
|
+
if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
|
97
|
+
Crass::Parser.stringify(child)
|
98
|
+
end
|
99
|
+
when :ident
|
100
|
+
keyword = child[:value]
|
101
|
+
if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
|
102
|
+
SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
|
103
|
+
(keyword =~ CSS_KEYWORDISH)
|
90
104
|
keyword
|
91
105
|
end
|
92
|
-
|
93
|
-
|
94
|
-
propstring = sprintf "%s:%s", name, value.join(" ")
|
95
|
-
sanitized_node = Crass.parse_properties(propstring).first
|
96
|
-
sanitized_tree << sanitized_node << CRASS_SEMICOLON
|
106
|
+
else
|
107
|
+
child[:raw]
|
97
108
|
end
|
98
|
-
end
|
109
|
+
end.compact
|
110
|
+
|
111
|
+
next if value.empty?
|
112
|
+
|
113
|
+
value << CSS_IMPORTANT if node[:important]
|
114
|
+
propstring = format("%s:%s", name, value.join(" "))
|
115
|
+
sanitized_node = Crass.parse_properties(propstring).first
|
116
|
+
sanitized_tree << sanitized_node << CRASS_SEMICOLON
|
99
117
|
end
|
100
118
|
|
101
|
-
Crass::Parser.stringify
|
119
|
+
Crass::Parser.stringify(sanitized_tree)
|
120
|
+
end
|
121
|
+
|
122
|
+
def scrub_attribute_that_allows_local_ref(attr_node)
|
123
|
+
return unless attr_node.value
|
124
|
+
|
125
|
+
nodes = Crass::Parser.new(attr_node.value).parse_component_values
|
126
|
+
|
127
|
+
values = nodes.map do |node|
|
128
|
+
case node[:node]
|
129
|
+
when :url
|
130
|
+
if node[:value].start_with?("#")
|
131
|
+
node[:raw]
|
132
|
+
end
|
133
|
+
when :hash, :ident, :string
|
134
|
+
node[:raw]
|
135
|
+
end
|
136
|
+
end.compact
|
137
|
+
|
138
|
+
attr_node.value = values.join(" ")
|
139
|
+
end
|
140
|
+
|
141
|
+
def scrub_uri_attribute(attr_node)
|
142
|
+
# this block lifted nearly verbatim from HTML5 sanitization
|
143
|
+
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
|
144
|
+
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ &&
|
145
|
+
!SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
|
146
|
+
attr_node.remove
|
147
|
+
return true
|
148
|
+
elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
|
149
|
+
# permit only allowed data mediatypes
|
150
|
+
mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
|
151
|
+
mediatype, _ = mediatype.split(";")[0..1] if mediatype
|
152
|
+
if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
|
153
|
+
attr_node.remove
|
154
|
+
return true
|
155
|
+
end
|
156
|
+
end
|
157
|
+
false
|
102
158
|
end
|
103
159
|
|
104
160
|
#
|
@@ -106,7 +162,7 @@ module Loofah
|
|
106
162
|
#
|
107
163
|
# see comments about CVE-2018-8048 within the tests for more information
|
108
164
|
#
|
109
|
-
def force_correct_attribute_escaping!
|
165
|
+
def force_correct_attribute_escaping!(node)
|
110
166
|
return unless Nokogiri::VersionInfo.instance.libxml2?
|
111
167
|
|
112
168
|
node.attribute_nodes.each do |attr_node|
|
@@ -122,11 +178,50 @@ module Loofah
|
|
122
178
|
#
|
123
179
|
encoding = attr_node.value.encoding
|
124
180
|
attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
|
125
|
-
|
181
|
+
"%" + m.unpack("H2" * m.bytesize).join("%").upcase
|
126
182
|
end.force_encoding(encoding)
|
127
183
|
end
|
128
184
|
end
|
129
185
|
|
186
|
+
def cdata_needs_escaping?(node)
|
187
|
+
# Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` tag as cdata, but it acts that way
|
188
|
+
node.cdata? || (Nokogiri.jruby? && node.text? && node.parent.name == "style")
|
189
|
+
end
|
190
|
+
|
191
|
+
def cdata_escape(node)
|
192
|
+
escaped_text = escape_tags(node.text)
|
193
|
+
if Nokogiri.jruby?
|
194
|
+
node.document.create_text_node(escaped_text)
|
195
|
+
else
|
196
|
+
node.document.create_cdata(escaped_text)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
TABLE_FOR_ESCAPE_HTML__ = {
|
201
|
+
"<" => "<",
|
202
|
+
">" => ">",
|
203
|
+
"&" => "&",
|
204
|
+
}
|
205
|
+
|
206
|
+
def escape_tags(string)
|
207
|
+
# modified version of CGI.escapeHTML from ruby 3.1
|
208
|
+
enc = string.encoding
|
209
|
+
if enc.ascii_compatible?
|
210
|
+
string = string.b
|
211
|
+
string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
|
212
|
+
string.force_encoding(enc)
|
213
|
+
else
|
214
|
+
if enc.dummy?
|
215
|
+
origenc = enc
|
216
|
+
enc = Encoding::Converter.asciicompat_encoding(enc)
|
217
|
+
string = enc ? string.encode(enc) : string.b
|
218
|
+
end
|
219
|
+
table = Hash[TABLE_FOR_ESCAPE_HTML__.map { |pair| pair.map { |s| s.encode(enc) } }]
|
220
|
+
string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
|
221
|
+
string.encode!(origenc) if origenc
|
222
|
+
string
|
223
|
+
end
|
224
|
+
end
|
130
225
|
end
|
131
226
|
end
|
132
227
|
end
|
data/lib/loofah/metahelpers.rb
CHANGED
@@ -1,11 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Loofah
|
2
4
|
module MetaHelpers # :nodoc:
|
3
|
-
|
4
|
-
mojule
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
5
|
+
class << self
|
6
|
+
def add_downcased_set_members_to_all_set_constants(mojule)
|
7
|
+
mojule.constants.each do |constant_sym|
|
8
|
+
constant = mojule.const_get(constant_sym)
|
9
|
+
next unless Set === constant
|
10
|
+
|
11
|
+
constant.dup.each do |member|
|
12
|
+
constant.add(member.downcase)
|
13
|
+
end
|
9
14
|
end
|
10
15
|
end
|
11
16
|
end
|
data/lib/loofah/scrubber.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Loofah
|
2
4
|
#
|
3
5
|
# A RuntimeError raised when Loofah could not find an appropriate scrubber.
|
4
6
|
#
|
5
|
-
class ScrubberNotFound < RuntimeError
|
7
|
+
class ScrubberNotFound < RuntimeError; end
|
6
8
|
|
7
9
|
#
|
8
10
|
# A Scrubber wraps up a block (or method) that is run on an HTML node (element):
|
@@ -23,7 +25,7 @@ module Loofah
|
|
23
25
|
#
|
24
26
|
# This can then be run on a document:
|
25
27
|
#
|
26
|
-
# Loofah.
|
28
|
+
# Loofah.html5_fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
|
27
29
|
# # => "<div>foo</div><p>bar</p>"
|
28
30
|
#
|
29
31
|
# Scrubbers can be run on a document in either a top-down traversal (the
|
@@ -31,12 +33,11 @@ module Loofah
|
|
31
33
|
# Scrubber::STOP to terminate the traversal of a subtree.
|
32
34
|
#
|
33
35
|
class Scrubber
|
34
|
-
|
35
36
|
# Top-down Scrubbers may return CONTINUE to indicate that the subtree should be traversed.
|
36
37
|
CONTINUE = Object.new.freeze
|
37
38
|
|
38
39
|
# Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
|
39
|
-
STOP
|
40
|
+
STOP = Object.new.freeze
|
40
41
|
|
41
42
|
# When a scrubber is initialized, the :direction may be specified
|
42
43
|
# as :top_down (the default) or :bottom_up.
|
@@ -64,9 +65,11 @@ module Loofah
|
|
64
65
|
def initialize(options = {}, &block)
|
65
66
|
direction = options[:direction] || :top_down
|
66
67
|
unless [:top_down, :bottom_up].include?(direction)
|
67
|
-
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
68
|
+
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
68
69
|
end
|
69
|
-
|
70
|
+
|
71
|
+
@direction = direction
|
72
|
+
@block = block
|
70
73
|
end
|
71
74
|
|
72
75
|
#
|
@@ -83,7 +86,7 @@ module Loofah
|
|
83
86
|
# +scrub+, which will be called for each document node.
|
84
87
|
#
|
85
88
|
def scrub(node)
|
86
|
-
raise ScrubberNotFound, "No scrub method has been defined on #{self.class
|
89
|
+
raise ScrubberNotFound, "No scrub method has been defined on #{self.class}"
|
87
90
|
end
|
88
91
|
|
89
92
|
#
|
@@ -91,10 +94,10 @@ module Loofah
|
|
91
94
|
# If the attribute is set, don't overwrite the existing value
|
92
95
|
#
|
93
96
|
def append_attribute(node, attribute, value)
|
94
|
-
current_value = node.get_attribute(attribute) ||
|
97
|
+
current_value = node.get_attribute(attribute) || ""
|
95
98
|
current_values = current_value.split(/\s+/)
|
96
99
|
updated_value = current_values | [value]
|
97
|
-
node.set_attribute(attribute, updated_value.join(
|
100
|
+
node.set_attribute(attribute, updated_value.join(" "))
|
98
101
|
end
|
99
102
|
|
100
103
|
private
|
@@ -102,11 +105,15 @@ module Loofah
|
|
102
105
|
def html5lib_sanitize(node)
|
103
106
|
case node.type
|
104
107
|
when Nokogiri::XML::Node::ELEMENT_NODE
|
105
|
-
if HTML5::Scrub.allowed_element?
|
106
|
-
HTML5::Scrub.scrub_attributes
|
108
|
+
if HTML5::Scrub.allowed_element?(node.name)
|
109
|
+
HTML5::Scrub.scrub_attributes(node)
|
107
110
|
return Scrubber::CONTINUE
|
108
111
|
end
|
109
112
|
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
|
113
|
+
if HTML5::Scrub.cdata_needs_escaping?(node)
|
114
|
+
node.before(HTML5::Scrub.cdata_escape(node))
|
115
|
+
return Scrubber::STOP
|
116
|
+
end
|
110
117
|
return Scrubber::CONTINUE
|
111
118
|
end
|
112
119
|
Scrubber::STOP
|
@@ -115,14 +122,14 @@ module Loofah
|
|
115
122
|
def traverse_conditionally_top_down(node)
|
116
123
|
if block
|
117
124
|
return if block.call(node) == STOP
|
118
|
-
|
119
|
-
return
|
125
|
+
elsif scrub(node) == STOP
|
126
|
+
return
|
120
127
|
end
|
121
|
-
node.children.each {|j| traverse_conditionally_top_down(j)}
|
128
|
+
node.children.each { |j| traverse_conditionally_top_down(j) }
|
122
129
|
end
|
123
130
|
|
124
131
|
def traverse_conditionally_bottom_up(node)
|
125
|
-
node.children.each {|j| traverse_conditionally_bottom_up(j)}
|
132
|
+
node.children.each { |j| traverse_conditionally_bottom_up(j) }
|
126
133
|
if block
|
127
134
|
block.call(node)
|
128
135
|
else
|