loofah 2.2.3 → 2.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +269 -31
- data/README.md +109 -124
- data/lib/loofah/concerns.rb +207 -0
- data/lib/loofah/elements.rb +85 -79
- data/lib/loofah/helpers.rb +37 -20
- data/lib/loofah/{html → html4}/document.rb +6 -7
- data/lib/loofah/html4/document_fragment.rb +15 -0
- data/lib/loofah/html5/document.rb +17 -0
- data/lib/loofah/html5/document_fragment.rb +15 -0
- data/lib/loofah/html5/libxml2_workarounds.rb +10 -8
- data/lib/loofah/html5/safelist.rb +1055 -0
- data/lib/loofah/html5/scrub.rb +153 -58
- data/lib/loofah/metahelpers.rb +11 -6
- data/lib/loofah/scrubber.rb +22 -15
- data/lib/loofah/scrubbers.rb +66 -55
- data/lib/loofah/version.rb +6 -0
- data/lib/loofah/xml/document.rb +2 -0
- data/lib/loofah/xml/document_fragment.rb +4 -7
- data/lib/loofah.rb +131 -38
- metadata +28 -216
- data/.gemtest +0 -0
- data/Gemfile +0 -22
- data/Manifest.txt +0 -40
- data/Rakefile +0 -79
- data/benchmark/benchmark.rb +0 -149
- data/benchmark/fragment.html +0 -96
- data/benchmark/helper.rb +0 -73
- data/benchmark/www.slashdot.com.html +0 -2560
- data/lib/loofah/html/document_fragment.rb +0 -40
- data/lib/loofah/html5/whitelist.rb +0 -186
- data/lib/loofah/instance_methods.rb +0 -127
- data/test/assets/msword.html +0 -63
- data/test/assets/testdata_sanitizer_tests1.dat +0 -502
- data/test/helper.rb +0 -18
- data/test/html5/test_sanitizer.rb +0 -382
- data/test/integration/test_ad_hoc.rb +0 -204
- data/test/integration/test_helpers.rb +0 -43
- data/test/integration/test_html.rb +0 -72
- data/test/integration/test_scrubbers.rb +0 -400
- data/test/integration/test_xml.rb +0 -55
- data/test/unit/test_api.rb +0 -142
- data/test/unit/test_encoding.rb +0 -20
- data/test/unit/test_helpers.rb +0 -62
- data/test/unit/test_scrubber.rb +0 -229
- data/test/unit/test_scrubbers.rb +0 -14
data/lib/loofah/html5/scrub.rb
CHANGED
@@ -1,104 +1,160 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "cgi"
|
4
|
+
require "crass"
|
3
5
|
|
4
6
|
module Loofah
|
5
7
|
module HTML5 # :nodoc:
|
6
8
|
module Scrub
|
7
|
-
|
8
9
|
CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
|
9
|
-
CSS_KEYWORDISH = /\A(#[0-9a-
|
10
|
-
CRASS_SEMICOLON = {:
|
10
|
+
CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/ # rubocop:disable Layout/LineLength
|
11
|
+
CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
|
12
|
+
CSS_IMPORTANT = "!important"
|
13
|
+
CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
|
14
|
+
DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
|
11
15
|
|
12
16
|
class << self
|
13
|
-
|
14
|
-
|
15
|
-
::Loofah::HTML5::WhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
|
17
|
+
def allowed_element?(element_name)
|
18
|
+
::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
|
16
19
|
end
|
17
20
|
|
18
21
|
# alternative implementation of the html5lib attribute scrubbing algorithm
|
19
|
-
def scrub_attributes
|
22
|
+
def scrub_attributes(node)
|
20
23
|
node.attribute_nodes.each do |attr_node|
|
21
24
|
attr_name = if attr_node.namespace
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
25
|
+
"#{attr_node.namespace.prefix}:#{attr_node.node_name}"
|
26
|
+
else
|
27
|
+
attr_node.node_name
|
28
|
+
end
|
26
29
|
|
27
|
-
if attr_name
|
30
|
+
if DATA_ATTRIBUTE_NAME.match?(attr_name)
|
28
31
|
next
|
29
32
|
end
|
30
33
|
|
31
|
-
unless
|
34
|
+
unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
|
32
35
|
attr_node.remove
|
33
36
|
next
|
34
37
|
end
|
35
38
|
|
36
|
-
if
|
37
|
-
|
38
|
-
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
|
39
|
-
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! WhiteList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0])
|
40
|
-
attr_node.remove
|
41
|
-
next
|
42
|
-
elsif val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0] == 'data'
|
43
|
-
# permit only allowed data mediatypes
|
44
|
-
mediatype = val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[1]
|
45
|
-
mediatype, _ = mediatype.split(';')[0..1] if mediatype
|
46
|
-
if mediatype && !WhiteList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
|
47
|
-
attr_node.remove
|
48
|
-
next
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
if WhiteList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
|
53
|
-
attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
|
39
|
+
if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
|
40
|
+
next if scrub_uri_attribute(attr_node)
|
54
41
|
end
|
55
|
-
|
56
|
-
|
57
|
-
|
42
|
+
|
43
|
+
if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
|
44
|
+
scrub_attribute_that_allows_local_ref(attr_node)
|
58
45
|
end
|
46
|
+
|
47
|
+
next unless SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) &&
|
48
|
+
attr_name == "xlink:href" &&
|
49
|
+
attr_node.value =~ /^\s*[^#\s].*/m
|
50
|
+
|
51
|
+
attr_node.remove
|
52
|
+
next
|
59
53
|
end
|
60
54
|
|
61
|
-
scrub_css_attribute
|
55
|
+
scrub_css_attribute(node)
|
62
56
|
|
63
57
|
node.attribute_nodes.each do |attr_node|
|
64
|
-
|
58
|
+
if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
|
59
|
+
node.remove_attribute(attr_node.name)
|
60
|
+
end
|
65
61
|
end
|
66
62
|
|
67
|
-
force_correct_attribute_escaping!
|
63
|
+
force_correct_attribute_escaping!(node)
|
68
64
|
end
|
69
65
|
|
70
|
-
def scrub_css_attribute
|
71
|
-
style = node.attributes[
|
66
|
+
def scrub_css_attribute(node)
|
67
|
+
style = node.attributes["style"]
|
72
68
|
style.value = scrub_css(style.value) if style
|
73
69
|
end
|
74
70
|
|
75
|
-
def scrub_css
|
76
|
-
|
71
|
+
def scrub_css(style)
|
72
|
+
url_flags = [:url, :bad_url]
|
73
|
+
style_tree = Crass.parse_properties(style)
|
77
74
|
sanitized_tree = []
|
78
75
|
|
79
76
|
style_tree.each do |node|
|
80
77
|
next unless node[:node] == :property
|
81
78
|
next if node[:children].any? do |child|
|
82
|
-
|
79
|
+
url_flags.include?(child[:node])
|
83
80
|
end
|
81
|
+
|
84
82
|
name = node[:name].downcase
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
83
|
+
next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
|
84
|
+
SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
|
85
|
+
SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
|
86
|
+
|
87
|
+
value = node[:children].map do |child|
|
88
|
+
case child[:node]
|
89
|
+
when :whitespace
|
90
|
+
nil
|
91
|
+
when :string
|
92
|
+
if CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES.match?(child[:raw])
|
93
|
+
Crass::Parser.stringify(child)
|
94
|
+
end
|
95
|
+
when :function
|
96
|
+
if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
|
97
|
+
Crass::Parser.stringify(child)
|
98
|
+
end
|
99
|
+
when :ident
|
100
|
+
keyword = child[:value]
|
101
|
+
if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
|
102
|
+
SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
|
103
|
+
(keyword =~ CSS_KEYWORDISH)
|
90
104
|
keyword
|
91
105
|
end
|
92
|
-
|
93
|
-
|
94
|
-
propstring = sprintf "%s:%s", name, value.join(" ")
|
95
|
-
sanitized_node = Crass.parse_properties(propstring).first
|
96
|
-
sanitized_tree << sanitized_node << CRASS_SEMICOLON
|
106
|
+
else
|
107
|
+
child[:raw]
|
97
108
|
end
|
98
|
-
end
|
109
|
+
end.compact
|
110
|
+
|
111
|
+
next if value.empty?
|
112
|
+
|
113
|
+
value << CSS_IMPORTANT if node[:important]
|
114
|
+
propstring = format("%s:%s", name, value.join(" "))
|
115
|
+
sanitized_node = Crass.parse_properties(propstring).first
|
116
|
+
sanitized_tree << sanitized_node << CRASS_SEMICOLON
|
99
117
|
end
|
100
118
|
|
101
|
-
Crass::Parser.stringify
|
119
|
+
Crass::Parser.stringify(sanitized_tree)
|
120
|
+
end
|
121
|
+
|
122
|
+
def scrub_attribute_that_allows_local_ref(attr_node)
|
123
|
+
return unless attr_node.value
|
124
|
+
|
125
|
+
nodes = Crass::Parser.new(attr_node.value).parse_component_values
|
126
|
+
|
127
|
+
values = nodes.map do |node|
|
128
|
+
case node[:node]
|
129
|
+
when :url
|
130
|
+
if node[:value].start_with?("#")
|
131
|
+
node[:raw]
|
132
|
+
end
|
133
|
+
when :hash, :ident, :string
|
134
|
+
node[:raw]
|
135
|
+
end
|
136
|
+
end.compact
|
137
|
+
|
138
|
+
attr_node.value = values.join(" ")
|
139
|
+
end
|
140
|
+
|
141
|
+
def scrub_uri_attribute(attr_node)
|
142
|
+
# this block lifted nearly verbatim from HTML5 sanitization
|
143
|
+
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
|
144
|
+
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ &&
|
145
|
+
!SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
|
146
|
+
attr_node.remove
|
147
|
+
return true
|
148
|
+
elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
|
149
|
+
# permit only allowed data mediatypes
|
150
|
+
mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
|
151
|
+
mediatype, _ = mediatype.split(";")[0..1] if mediatype
|
152
|
+
if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
|
153
|
+
attr_node.remove
|
154
|
+
return true
|
155
|
+
end
|
156
|
+
end
|
157
|
+
false
|
102
158
|
end
|
103
159
|
|
104
160
|
#
|
@@ -106,7 +162,7 @@ module Loofah
|
|
106
162
|
#
|
107
163
|
# see comments about CVE-2018-8048 within the tests for more information
|
108
164
|
#
|
109
|
-
def force_correct_attribute_escaping!
|
165
|
+
def force_correct_attribute_escaping!(node)
|
110
166
|
return unless Nokogiri::VersionInfo.instance.libxml2?
|
111
167
|
|
112
168
|
node.attribute_nodes.each do |attr_node|
|
@@ -122,11 +178,50 @@ module Loofah
|
|
122
178
|
#
|
123
179
|
encoding = attr_node.value.encoding
|
124
180
|
attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
|
125
|
-
|
181
|
+
"%" + m.unpack("H2" * m.bytesize).join("%").upcase
|
126
182
|
end.force_encoding(encoding)
|
127
183
|
end
|
128
184
|
end
|
129
185
|
|
186
|
+
def cdata_needs_escaping?(node)
|
187
|
+
# Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` tag as cdata, but it acts that way
|
188
|
+
node.cdata? || (Nokogiri.jruby? && node.text? && node.parent.name == "style")
|
189
|
+
end
|
190
|
+
|
191
|
+
def cdata_escape(node)
|
192
|
+
escaped_text = escape_tags(node.text)
|
193
|
+
if Nokogiri.jruby?
|
194
|
+
node.document.create_text_node(escaped_text)
|
195
|
+
else
|
196
|
+
node.document.create_cdata(escaped_text)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
TABLE_FOR_ESCAPE_HTML__ = {
|
201
|
+
"<" => "<",
|
202
|
+
">" => ">",
|
203
|
+
"&" => "&",
|
204
|
+
}
|
205
|
+
|
206
|
+
def escape_tags(string)
|
207
|
+
# modified version of CGI.escapeHTML from ruby 3.1
|
208
|
+
enc = string.encoding
|
209
|
+
if enc.ascii_compatible?
|
210
|
+
string = string.b
|
211
|
+
string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
|
212
|
+
string.force_encoding(enc)
|
213
|
+
else
|
214
|
+
if enc.dummy?
|
215
|
+
origenc = enc
|
216
|
+
enc = Encoding::Converter.asciicompat_encoding(enc)
|
217
|
+
string = enc ? string.encode(enc) : string.b
|
218
|
+
end
|
219
|
+
table = Hash[TABLE_FOR_ESCAPE_HTML__.map { |pair| pair.map { |s| s.encode(enc) } }]
|
220
|
+
string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
|
221
|
+
string.encode!(origenc) if origenc
|
222
|
+
string
|
223
|
+
end
|
224
|
+
end
|
130
225
|
end
|
131
226
|
end
|
132
227
|
end
|
data/lib/loofah/metahelpers.rb
CHANGED
@@ -1,11 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Loofah
|
2
4
|
module MetaHelpers # :nodoc:
|
3
|
-
|
4
|
-
mojule
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
5
|
+
class << self
|
6
|
+
def add_downcased_set_members_to_all_set_constants(mojule)
|
7
|
+
mojule.constants.each do |constant_sym|
|
8
|
+
constant = mojule.const_get(constant_sym)
|
9
|
+
next unless Set === constant
|
10
|
+
|
11
|
+
constant.dup.each do |member|
|
12
|
+
constant.add(member.downcase)
|
13
|
+
end
|
9
14
|
end
|
10
15
|
end
|
11
16
|
end
|
data/lib/loofah/scrubber.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Loofah
|
2
4
|
#
|
3
5
|
# A RuntimeError raised when Loofah could not find an appropriate scrubber.
|
4
6
|
#
|
5
|
-
class ScrubberNotFound < RuntimeError
|
7
|
+
class ScrubberNotFound < RuntimeError; end
|
6
8
|
|
7
9
|
#
|
8
10
|
# A Scrubber wraps up a block (or method) that is run on an HTML node (element):
|
@@ -23,7 +25,7 @@ module Loofah
|
|
23
25
|
#
|
24
26
|
# This can then be run on a document:
|
25
27
|
#
|
26
|
-
# Loofah.
|
28
|
+
# Loofah.html5_fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
|
27
29
|
# # => "<div>foo</div><p>bar</p>"
|
28
30
|
#
|
29
31
|
# Scrubbers can be run on a document in either a top-down traversal (the
|
@@ -31,12 +33,11 @@ module Loofah
|
|
31
33
|
# Scrubber::STOP to terminate the traversal of a subtree.
|
32
34
|
#
|
33
35
|
class Scrubber
|
34
|
-
|
35
36
|
# Top-down Scrubbers may return CONTINUE to indicate that the subtree should be traversed.
|
36
37
|
CONTINUE = Object.new.freeze
|
37
38
|
|
38
39
|
# Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
|
39
|
-
STOP
|
40
|
+
STOP = Object.new.freeze
|
40
41
|
|
41
42
|
# When a scrubber is initialized, the :direction may be specified
|
42
43
|
# as :top_down (the default) or :bottom_up.
|
@@ -64,9 +65,11 @@ module Loofah
|
|
64
65
|
def initialize(options = {}, &block)
|
65
66
|
direction = options[:direction] || :top_down
|
66
67
|
unless [:top_down, :bottom_up].include?(direction)
|
67
|
-
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
68
|
+
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
68
69
|
end
|
69
|
-
|
70
|
+
|
71
|
+
@direction = direction
|
72
|
+
@block = block
|
70
73
|
end
|
71
74
|
|
72
75
|
#
|
@@ -83,7 +86,7 @@ module Loofah
|
|
83
86
|
# +scrub+, which will be called for each document node.
|
84
87
|
#
|
85
88
|
def scrub(node)
|
86
|
-
raise ScrubberNotFound, "No scrub method has been defined on #{self.class
|
89
|
+
raise ScrubberNotFound, "No scrub method has been defined on #{self.class}"
|
87
90
|
end
|
88
91
|
|
89
92
|
#
|
@@ -91,10 +94,10 @@ module Loofah
|
|
91
94
|
# If the attribute is set, don't overwrite the existing value
|
92
95
|
#
|
93
96
|
def append_attribute(node, attribute, value)
|
94
|
-
current_value = node.get_attribute(attribute) ||
|
97
|
+
current_value = node.get_attribute(attribute) || ""
|
95
98
|
current_values = current_value.split(/\s+/)
|
96
99
|
updated_value = current_values | [value]
|
97
|
-
node.set_attribute(attribute, updated_value.join(
|
100
|
+
node.set_attribute(attribute, updated_value.join(" "))
|
98
101
|
end
|
99
102
|
|
100
103
|
private
|
@@ -102,11 +105,15 @@ module Loofah
|
|
102
105
|
def html5lib_sanitize(node)
|
103
106
|
case node.type
|
104
107
|
when Nokogiri::XML::Node::ELEMENT_NODE
|
105
|
-
if HTML5::Scrub.allowed_element?
|
106
|
-
HTML5::Scrub.scrub_attributes
|
108
|
+
if HTML5::Scrub.allowed_element?(node.name)
|
109
|
+
HTML5::Scrub.scrub_attributes(node)
|
107
110
|
return Scrubber::CONTINUE
|
108
111
|
end
|
109
112
|
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
|
113
|
+
if HTML5::Scrub.cdata_needs_escaping?(node)
|
114
|
+
node.before(HTML5::Scrub.cdata_escape(node))
|
115
|
+
return Scrubber::STOP
|
116
|
+
end
|
110
117
|
return Scrubber::CONTINUE
|
111
118
|
end
|
112
119
|
Scrubber::STOP
|
@@ -115,14 +122,14 @@ module Loofah
|
|
115
122
|
def traverse_conditionally_top_down(node)
|
116
123
|
if block
|
117
124
|
return if block.call(node) == STOP
|
118
|
-
|
119
|
-
return
|
125
|
+
elsif scrub(node) == STOP
|
126
|
+
return
|
120
127
|
end
|
121
|
-
node.children.each {|j| traverse_conditionally_top_down(j)}
|
128
|
+
node.children.each { |j| traverse_conditionally_top_down(j) }
|
122
129
|
end
|
123
130
|
|
124
131
|
def traverse_conditionally_bottom_up(node)
|
125
|
-
node.children.each {|j| traverse_conditionally_bottom_up(j)}
|
132
|
+
node.children.each { |j| traverse_conditionally_bottom_up(j) }
|
126
133
|
if block
|
127
134
|
block.call(node)
|
128
135
|
else
|