loofah 2.2.3 → 2.19.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +221 -31
- data/README.md +18 -24
- data/lib/loofah/elements.rb +79 -75
- data/lib/loofah/helpers.rb +18 -7
- data/lib/loofah/html/document.rb +1 -0
- data/lib/loofah/html/document_fragment.rb +4 -2
- data/lib/loofah/html5/libxml2_workarounds.rb +8 -7
- data/lib/loofah/html5/safelist.rb +1042 -0
- data/lib/loofah/html5/scrub.rb +150 -55
- data/lib/loofah/instance_methods.rb +14 -8
- data/lib/loofah/metahelpers.rb +2 -1
- data/lib/loofah/scrubber.rb +12 -7
- data/lib/loofah/scrubbers.rb +21 -19
- data/lib/loofah/version.rb +5 -0
- data/lib/loofah/xml/document.rb +1 -0
- data/lib/loofah/xml/document_fragment.rb +2 -1
- data/lib/loofah.rb +35 -18
- metadata +52 -138
- data/.gemtest +0 -0
- data/Gemfile +0 -22
- data/Manifest.txt +0 -40
- data/Rakefile +0 -79
- data/benchmark/benchmark.rb +0 -149
- data/benchmark/fragment.html +0 -96
- data/benchmark/helper.rb +0 -73
- data/benchmark/www.slashdot.com.html +0 -2560
- data/lib/loofah/html5/whitelist.rb +0 -186
- data/test/assets/msword.html +0 -63
- data/test/assets/testdata_sanitizer_tests1.dat +0 -502
- data/test/helper.rb +0 -18
- data/test/html5/test_sanitizer.rb +0 -382
- data/test/integration/test_ad_hoc.rb +0 -204
- data/test/integration/test_helpers.rb +0 -43
- data/test/integration/test_html.rb +0 -72
- data/test/integration/test_scrubbers.rb +0 -400
- data/test/integration/test_xml.rb +0 -55
- data/test/unit/test_api.rb +0 -142
- data/test/unit/test_encoding.rb +0 -20
- data/test/unit/test_helpers.rb +0 -62
- data/test/unit/test_scrubber.rb +0 -229
- data/test/unit/test_scrubbers.rb +0 -14
data/lib/loofah/html5/scrub.rb
CHANGED
@@ -1,104 +1,160 @@
|
|
1
|
-
|
2
|
-
require
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "cgi"
|
3
|
+
require "crass"
|
3
4
|
|
4
5
|
module Loofah
|
5
6
|
module HTML5 # :nodoc:
|
6
7
|
module Scrub
|
7
|
-
|
8
8
|
CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
|
9
|
-
CSS_KEYWORDISH = /\A(#[0-9a-
|
10
|
-
CRASS_SEMICOLON = {:
|
9
|
+
CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
|
10
|
+
CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
|
11
|
+
CSS_IMPORTANT = '!important'
|
12
|
+
CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
|
13
|
+
DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
|
11
14
|
|
12
15
|
class << self
|
13
|
-
|
14
|
-
|
15
|
-
::Loofah::HTML5::WhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
|
16
|
+
def allowed_element?(element_name)
|
17
|
+
::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
|
16
18
|
end
|
17
19
|
|
18
20
|
# alternative implementation of the html5lib attribute scrubbing algorithm
|
19
|
-
def scrub_attributes
|
21
|
+
def scrub_attributes(node)
|
20
22
|
node.attribute_nodes.each do |attr_node|
|
21
23
|
attr_name = if attr_node.namespace
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
24
|
+
"#{attr_node.namespace.prefix}:#{attr_node.node_name}"
|
25
|
+
else
|
26
|
+
attr_node.node_name
|
27
|
+
end
|
26
28
|
|
27
|
-
if attr_name =~
|
29
|
+
if attr_name =~ DATA_ATTRIBUTE_NAME
|
28
30
|
next
|
29
31
|
end
|
30
32
|
|
31
|
-
unless
|
33
|
+
unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
|
32
34
|
attr_node.remove
|
33
35
|
next
|
34
36
|
end
|
35
37
|
|
36
|
-
if
|
37
|
-
|
38
|
-
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
|
39
|
-
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! WhiteList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0])
|
40
|
-
attr_node.remove
|
41
|
-
next
|
42
|
-
elsif val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0] == 'data'
|
43
|
-
# permit only allowed data mediatypes
|
44
|
-
mediatype = val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[1]
|
45
|
-
mediatype, _ = mediatype.split(';')[0..1] if mediatype
|
46
|
-
if mediatype && !WhiteList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
|
47
|
-
attr_node.remove
|
48
|
-
next
|
49
|
-
end
|
50
|
-
end
|
38
|
+
if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
|
39
|
+
next if scrub_uri_attribute(attr_node)
|
51
40
|
end
|
52
|
-
|
53
|
-
|
41
|
+
|
42
|
+
if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
|
43
|
+
scrub_attribute_that_allows_local_ref(attr_node)
|
54
44
|
end
|
55
|
-
|
45
|
+
|
46
|
+
if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
|
56
47
|
attr_node.remove
|
57
48
|
next
|
58
49
|
end
|
59
50
|
end
|
60
51
|
|
61
|
-
scrub_css_attribute
|
52
|
+
scrub_css_attribute(node)
|
62
53
|
|
63
54
|
node.attribute_nodes.each do |attr_node|
|
64
|
-
|
55
|
+
if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
|
56
|
+
node.remove_attribute(attr_node.name)
|
57
|
+
end
|
65
58
|
end
|
66
59
|
|
67
|
-
force_correct_attribute_escaping!
|
60
|
+
force_correct_attribute_escaping!(node)
|
68
61
|
end
|
69
62
|
|
70
|
-
def scrub_css_attribute
|
71
|
-
style = node.attributes[
|
63
|
+
def scrub_css_attribute(node)
|
64
|
+
style = node.attributes["style"]
|
72
65
|
style.value = scrub_css(style.value) if style
|
73
66
|
end
|
74
67
|
|
75
|
-
def scrub_css
|
76
|
-
style_tree = Crass.parse_properties
|
68
|
+
def scrub_css(style)
|
69
|
+
style_tree = Crass.parse_properties(style)
|
77
70
|
sanitized_tree = []
|
78
71
|
|
79
72
|
style_tree.each do |node|
|
80
73
|
next unless node[:node] == :property
|
81
74
|
next if node[:children].any? do |child|
|
82
|
-
[:url, :bad_url].include?(child[:node])
|
75
|
+
[:url, :bad_url].include?(child[:node])
|
83
76
|
end
|
77
|
+
|
84
78
|
name = node[:name].downcase
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
79
|
+
next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
|
80
|
+
SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
|
81
|
+
SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
|
82
|
+
|
83
|
+
value = node[:children].map do |child|
|
84
|
+
case child[:node]
|
85
|
+
when :whitespace
|
86
|
+
nil
|
87
|
+
when :string
|
88
|
+
if child[:raw] =~ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES
|
89
|
+
Crass::Parser.stringify(child)
|
90
|
+
else
|
91
|
+
nil
|
92
|
+
end
|
93
|
+
when :function
|
94
|
+
if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
|
95
|
+
Crass::Parser.stringify(child)
|
96
|
+
end
|
97
|
+
when :ident
|
98
|
+
keyword = child[:value]
|
99
|
+
if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
|
100
|
+
SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
|
101
|
+
(keyword =~ CSS_KEYWORDISH)
|
90
102
|
keyword
|
91
103
|
end
|
92
|
-
|
93
|
-
|
94
|
-
propstring = sprintf "%s:%s", name, value.join(" ")
|
95
|
-
sanitized_node = Crass.parse_properties(propstring).first
|
96
|
-
sanitized_tree << sanitized_node << CRASS_SEMICOLON
|
104
|
+
else
|
105
|
+
child[:raw]
|
97
106
|
end
|
98
|
-
end
|
107
|
+
end.compact
|
108
|
+
|
109
|
+
next if value.empty?
|
110
|
+
value << CSS_IMPORTANT if node[:important]
|
111
|
+
propstring = format("%s:%s", name, value.join(" "))
|
112
|
+
sanitized_node = Crass.parse_properties(propstring).first
|
113
|
+
sanitized_tree << sanitized_node << CRASS_SEMICOLON
|
99
114
|
end
|
100
115
|
|
101
|
-
Crass::Parser.stringify
|
116
|
+
Crass::Parser.stringify(sanitized_tree)
|
117
|
+
end
|
118
|
+
|
119
|
+
def scrub_attribute_that_allows_local_ref(attr_node)
|
120
|
+
return unless attr_node.value
|
121
|
+
|
122
|
+
nodes = Crass::Parser.new(attr_node.value).parse_component_values
|
123
|
+
|
124
|
+
values = nodes.map do |node|
|
125
|
+
case node[:node]
|
126
|
+
when :url
|
127
|
+
if node[:value].start_with?("#")
|
128
|
+
node[:raw]
|
129
|
+
else
|
130
|
+
nil
|
131
|
+
end
|
132
|
+
when :hash, :ident, :string
|
133
|
+
node[:raw]
|
134
|
+
else
|
135
|
+
nil
|
136
|
+
end
|
137
|
+
end.compact
|
138
|
+
|
139
|
+
attr_node.value = values.join(" ")
|
140
|
+
end
|
141
|
+
|
142
|
+
def scrub_uri_attribute(attr_node)
|
143
|
+
# this block lifted nearly verbatim from HTML5 sanitization
|
144
|
+
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
|
145
|
+
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
|
146
|
+
attr_node.remove
|
147
|
+
return true
|
148
|
+
elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
|
149
|
+
# permit only allowed data mediatypes
|
150
|
+
mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
|
151
|
+
mediatype, _ = mediatype.split(";")[0..1] if mediatype
|
152
|
+
if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
|
153
|
+
attr_node.remove
|
154
|
+
return true
|
155
|
+
end
|
156
|
+
end
|
157
|
+
false
|
102
158
|
end
|
103
159
|
|
104
160
|
#
|
@@ -106,7 +162,7 @@ module Loofah
|
|
106
162
|
#
|
107
163
|
# see comments about CVE-2018-8048 within the tests for more information
|
108
164
|
#
|
109
|
-
def force_correct_attribute_escaping!
|
165
|
+
def force_correct_attribute_escaping!(node)
|
110
166
|
return unless Nokogiri::VersionInfo.instance.libxml2?
|
111
167
|
|
112
168
|
node.attribute_nodes.each do |attr_node|
|
@@ -122,11 +178,50 @@ module Loofah
|
|
122
178
|
#
|
123
179
|
encoding = attr_node.value.encoding
|
124
180
|
attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
|
125
|
-
|
181
|
+
"%" + m.unpack("H2" * m.bytesize).join("%").upcase
|
126
182
|
end.force_encoding(encoding)
|
127
183
|
end
|
128
184
|
end
|
129
185
|
|
186
|
+
def cdata_needs_escaping?(node)
|
187
|
+
# Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way
|
188
|
+
node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script"))
|
189
|
+
end
|
190
|
+
|
191
|
+
def cdata_escape(node)
|
192
|
+
escaped_text = escape_tags(node.text)
|
193
|
+
if Nokogiri.jruby?
|
194
|
+
node.document.create_text_node(escaped_text)
|
195
|
+
else
|
196
|
+
node.document.create_cdata(escaped_text)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
TABLE_FOR_ESCAPE_HTML__ = {
|
201
|
+
'<' => '<',
|
202
|
+
'>' => '>',
|
203
|
+
'&' => '&',
|
204
|
+
}
|
205
|
+
|
206
|
+
def escape_tags(string)
|
207
|
+
# modified version of CGI.escapeHTML from ruby 3.1
|
208
|
+
enc = string.encoding
|
209
|
+
unless enc.ascii_compatible?
|
210
|
+
if enc.dummy?
|
211
|
+
origenc = enc
|
212
|
+
enc = Encoding::Converter.asciicompat_encoding(enc)
|
213
|
+
string = enc ? string.encode(enc) : string.b
|
214
|
+
end
|
215
|
+
table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
|
216
|
+
string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
|
217
|
+
string.encode!(origenc) if origenc
|
218
|
+
string
|
219
|
+
else
|
220
|
+
string = string.b
|
221
|
+
string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
|
222
|
+
string.force_encoding(enc)
|
223
|
+
end
|
224
|
+
end
|
130
225
|
end
|
131
226
|
end
|
132
227
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
#
|
3
4
|
# Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
|
@@ -91,28 +92,33 @@ module Loofah
|
|
91
92
|
# # decidedly not ok for browser:
|
92
93
|
# frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
|
93
94
|
#
|
94
|
-
def text(options={})
|
95
|
-
result = serialize_root
|
95
|
+
def text(options = {})
|
96
|
+
result = if serialize_root
|
97
|
+
serialize_root.children.reject(&:comment?).map(&:inner_text).join("")
|
98
|
+
else
|
99
|
+
""
|
100
|
+
end
|
96
101
|
if options[:encode_special_chars] == false
|
97
102
|
result # possibly dangerous if rendered in a browser
|
98
103
|
else
|
99
104
|
encode_special_chars result
|
100
105
|
end
|
101
106
|
end
|
107
|
+
|
102
108
|
alias :inner_text :text
|
103
|
-
alias :to_str
|
109
|
+
alias :to_str :text
|
104
110
|
|
105
111
|
#
|
106
112
|
# Returns a plain-text version of the markup contained by the
|
107
113
|
# fragment, with HTML entities encoded.
|
108
114
|
#
|
109
|
-
# This method is slower than #
|
110
|
-
# whitespace around block elements.
|
115
|
+
# This method is slower than #text, but is clever about
|
116
|
+
# whitespace around block elements and line break elements.
|
111
117
|
#
|
112
|
-
# Loofah.document("<h1>Title</h1><div>Content</div>").to_text
|
113
|
-
# # => "\nTitle\n\nContent\n"
|
118
|
+
# Loofah.document("<h1>Title</h1><div>Content<br>Next line</div>").to_text
|
119
|
+
# # => "\nTitle\n\nContent\nNext line\n"
|
114
120
|
#
|
115
|
-
def to_text(options={})
|
121
|
+
def to_text(options = {})
|
116
122
|
Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
|
117
123
|
end
|
118
124
|
end
|
data/lib/loofah/metahelpers.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
module MetaHelpers # :nodoc:
|
3
|
-
def self.add_downcased_set_members_to_all_set_constants
|
4
|
+
def self.add_downcased_set_members_to_all_set_constants(mojule)
|
4
5
|
mojule.constants.each do |constant_sym|
|
5
6
|
constant = mojule.const_get constant_sym
|
6
7
|
next unless Set === constant
|
data/lib/loofah/scrubber.rb
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
#
|
3
4
|
# A RuntimeError raised when Loofah could not find an appropriate scrubber.
|
4
5
|
#
|
5
|
-
class ScrubberNotFound < RuntimeError
|
6
|
+
class ScrubberNotFound < RuntimeError; end
|
6
7
|
|
7
8
|
#
|
8
9
|
# A Scrubber wraps up a block (or method) that is run on an HTML node (element):
|
@@ -36,7 +37,7 @@ module Loofah
|
|
36
37
|
CONTINUE = Object.new.freeze
|
37
38
|
|
38
39
|
# Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
|
39
|
-
STOP
|
40
|
+
STOP = Object.new.freeze
|
40
41
|
|
41
42
|
# When a scrubber is initialized, the :direction may be specified
|
42
43
|
# as :top_down (the default) or :bottom_up.
|
@@ -64,7 +65,7 @@ module Loofah
|
|
64
65
|
def initialize(options = {}, &block)
|
65
66
|
direction = options[:direction] || :top_down
|
66
67
|
unless [:top_down, :bottom_up].include?(direction)
|
67
|
-
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
68
|
+
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
68
69
|
end
|
69
70
|
@direction, @block = direction, block
|
70
71
|
end
|
@@ -91,10 +92,10 @@ module Loofah
|
|
91
92
|
# If the attribute is set, don't overwrite the existing value
|
92
93
|
#
|
93
94
|
def append_attribute(node, attribute, value)
|
94
|
-
current_value = node.get_attribute(attribute) ||
|
95
|
+
current_value = node.get_attribute(attribute) || ""
|
95
96
|
current_values = current_value.split(/\s+/)
|
96
97
|
updated_value = current_values | [value]
|
97
|
-
node.set_attribute(attribute, updated_value.join(
|
98
|
+
node.set_attribute(attribute, updated_value.join(" "))
|
98
99
|
end
|
99
100
|
|
100
101
|
private
|
@@ -107,6 +108,10 @@ module Loofah
|
|
107
108
|
return Scrubber::CONTINUE
|
108
109
|
end
|
109
110
|
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
|
111
|
+
if HTML5::Scrub.cdata_needs_escaping?(node)
|
112
|
+
node.before(HTML5::Scrub.cdata_escape(node))
|
113
|
+
return Scrubber::STOP
|
114
|
+
end
|
110
115
|
return Scrubber::CONTINUE
|
111
116
|
end
|
112
117
|
Scrubber::STOP
|
@@ -118,11 +123,11 @@ module Loofah
|
|
118
123
|
else
|
119
124
|
return if scrub(node) == STOP
|
120
125
|
end
|
121
|
-
node.children.each {|j| traverse_conditionally_top_down(j)}
|
126
|
+
node.children.each { |j| traverse_conditionally_top_down(j) }
|
122
127
|
end
|
123
128
|
|
124
129
|
def traverse_conditionally_bottom_up(node)
|
125
|
-
node.children.each {|j| traverse_conditionally_bottom_up(j)}
|
130
|
+
node.children.each { |j| traverse_conditionally_bottom_up(j) }
|
126
131
|
if block
|
127
132
|
block.call(node)
|
128
133
|
else
|
data/lib/loofah/scrubbers.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
#
|
3
4
|
# Loofah provides some built-in scrubbers for sanitizing with
|
4
|
-
# HTML5lib's
|
5
|
+
# HTML5lib's safelist and for accomplishing some common
|
5
6
|
# transformation tasks.
|
6
7
|
#
|
7
8
|
#
|
@@ -99,13 +100,9 @@ module Loofah
|
|
99
100
|
|
100
101
|
def scrub(node)
|
101
102
|
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
102
|
-
|
103
|
-
sanitized_text = Loofah.fragment(node.children.first.to_html).scrub!(:strip).to_html
|
104
|
-
node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
|
105
|
-
else
|
106
|
-
node.before node.children
|
107
|
-
end
|
103
|
+
node.before(node.children)
|
108
104
|
node.remove
|
105
|
+
return STOP
|
109
106
|
end
|
110
107
|
end
|
111
108
|
|
@@ -205,8 +202,8 @@ module Loofah
|
|
205
202
|
end
|
206
203
|
|
207
204
|
def scrub(node)
|
208
|
-
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name ==
|
209
|
-
append_attribute(node,
|
205
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
206
|
+
append_attribute(node, "rel", "nofollow")
|
210
207
|
return STOP
|
211
208
|
end
|
212
209
|
end
|
@@ -226,8 +223,8 @@ module Loofah
|
|
226
223
|
end
|
227
224
|
|
228
225
|
def scrub(node)
|
229
|
-
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name ==
|
230
|
-
append_attribute(node,
|
226
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
227
|
+
append_attribute(node, "rel", "noopener")
|
231
228
|
return STOP
|
232
229
|
end
|
233
230
|
end
|
@@ -239,8 +236,13 @@ module Loofah
|
|
239
236
|
end
|
240
237
|
|
241
238
|
def scrub(node)
|
242
|
-
return CONTINUE unless Loofah::Elements::
|
243
|
-
|
239
|
+
return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
|
240
|
+
replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
|
241
|
+
"\n"
|
242
|
+
else
|
243
|
+
"\n#{node.content}\n"
|
244
|
+
end
|
245
|
+
node.add_next_sibling Nokogiri::XML::Text.new(replacement, node.document)
|
244
246
|
node.remove
|
245
247
|
end
|
246
248
|
end
|
@@ -267,7 +269,7 @@ module Loofah
|
|
267
269
|
|
268
270
|
def scrub(node)
|
269
271
|
if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
|
270
|
-
node.content = node.content.gsub(/\u2028|\u2029/,
|
272
|
+
node.content = node.content.gsub(/\u2028|\u2029/, "")
|
271
273
|
end
|
272
274
|
CONTINUE
|
273
275
|
end
|
@@ -277,14 +279,14 @@ module Loofah
|
|
277
279
|
# A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
|
278
280
|
#
|
279
281
|
MAP = {
|
280
|
-
:escape
|
281
|
-
:prune
|
282
|
+
:escape => Escape,
|
283
|
+
:prune => Prune,
|
282
284
|
:whitewash => Whitewash,
|
283
|
-
:strip
|
284
|
-
:nofollow
|
285
|
+
:strip => Strip,
|
286
|
+
:nofollow => NoFollow,
|
285
287
|
:noopener => NoOpener,
|
286
288
|
:newline_block_elements => NewlineBlockElements,
|
287
|
-
:unprintable => Unprintable
|
289
|
+
:unprintable => Unprintable,
|
288
290
|
}
|
289
291
|
|
290
292
|
#
|
data/lib/loofah/xml/document.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
module XML # :nodoc:
|
3
4
|
#
|
@@ -12,7 +13,7 @@ module Loofah
|
|
12
13
|
# constructor. Applications should use Loofah.fragment to
|
13
14
|
# parse a fragment.
|
14
15
|
#
|
15
|
-
def parse
|
16
|
+
def parse(tags)
|
16
17
|
doc = Loofah::XML::Document.new
|
17
18
|
doc.encoding = tags.encoding.name if tags.respond_to?(:encoding)
|
18
19
|
self.new(doc, tags)
|
data/lib/loofah.rb
CHANGED
@@ -1,22 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__)))
|
2
3
|
|
3
|
-
require
|
4
|
+
require "nokogiri"
|
4
5
|
|
5
|
-
|
6
|
-
|
6
|
+
require_relative "loofah/version"
|
7
|
+
require_relative "loofah/metahelpers"
|
8
|
+
require_relative "loofah/elements"
|
7
9
|
|
8
|
-
|
9
|
-
|
10
|
-
|
10
|
+
require_relative "loofah/html5/safelist"
|
11
|
+
require_relative "loofah/html5/libxml2_workarounds"
|
12
|
+
require_relative "loofah/html5/scrub"
|
11
13
|
|
12
|
-
|
13
|
-
|
14
|
+
require_relative "loofah/scrubber"
|
15
|
+
require_relative "loofah/scrubbers"
|
14
16
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
require_relative "loofah/instance_methods"
|
18
|
+
require_relative "loofah/xml/document"
|
19
|
+
require_relative "loofah/xml/document_fragment"
|
20
|
+
require_relative "loofah/html/document"
|
21
|
+
require_relative "loofah/html/document_fragment"
|
20
22
|
|
21
23
|
# == Strings and IO Objects as Input
|
22
24
|
#
|
@@ -27,14 +29,11 @@ require 'loofah/html/document_fragment'
|
|
27
29
|
# quantities of docs.
|
28
30
|
#
|
29
31
|
module Loofah
|
30
|
-
# The version of Loofah you are using
|
31
|
-
VERSION = '2.2.3'
|
32
|
-
|
33
32
|
class << self
|
34
33
|
# Shortcut for Loofah::HTML::Document.parse
|
35
34
|
# This method accepts the same parameters as Nokogiri::HTML::Document.parse
|
36
35
|
def document(*args, &block)
|
37
|
-
Loofah::HTML::Document.parse(*args, &block)
|
36
|
+
remove_comments_before_html_element Loofah::HTML::Document.parse(*args, &block)
|
38
37
|
end
|
39
38
|
|
40
39
|
# Shortcut for Loofah::HTML::DocumentFragment.parse
|
@@ -77,7 +76,25 @@ module Loofah
|
|
77
76
|
|
78
77
|
# A helper to remove extraneous whitespace from text-ified HTML
|
79
78
|
def remove_extraneous_whitespace(string)
|
80
|
-
string.gsub(/\n\s*\n\s*\n/,"\n\n")
|
79
|
+
string.gsub(/\n\s*\n\s*\n/, "\n\n")
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
# remove comments that exist outside of the HTML element.
|
85
|
+
#
|
86
|
+
# these comments are allowed by the HTML spec:
|
87
|
+
#
|
88
|
+
# https://www.w3.org/TR/html401/struct/global.html#h-7.1
|
89
|
+
#
|
90
|
+
# but are not scrubbed by Loofah because these nodes don't meet
|
91
|
+
# the contract that scrubbers expect of a node (e.g., it can be
|
92
|
+
# replaced, sibling and children nodes can be created).
|
93
|
+
def remove_comments_before_html_element(doc)
|
94
|
+
doc.children.each do |child|
|
95
|
+
child.unlink if child.comment?
|
96
|
+
end
|
97
|
+
doc
|
81
98
|
end
|
82
99
|
end
|
83
100
|
end
|