loofah 1.0.0 → 2.19.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +489 -0
- data/MIT-LICENSE.txt +3 -1
- data/README.md +364 -0
- data/SECURITY.md +18 -0
- data/lib/loofah/elements.rb +88 -11
- data/lib/loofah/helpers.rb +76 -2
- data/lib/loofah/html/document.rb +1 -0
- data/lib/loofah/html/document_fragment.rb +9 -2
- data/lib/loofah/html5/libxml2_workarounds.rb +27 -0
- data/lib/loofah/html5/safelist.rb +1042 -0
- data/lib/loofah/html5/scrub.rb +198 -40
- data/lib/loofah/instance_methods.rb +16 -10
- data/lib/loofah/metahelpers.rb +9 -10
- data/lib/loofah/scrubber.rb +22 -6
- data/lib/loofah/scrubbers.rb +96 -16
- data/lib/loofah/version.rb +5 -0
- data/lib/loofah/xml/document.rb +1 -0
- data/lib/loofah/xml/document_fragment.rb +5 -2
- data/lib/loofah.rb +38 -25
- metadata +159 -172
- data/CHANGELOG.rdoc +0 -134
- data/Gemfile +0 -1
- data/Manifest.txt +0 -34
- data/README.rdoc +0 -312
- data/Rakefile +0 -53
- data/benchmark/benchmark.rb +0 -149
- data/benchmark/fragment.html +0 -96
- data/benchmark/helper.rb +0 -73
- data/benchmark/www.slashdot.com.html +0 -2560
- data/lib/loofah/html5/whitelist.rb +0 -168
- data/test/helper.rb +0 -7
- data/test/html5/test_sanitizer.rb +0 -248
- data/test/integration/test_ad_hoc.rb +0 -176
- data/test/integration/test_helpers.rb +0 -33
- data/test/integration/test_html.rb +0 -51
- data/test/integration/test_scrubbers.rb +0 -331
- data/test/integration/test_xml.rb +0 -55
- data/test/unit/test_api.rb +0 -138
- data/test/unit/test_helpers.rb +0 -27
- data/test/unit/test_scrubber.rb +0 -229
- data/test/unit/test_scrubbers.rb +0 -14
data/lib/loofah/html5/scrub.rb
CHANGED
@@ -1,70 +1,228 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "cgi"
|
3
|
+
require "crass"
|
2
4
|
|
3
5
|
module Loofah
|
4
6
|
module HTML5 # :nodoc:
|
5
7
|
module Scrub
|
8
|
+
CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
|
9
|
+
CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
|
10
|
+
CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
|
11
|
+
CSS_IMPORTANT = '!important'
|
12
|
+
CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
|
13
|
+
DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
|
6
14
|
|
7
15
|
class << self
|
16
|
+
def allowed_element?(element_name)
|
17
|
+
::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
|
18
|
+
end
|
8
19
|
|
9
20
|
# alternative implementation of the html5lib attribute scrubbing algorithm
|
10
21
|
def scrub_attributes(node)
|
11
22
|
node.attribute_nodes.each do |attr_node|
|
12
23
|
attr_name = if attr_node.namespace
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
end
|
17
|
-
attr_node.remove unless HashedWhiteList::ALLOWED_ATTRIBUTES[attr_name]
|
18
|
-
if HashedWhiteList::ATTR_VAL_IS_URI[attr_name]
|
19
|
-
# this block lifted nearly verbatim from HTML5 sanitization
|
20
|
-
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
21
|
-
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and HashedWhiteList::ALLOWED_PROTOCOLS[val_unescaped.split(':')[0]].nil?
|
22
|
-
attr_node.remove
|
23
|
-
end
|
24
|
+
"#{attr_node.namespace.prefix}:#{attr_node.node_name}"
|
25
|
+
else
|
26
|
+
attr_node.node_name
|
24
27
|
end
|
25
|
-
|
26
|
-
|
28
|
+
|
29
|
+
if attr_name =~ DATA_ATTRIBUTE_NAME
|
30
|
+
next
|
27
31
|
end
|
28
|
-
|
32
|
+
|
33
|
+
unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
|
29
34
|
attr_node.remove
|
35
|
+
next
|
36
|
+
end
|
37
|
+
|
38
|
+
if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
|
39
|
+
next if scrub_uri_attribute(attr_node)
|
40
|
+
end
|
41
|
+
|
42
|
+
if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
|
43
|
+
scrub_attribute_that_allows_local_ref(attr_node)
|
44
|
+
end
|
45
|
+
|
46
|
+
if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
|
47
|
+
attr_node.remove
|
48
|
+
next
|
30
49
|
end
|
31
50
|
end
|
32
|
-
|
33
|
-
|
51
|
+
|
52
|
+
scrub_css_attribute(node)
|
53
|
+
|
54
|
+
node.attribute_nodes.each do |attr_node|
|
55
|
+
if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
|
56
|
+
node.remove_attribute(attr_node.name)
|
57
|
+
end
|
34
58
|
end
|
59
|
+
|
60
|
+
force_correct_attribute_escaping!(node)
|
61
|
+
end
|
62
|
+
|
63
|
+
def scrub_css_attribute(node)
|
64
|
+
style = node.attributes["style"]
|
65
|
+
style.value = scrub_css(style.value) if style
|
35
66
|
end
|
36
67
|
|
37
|
-
# lifted nearly verbatim from html5lib
|
38
68
|
def scrub_css(style)
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
69
|
+
style_tree = Crass.parse_properties(style)
|
70
|
+
sanitized_tree = []
|
71
|
+
|
72
|
+
style_tree.each do |node|
|
73
|
+
next unless node[:node] == :property
|
74
|
+
next if node[:children].any? do |child|
|
75
|
+
[:url, :bad_url].include?(child[:node])
|
76
|
+
end
|
77
|
+
|
78
|
+
name = node[:name].downcase
|
79
|
+
next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
|
80
|
+
SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
|
81
|
+
SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
|
82
|
+
|
83
|
+
value = node[:children].map do |child|
|
84
|
+
case child[:node]
|
85
|
+
when :whitespace
|
86
|
+
nil
|
87
|
+
when :string
|
88
|
+
if child[:raw] =~ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES
|
89
|
+
Crass::Parser.stringify(child)
|
90
|
+
else
|
91
|
+
nil
|
92
|
+
end
|
93
|
+
when :function
|
94
|
+
if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
|
95
|
+
Crass::Parser.stringify(child)
|
96
|
+
end
|
97
|
+
when :ident
|
98
|
+
keyword = child[:value]
|
99
|
+
if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
|
100
|
+
SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
|
101
|
+
(keyword =~ CSS_KEYWORDISH)
|
102
|
+
keyword
|
103
|
+
end
|
104
|
+
else
|
105
|
+
child[:raw]
|
106
|
+
end
|
107
|
+
end.compact
|
108
|
+
|
109
|
+
next if value.empty?
|
110
|
+
value << CSS_IMPORTANT if node[:important]
|
111
|
+
propstring = format("%s:%s", name, value.join(" "))
|
112
|
+
sanitized_node = Crass.parse_properties(propstring).first
|
113
|
+
sanitized_tree << sanitized_node << CRASS_SEMICOLON
|
114
|
+
end
|
115
|
+
|
116
|
+
Crass::Parser.stringify(sanitized_tree)
|
117
|
+
end
|
118
|
+
|
119
|
+
def scrub_attribute_that_allows_local_ref(attr_node)
|
120
|
+
return unless attr_node.value
|
121
|
+
|
122
|
+
nodes = Crass::Parser.new(attr_node.value).parse_component_values
|
123
|
+
|
124
|
+
values = nodes.map do |node|
|
125
|
+
case node[:node]
|
126
|
+
when :url
|
127
|
+
if node[:value].start_with?("#")
|
128
|
+
node[:raw]
|
129
|
+
else
|
130
|
+
nil
|
56
131
|
end
|
57
|
-
|
58
|
-
|
132
|
+
when :hash, :ident, :string
|
133
|
+
node[:raw]
|
134
|
+
else
|
135
|
+
nil
|
59
136
|
end
|
137
|
+
end.compact
|
138
|
+
|
139
|
+
attr_node.value = values.join(" ")
|
140
|
+
end
|
141
|
+
|
142
|
+
def scrub_uri_attribute(attr_node)
|
143
|
+
# this block lifted nearly verbatim from HTML5 sanitization
|
144
|
+
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
|
145
|
+
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
|
146
|
+
attr_node.remove
|
147
|
+
return true
|
148
|
+
elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
|
149
|
+
# permit only allowed data mediatypes
|
150
|
+
mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
|
151
|
+
mediatype, _ = mediatype.split(";")[0..1] if mediatype
|
152
|
+
if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
|
153
|
+
attr_node.remove
|
154
|
+
return true
|
155
|
+
end
|
156
|
+
end
|
157
|
+
false
|
158
|
+
end
|
159
|
+
|
160
|
+
#
|
161
|
+
# libxml2 >= 2.9.2 fails to escape comments within some attributes.
|
162
|
+
#
|
163
|
+
# see comments about CVE-2018-8048 within the tests for more information
|
164
|
+
#
|
165
|
+
def force_correct_attribute_escaping!(node)
|
166
|
+
return unless Nokogiri::VersionInfo.instance.libxml2?
|
167
|
+
|
168
|
+
node.attribute_nodes.each do |attr_node|
|
169
|
+
next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name)
|
170
|
+
|
171
|
+
tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name]
|
172
|
+
next unless tag_name.nil? || tag_name == node.name
|
173
|
+
|
174
|
+
#
|
175
|
+
# this block is just like CGI.escape in Ruby 2.4, but
|
176
|
+
# only encodes space and double-quote, to mimic
|
177
|
+
# pre-2.9.2 behavior
|
178
|
+
#
|
179
|
+
encoding = attr_node.value.encoding
|
180
|
+
attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
|
181
|
+
"%" + m.unpack("H2" * m.bytesize).join("%").upcase
|
182
|
+
end.force_encoding(encoding)
|
60
183
|
end
|
184
|
+
end
|
61
185
|
|
62
|
-
|
186
|
+
def cdata_needs_escaping?(node)
|
187
|
+
# Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way
|
188
|
+
node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script"))
|
63
189
|
end
|
64
190
|
|
65
|
-
|
191
|
+
def cdata_escape(node)
|
192
|
+
escaped_text = escape_tags(node.text)
|
193
|
+
if Nokogiri.jruby?
|
194
|
+
node.document.create_text_node(escaped_text)
|
195
|
+
else
|
196
|
+
node.document.create_cdata(escaped_text)
|
197
|
+
end
|
198
|
+
end
|
66
199
|
|
200
|
+
TABLE_FOR_ESCAPE_HTML__ = {
|
201
|
+
'<' => '<',
|
202
|
+
'>' => '>',
|
203
|
+
'&' => '&',
|
204
|
+
}
|
205
|
+
|
206
|
+
def escape_tags(string)
|
207
|
+
# modified version of CGI.escapeHTML from ruby 3.1
|
208
|
+
enc = string.encoding
|
209
|
+
unless enc.ascii_compatible?
|
210
|
+
if enc.dummy?
|
211
|
+
origenc = enc
|
212
|
+
enc = Encoding::Converter.asciicompat_encoding(enc)
|
213
|
+
string = enc ? string.encode(enc) : string.b
|
214
|
+
end
|
215
|
+
table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
|
216
|
+
string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
|
217
|
+
string.encode!(origenc) if origenc
|
218
|
+
string
|
219
|
+
else
|
220
|
+
string = string.b
|
221
|
+
string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
|
222
|
+
string.force_encoding(enc)
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
67
226
|
end
|
68
227
|
end
|
69
228
|
end
|
70
|
-
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
#
|
3
4
|
# Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
|
@@ -41,7 +42,7 @@ module Loofah
|
|
41
42
|
when Nokogiri::XML::Document
|
42
43
|
scrubber.traverse(root) if root
|
43
44
|
when Nokogiri::XML::DocumentFragment
|
44
|
-
children.
|
45
|
+
children.scrub! scrubber
|
45
46
|
else
|
46
47
|
scrubber.traverse(self)
|
47
48
|
end
|
@@ -91,29 +92,34 @@ module Loofah
|
|
91
92
|
# # decidedly not ok for browser:
|
92
93
|
# frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
|
93
94
|
#
|
94
|
-
def text(options={})
|
95
|
-
result = serialize_root
|
95
|
+
def text(options = {})
|
96
|
+
result = if serialize_root
|
97
|
+
serialize_root.children.reject(&:comment?).map(&:inner_text).join("")
|
98
|
+
else
|
99
|
+
""
|
100
|
+
end
|
96
101
|
if options[:encode_special_chars] == false
|
97
102
|
result # possibly dangerous if rendered in a browser
|
98
103
|
else
|
99
104
|
encode_special_chars result
|
100
105
|
end
|
101
106
|
end
|
107
|
+
|
102
108
|
alias :inner_text :text
|
103
|
-
alias :to_str
|
109
|
+
alias :to_str :text
|
104
110
|
|
105
111
|
#
|
106
112
|
# Returns a plain-text version of the markup contained by the
|
107
113
|
# fragment, with HTML entities encoded.
|
108
114
|
#
|
109
|
-
# This method is slower than #
|
110
|
-
# whitespace around block elements.
|
115
|
+
# This method is slower than #text, but is clever about
|
116
|
+
# whitespace around block elements and line break elements.
|
111
117
|
#
|
112
|
-
# Loofah.document("<h1>Title</h1><div>Content</div>").to_text
|
113
|
-
# # => "\nTitle\n\nContent\n"
|
118
|
+
# Loofah.document("<h1>Title</h1><div>Content<br>Next line</div>").to_text
|
119
|
+
# # => "\nTitle\n\nContent\nNext line\n"
|
114
120
|
#
|
115
|
-
def to_text(options={})
|
116
|
-
Loofah
|
121
|
+
def to_text(options = {})
|
122
|
+
Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
|
117
123
|
end
|
118
124
|
end
|
119
125
|
|
data/lib/loofah/metahelpers.rb
CHANGED
@@ -1,15 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
|
-
module MetaHelpers
|
3
|
-
def self.
|
4
|
-
|
5
|
-
|
6
|
-
next unless
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
CODE
|
3
|
+
module MetaHelpers # :nodoc:
|
4
|
+
def self.add_downcased_set_members_to_all_set_constants(mojule)
|
5
|
+
mojule.constants.each do |constant_sym|
|
6
|
+
constant = mojule.const_get constant_sym
|
7
|
+
next unless Set === constant
|
8
|
+
constant.dup.each do |member|
|
9
|
+
constant.add member.downcase
|
10
|
+
end
|
11
11
|
end
|
12
|
-
hashed_module
|
13
12
|
end
|
14
13
|
end
|
15
14
|
end
|
data/lib/loofah/scrubber.rb
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
#
|
3
4
|
# A RuntimeError raised when Loofah could not find an appropriate scrubber.
|
4
5
|
#
|
5
|
-
class ScrubberNotFound < RuntimeError
|
6
|
+
class ScrubberNotFound < RuntimeError; end
|
6
7
|
|
7
8
|
#
|
8
9
|
# A Scrubber wraps up a block (or method) that is run on an HTML node (element):
|
@@ -36,7 +37,7 @@ module Loofah
|
|
36
37
|
CONTINUE = Object.new.freeze
|
37
38
|
|
38
39
|
# Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
|
39
|
-
STOP
|
40
|
+
STOP = Object.new.freeze
|
40
41
|
|
41
42
|
# When a scrubber is initialized, the :direction may be specified
|
42
43
|
# as :top_down (the default) or :bottom_up.
|
@@ -64,7 +65,7 @@ module Loofah
|
|
64
65
|
def initialize(options = {}, &block)
|
65
66
|
direction = options[:direction] || :top_down
|
66
67
|
unless [:top_down, :bottom_up].include?(direction)
|
67
|
-
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
68
|
+
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
68
69
|
end
|
69
70
|
@direction, @block = direction, block
|
70
71
|
end
|
@@ -86,16 +87,31 @@ module Loofah
|
|
86
87
|
raise ScrubberNotFound, "No scrub method has been defined on #{self.class.to_s}"
|
87
88
|
end
|
88
89
|
|
90
|
+
#
|
91
|
+
# If the attribute is not set, add it
|
92
|
+
# If the attribute is set, don't overwrite the existing value
|
93
|
+
#
|
94
|
+
def append_attribute(node, attribute, value)
|
95
|
+
current_value = node.get_attribute(attribute) || ""
|
96
|
+
current_values = current_value.split(/\s+/)
|
97
|
+
updated_value = current_values | [value]
|
98
|
+
node.set_attribute(attribute, updated_value.join(" "))
|
99
|
+
end
|
100
|
+
|
89
101
|
private
|
90
102
|
|
91
103
|
def html5lib_sanitize(node)
|
92
104
|
case node.type
|
93
105
|
when Nokogiri::XML::Node::ELEMENT_NODE
|
94
|
-
if HTML5::
|
106
|
+
if HTML5::Scrub.allowed_element? node.name
|
95
107
|
HTML5::Scrub.scrub_attributes node
|
96
108
|
return Scrubber::CONTINUE
|
97
109
|
end
|
98
110
|
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
|
111
|
+
if HTML5::Scrub.cdata_needs_escaping?(node)
|
112
|
+
node.before(HTML5::Scrub.cdata_escape(node))
|
113
|
+
return Scrubber::STOP
|
114
|
+
end
|
99
115
|
return Scrubber::CONTINUE
|
100
116
|
end
|
101
117
|
Scrubber::STOP
|
@@ -107,11 +123,11 @@ module Loofah
|
|
107
123
|
else
|
108
124
|
return if scrub(node) == STOP
|
109
125
|
end
|
110
|
-
node.children.each {|j| traverse_conditionally_top_down(j)}
|
126
|
+
node.children.each { |j| traverse_conditionally_top_down(j) }
|
111
127
|
end
|
112
128
|
|
113
129
|
def traverse_conditionally_bottom_up(node)
|
114
|
-
node.children.each {|j| traverse_conditionally_bottom_up(j)}
|
130
|
+
node.children.each { |j| traverse_conditionally_bottom_up(j) }
|
115
131
|
if block
|
116
132
|
block.call(node)
|
117
133
|
else
|
data/lib/loofah/scrubbers.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
#
|
3
4
|
# Loofah provides some built-in scrubbers for sanitizing with
|
4
|
-
# HTML5lib's
|
5
|
+
# HTML5lib's safelist and for accomplishing some common
|
5
6
|
# transformation tasks.
|
6
7
|
#
|
7
8
|
#
|
@@ -58,6 +59,30 @@ module Loofah
|
|
58
59
|
# Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
|
59
60
|
# => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
|
60
61
|
#
|
62
|
+
#
|
63
|
+
# === Loofah::Scrubbers::NoOpener / scrub!(:noopener)
|
64
|
+
#
|
65
|
+
# +:noopener+ adds a rel="noopener" attribute to all links
|
66
|
+
#
|
67
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
68
|
+
# Loofah.fragment(link_farmers_markup).scrub!(:noopener)
|
69
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
|
70
|
+
#
|
71
|
+
#
|
72
|
+
# === Loofah::Scrubbers::Unprintable / scrub!(:unprintable)
|
73
|
+
#
|
74
|
+
# +:unprintable+ removes unprintable Unicode characters.
|
75
|
+
#
|
76
|
+
# markup = "<p>Some text with an unprintable character at the end\u2028</p>"
|
77
|
+
# Loofah.fragment(markup).scrub!(:unprintable)
|
78
|
+
# => "<p>Some text with an unprintable character at the end</p>"
|
79
|
+
#
|
80
|
+
# You may not be able to see the unprintable character in the above example, but there is a
|
81
|
+
# U+2028 character right before the closing </p> tag. These characters can cause issues if
|
82
|
+
# the content is ever parsed by JavaScript - more information here:
|
83
|
+
#
|
84
|
+
# http://timelessrepo.com/json-isnt-a-javascript-subset
|
85
|
+
#
|
61
86
|
module Scrubbers
|
62
87
|
#
|
63
88
|
# === scrub!(:strip)
|
@@ -75,8 +100,9 @@ module Loofah
|
|
75
100
|
|
76
101
|
def scrub(node)
|
77
102
|
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
78
|
-
node.before
|
103
|
+
node.before(node.children)
|
79
104
|
node.remove
|
105
|
+
return STOP
|
80
106
|
end
|
81
107
|
end
|
82
108
|
|
@@ -117,8 +143,7 @@ module Loofah
|
|
117
143
|
|
118
144
|
def scrub(node)
|
119
145
|
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
120
|
-
|
121
|
-
node.add_next_sibling replacement_killer
|
146
|
+
node.add_next_sibling Nokogiri::XML::Text.new(node.to_s, node.document)
|
122
147
|
node.remove
|
123
148
|
return STOP
|
124
149
|
end
|
@@ -150,7 +175,7 @@ module Loofah
|
|
150
175
|
def scrub(node)
|
151
176
|
case node.type
|
152
177
|
when Nokogiri::XML::Node::ELEMENT_NODE
|
153
|
-
if HTML5::
|
178
|
+
if HTML5::Scrub.allowed_element? node.name
|
154
179
|
node.attributes.each { |attr| node.remove_attribute(attr.first) }
|
155
180
|
return CONTINUE if node.namespaces.empty?
|
156
181
|
end
|
@@ -177,9 +202,30 @@ module Loofah
|
|
177
202
|
end
|
178
203
|
|
179
204
|
def scrub(node)
|
180
|
-
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name ==
|
181
|
-
node
|
182
|
-
return STOP
|
205
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
206
|
+
append_attribute(node, "rel", "nofollow")
|
207
|
+
return STOP
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
#
|
212
|
+
# === scrub!(:noopener)
|
213
|
+
#
|
214
|
+
# +:noopener+ adds a rel="noopener" attribute to all links
|
215
|
+
#
|
216
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
217
|
+
# Loofah.fragment(link_farmers_markup).scrub!(:noopener)
|
218
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
|
219
|
+
#
|
220
|
+
class NoOpener < Scrubber
|
221
|
+
def initialize
|
222
|
+
@direction = :top_down
|
223
|
+
end
|
224
|
+
|
225
|
+
def scrub(node)
|
226
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
227
|
+
append_attribute(node, "rel", "noopener")
|
228
|
+
return STOP
|
183
229
|
end
|
184
230
|
end
|
185
231
|
|
@@ -190,23 +236,57 @@ module Loofah
|
|
190
236
|
end
|
191
237
|
|
192
238
|
def scrub(node)
|
193
|
-
return CONTINUE unless Loofah::
|
194
|
-
|
195
|
-
|
239
|
+
return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
|
240
|
+
replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
|
241
|
+
"\n"
|
242
|
+
else
|
243
|
+
"\n#{node.content}\n"
|
244
|
+
end
|
245
|
+
node.add_next_sibling Nokogiri::XML::Text.new(replacement, node.document)
|
196
246
|
node.remove
|
197
247
|
end
|
198
248
|
end
|
199
249
|
|
250
|
+
#
|
251
|
+
# === scrub!(:unprintable)
|
252
|
+
#
|
253
|
+
# +:unprintable+ removes unprintable Unicode characters.
|
254
|
+
#
|
255
|
+
# markup = "<p>Some text with an unprintable character at the end\u2028</p>"
|
256
|
+
# Loofah.fragment(markup).scrub!(:unprintable)
|
257
|
+
# => "<p>Some text with an unprintable character at the end</p>"
|
258
|
+
#
|
259
|
+
# You may not be able to see the unprintable character in the above example, but there is a
|
260
|
+
# U+2028 character right before the closing </p> tag. These characters can cause issues if
|
261
|
+
# the content is ever parsed by JavaScript - more information here:
|
262
|
+
#
|
263
|
+
# http://timelessrepo.com/json-isnt-a-javascript-subset
|
264
|
+
#
|
265
|
+
class Unprintable < Scrubber
|
266
|
+
def initialize
|
267
|
+
@direction = :top_down
|
268
|
+
end
|
269
|
+
|
270
|
+
def scrub(node)
|
271
|
+
if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
|
272
|
+
node.content = node.content.gsub(/\u2028|\u2029/, "")
|
273
|
+
end
|
274
|
+
CONTINUE
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
200
278
|
#
|
201
279
|
# A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
|
202
280
|
#
|
203
281
|
MAP = {
|
204
|
-
:escape
|
205
|
-
:prune
|
282
|
+
:escape => Escape,
|
283
|
+
:prune => Prune,
|
206
284
|
:whitewash => Whitewash,
|
207
|
-
:strip
|
208
|
-
:nofollow
|
209
|
-
:
|
285
|
+
:strip => Strip,
|
286
|
+
:nofollow => NoFollow,
|
287
|
+
:noopener => NoOpener,
|
288
|
+
:newline_block_elements => NewlineBlockElements,
|
289
|
+
:unprintable => Unprintable,
|
210
290
|
}
|
211
291
|
|
212
292
|
#
|
data/lib/loofah/xml/document.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
module XML # :nodoc:
|
3
4
|
#
|
@@ -12,8 +13,10 @@ module Loofah
|
|
12
13
|
# constructor. Applications should use Loofah.fragment to
|
13
14
|
# parse a fragment.
|
14
15
|
#
|
15
|
-
def parse
|
16
|
-
|
16
|
+
def parse(tags)
|
17
|
+
doc = Loofah::XML::Document.new
|
18
|
+
doc.encoding = tags.encoding.name if tags.respond_to?(:encoding)
|
19
|
+
self.new(doc, tags)
|
17
20
|
end
|
18
21
|
end
|
19
22
|
end
|