loofah 1.0.0 → 2.19.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +489 -0
- data/MIT-LICENSE.txt +3 -1
- data/README.md +364 -0
- data/SECURITY.md +18 -0
- data/lib/loofah/elements.rb +88 -11
- data/lib/loofah/helpers.rb +76 -2
- data/lib/loofah/html/document.rb +1 -0
- data/lib/loofah/html/document_fragment.rb +9 -2
- data/lib/loofah/html5/libxml2_workarounds.rb +27 -0
- data/lib/loofah/html5/safelist.rb +1042 -0
- data/lib/loofah/html5/scrub.rb +198 -40
- data/lib/loofah/instance_methods.rb +16 -10
- data/lib/loofah/metahelpers.rb +9 -10
- data/lib/loofah/scrubber.rb +22 -6
- data/lib/loofah/scrubbers.rb +96 -16
- data/lib/loofah/version.rb +5 -0
- data/lib/loofah/xml/document.rb +1 -0
- data/lib/loofah/xml/document_fragment.rb +5 -2
- data/lib/loofah.rb +38 -25
- metadata +159 -172
- data/CHANGELOG.rdoc +0 -134
- data/Gemfile +0 -1
- data/Manifest.txt +0 -34
- data/README.rdoc +0 -312
- data/Rakefile +0 -53
- data/benchmark/benchmark.rb +0 -149
- data/benchmark/fragment.html +0 -96
- data/benchmark/helper.rb +0 -73
- data/benchmark/www.slashdot.com.html +0 -2560
- data/lib/loofah/html5/whitelist.rb +0 -168
- data/test/helper.rb +0 -7
- data/test/html5/test_sanitizer.rb +0 -248
- data/test/integration/test_ad_hoc.rb +0 -176
- data/test/integration/test_helpers.rb +0 -33
- data/test/integration/test_html.rb +0 -51
- data/test/integration/test_scrubbers.rb +0 -331
- data/test/integration/test_xml.rb +0 -55
- data/test/unit/test_api.rb +0 -138
- data/test/unit/test_helpers.rb +0 -27
- data/test/unit/test_scrubber.rb +0 -229
- data/test/unit/test_scrubbers.rb +0 -14
data/lib/loofah/html5/scrub.rb
CHANGED
@@ -1,70 +1,228 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "cgi"
|
3
|
+
require "crass"
|
2
4
|
|
3
5
|
module Loofah
|
4
6
|
module HTML5 # :nodoc:
|
5
7
|
module Scrub
|
8
|
+
CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
|
9
|
+
CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
|
10
|
+
CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
|
11
|
+
CSS_IMPORTANT = '!important'
|
12
|
+
CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
|
13
|
+
DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
|
6
14
|
|
7
15
|
class << self
|
16
|
+
def allowed_element?(element_name)
|
17
|
+
::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
|
18
|
+
end
|
8
19
|
|
9
20
|
# alternative implementation of the html5lib attribute scrubbing algorithm
|
10
21
|
def scrub_attributes(node)
|
11
22
|
node.attribute_nodes.each do |attr_node|
|
12
23
|
attr_name = if attr_node.namespace
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
end
|
17
|
-
attr_node.remove unless HashedWhiteList::ALLOWED_ATTRIBUTES[attr_name]
|
18
|
-
if HashedWhiteList::ATTR_VAL_IS_URI[attr_name]
|
19
|
-
# this block lifted nearly verbatim from HTML5 sanitization
|
20
|
-
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
21
|
-
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and HashedWhiteList::ALLOWED_PROTOCOLS[val_unescaped.split(':')[0]].nil?
|
22
|
-
attr_node.remove
|
23
|
-
end
|
24
|
+
"#{attr_node.namespace.prefix}:#{attr_node.node_name}"
|
25
|
+
else
|
26
|
+
attr_node.node_name
|
24
27
|
end
|
25
|
-
|
26
|
-
|
28
|
+
|
29
|
+
if attr_name =~ DATA_ATTRIBUTE_NAME
|
30
|
+
next
|
27
31
|
end
|
28
|
-
|
32
|
+
|
33
|
+
unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
|
29
34
|
attr_node.remove
|
35
|
+
next
|
36
|
+
end
|
37
|
+
|
38
|
+
if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
|
39
|
+
next if scrub_uri_attribute(attr_node)
|
40
|
+
end
|
41
|
+
|
42
|
+
if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
|
43
|
+
scrub_attribute_that_allows_local_ref(attr_node)
|
44
|
+
end
|
45
|
+
|
46
|
+
if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
|
47
|
+
attr_node.remove
|
48
|
+
next
|
30
49
|
end
|
31
50
|
end
|
32
|
-
|
33
|
-
|
51
|
+
|
52
|
+
scrub_css_attribute(node)
|
53
|
+
|
54
|
+
node.attribute_nodes.each do |attr_node|
|
55
|
+
if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
|
56
|
+
node.remove_attribute(attr_node.name)
|
57
|
+
end
|
34
58
|
end
|
59
|
+
|
60
|
+
force_correct_attribute_escaping!(node)
|
61
|
+
end
|
62
|
+
|
63
|
+
def scrub_css_attribute(node)
|
64
|
+
style = node.attributes["style"]
|
65
|
+
style.value = scrub_css(style.value) if style
|
35
66
|
end
|
36
67
|
|
37
|
-
# lifted nearly verbatim from html5lib
|
38
68
|
def scrub_css(style)
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
69
|
+
style_tree = Crass.parse_properties(style)
|
70
|
+
sanitized_tree = []
|
71
|
+
|
72
|
+
style_tree.each do |node|
|
73
|
+
next unless node[:node] == :property
|
74
|
+
next if node[:children].any? do |child|
|
75
|
+
[:url, :bad_url].include?(child[:node])
|
76
|
+
end
|
77
|
+
|
78
|
+
name = node[:name].downcase
|
79
|
+
next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
|
80
|
+
SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
|
81
|
+
SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
|
82
|
+
|
83
|
+
value = node[:children].map do |child|
|
84
|
+
case child[:node]
|
85
|
+
when :whitespace
|
86
|
+
nil
|
87
|
+
when :string
|
88
|
+
if child[:raw] =~ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES
|
89
|
+
Crass::Parser.stringify(child)
|
90
|
+
else
|
91
|
+
nil
|
92
|
+
end
|
93
|
+
when :function
|
94
|
+
if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
|
95
|
+
Crass::Parser.stringify(child)
|
96
|
+
end
|
97
|
+
when :ident
|
98
|
+
keyword = child[:value]
|
99
|
+
if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
|
100
|
+
SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
|
101
|
+
(keyword =~ CSS_KEYWORDISH)
|
102
|
+
keyword
|
103
|
+
end
|
104
|
+
else
|
105
|
+
child[:raw]
|
106
|
+
end
|
107
|
+
end.compact
|
108
|
+
|
109
|
+
next if value.empty?
|
110
|
+
value << CSS_IMPORTANT if node[:important]
|
111
|
+
propstring = format("%s:%s", name, value.join(" "))
|
112
|
+
sanitized_node = Crass.parse_properties(propstring).first
|
113
|
+
sanitized_tree << sanitized_node << CRASS_SEMICOLON
|
114
|
+
end
|
115
|
+
|
116
|
+
Crass::Parser.stringify(sanitized_tree)
|
117
|
+
end
|
118
|
+
|
119
|
+
def scrub_attribute_that_allows_local_ref(attr_node)
|
120
|
+
return unless attr_node.value
|
121
|
+
|
122
|
+
nodes = Crass::Parser.new(attr_node.value).parse_component_values
|
123
|
+
|
124
|
+
values = nodes.map do |node|
|
125
|
+
case node[:node]
|
126
|
+
when :url
|
127
|
+
if node[:value].start_with?("#")
|
128
|
+
node[:raw]
|
129
|
+
else
|
130
|
+
nil
|
56
131
|
end
|
57
|
-
|
58
|
-
|
132
|
+
when :hash, :ident, :string
|
133
|
+
node[:raw]
|
134
|
+
else
|
135
|
+
nil
|
59
136
|
end
|
137
|
+
end.compact
|
138
|
+
|
139
|
+
attr_node.value = values.join(" ")
|
140
|
+
end
|
141
|
+
|
142
|
+
def scrub_uri_attribute(attr_node)
|
143
|
+
# this block lifted nearly verbatim from HTML5 sanitization
|
144
|
+
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
|
145
|
+
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
|
146
|
+
attr_node.remove
|
147
|
+
return true
|
148
|
+
elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
|
149
|
+
# permit only allowed data mediatypes
|
150
|
+
mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
|
151
|
+
mediatype, _ = mediatype.split(";")[0..1] if mediatype
|
152
|
+
if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
|
153
|
+
attr_node.remove
|
154
|
+
return true
|
155
|
+
end
|
156
|
+
end
|
157
|
+
false
|
158
|
+
end
|
159
|
+
|
160
|
+
#
|
161
|
+
# libxml2 >= 2.9.2 fails to escape comments within some attributes.
|
162
|
+
#
|
163
|
+
# see comments about CVE-2018-8048 within the tests for more information
|
164
|
+
#
|
165
|
+
def force_correct_attribute_escaping!(node)
|
166
|
+
return unless Nokogiri::VersionInfo.instance.libxml2?
|
167
|
+
|
168
|
+
node.attribute_nodes.each do |attr_node|
|
169
|
+
next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name)
|
170
|
+
|
171
|
+
tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name]
|
172
|
+
next unless tag_name.nil? || tag_name == node.name
|
173
|
+
|
174
|
+
#
|
175
|
+
# this block is just like CGI.escape in Ruby 2.4, but
|
176
|
+
# only encodes space and double-quote, to mimic
|
177
|
+
# pre-2.9.2 behavior
|
178
|
+
#
|
179
|
+
encoding = attr_node.value.encoding
|
180
|
+
attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
|
181
|
+
"%" + m.unpack("H2" * m.bytesize).join("%").upcase
|
182
|
+
end.force_encoding(encoding)
|
60
183
|
end
|
184
|
+
end
|
61
185
|
|
62
|
-
|
186
|
+
def cdata_needs_escaping?(node)
|
187
|
+
# Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way
|
188
|
+
node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script"))
|
63
189
|
end
|
64
190
|
|
65
|
-
|
191
|
+
def cdata_escape(node)
|
192
|
+
escaped_text = escape_tags(node.text)
|
193
|
+
if Nokogiri.jruby?
|
194
|
+
node.document.create_text_node(escaped_text)
|
195
|
+
else
|
196
|
+
node.document.create_cdata(escaped_text)
|
197
|
+
end
|
198
|
+
end
|
66
199
|
|
200
|
+
TABLE_FOR_ESCAPE_HTML__ = {
|
201
|
+
'<' => '<',
|
202
|
+
'>' => '>',
|
203
|
+
'&' => '&',
|
204
|
+
}
|
205
|
+
|
206
|
+
def escape_tags(string)
|
207
|
+
# modified version of CGI.escapeHTML from ruby 3.1
|
208
|
+
enc = string.encoding
|
209
|
+
unless enc.ascii_compatible?
|
210
|
+
if enc.dummy?
|
211
|
+
origenc = enc
|
212
|
+
enc = Encoding::Converter.asciicompat_encoding(enc)
|
213
|
+
string = enc ? string.encode(enc) : string.b
|
214
|
+
end
|
215
|
+
table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
|
216
|
+
string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
|
217
|
+
string.encode!(origenc) if origenc
|
218
|
+
string
|
219
|
+
else
|
220
|
+
string = string.b
|
221
|
+
string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
|
222
|
+
string.force_encoding(enc)
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
67
226
|
end
|
68
227
|
end
|
69
228
|
end
|
70
|
-
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
#
|
3
4
|
# Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
|
@@ -41,7 +42,7 @@ module Loofah
|
|
41
42
|
when Nokogiri::XML::Document
|
42
43
|
scrubber.traverse(root) if root
|
43
44
|
when Nokogiri::XML::DocumentFragment
|
44
|
-
children.
|
45
|
+
children.scrub! scrubber
|
45
46
|
else
|
46
47
|
scrubber.traverse(self)
|
47
48
|
end
|
@@ -91,29 +92,34 @@ module Loofah
|
|
91
92
|
# # decidedly not ok for browser:
|
92
93
|
# frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
|
93
94
|
#
|
94
|
-
def text(options={})
|
95
|
-
result = serialize_root
|
95
|
+
def text(options = {})
|
96
|
+
result = if serialize_root
|
97
|
+
serialize_root.children.reject(&:comment?).map(&:inner_text).join("")
|
98
|
+
else
|
99
|
+
""
|
100
|
+
end
|
96
101
|
if options[:encode_special_chars] == false
|
97
102
|
result # possibly dangerous if rendered in a browser
|
98
103
|
else
|
99
104
|
encode_special_chars result
|
100
105
|
end
|
101
106
|
end
|
107
|
+
|
102
108
|
alias :inner_text :text
|
103
|
-
alias :to_str
|
109
|
+
alias :to_str :text
|
104
110
|
|
105
111
|
#
|
106
112
|
# Returns a plain-text version of the markup contained by the
|
107
113
|
# fragment, with HTML entities encoded.
|
108
114
|
#
|
109
|
-
# This method is slower than #
|
110
|
-
# whitespace around block elements.
|
115
|
+
# This method is slower than #text, but is clever about
|
116
|
+
# whitespace around block elements and line break elements.
|
111
117
|
#
|
112
|
-
# Loofah.document("<h1>Title</h1><div>Content</div>").to_text
|
113
|
-
# # => "\nTitle\n\nContent\n"
|
118
|
+
# Loofah.document("<h1>Title</h1><div>Content<br>Next line</div>").to_text
|
119
|
+
# # => "\nTitle\n\nContent\nNext line\n"
|
114
120
|
#
|
115
|
-
def to_text(options={})
|
116
|
-
Loofah
|
121
|
+
def to_text(options = {})
|
122
|
+
Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
|
117
123
|
end
|
118
124
|
end
|
119
125
|
|
data/lib/loofah/metahelpers.rb
CHANGED
@@ -1,15 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
|
-
module MetaHelpers
|
3
|
-
def self.
|
4
|
-
|
5
|
-
|
6
|
-
next unless
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
CODE
|
3
|
+
module MetaHelpers # :nodoc:
|
4
|
+
def self.add_downcased_set_members_to_all_set_constants(mojule)
|
5
|
+
mojule.constants.each do |constant_sym|
|
6
|
+
constant = mojule.const_get constant_sym
|
7
|
+
next unless Set === constant
|
8
|
+
constant.dup.each do |member|
|
9
|
+
constant.add member.downcase
|
10
|
+
end
|
11
11
|
end
|
12
|
-
hashed_module
|
13
12
|
end
|
14
13
|
end
|
15
14
|
end
|
data/lib/loofah/scrubber.rb
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
#
|
3
4
|
# A RuntimeError raised when Loofah could not find an appropriate scrubber.
|
4
5
|
#
|
5
|
-
class ScrubberNotFound < RuntimeError
|
6
|
+
class ScrubberNotFound < RuntimeError; end
|
6
7
|
|
7
8
|
#
|
8
9
|
# A Scrubber wraps up a block (or method) that is run on an HTML node (element):
|
@@ -36,7 +37,7 @@ module Loofah
|
|
36
37
|
CONTINUE = Object.new.freeze
|
37
38
|
|
38
39
|
# Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
|
39
|
-
STOP
|
40
|
+
STOP = Object.new.freeze
|
40
41
|
|
41
42
|
# When a scrubber is initialized, the :direction may be specified
|
42
43
|
# as :top_down (the default) or :bottom_up.
|
@@ -64,7 +65,7 @@ module Loofah
|
|
64
65
|
def initialize(options = {}, &block)
|
65
66
|
direction = options[:direction] || :top_down
|
66
67
|
unless [:top_down, :bottom_up].include?(direction)
|
67
|
-
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
68
|
+
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
68
69
|
end
|
69
70
|
@direction, @block = direction, block
|
70
71
|
end
|
@@ -86,16 +87,31 @@ module Loofah
|
|
86
87
|
raise ScrubberNotFound, "No scrub method has been defined on #{self.class.to_s}"
|
87
88
|
end
|
88
89
|
|
90
|
+
#
|
91
|
+
# If the attribute is not set, add it
|
92
|
+
# If the attribute is set, don't overwrite the existing value
|
93
|
+
#
|
94
|
+
def append_attribute(node, attribute, value)
|
95
|
+
current_value = node.get_attribute(attribute) || ""
|
96
|
+
current_values = current_value.split(/\s+/)
|
97
|
+
updated_value = current_values | [value]
|
98
|
+
node.set_attribute(attribute, updated_value.join(" "))
|
99
|
+
end
|
100
|
+
|
89
101
|
private
|
90
102
|
|
91
103
|
def html5lib_sanitize(node)
|
92
104
|
case node.type
|
93
105
|
when Nokogiri::XML::Node::ELEMENT_NODE
|
94
|
-
if HTML5::
|
106
|
+
if HTML5::Scrub.allowed_element? node.name
|
95
107
|
HTML5::Scrub.scrub_attributes node
|
96
108
|
return Scrubber::CONTINUE
|
97
109
|
end
|
98
110
|
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
|
111
|
+
if HTML5::Scrub.cdata_needs_escaping?(node)
|
112
|
+
node.before(HTML5::Scrub.cdata_escape(node))
|
113
|
+
return Scrubber::STOP
|
114
|
+
end
|
99
115
|
return Scrubber::CONTINUE
|
100
116
|
end
|
101
117
|
Scrubber::STOP
|
@@ -107,11 +123,11 @@ module Loofah
|
|
107
123
|
else
|
108
124
|
return if scrub(node) == STOP
|
109
125
|
end
|
110
|
-
node.children.each {|j| traverse_conditionally_top_down(j)}
|
126
|
+
node.children.each { |j| traverse_conditionally_top_down(j) }
|
111
127
|
end
|
112
128
|
|
113
129
|
def traverse_conditionally_bottom_up(node)
|
114
|
-
node.children.each {|j| traverse_conditionally_bottom_up(j)}
|
130
|
+
node.children.each { |j| traverse_conditionally_bottom_up(j) }
|
115
131
|
if block
|
116
132
|
block.call(node)
|
117
133
|
else
|
data/lib/loofah/scrubbers.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
#
|
3
4
|
# Loofah provides some built-in scrubbers for sanitizing with
|
4
|
-
# HTML5lib's
|
5
|
+
# HTML5lib's safelist and for accomplishing some common
|
5
6
|
# transformation tasks.
|
6
7
|
#
|
7
8
|
#
|
@@ -58,6 +59,30 @@ module Loofah
|
|
58
59
|
# Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
|
59
60
|
# => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
|
60
61
|
#
|
62
|
+
#
|
63
|
+
# === Loofah::Scrubbers::NoOpener / scrub!(:noopener)
|
64
|
+
#
|
65
|
+
# +:noopener+ adds a rel="noopener" attribute to all links
|
66
|
+
#
|
67
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
68
|
+
# Loofah.fragment(link_farmers_markup).scrub!(:noopener)
|
69
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
|
70
|
+
#
|
71
|
+
#
|
72
|
+
# === Loofah::Scrubbers::Unprintable / scrub!(:unprintable)
|
73
|
+
#
|
74
|
+
# +:unprintable+ removes unprintable Unicode characters.
|
75
|
+
#
|
76
|
+
# markup = "<p>Some text with an unprintable character at the end\u2028</p>"
|
77
|
+
# Loofah.fragment(markup).scrub!(:unprintable)
|
78
|
+
# => "<p>Some text with an unprintable character at the end</p>"
|
79
|
+
#
|
80
|
+
# You may not be able to see the unprintable character in the above example, but there is a
|
81
|
+
# U+2028 character right before the closing </p> tag. These characters can cause issues if
|
82
|
+
# the content is ever parsed by JavaScript - more information here:
|
83
|
+
#
|
84
|
+
# http://timelessrepo.com/json-isnt-a-javascript-subset
|
85
|
+
#
|
61
86
|
module Scrubbers
|
62
87
|
#
|
63
88
|
# === scrub!(:strip)
|
@@ -75,8 +100,9 @@ module Loofah
|
|
75
100
|
|
76
101
|
def scrub(node)
|
77
102
|
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
78
|
-
node.before
|
103
|
+
node.before(node.children)
|
79
104
|
node.remove
|
105
|
+
return STOP
|
80
106
|
end
|
81
107
|
end
|
82
108
|
|
@@ -117,8 +143,7 @@ module Loofah
|
|
117
143
|
|
118
144
|
def scrub(node)
|
119
145
|
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
120
|
-
|
121
|
-
node.add_next_sibling replacement_killer
|
146
|
+
node.add_next_sibling Nokogiri::XML::Text.new(node.to_s, node.document)
|
122
147
|
node.remove
|
123
148
|
return STOP
|
124
149
|
end
|
@@ -150,7 +175,7 @@ module Loofah
|
|
150
175
|
def scrub(node)
|
151
176
|
case node.type
|
152
177
|
when Nokogiri::XML::Node::ELEMENT_NODE
|
153
|
-
if HTML5::
|
178
|
+
if HTML5::Scrub.allowed_element? node.name
|
154
179
|
node.attributes.each { |attr| node.remove_attribute(attr.first) }
|
155
180
|
return CONTINUE if node.namespaces.empty?
|
156
181
|
end
|
@@ -177,9 +202,30 @@ module Loofah
|
|
177
202
|
end
|
178
203
|
|
179
204
|
def scrub(node)
|
180
|
-
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name ==
|
181
|
-
node
|
182
|
-
return STOP
|
205
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
206
|
+
append_attribute(node, "rel", "nofollow")
|
207
|
+
return STOP
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
#
|
212
|
+
# === scrub!(:noopener)
|
213
|
+
#
|
214
|
+
# +:noopener+ adds a rel="noopener" attribute to all links
|
215
|
+
#
|
216
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
217
|
+
# Loofah.fragment(link_farmers_markup).scrub!(:noopener)
|
218
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
|
219
|
+
#
|
220
|
+
class NoOpener < Scrubber
|
221
|
+
def initialize
|
222
|
+
@direction = :top_down
|
223
|
+
end
|
224
|
+
|
225
|
+
def scrub(node)
|
226
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
227
|
+
append_attribute(node, "rel", "noopener")
|
228
|
+
return STOP
|
183
229
|
end
|
184
230
|
end
|
185
231
|
|
@@ -190,23 +236,57 @@ module Loofah
|
|
190
236
|
end
|
191
237
|
|
192
238
|
def scrub(node)
|
193
|
-
return CONTINUE unless Loofah::
|
194
|
-
|
195
|
-
|
239
|
+
return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
|
240
|
+
replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
|
241
|
+
"\n"
|
242
|
+
else
|
243
|
+
"\n#{node.content}\n"
|
244
|
+
end
|
245
|
+
node.add_next_sibling Nokogiri::XML::Text.new(replacement, node.document)
|
196
246
|
node.remove
|
197
247
|
end
|
198
248
|
end
|
199
249
|
|
250
|
+
#
|
251
|
+
# === scrub!(:unprintable)
|
252
|
+
#
|
253
|
+
# +:unprintable+ removes unprintable Unicode characters.
|
254
|
+
#
|
255
|
+
# markup = "<p>Some text with an unprintable character at the end\u2028</p>"
|
256
|
+
# Loofah.fragment(markup).scrub!(:unprintable)
|
257
|
+
# => "<p>Some text with an unprintable character at the end</p>"
|
258
|
+
#
|
259
|
+
# You may not be able to see the unprintable character in the above example, but there is a
|
260
|
+
# U+2028 character right before the closing </p> tag. These characters can cause issues if
|
261
|
+
# the content is ever parsed by JavaScript - more information here:
|
262
|
+
#
|
263
|
+
# http://timelessrepo.com/json-isnt-a-javascript-subset
|
264
|
+
#
|
265
|
+
class Unprintable < Scrubber
|
266
|
+
def initialize
|
267
|
+
@direction = :top_down
|
268
|
+
end
|
269
|
+
|
270
|
+
def scrub(node)
|
271
|
+
if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
|
272
|
+
node.content = node.content.gsub(/\u2028|\u2029/, "")
|
273
|
+
end
|
274
|
+
CONTINUE
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
200
278
|
#
|
201
279
|
# A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
|
202
280
|
#
|
203
281
|
MAP = {
|
204
|
-
:escape
|
205
|
-
:prune
|
282
|
+
:escape => Escape,
|
283
|
+
:prune => Prune,
|
206
284
|
:whitewash => Whitewash,
|
207
|
-
:strip
|
208
|
-
:nofollow
|
209
|
-
:
|
285
|
+
:strip => Strip,
|
286
|
+
:nofollow => NoFollow,
|
287
|
+
:noopener => NoOpener,
|
288
|
+
:newline_block_elements => NewlineBlockElements,
|
289
|
+
:unprintable => Unprintable,
|
210
290
|
}
|
211
291
|
|
212
292
|
#
|
data/lib/loofah/xml/document.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
module XML # :nodoc:
|
3
4
|
#
|
@@ -12,8 +13,10 @@ module Loofah
|
|
12
13
|
# constructor. Applications should use Loofah.fragment to
|
13
14
|
# parse a fragment.
|
14
15
|
#
|
15
|
-
def parse
|
16
|
-
|
16
|
+
def parse(tags)
|
17
|
+
doc = Loofah::XML::Document.new
|
18
|
+
doc.encoding = tags.encoding.name if tags.respond_to?(:encoding)
|
19
|
+
self.new(doc, tags)
|
17
20
|
end
|
18
21
|
end
|
19
22
|
end
|