loofah 2.19.0 → 2.23.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +100 -0
- data/README.md +157 -114
- data/lib/loofah/concerns.rb +207 -0
- data/lib/loofah/elements.rb +78 -76
- data/lib/loofah/helpers.rb +21 -15
- data/lib/loofah/{html → html4}/document.rb +5 -7
- data/lib/loofah/html4/document_fragment.rb +15 -0
- data/lib/loofah/html5/document.rb +17 -0
- data/lib/loofah/html5/document_fragment.rb +15 -0
- data/lib/loofah/html5/libxml2_workarounds.rb +7 -6
- data/lib/loofah/html5/safelist.rb +940 -925
- data/lib/loofah/html5/scrub.rb +105 -34
- data/lib/loofah/metahelpers.rb +10 -6
- data/lib/loofah/scrubber.rb +14 -8
- data/lib/loofah/scrubbers.rb +121 -48
- data/lib/loofah/version.rb +2 -1
- data/lib/loofah/xml/document.rb +1 -0
- data/lib/loofah/xml/document_fragment.rb +2 -6
- data/lib/loofah.rb +116 -43
- metadata +20 -122
- data/lib/loofah/html/document_fragment.rb +0 -42
- data/lib/loofah/instance_methods.rb +0 -133
data/lib/loofah/html5/scrub.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require "cgi"
|
3
4
|
require "crass"
|
4
5
|
|
@@ -6,9 +7,10 @@ module Loofah
|
|
6
7
|
module HTML5 # :nodoc:
|
7
8
|
module Scrub
|
8
9
|
CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
|
9
|
-
CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
|
10
|
+
CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/ # rubocop:disable Layout/LineLength
|
10
11
|
CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
|
11
|
-
CSS_IMPORTANT =
|
12
|
+
CSS_IMPORTANT = "!important"
|
13
|
+
CSS_WHITESPACE = " "
|
12
14
|
CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
|
13
15
|
DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
|
14
16
|
|
@@ -26,7 +28,7 @@ module Loofah
|
|
26
28
|
attr_node.node_name
|
27
29
|
end
|
28
30
|
|
29
|
-
if attr_name
|
31
|
+
if DATA_ATTRIBUTE_NAME.match?(attr_name)
|
30
32
|
next
|
31
33
|
end
|
32
34
|
|
@@ -36,28 +38,19 @@ module Loofah
|
|
36
38
|
end
|
37
39
|
|
38
40
|
if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
|
39
|
-
|
40
|
-
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
|
41
|
-
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
|
42
|
-
attr_node.remove
|
43
|
-
next
|
44
|
-
elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
|
45
|
-
# permit only allowed data mediatypes
|
46
|
-
mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
|
47
|
-
mediatype, _ = mediatype.split(";")[0..1] if mediatype
|
48
|
-
if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
|
49
|
-
attr_node.remove
|
50
|
-
next
|
51
|
-
end
|
52
|
-
end
|
41
|
+
next if scrub_uri_attribute(attr_node)
|
53
42
|
end
|
43
|
+
|
54
44
|
if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
|
55
|
-
attr_node
|
56
|
-
end
|
57
|
-
if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
|
58
|
-
attr_node.remove
|
59
|
-
next
|
45
|
+
scrub_attribute_that_allows_local_ref(attr_node)
|
60
46
|
end
|
47
|
+
|
48
|
+
next unless SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) &&
|
49
|
+
attr_name == "xlink:href" &&
|
50
|
+
attr_node.value =~ /^\s*[^#\s].*/m
|
51
|
+
|
52
|
+
attr_node.remove
|
53
|
+
next
|
61
54
|
end
|
62
55
|
|
63
56
|
scrub_css_attribute(node)
|
@@ -77,29 +70,28 @@ module Loofah
|
|
77
70
|
end
|
78
71
|
|
79
72
|
def scrub_css(style)
|
73
|
+
url_flags = [:url, :bad_url]
|
80
74
|
style_tree = Crass.parse_properties(style)
|
81
75
|
sanitized_tree = []
|
82
76
|
|
83
77
|
style_tree.each do |node|
|
84
78
|
next unless node[:node] == :property
|
85
79
|
next if node[:children].any? do |child|
|
86
|
-
|
80
|
+
url_flags.include?(child[:node])
|
87
81
|
end
|
88
82
|
|
89
83
|
name = node[:name].downcase
|
90
84
|
next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
|
91
|
-
|
92
|
-
|
85
|
+
SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
|
86
|
+
SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
|
93
87
|
|
94
88
|
value = node[:children].map do |child|
|
95
89
|
case child[:node]
|
96
90
|
when :whitespace
|
97
|
-
|
91
|
+
CSS_WHITESPACE
|
98
92
|
when :string
|
99
|
-
if child[:raw]
|
93
|
+
if CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES.match?(child[:raw])
|
100
94
|
Crass::Parser.stringify(child)
|
101
|
-
else
|
102
|
-
nil
|
103
95
|
end
|
104
96
|
when :function
|
105
97
|
if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
|
@@ -108,18 +100,19 @@ module Loofah
|
|
108
100
|
when :ident
|
109
101
|
keyword = child[:value]
|
110
102
|
if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
|
111
|
-
|
112
|
-
|
103
|
+
SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
|
104
|
+
(keyword =~ CSS_KEYWORDISH)
|
113
105
|
keyword
|
114
106
|
end
|
115
107
|
else
|
116
108
|
child[:raw]
|
117
109
|
end
|
118
|
-
end.compact
|
110
|
+
end.compact.join.strip
|
119
111
|
|
120
112
|
next if value.empty?
|
121
|
-
|
122
|
-
|
113
|
+
|
114
|
+
value << CSS_WHITESPACE << CSS_IMPORTANT if node[:important]
|
115
|
+
propstring = format("%s:%s", name, value)
|
123
116
|
sanitized_node = Crass.parse_properties(propstring).first
|
124
117
|
sanitized_tree << sanitized_node << CRASS_SEMICOLON
|
125
118
|
end
|
@@ -127,6 +120,44 @@ module Loofah
|
|
127
120
|
Crass::Parser.stringify(sanitized_tree)
|
128
121
|
end
|
129
122
|
|
123
|
+
def scrub_attribute_that_allows_local_ref(attr_node)
|
124
|
+
return unless attr_node.value
|
125
|
+
|
126
|
+
nodes = Crass::Parser.new(attr_node.value).parse_component_values
|
127
|
+
|
128
|
+
values = nodes.map do |node|
|
129
|
+
case node[:node]
|
130
|
+
when :url
|
131
|
+
if node[:value].start_with?("#")
|
132
|
+
node[:raw]
|
133
|
+
end
|
134
|
+
when :hash, :ident, :string
|
135
|
+
node[:raw]
|
136
|
+
end
|
137
|
+
end.compact
|
138
|
+
|
139
|
+
attr_node.value = values.join(" ")
|
140
|
+
end
|
141
|
+
|
142
|
+
def scrub_uri_attribute(attr_node)
|
143
|
+
# this block lifted nearly verbatim from HTML5 sanitization
|
144
|
+
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
|
145
|
+
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ &&
|
146
|
+
!SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
|
147
|
+
attr_node.remove
|
148
|
+
return true
|
149
|
+
elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
|
150
|
+
# permit only allowed data mediatypes
|
151
|
+
mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
|
152
|
+
mediatype, _ = mediatype.split(";")[0..1] if mediatype
|
153
|
+
if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
|
154
|
+
attr_node.remove
|
155
|
+
return true
|
156
|
+
end
|
157
|
+
end
|
158
|
+
false
|
159
|
+
end
|
160
|
+
|
130
161
|
#
|
131
162
|
# libxml2 >= 2.9.2 fails to escape comments within some attributes.
|
132
163
|
#
|
@@ -152,6 +183,46 @@ module Loofah
|
|
152
183
|
end.force_encoding(encoding)
|
153
184
|
end
|
154
185
|
end
|
186
|
+
|
187
|
+
def cdata_needs_escaping?(node)
|
188
|
+
# Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` tag as cdata, but it acts that way
|
189
|
+
node.cdata? || (Nokogiri.jruby? && node.text? && node.parent.name == "style")
|
190
|
+
end
|
191
|
+
|
192
|
+
def cdata_escape(node)
|
193
|
+
escaped_text = escape_tags(node.text)
|
194
|
+
if Nokogiri.jruby?
|
195
|
+
node.document.create_text_node(escaped_text)
|
196
|
+
else
|
197
|
+
node.document.create_cdata(escaped_text)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
TABLE_FOR_ESCAPE_HTML__ = {
|
202
|
+
"<" => "<",
|
203
|
+
">" => ">",
|
204
|
+
"&" => "&",
|
205
|
+
}
|
206
|
+
|
207
|
+
def escape_tags(string)
|
208
|
+
# modified version of CGI.escapeHTML from ruby 3.1
|
209
|
+
enc = string.encoding
|
210
|
+
if enc.ascii_compatible?
|
211
|
+
string = string.b
|
212
|
+
string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
|
213
|
+
string.force_encoding(enc)
|
214
|
+
else
|
215
|
+
if enc.dummy?
|
216
|
+
origenc = enc
|
217
|
+
enc = Encoding::Converter.asciicompat_encoding(enc)
|
218
|
+
string = enc ? string.encode(enc) : string.b
|
219
|
+
end
|
220
|
+
table = Hash[TABLE_FOR_ESCAPE_HTML__.map { |pair| pair.map { |s| s.encode(enc) } }]
|
221
|
+
string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
|
222
|
+
string.encode!(origenc) if origenc
|
223
|
+
string
|
224
|
+
end
|
225
|
+
end
|
155
226
|
end
|
156
227
|
end
|
157
228
|
end
|
data/lib/loofah/metahelpers.rb
CHANGED
@@ -1,12 +1,16 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Loofah
|
3
4
|
module MetaHelpers # :nodoc:
|
4
|
-
|
5
|
-
mojule
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
class << self
|
6
|
+
def add_downcased_set_members_to_all_set_constants(mojule)
|
7
|
+
mojule.constants.each do |constant_sym|
|
8
|
+
constant = mojule.const_get(constant_sym)
|
9
|
+
next unless Set === constant
|
10
|
+
|
11
|
+
constant.dup.each do |member|
|
12
|
+
constant.add(member.downcase)
|
13
|
+
end
|
10
14
|
end
|
11
15
|
end
|
12
16
|
end
|
data/lib/loofah/scrubber.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Loofah
|
3
4
|
#
|
4
5
|
# A RuntimeError raised when Loofah could not find an appropriate scrubber.
|
@@ -24,7 +25,7 @@ module Loofah
|
|
24
25
|
#
|
25
26
|
# This can then be run on a document:
|
26
27
|
#
|
27
|
-
# Loofah.
|
28
|
+
# Loofah.html5_fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
|
28
29
|
# # => "<div>foo</div><p>bar</p>"
|
29
30
|
#
|
30
31
|
# Scrubbers can be run on a document in either a top-down traversal (the
|
@@ -32,7 +33,6 @@ module Loofah
|
|
32
33
|
# Scrubber::STOP to terminate the traversal of a subtree.
|
33
34
|
#
|
34
35
|
class Scrubber
|
35
|
-
|
36
36
|
# Top-down Scrubbers may return CONTINUE to indicate that the subtree should be traversed.
|
37
37
|
CONTINUE = Object.new.freeze
|
38
38
|
|
@@ -67,7 +67,9 @@ module Loofah
|
|
67
67
|
unless [:top_down, :bottom_up].include?(direction)
|
68
68
|
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
69
69
|
end
|
70
|
-
|
70
|
+
|
71
|
+
@direction = direction
|
72
|
+
@block = block
|
71
73
|
end
|
72
74
|
|
73
75
|
#
|
@@ -84,7 +86,7 @@ module Loofah
|
|
84
86
|
# +scrub+, which will be called for each document node.
|
85
87
|
#
|
86
88
|
def scrub(node)
|
87
|
-
raise ScrubberNotFound, "No scrub method has been defined on #{self.class
|
89
|
+
raise ScrubberNotFound, "No scrub method has been defined on #{self.class}"
|
88
90
|
end
|
89
91
|
|
90
92
|
#
|
@@ -103,11 +105,15 @@ module Loofah
|
|
103
105
|
def html5lib_sanitize(node)
|
104
106
|
case node.type
|
105
107
|
when Nokogiri::XML::Node::ELEMENT_NODE
|
106
|
-
if HTML5::Scrub.allowed_element?
|
107
|
-
HTML5::Scrub.scrub_attributes
|
108
|
+
if HTML5::Scrub.allowed_element?(node.name)
|
109
|
+
HTML5::Scrub.scrub_attributes(node)
|
108
110
|
return Scrubber::CONTINUE
|
109
111
|
end
|
110
112
|
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
|
113
|
+
if HTML5::Scrub.cdata_needs_escaping?(node)
|
114
|
+
node.before(HTML5::Scrub.cdata_escape(node))
|
115
|
+
return Scrubber::STOP
|
116
|
+
end
|
111
117
|
return Scrubber::CONTINUE
|
112
118
|
end
|
113
119
|
Scrubber::STOP
|
@@ -116,8 +122,8 @@ module Loofah
|
|
116
122
|
def traverse_conditionally_top_down(node)
|
117
123
|
if block
|
118
124
|
return if block.call(node) == STOP
|
119
|
-
|
120
|
-
return
|
125
|
+
elsif scrub(node) == STOP
|
126
|
+
return
|
121
127
|
end
|
122
128
|
node.children.each { |j| traverse_conditionally_top_down(j) }
|
123
129
|
end
|
data/lib/loofah/scrubbers.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Loofah
|
3
4
|
#
|
4
5
|
# Loofah provides some built-in scrubbers for sanitizing with
|
@@ -11,7 +12,7 @@ module Loofah
|
|
11
12
|
# +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
|
12
13
|
#
|
13
14
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
14
|
-
# Loofah.
|
15
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:strip)
|
15
16
|
# => "ohai! <div>div is safe</div> but foo is <b>not</b>"
|
16
17
|
#
|
17
18
|
#
|
@@ -20,7 +21,7 @@ module Loofah
|
|
20
21
|
# +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
|
21
22
|
#
|
22
23
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
23
|
-
# Loofah.
|
24
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:prune)
|
24
25
|
# => "ohai! <div>div is safe</div> "
|
25
26
|
#
|
26
27
|
#
|
@@ -29,7 +30,7 @@ module Loofah
|
|
29
30
|
# +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
|
30
31
|
#
|
31
32
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
32
|
-
# Loofah.
|
33
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:escape)
|
33
34
|
# => "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
34
35
|
#
|
35
36
|
#
|
@@ -41,7 +42,7 @@ module Loofah
|
|
41
42
|
# layer of paint on top of the HTML input to make it look nice.
|
42
43
|
#
|
43
44
|
# messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
|
44
|
-
# Loofah.
|
45
|
+
# Loofah.html5_fragment(messy_markup).scrub!(:whitewash)
|
45
46
|
# => "ohai! <div>div with attributes</div>"
|
46
47
|
#
|
47
48
|
# One use case for this scrubber is to clean up HTML that was
|
@@ -56,25 +57,42 @@ module Loofah
|
|
56
57
|
# +:nofollow+ adds a rel="nofollow" attribute to all links
|
57
58
|
#
|
58
59
|
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
59
|
-
# Loofah.
|
60
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:nofollow)
|
60
61
|
# => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
|
61
62
|
#
|
62
63
|
#
|
64
|
+
# === Loofah::Scrubbers::TargetBlank / scrub!(:targetblank)
|
65
|
+
#
|
66
|
+
# +:targetblank+ adds a target="_blank" attribute to all links
|
67
|
+
#
|
68
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
69
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:targetblank)
|
70
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' target="_blank">I like your blog post</a>"
|
71
|
+
#
|
72
|
+
#
|
63
73
|
# === Loofah::Scrubbers::NoOpener / scrub!(:noopener)
|
64
74
|
#
|
65
75
|
# +:noopener+ adds a rel="noopener" attribute to all links
|
66
76
|
#
|
67
77
|
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
68
|
-
# Loofah.
|
78
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:noopener)
|
69
79
|
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
|
70
80
|
#
|
81
|
+
# === Loofah::Scrubbers::NoReferrer / scrub!(:noreferrer)
|
82
|
+
#
|
83
|
+
# +:noreferrer+ adds a rel="noreferrer" attribute to all links
|
84
|
+
#
|
85
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
86
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:noreferrer)
|
87
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noreferrer">I like your blog post</a>"
|
88
|
+
#
|
71
89
|
#
|
72
90
|
# === Loofah::Scrubbers::Unprintable / scrub!(:unprintable)
|
73
91
|
#
|
74
92
|
# +:unprintable+ removes unprintable Unicode characters.
|
75
93
|
#
|
76
94
|
# markup = "<p>Some text with an unprintable character at the end\u2028</p>"
|
77
|
-
# Loofah.
|
95
|
+
# Loofah.html5_fragment(markup).scrub!(:unprintable)
|
78
96
|
# => "<p>Some text with an unprintable character at the end</p>"
|
79
97
|
#
|
80
98
|
# You may not be able to see the unprintable character in the above example, but there is a
|
@@ -90,23 +108,20 @@ module Loofah
|
|
90
108
|
# +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
|
91
109
|
#
|
92
110
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
93
|
-
# Loofah.
|
111
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:strip)
|
94
112
|
# => "ohai! <div>div is safe</div> but foo is <b>not</b>"
|
95
113
|
#
|
96
114
|
class Strip < Scrubber
|
97
|
-
def initialize
|
115
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
98
116
|
@direction = :bottom_up
|
99
117
|
end
|
100
118
|
|
101
119
|
def scrub(node)
|
102
120
|
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
103
|
-
|
104
|
-
|
105
|
-
node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
|
106
|
-
else
|
107
|
-
node.before node.children
|
108
|
-
end
|
121
|
+
|
122
|
+
node.before(node.children)
|
109
123
|
node.remove
|
124
|
+
STOP
|
110
125
|
end
|
111
126
|
end
|
112
127
|
|
@@ -116,18 +131,19 @@ module Loofah
|
|
116
131
|
# +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
|
117
132
|
#
|
118
133
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
119
|
-
# Loofah.
|
134
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:prune)
|
120
135
|
# => "ohai! <div>div is safe</div> "
|
121
136
|
#
|
122
137
|
class Prune < Scrubber
|
123
|
-
def initialize
|
138
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
124
139
|
@direction = :top_down
|
125
140
|
end
|
126
141
|
|
127
142
|
def scrub(node)
|
128
143
|
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
144
|
+
|
129
145
|
node.remove
|
130
|
-
|
146
|
+
STOP
|
131
147
|
end
|
132
148
|
end
|
133
149
|
|
@@ -137,19 +153,20 @@ module Loofah
|
|
137
153
|
# +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
|
138
154
|
#
|
139
155
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
140
|
-
# Loofah.
|
156
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:escape)
|
141
157
|
# => "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
142
158
|
#
|
143
159
|
class Escape < Scrubber
|
144
|
-
def initialize
|
160
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
145
161
|
@direction = :top_down
|
146
162
|
end
|
147
163
|
|
148
164
|
def scrub(node)
|
149
165
|
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
150
|
-
|
166
|
+
|
167
|
+
node.add_next_sibling(Nokogiri::XML::Text.new(node.to_s, node.document))
|
151
168
|
node.remove
|
152
|
-
|
169
|
+
STOP
|
153
170
|
end
|
154
171
|
end
|
155
172
|
|
@@ -162,7 +179,7 @@ module Loofah
|
|
162
179
|
# layer of paint on top of the HTML input to make it look nice.
|
163
180
|
#
|
164
181
|
# messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
|
165
|
-
# Loofah.
|
182
|
+
# Loofah.html5_fragment(messy_markup).scrub!(:whitewash)
|
166
183
|
# => "ohai! <div>div with attributes</div>"
|
167
184
|
#
|
168
185
|
# One use case for this scrubber is to clean up HTML that was
|
@@ -172,14 +189,14 @@ module Loofah
|
|
172
189
|
# Certainly not me.
|
173
190
|
#
|
174
191
|
class Whitewash < Scrubber
|
175
|
-
def initialize
|
192
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
176
193
|
@direction = :top_down
|
177
194
|
end
|
178
195
|
|
179
196
|
def scrub(node)
|
180
197
|
case node.type
|
181
198
|
when Nokogiri::XML::Node::ELEMENT_NODE
|
182
|
-
if HTML5::Scrub.allowed_element?
|
199
|
+
if HTML5::Scrub.allowed_element?(node.name)
|
183
200
|
node.attributes.each { |attr| node.remove_attribute(attr.first) }
|
184
201
|
return CONTINUE if node.namespaces.empty?
|
185
202
|
end
|
@@ -197,18 +214,46 @@ module Loofah
|
|
197
214
|
# +:nofollow+ adds a rel="nofollow" attribute to all links
|
198
215
|
#
|
199
216
|
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
200
|
-
# Loofah.
|
217
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:nofollow)
|
201
218
|
# => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
|
202
219
|
#
|
203
220
|
class NoFollow < Scrubber
|
204
|
-
def initialize
|
221
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
205
222
|
@direction = :top_down
|
206
223
|
end
|
207
224
|
|
208
225
|
def scrub(node)
|
209
226
|
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
227
|
+
|
210
228
|
append_attribute(node, "rel", "nofollow")
|
211
|
-
|
229
|
+
STOP
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
#
|
234
|
+
# === scrub!(:targetblank)
|
235
|
+
#
|
236
|
+
# +:targetblank+ adds a target="_blank" attribute to all links.
|
237
|
+
# If there is a target already set, replaces it with target="_blank".
|
238
|
+
#
|
239
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
240
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:targetblank)
|
241
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' target="_blank">I like your blog post</a>"
|
242
|
+
#
|
243
|
+
# On modern browsers, setting target="_blank" on anchor elements implicitly provides the same
|
244
|
+
# behavior as setting rel="noopener".
|
245
|
+
#
|
246
|
+
class TargetBlank < Scrubber
|
247
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
248
|
+
@direction = :top_down
|
249
|
+
end
|
250
|
+
|
251
|
+
def scrub(node)
|
252
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
253
|
+
|
254
|
+
node.set_attribute("target", "_blank")
|
255
|
+
|
256
|
+
STOP
|
212
257
|
end
|
213
258
|
end
|
214
259
|
|
@@ -218,35 +263,59 @@ module Loofah
|
|
218
263
|
# +:noopener+ adds a rel="noopener" attribute to all links
|
219
264
|
#
|
220
265
|
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
221
|
-
# Loofah.
|
266
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:noopener)
|
222
267
|
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
|
223
268
|
#
|
224
269
|
class NoOpener < Scrubber
|
225
|
-
def initialize
|
270
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
226
271
|
@direction = :top_down
|
227
272
|
end
|
228
273
|
|
229
274
|
def scrub(node)
|
230
275
|
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
276
|
+
|
231
277
|
append_attribute(node, "rel", "noopener")
|
232
|
-
|
278
|
+
STOP
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
#
|
283
|
+
# === scrub!(:noreferrer)
|
284
|
+
#
|
285
|
+
# +:noreferrer+ adds a rel="noreferrer" attribute to all links
|
286
|
+
#
|
287
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
288
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:noreferrer)
|
289
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noreferrer">I like your blog post</a>"
|
290
|
+
#
|
291
|
+
class NoReferrer < Scrubber
|
292
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
293
|
+
@direction = :top_down
|
294
|
+
end
|
295
|
+
|
296
|
+
def scrub(node)
|
297
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
298
|
+
|
299
|
+
append_attribute(node, "rel", "noreferrer")
|
300
|
+
STOP
|
233
301
|
end
|
234
302
|
end
|
235
303
|
|
236
304
|
# This class probably isn't useful publicly, but is used for #to_text's current implemention
|
237
305
|
class NewlineBlockElements < Scrubber # :nodoc:
|
238
|
-
def initialize
|
306
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
239
307
|
@direction = :bottom_up
|
240
308
|
end
|
241
309
|
|
242
310
|
def scrub(node)
|
243
311
|
return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
|
312
|
+
|
244
313
|
replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
|
245
314
|
"\n"
|
246
315
|
else
|
247
316
|
"\n#{node.content}\n"
|
248
317
|
end
|
249
|
-
node.add_next_sibling
|
318
|
+
node.add_next_sibling(Nokogiri::XML::Text.new(replacement, node.document))
|
250
319
|
node.remove
|
251
320
|
end
|
252
321
|
end
|
@@ -257,7 +326,7 @@ module Loofah
|
|
257
326
|
# +:unprintable+ removes unprintable Unicode characters.
|
258
327
|
#
|
259
328
|
# markup = "<p>Some text with an unprintable character at the end\u2028</p>"
|
260
|
-
# Loofah.
|
329
|
+
# Loofah.html5_fragment(markup).scrub!(:unprintable)
|
261
330
|
# => "<p>Some text with an unprintable character at the end</p>"
|
262
331
|
#
|
263
332
|
# You may not be able to see the unprintable character in the above example, but there is a
|
@@ -267,7 +336,7 @@ module Loofah
|
|
267
336
|
# http://timelessrepo.com/json-isnt-a-javascript-subset
|
268
337
|
#
|
269
338
|
class Unprintable < Scrubber
|
270
|
-
def initialize
|
339
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
271
340
|
@direction = :top_down
|
272
341
|
end
|
273
342
|
|
@@ -283,21 +352,25 @@ module Loofah
|
|
283
352
|
# A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
|
284
353
|
#
|
285
354
|
MAP = {
|
286
|
-
:
|
287
|
-
:
|
288
|
-
:
|
289
|
-
:
|
290
|
-
:
|
291
|
-
:
|
292
|
-
:
|
293
|
-
:
|
355
|
+
escape: Escape,
|
356
|
+
prune: Prune,
|
357
|
+
whitewash: Whitewash,
|
358
|
+
strip: Strip,
|
359
|
+
nofollow: NoFollow,
|
360
|
+
noopener: NoOpener,
|
361
|
+
noreferrer: NoReferrer,
|
362
|
+
targetblank: TargetBlank,
|
363
|
+
newline_block_elements: NewlineBlockElements,
|
364
|
+
unprintable: Unprintable,
|
294
365
|
}
|
295
366
|
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
367
|
+
class << self
|
368
|
+
#
|
369
|
+
# Returns an array of symbols representing the built-in scrubbers
|
370
|
+
#
|
371
|
+
def scrubber_symbols
|
372
|
+
MAP.keys
|
373
|
+
end
|
301
374
|
end
|
302
375
|
end
|
303
376
|
end
|