loofah 2.19.0 → 2.23.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +100 -0
- data/README.md +157 -114
- data/lib/loofah/concerns.rb +207 -0
- data/lib/loofah/elements.rb +78 -76
- data/lib/loofah/helpers.rb +21 -15
- data/lib/loofah/{html → html4}/document.rb +5 -7
- data/lib/loofah/html4/document_fragment.rb +15 -0
- data/lib/loofah/html5/document.rb +17 -0
- data/lib/loofah/html5/document_fragment.rb +15 -0
- data/lib/loofah/html5/libxml2_workarounds.rb +7 -6
- data/lib/loofah/html5/safelist.rb +940 -925
- data/lib/loofah/html5/scrub.rb +105 -34
- data/lib/loofah/metahelpers.rb +10 -6
- data/lib/loofah/scrubber.rb +14 -8
- data/lib/loofah/scrubbers.rb +121 -48
- data/lib/loofah/version.rb +2 -1
- data/lib/loofah/xml/document.rb +1 -0
- data/lib/loofah/xml/document_fragment.rb +2 -6
- data/lib/loofah.rb +116 -43
- metadata +20 -122
- data/lib/loofah/html/document_fragment.rb +0 -42
- data/lib/loofah/instance_methods.rb +0 -133
data/lib/loofah/html5/scrub.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require "cgi"
|
3
4
|
require "crass"
|
4
5
|
|
@@ -6,9 +7,10 @@ module Loofah
|
|
6
7
|
module HTML5 # :nodoc:
|
7
8
|
module Scrub
|
8
9
|
CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
|
9
|
-
CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
|
10
|
+
CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/ # rubocop:disable Layout/LineLength
|
10
11
|
CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
|
11
|
-
CSS_IMPORTANT =
|
12
|
+
CSS_IMPORTANT = "!important"
|
13
|
+
CSS_WHITESPACE = " "
|
12
14
|
CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
|
13
15
|
DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
|
14
16
|
|
@@ -26,7 +28,7 @@ module Loofah
|
|
26
28
|
attr_node.node_name
|
27
29
|
end
|
28
30
|
|
29
|
-
if attr_name
|
31
|
+
if DATA_ATTRIBUTE_NAME.match?(attr_name)
|
30
32
|
next
|
31
33
|
end
|
32
34
|
|
@@ -36,28 +38,19 @@ module Loofah
|
|
36
38
|
end
|
37
39
|
|
38
40
|
if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
|
39
|
-
|
40
|
-
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
|
41
|
-
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
|
42
|
-
attr_node.remove
|
43
|
-
next
|
44
|
-
elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
|
45
|
-
# permit only allowed data mediatypes
|
46
|
-
mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
|
47
|
-
mediatype, _ = mediatype.split(";")[0..1] if mediatype
|
48
|
-
if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
|
49
|
-
attr_node.remove
|
50
|
-
next
|
51
|
-
end
|
52
|
-
end
|
41
|
+
next if scrub_uri_attribute(attr_node)
|
53
42
|
end
|
43
|
+
|
54
44
|
if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
|
55
|
-
attr_node
|
56
|
-
end
|
57
|
-
if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
|
58
|
-
attr_node.remove
|
59
|
-
next
|
45
|
+
scrub_attribute_that_allows_local_ref(attr_node)
|
60
46
|
end
|
47
|
+
|
48
|
+
next unless SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) &&
|
49
|
+
attr_name == "xlink:href" &&
|
50
|
+
attr_node.value =~ /^\s*[^#\s].*/m
|
51
|
+
|
52
|
+
attr_node.remove
|
53
|
+
next
|
61
54
|
end
|
62
55
|
|
63
56
|
scrub_css_attribute(node)
|
@@ -77,29 +70,28 @@ module Loofah
|
|
77
70
|
end
|
78
71
|
|
79
72
|
def scrub_css(style)
|
73
|
+
url_flags = [:url, :bad_url]
|
80
74
|
style_tree = Crass.parse_properties(style)
|
81
75
|
sanitized_tree = []
|
82
76
|
|
83
77
|
style_tree.each do |node|
|
84
78
|
next unless node[:node] == :property
|
85
79
|
next if node[:children].any? do |child|
|
86
|
-
|
80
|
+
url_flags.include?(child[:node])
|
87
81
|
end
|
88
82
|
|
89
83
|
name = node[:name].downcase
|
90
84
|
next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
|
91
|
-
|
92
|
-
|
85
|
+
SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
|
86
|
+
SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
|
93
87
|
|
94
88
|
value = node[:children].map do |child|
|
95
89
|
case child[:node]
|
96
90
|
when :whitespace
|
97
|
-
|
91
|
+
CSS_WHITESPACE
|
98
92
|
when :string
|
99
|
-
if child[:raw]
|
93
|
+
if CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES.match?(child[:raw])
|
100
94
|
Crass::Parser.stringify(child)
|
101
|
-
else
|
102
|
-
nil
|
103
95
|
end
|
104
96
|
when :function
|
105
97
|
if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
|
@@ -108,18 +100,19 @@ module Loofah
|
|
108
100
|
when :ident
|
109
101
|
keyword = child[:value]
|
110
102
|
if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
|
111
|
-
|
112
|
-
|
103
|
+
SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
|
104
|
+
(keyword =~ CSS_KEYWORDISH)
|
113
105
|
keyword
|
114
106
|
end
|
115
107
|
else
|
116
108
|
child[:raw]
|
117
109
|
end
|
118
|
-
end.compact
|
110
|
+
end.compact.join.strip
|
119
111
|
|
120
112
|
next if value.empty?
|
121
|
-
|
122
|
-
|
113
|
+
|
114
|
+
value << CSS_WHITESPACE << CSS_IMPORTANT if node[:important]
|
115
|
+
propstring = format("%s:%s", name, value)
|
123
116
|
sanitized_node = Crass.parse_properties(propstring).first
|
124
117
|
sanitized_tree << sanitized_node << CRASS_SEMICOLON
|
125
118
|
end
|
@@ -127,6 +120,44 @@ module Loofah
|
|
127
120
|
Crass::Parser.stringify(sanitized_tree)
|
128
121
|
end
|
129
122
|
|
123
|
+
def scrub_attribute_that_allows_local_ref(attr_node)
|
124
|
+
return unless attr_node.value
|
125
|
+
|
126
|
+
nodes = Crass::Parser.new(attr_node.value).parse_component_values
|
127
|
+
|
128
|
+
values = nodes.map do |node|
|
129
|
+
case node[:node]
|
130
|
+
when :url
|
131
|
+
if node[:value].start_with?("#")
|
132
|
+
node[:raw]
|
133
|
+
end
|
134
|
+
when :hash, :ident, :string
|
135
|
+
node[:raw]
|
136
|
+
end
|
137
|
+
end.compact
|
138
|
+
|
139
|
+
attr_node.value = values.join(" ")
|
140
|
+
end
|
141
|
+
|
142
|
+
def scrub_uri_attribute(attr_node)
|
143
|
+
# this block lifted nearly verbatim from HTML5 sanitization
|
144
|
+
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
|
145
|
+
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ &&
|
146
|
+
!SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
|
147
|
+
attr_node.remove
|
148
|
+
return true
|
149
|
+
elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
|
150
|
+
# permit only allowed data mediatypes
|
151
|
+
mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
|
152
|
+
mediatype, _ = mediatype.split(";")[0..1] if mediatype
|
153
|
+
if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
|
154
|
+
attr_node.remove
|
155
|
+
return true
|
156
|
+
end
|
157
|
+
end
|
158
|
+
false
|
159
|
+
end
|
160
|
+
|
130
161
|
#
|
131
162
|
# libxml2 >= 2.9.2 fails to escape comments within some attributes.
|
132
163
|
#
|
@@ -152,6 +183,46 @@ module Loofah
|
|
152
183
|
end.force_encoding(encoding)
|
153
184
|
end
|
154
185
|
end
|
186
|
+
|
187
|
+
def cdata_needs_escaping?(node)
|
188
|
+
# Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` tag as cdata, but it acts that way
|
189
|
+
node.cdata? || (Nokogiri.jruby? && node.text? && node.parent.name == "style")
|
190
|
+
end
|
191
|
+
|
192
|
+
def cdata_escape(node)
|
193
|
+
escaped_text = escape_tags(node.text)
|
194
|
+
if Nokogiri.jruby?
|
195
|
+
node.document.create_text_node(escaped_text)
|
196
|
+
else
|
197
|
+
node.document.create_cdata(escaped_text)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
TABLE_FOR_ESCAPE_HTML__ = {
|
202
|
+
"<" => "<",
|
203
|
+
">" => ">",
|
204
|
+
"&" => "&",
|
205
|
+
}
|
206
|
+
|
207
|
+
def escape_tags(string)
|
208
|
+
# modified version of CGI.escapeHTML from ruby 3.1
|
209
|
+
enc = string.encoding
|
210
|
+
if enc.ascii_compatible?
|
211
|
+
string = string.b
|
212
|
+
string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
|
213
|
+
string.force_encoding(enc)
|
214
|
+
else
|
215
|
+
if enc.dummy?
|
216
|
+
origenc = enc
|
217
|
+
enc = Encoding::Converter.asciicompat_encoding(enc)
|
218
|
+
string = enc ? string.encode(enc) : string.b
|
219
|
+
end
|
220
|
+
table = Hash[TABLE_FOR_ESCAPE_HTML__.map { |pair| pair.map { |s| s.encode(enc) } }]
|
221
|
+
string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
|
222
|
+
string.encode!(origenc) if origenc
|
223
|
+
string
|
224
|
+
end
|
225
|
+
end
|
155
226
|
end
|
156
227
|
end
|
157
228
|
end
|
data/lib/loofah/metahelpers.rb
CHANGED
@@ -1,12 +1,16 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Loofah
|
3
4
|
module MetaHelpers # :nodoc:
|
4
|
-
|
5
|
-
mojule
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
class << self
|
6
|
+
def add_downcased_set_members_to_all_set_constants(mojule)
|
7
|
+
mojule.constants.each do |constant_sym|
|
8
|
+
constant = mojule.const_get(constant_sym)
|
9
|
+
next unless Set === constant
|
10
|
+
|
11
|
+
constant.dup.each do |member|
|
12
|
+
constant.add(member.downcase)
|
13
|
+
end
|
10
14
|
end
|
11
15
|
end
|
12
16
|
end
|
data/lib/loofah/scrubber.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Loofah
|
3
4
|
#
|
4
5
|
# A RuntimeError raised when Loofah could not find an appropriate scrubber.
|
@@ -24,7 +25,7 @@ module Loofah
|
|
24
25
|
#
|
25
26
|
# This can then be run on a document:
|
26
27
|
#
|
27
|
-
# Loofah.
|
28
|
+
# Loofah.html5_fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
|
28
29
|
# # => "<div>foo</div><p>bar</p>"
|
29
30
|
#
|
30
31
|
# Scrubbers can be run on a document in either a top-down traversal (the
|
@@ -32,7 +33,6 @@ module Loofah
|
|
32
33
|
# Scrubber::STOP to terminate the traversal of a subtree.
|
33
34
|
#
|
34
35
|
class Scrubber
|
35
|
-
|
36
36
|
# Top-down Scrubbers may return CONTINUE to indicate that the subtree should be traversed.
|
37
37
|
CONTINUE = Object.new.freeze
|
38
38
|
|
@@ -67,7 +67,9 @@ module Loofah
|
|
67
67
|
unless [:top_down, :bottom_up].include?(direction)
|
68
68
|
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
69
69
|
end
|
70
|
-
|
70
|
+
|
71
|
+
@direction = direction
|
72
|
+
@block = block
|
71
73
|
end
|
72
74
|
|
73
75
|
#
|
@@ -84,7 +86,7 @@ module Loofah
|
|
84
86
|
# +scrub+, which will be called for each document node.
|
85
87
|
#
|
86
88
|
def scrub(node)
|
87
|
-
raise ScrubberNotFound, "No scrub method has been defined on #{self.class
|
89
|
+
raise ScrubberNotFound, "No scrub method has been defined on #{self.class}"
|
88
90
|
end
|
89
91
|
|
90
92
|
#
|
@@ -103,11 +105,15 @@ module Loofah
|
|
103
105
|
def html5lib_sanitize(node)
|
104
106
|
case node.type
|
105
107
|
when Nokogiri::XML::Node::ELEMENT_NODE
|
106
|
-
if HTML5::Scrub.allowed_element?
|
107
|
-
HTML5::Scrub.scrub_attributes
|
108
|
+
if HTML5::Scrub.allowed_element?(node.name)
|
109
|
+
HTML5::Scrub.scrub_attributes(node)
|
108
110
|
return Scrubber::CONTINUE
|
109
111
|
end
|
110
112
|
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
|
113
|
+
if HTML5::Scrub.cdata_needs_escaping?(node)
|
114
|
+
node.before(HTML5::Scrub.cdata_escape(node))
|
115
|
+
return Scrubber::STOP
|
116
|
+
end
|
111
117
|
return Scrubber::CONTINUE
|
112
118
|
end
|
113
119
|
Scrubber::STOP
|
@@ -116,8 +122,8 @@ module Loofah
|
|
116
122
|
def traverse_conditionally_top_down(node)
|
117
123
|
if block
|
118
124
|
return if block.call(node) == STOP
|
119
|
-
|
120
|
-
return
|
125
|
+
elsif scrub(node) == STOP
|
126
|
+
return
|
121
127
|
end
|
122
128
|
node.children.each { |j| traverse_conditionally_top_down(j) }
|
123
129
|
end
|
data/lib/loofah/scrubbers.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Loofah
|
3
4
|
#
|
4
5
|
# Loofah provides some built-in scrubbers for sanitizing with
|
@@ -11,7 +12,7 @@ module Loofah
|
|
11
12
|
# +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
|
12
13
|
#
|
13
14
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
14
|
-
# Loofah.
|
15
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:strip)
|
15
16
|
# => "ohai! <div>div is safe</div> but foo is <b>not</b>"
|
16
17
|
#
|
17
18
|
#
|
@@ -20,7 +21,7 @@ module Loofah
|
|
20
21
|
# +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
|
21
22
|
#
|
22
23
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
23
|
-
# Loofah.
|
24
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:prune)
|
24
25
|
# => "ohai! <div>div is safe</div> "
|
25
26
|
#
|
26
27
|
#
|
@@ -29,7 +30,7 @@ module Loofah
|
|
29
30
|
# +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
|
30
31
|
#
|
31
32
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
32
|
-
# Loofah.
|
33
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:escape)
|
33
34
|
# => "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
34
35
|
#
|
35
36
|
#
|
@@ -41,7 +42,7 @@ module Loofah
|
|
41
42
|
# layer of paint on top of the HTML input to make it look nice.
|
42
43
|
#
|
43
44
|
# messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
|
44
|
-
# Loofah.
|
45
|
+
# Loofah.html5_fragment(messy_markup).scrub!(:whitewash)
|
45
46
|
# => "ohai! <div>div with attributes</div>"
|
46
47
|
#
|
47
48
|
# One use case for this scrubber is to clean up HTML that was
|
@@ -56,25 +57,42 @@ module Loofah
|
|
56
57
|
# +:nofollow+ adds a rel="nofollow" attribute to all links
|
57
58
|
#
|
58
59
|
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
59
|
-
# Loofah.
|
60
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:nofollow)
|
60
61
|
# => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
|
61
62
|
#
|
62
63
|
#
|
64
|
+
# === Loofah::Scrubbers::TargetBlank / scrub!(:targetblank)
|
65
|
+
#
|
66
|
+
# +:targetblank+ adds a target="_blank" attribute to all links
|
67
|
+
#
|
68
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
69
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:targetblank)
|
70
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' target="_blank">I like your blog post</a>"
|
71
|
+
#
|
72
|
+
#
|
63
73
|
# === Loofah::Scrubbers::NoOpener / scrub!(:noopener)
|
64
74
|
#
|
65
75
|
# +:noopener+ adds a rel="noopener" attribute to all links
|
66
76
|
#
|
67
77
|
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
68
|
-
# Loofah.
|
78
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:noopener)
|
69
79
|
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
|
70
80
|
#
|
81
|
+
# === Loofah::Scrubbers::NoReferrer / scrub!(:noreferrer)
|
82
|
+
#
|
83
|
+
# +:noreferrer+ adds a rel="noreferrer" attribute to all links
|
84
|
+
#
|
85
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
86
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:noreferrer)
|
87
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noreferrer">I like your blog post</a>"
|
88
|
+
#
|
71
89
|
#
|
72
90
|
# === Loofah::Scrubbers::Unprintable / scrub!(:unprintable)
|
73
91
|
#
|
74
92
|
# +:unprintable+ removes unprintable Unicode characters.
|
75
93
|
#
|
76
94
|
# markup = "<p>Some text with an unprintable character at the end\u2028</p>"
|
77
|
-
# Loofah.
|
95
|
+
# Loofah.html5_fragment(markup).scrub!(:unprintable)
|
78
96
|
# => "<p>Some text with an unprintable character at the end</p>"
|
79
97
|
#
|
80
98
|
# You may not be able to see the unprintable character in the above example, but there is a
|
@@ -90,23 +108,20 @@ module Loofah
|
|
90
108
|
# +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
|
91
109
|
#
|
92
110
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
93
|
-
# Loofah.
|
111
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:strip)
|
94
112
|
# => "ohai! <div>div is safe</div> but foo is <b>not</b>"
|
95
113
|
#
|
96
114
|
class Strip < Scrubber
|
97
|
-
def initialize
|
115
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
98
116
|
@direction = :bottom_up
|
99
117
|
end
|
100
118
|
|
101
119
|
def scrub(node)
|
102
120
|
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
103
|
-
|
104
|
-
|
105
|
-
node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
|
106
|
-
else
|
107
|
-
node.before node.children
|
108
|
-
end
|
121
|
+
|
122
|
+
node.before(node.children)
|
109
123
|
node.remove
|
124
|
+
STOP
|
110
125
|
end
|
111
126
|
end
|
112
127
|
|
@@ -116,18 +131,19 @@ module Loofah
|
|
116
131
|
# +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
|
117
132
|
#
|
118
133
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
119
|
-
# Loofah.
|
134
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:prune)
|
120
135
|
# => "ohai! <div>div is safe</div> "
|
121
136
|
#
|
122
137
|
class Prune < Scrubber
|
123
|
-
def initialize
|
138
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
124
139
|
@direction = :top_down
|
125
140
|
end
|
126
141
|
|
127
142
|
def scrub(node)
|
128
143
|
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
144
|
+
|
129
145
|
node.remove
|
130
|
-
|
146
|
+
STOP
|
131
147
|
end
|
132
148
|
end
|
133
149
|
|
@@ -137,19 +153,20 @@ module Loofah
|
|
137
153
|
# +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
|
138
154
|
#
|
139
155
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
140
|
-
# Loofah.
|
156
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:escape)
|
141
157
|
# => "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
142
158
|
#
|
143
159
|
class Escape < Scrubber
|
144
|
-
def initialize
|
160
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
145
161
|
@direction = :top_down
|
146
162
|
end
|
147
163
|
|
148
164
|
def scrub(node)
|
149
165
|
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
150
|
-
|
166
|
+
|
167
|
+
node.add_next_sibling(Nokogiri::XML::Text.new(node.to_s, node.document))
|
151
168
|
node.remove
|
152
|
-
|
169
|
+
STOP
|
153
170
|
end
|
154
171
|
end
|
155
172
|
|
@@ -162,7 +179,7 @@ module Loofah
|
|
162
179
|
# layer of paint on top of the HTML input to make it look nice.
|
163
180
|
#
|
164
181
|
# messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
|
165
|
-
# Loofah.
|
182
|
+
# Loofah.html5_fragment(messy_markup).scrub!(:whitewash)
|
166
183
|
# => "ohai! <div>div with attributes</div>"
|
167
184
|
#
|
168
185
|
# One use case for this scrubber is to clean up HTML that was
|
@@ -172,14 +189,14 @@ module Loofah
|
|
172
189
|
# Certainly not me.
|
173
190
|
#
|
174
191
|
class Whitewash < Scrubber
|
175
|
-
def initialize
|
192
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
176
193
|
@direction = :top_down
|
177
194
|
end
|
178
195
|
|
179
196
|
def scrub(node)
|
180
197
|
case node.type
|
181
198
|
when Nokogiri::XML::Node::ELEMENT_NODE
|
182
|
-
if HTML5::Scrub.allowed_element?
|
199
|
+
if HTML5::Scrub.allowed_element?(node.name)
|
183
200
|
node.attributes.each { |attr| node.remove_attribute(attr.first) }
|
184
201
|
return CONTINUE if node.namespaces.empty?
|
185
202
|
end
|
@@ -197,18 +214,46 @@ module Loofah
|
|
197
214
|
# +:nofollow+ adds a rel="nofollow" attribute to all links
|
198
215
|
#
|
199
216
|
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
200
|
-
# Loofah.
|
217
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:nofollow)
|
201
218
|
# => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
|
202
219
|
#
|
203
220
|
class NoFollow < Scrubber
|
204
|
-
def initialize
|
221
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
205
222
|
@direction = :top_down
|
206
223
|
end
|
207
224
|
|
208
225
|
def scrub(node)
|
209
226
|
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
227
|
+
|
210
228
|
append_attribute(node, "rel", "nofollow")
|
211
|
-
|
229
|
+
STOP
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
#
|
234
|
+
# === scrub!(:targetblank)
|
235
|
+
#
|
236
|
+
# +:targetblank+ adds a target="_blank" attribute to all links.
|
237
|
+
# If there is a target already set, replaces it with target="_blank".
|
238
|
+
#
|
239
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
240
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:targetblank)
|
241
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' target="_blank">I like your blog post</a>"
|
242
|
+
#
|
243
|
+
# On modern browsers, setting target="_blank" on anchor elements implicitly provides the same
|
244
|
+
# behavior as setting rel="noopener".
|
245
|
+
#
|
246
|
+
class TargetBlank < Scrubber
|
247
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
248
|
+
@direction = :top_down
|
249
|
+
end
|
250
|
+
|
251
|
+
def scrub(node)
|
252
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
253
|
+
|
254
|
+
node.set_attribute("target", "_blank")
|
255
|
+
|
256
|
+
STOP
|
212
257
|
end
|
213
258
|
end
|
214
259
|
|
@@ -218,35 +263,59 @@ module Loofah
|
|
218
263
|
# +:noopener+ adds a rel="noopener" attribute to all links
|
219
264
|
#
|
220
265
|
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
221
|
-
# Loofah.
|
266
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:noopener)
|
222
267
|
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
|
223
268
|
#
|
224
269
|
class NoOpener < Scrubber
|
225
|
-
def initialize
|
270
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
226
271
|
@direction = :top_down
|
227
272
|
end
|
228
273
|
|
229
274
|
def scrub(node)
|
230
275
|
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
276
|
+
|
231
277
|
append_attribute(node, "rel", "noopener")
|
232
|
-
|
278
|
+
STOP
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
#
|
283
|
+
# === scrub!(:noreferrer)
|
284
|
+
#
|
285
|
+
# +:noreferrer+ adds a rel="noreferrer" attribute to all links
|
286
|
+
#
|
287
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
288
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:noreferrer)
|
289
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noreferrer">I like your blog post</a>"
|
290
|
+
#
|
291
|
+
class NoReferrer < Scrubber
|
292
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
293
|
+
@direction = :top_down
|
294
|
+
end
|
295
|
+
|
296
|
+
def scrub(node)
|
297
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
298
|
+
|
299
|
+
append_attribute(node, "rel", "noreferrer")
|
300
|
+
STOP
|
233
301
|
end
|
234
302
|
end
|
235
303
|
|
236
304
|
# This class probably isn't useful publicly, but is used for #to_text's current implemention
|
237
305
|
class NewlineBlockElements < Scrubber # :nodoc:
|
238
|
-
def initialize
|
306
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
239
307
|
@direction = :bottom_up
|
240
308
|
end
|
241
309
|
|
242
310
|
def scrub(node)
|
243
311
|
return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
|
312
|
+
|
244
313
|
replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
|
245
314
|
"\n"
|
246
315
|
else
|
247
316
|
"\n#{node.content}\n"
|
248
317
|
end
|
249
|
-
node.add_next_sibling
|
318
|
+
node.add_next_sibling(Nokogiri::XML::Text.new(replacement, node.document))
|
250
319
|
node.remove
|
251
320
|
end
|
252
321
|
end
|
@@ -257,7 +326,7 @@ module Loofah
|
|
257
326
|
# +:unprintable+ removes unprintable Unicode characters.
|
258
327
|
#
|
259
328
|
# markup = "<p>Some text with an unprintable character at the end\u2028</p>"
|
260
|
-
# Loofah.
|
329
|
+
# Loofah.html5_fragment(markup).scrub!(:unprintable)
|
261
330
|
# => "<p>Some text with an unprintable character at the end</p>"
|
262
331
|
#
|
263
332
|
# You may not be able to see the unprintable character in the above example, but there is a
|
@@ -267,7 +336,7 @@ module Loofah
|
|
267
336
|
# http://timelessrepo.com/json-isnt-a-javascript-subset
|
268
337
|
#
|
269
338
|
class Unprintable < Scrubber
|
270
|
-
def initialize
|
339
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
271
340
|
@direction = :top_down
|
272
341
|
end
|
273
342
|
|
@@ -283,21 +352,25 @@ module Loofah
|
|
283
352
|
# A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
|
284
353
|
#
|
285
354
|
MAP = {
|
286
|
-
:
|
287
|
-
:
|
288
|
-
:
|
289
|
-
:
|
290
|
-
:
|
291
|
-
:
|
292
|
-
:
|
293
|
-
:
|
355
|
+
escape: Escape,
|
356
|
+
prune: Prune,
|
357
|
+
whitewash: Whitewash,
|
358
|
+
strip: Strip,
|
359
|
+
nofollow: NoFollow,
|
360
|
+
noopener: NoOpener,
|
361
|
+
noreferrer: NoReferrer,
|
362
|
+
targetblank: TargetBlank,
|
363
|
+
newline_block_elements: NewlineBlockElements,
|
364
|
+
unprintable: Unprintable,
|
294
365
|
}
|
295
366
|
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
367
|
+
class << self
|
368
|
+
#
|
369
|
+
# Returns an array of symbols representing the built-in scrubbers
|
370
|
+
#
|
371
|
+
def scrubber_symbols
|
372
|
+
MAP.keys
|
373
|
+
end
|
301
374
|
end
|
302
375
|
end
|
303
376
|
end
|