loofah 2.19.1 → 2.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +102 -0
- data/README.md +161 -115
- data/lib/loofah/concerns.rb +207 -0
- data/lib/loofah/elements.rb +78 -76
- data/lib/loofah/helpers.rb +21 -15
- data/lib/loofah/{html → html4}/document.rb +5 -7
- data/lib/loofah/html4/document_fragment.rb +15 -0
- data/lib/loofah/html5/document.rb +17 -0
- data/lib/loofah/html5/document_fragment.rb +15 -0
- data/lib/loofah/html5/libxml2_workarounds.rb +7 -6
- data/lib/loofah/html5/safelist.rb +940 -924
- data/lib/loofah/html5/scrub.rb +36 -35
- data/lib/loofah/metahelpers.rb +10 -6
- data/lib/loofah/scrubber.rb +10 -8
- data/lib/loofah/scrubbers.rb +174 -43
- data/lib/loofah/version.rb +2 -1
- data/lib/loofah/xml/document.rb +1 -0
- data/lib/loofah/xml/document_fragment.rb +2 -6
- data/lib/loofah.rb +116 -43
- metadata +18 -122
- data/lib/loofah/html/document_fragment.rb +0 -42
- data/lib/loofah/instance_methods.rb +0 -133
data/lib/loofah/html5/scrub.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require "cgi"
|
3
4
|
require "crass"
|
4
5
|
|
@@ -6,9 +7,10 @@ module Loofah
|
|
6
7
|
module HTML5 # :nodoc:
|
7
8
|
module Scrub
|
8
9
|
CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
|
9
|
-
CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
|
10
|
+
CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/ # rubocop:disable Layout/LineLength
|
10
11
|
CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
|
11
|
-
CSS_IMPORTANT =
|
12
|
+
CSS_IMPORTANT = "!important"
|
13
|
+
CSS_WHITESPACE = " "
|
12
14
|
CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
|
13
15
|
DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
|
14
16
|
|
@@ -26,7 +28,7 @@ module Loofah
|
|
26
28
|
attr_node.node_name
|
27
29
|
end
|
28
30
|
|
29
|
-
if attr_name
|
31
|
+
if DATA_ATTRIBUTE_NAME.match?(attr_name)
|
30
32
|
next
|
31
33
|
end
|
32
34
|
|
@@ -43,10 +45,12 @@ module Loofah
|
|
43
45
|
scrub_attribute_that_allows_local_ref(attr_node)
|
44
46
|
end
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
48
|
+
next unless SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) &&
|
49
|
+
attr_name == "xlink:href" &&
|
50
|
+
attr_node.value =~ /^\s*[^#\s].*/m
|
51
|
+
|
52
|
+
attr_node.remove
|
53
|
+
next
|
50
54
|
end
|
51
55
|
|
52
56
|
scrub_css_attribute(node)
|
@@ -66,29 +70,28 @@ module Loofah
|
|
66
70
|
end
|
67
71
|
|
68
72
|
def scrub_css(style)
|
73
|
+
url_flags = [:url, :bad_url]
|
69
74
|
style_tree = Crass.parse_properties(style)
|
70
75
|
sanitized_tree = []
|
71
76
|
|
72
77
|
style_tree.each do |node|
|
73
78
|
next unless node[:node] == :property
|
74
79
|
next if node[:children].any? do |child|
|
75
|
-
|
80
|
+
url_flags.include?(child[:node])
|
76
81
|
end
|
77
82
|
|
78
83
|
name = node[:name].downcase
|
79
84
|
next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
|
80
|
-
|
81
|
-
|
85
|
+
SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
|
86
|
+
SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
|
82
87
|
|
83
88
|
value = node[:children].map do |child|
|
84
89
|
case child[:node]
|
85
90
|
when :whitespace
|
86
|
-
|
91
|
+
CSS_WHITESPACE
|
87
92
|
when :string
|
88
|
-
if child[:raw]
|
93
|
+
if CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES.match?(child[:raw])
|
89
94
|
Crass::Parser.stringify(child)
|
90
|
-
else
|
91
|
-
nil
|
92
95
|
end
|
93
96
|
when :function
|
94
97
|
if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
|
@@ -97,18 +100,19 @@ module Loofah
|
|
97
100
|
when :ident
|
98
101
|
keyword = child[:value]
|
99
102
|
if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
|
100
|
-
|
101
|
-
|
103
|
+
SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
|
104
|
+
(keyword =~ CSS_KEYWORDISH)
|
102
105
|
keyword
|
103
106
|
end
|
104
107
|
else
|
105
108
|
child[:raw]
|
106
109
|
end
|
107
|
-
end.compact
|
110
|
+
end.compact.join.strip
|
108
111
|
|
109
112
|
next if value.empty?
|
110
|
-
|
111
|
-
|
113
|
+
|
114
|
+
value << CSS_WHITESPACE << CSS_IMPORTANT if node[:important]
|
115
|
+
propstring = format("%s:%s", name, value)
|
112
116
|
sanitized_node = Crass.parse_properties(propstring).first
|
113
117
|
sanitized_tree << sanitized_node << CRASS_SEMICOLON
|
114
118
|
end
|
@@ -126,13 +130,9 @@ module Loofah
|
|
126
130
|
when :url
|
127
131
|
if node[:value].start_with?("#")
|
128
132
|
node[:raw]
|
129
|
-
else
|
130
|
-
nil
|
131
133
|
end
|
132
134
|
when :hash, :ident, :string
|
133
135
|
node[:raw]
|
134
|
-
else
|
135
|
-
nil
|
136
136
|
end
|
137
137
|
end.compact
|
138
138
|
|
@@ -142,7 +142,8 @@ module Loofah
|
|
142
142
|
def scrub_uri_attribute(attr_node)
|
143
143
|
# this block lifted nearly verbatim from HTML5 sanitization
|
144
144
|
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
|
145
|
-
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ &&
|
145
|
+
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ &&
|
146
|
+
!SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
|
146
147
|
attr_node.remove
|
147
148
|
return true
|
148
149
|
elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
|
@@ -184,8 +185,8 @@ module Loofah
|
|
184
185
|
end
|
185
186
|
|
186
187
|
def cdata_needs_escaping?(node)
|
187
|
-
# Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style`
|
188
|
-
node.cdata? || (Nokogiri.jruby? && node.text? &&
|
188
|
+
# Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` tag as cdata, but it acts that way
|
189
|
+
node.cdata? || (Nokogiri.jruby? && node.text? && node.parent.name == "style")
|
189
190
|
end
|
190
191
|
|
191
192
|
def cdata_escape(node)
|
@@ -198,28 +199,28 @@ module Loofah
|
|
198
199
|
end
|
199
200
|
|
200
201
|
TABLE_FOR_ESCAPE_HTML__ = {
|
201
|
-
|
202
|
-
|
203
|
-
|
202
|
+
"<" => "<",
|
203
|
+
">" => ">",
|
204
|
+
"&" => "&",
|
204
205
|
}
|
205
206
|
|
206
207
|
def escape_tags(string)
|
207
208
|
# modified version of CGI.escapeHTML from ruby 3.1
|
208
209
|
enc = string.encoding
|
209
|
-
|
210
|
+
if enc.ascii_compatible?
|
211
|
+
string = string.b
|
212
|
+
string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
|
213
|
+
string.force_encoding(enc)
|
214
|
+
else
|
210
215
|
if enc.dummy?
|
211
216
|
origenc = enc
|
212
217
|
enc = Encoding::Converter.asciicompat_encoding(enc)
|
213
218
|
string = enc ? string.encode(enc) : string.b
|
214
219
|
end
|
215
|
-
table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
|
220
|
+
table = Hash[TABLE_FOR_ESCAPE_HTML__.map { |pair| pair.map { |s| s.encode(enc) } }]
|
216
221
|
string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
|
217
222
|
string.encode!(origenc) if origenc
|
218
223
|
string
|
219
|
-
else
|
220
|
-
string = string.b
|
221
|
-
string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
|
222
|
-
string.force_encoding(enc)
|
223
224
|
end
|
224
225
|
end
|
225
226
|
end
|
data/lib/loofah/metahelpers.rb
CHANGED
@@ -1,12 +1,16 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Loofah
|
3
4
|
module MetaHelpers # :nodoc:
|
4
|
-
|
5
|
-
mojule
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
class << self
|
6
|
+
def add_downcased_set_members_to_all_set_constants(mojule)
|
7
|
+
mojule.constants.each do |constant_sym|
|
8
|
+
constant = mojule.const_get(constant_sym)
|
9
|
+
next unless Set === constant
|
10
|
+
|
11
|
+
constant.dup.each do |member|
|
12
|
+
constant.add(member.downcase)
|
13
|
+
end
|
10
14
|
end
|
11
15
|
end
|
12
16
|
end
|
data/lib/loofah/scrubber.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Loofah
|
3
4
|
#
|
4
5
|
# A RuntimeError raised when Loofah could not find an appropriate scrubber.
|
@@ -24,7 +25,7 @@ module Loofah
|
|
24
25
|
#
|
25
26
|
# This can then be run on a document:
|
26
27
|
#
|
27
|
-
# Loofah.
|
28
|
+
# Loofah.html5_fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
|
28
29
|
# # => "<div>foo</div><p>bar</p>"
|
29
30
|
#
|
30
31
|
# Scrubbers can be run on a document in either a top-down traversal (the
|
@@ -32,7 +33,6 @@ module Loofah
|
|
32
33
|
# Scrubber::STOP to terminate the traversal of a subtree.
|
33
34
|
#
|
34
35
|
class Scrubber
|
35
|
-
|
36
36
|
# Top-down Scrubbers may return CONTINUE to indicate that the subtree should be traversed.
|
37
37
|
CONTINUE = Object.new.freeze
|
38
38
|
|
@@ -67,7 +67,9 @@ module Loofah
|
|
67
67
|
unless [:top_down, :bottom_up].include?(direction)
|
68
68
|
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
69
69
|
end
|
70
|
-
|
70
|
+
|
71
|
+
@direction = direction
|
72
|
+
@block = block
|
71
73
|
end
|
72
74
|
|
73
75
|
#
|
@@ -84,7 +86,7 @@ module Loofah
|
|
84
86
|
# +scrub+, which will be called for each document node.
|
85
87
|
#
|
86
88
|
def scrub(node)
|
87
|
-
raise ScrubberNotFound, "No scrub method has been defined on #{self.class
|
89
|
+
raise ScrubberNotFound, "No scrub method has been defined on #{self.class}"
|
88
90
|
end
|
89
91
|
|
90
92
|
#
|
@@ -103,8 +105,8 @@ module Loofah
|
|
103
105
|
def html5lib_sanitize(node)
|
104
106
|
case node.type
|
105
107
|
when Nokogiri::XML::Node::ELEMENT_NODE
|
106
|
-
if HTML5::Scrub.allowed_element?
|
107
|
-
HTML5::Scrub.scrub_attributes
|
108
|
+
if HTML5::Scrub.allowed_element?(node.name)
|
109
|
+
HTML5::Scrub.scrub_attributes(node)
|
108
110
|
return Scrubber::CONTINUE
|
109
111
|
end
|
110
112
|
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
|
@@ -120,8 +122,8 @@ module Loofah
|
|
120
122
|
def traverse_conditionally_top_down(node)
|
121
123
|
if block
|
122
124
|
return if block.call(node) == STOP
|
123
|
-
|
124
|
-
return
|
125
|
+
elsif scrub(node) == STOP
|
126
|
+
return
|
125
127
|
end
|
126
128
|
node.children.each { |j| traverse_conditionally_top_down(j) }
|
127
129
|
end
|
data/lib/loofah/scrubbers.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Loofah
|
3
4
|
#
|
4
5
|
# Loofah provides some built-in scrubbers for sanitizing with
|
@@ -11,7 +12,7 @@ module Loofah
|
|
11
12
|
# +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
|
12
13
|
#
|
13
14
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
14
|
-
# Loofah.
|
15
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:strip)
|
15
16
|
# => "ohai! <div>div is safe</div> but foo is <b>not</b>"
|
16
17
|
#
|
17
18
|
#
|
@@ -20,7 +21,7 @@ module Loofah
|
|
20
21
|
# +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
|
21
22
|
#
|
22
23
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
23
|
-
# Loofah.
|
24
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:prune)
|
24
25
|
# => "ohai! <div>div is safe</div> "
|
25
26
|
#
|
26
27
|
#
|
@@ -29,7 +30,7 @@ module Loofah
|
|
29
30
|
# +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
|
30
31
|
#
|
31
32
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
32
|
-
# Loofah.
|
33
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:escape)
|
33
34
|
# => "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
34
35
|
#
|
35
36
|
#
|
@@ -41,7 +42,7 @@ module Loofah
|
|
41
42
|
# layer of paint on top of the HTML input to make it look nice.
|
42
43
|
#
|
43
44
|
# messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
|
44
|
-
# Loofah.
|
45
|
+
# Loofah.html5_fragment(messy_markup).scrub!(:whitewash)
|
45
46
|
# => "ohai! <div>div with attributes</div>"
|
46
47
|
#
|
47
48
|
# One use case for this scrubber is to clean up HTML that was
|
@@ -56,25 +57,42 @@ module Loofah
|
|
56
57
|
# +:nofollow+ adds a rel="nofollow" attribute to all links
|
57
58
|
#
|
58
59
|
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
59
|
-
# Loofah.
|
60
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:nofollow)
|
60
61
|
# => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
|
61
62
|
#
|
62
63
|
#
|
64
|
+
# === Loofah::Scrubbers::TargetBlank / scrub!(:targetblank)
|
65
|
+
#
|
66
|
+
# +:targetblank+ adds a target="_blank" attribute to all links
|
67
|
+
#
|
68
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
69
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:targetblank)
|
70
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' target="_blank">I like your blog post</a>"
|
71
|
+
#
|
72
|
+
#
|
63
73
|
# === Loofah::Scrubbers::NoOpener / scrub!(:noopener)
|
64
74
|
#
|
65
75
|
# +:noopener+ adds a rel="noopener" attribute to all links
|
66
76
|
#
|
67
77
|
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
68
|
-
# Loofah.
|
78
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:noopener)
|
69
79
|
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
|
70
80
|
#
|
81
|
+
# === Loofah::Scrubbers::NoReferrer / scrub!(:noreferrer)
|
82
|
+
#
|
83
|
+
# +:noreferrer+ adds a rel="noreferrer" attribute to all links
|
84
|
+
#
|
85
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
86
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:noreferrer)
|
87
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noreferrer">I like your blog post</a>"
|
88
|
+
#
|
71
89
|
#
|
72
90
|
# === Loofah::Scrubbers::Unprintable / scrub!(:unprintable)
|
73
91
|
#
|
74
92
|
# +:unprintable+ removes unprintable Unicode characters.
|
75
93
|
#
|
76
94
|
# markup = "<p>Some text with an unprintable character at the end\u2028</p>"
|
77
|
-
# Loofah.
|
95
|
+
# Loofah.html5_fragment(markup).scrub!(:unprintable)
|
78
96
|
# => "<p>Some text with an unprintable character at the end</p>"
|
79
97
|
#
|
80
98
|
# You may not be able to see the unprintable character in the above example, but there is a
|
@@ -90,19 +108,20 @@ module Loofah
|
|
90
108
|
# +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
|
91
109
|
#
|
92
110
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
93
|
-
# Loofah.
|
111
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:strip)
|
94
112
|
# => "ohai! <div>div is safe</div> but foo is <b>not</b>"
|
95
113
|
#
|
96
114
|
class Strip < Scrubber
|
97
|
-
def initialize
|
115
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
98
116
|
@direction = :bottom_up
|
99
117
|
end
|
100
118
|
|
101
119
|
def scrub(node)
|
102
120
|
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
121
|
+
|
103
122
|
node.before(node.children)
|
104
123
|
node.remove
|
105
|
-
|
124
|
+
STOP
|
106
125
|
end
|
107
126
|
end
|
108
127
|
|
@@ -112,18 +131,19 @@ module Loofah
|
|
112
131
|
# +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
|
113
132
|
#
|
114
133
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
115
|
-
# Loofah.
|
134
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:prune)
|
116
135
|
# => "ohai! <div>div is safe</div> "
|
117
136
|
#
|
118
137
|
class Prune < Scrubber
|
119
|
-
def initialize
|
138
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
120
139
|
@direction = :top_down
|
121
140
|
end
|
122
141
|
|
123
142
|
def scrub(node)
|
124
143
|
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
144
|
+
|
125
145
|
node.remove
|
126
|
-
|
146
|
+
STOP
|
127
147
|
end
|
128
148
|
end
|
129
149
|
|
@@ -133,19 +153,20 @@ module Loofah
|
|
133
153
|
# +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
|
134
154
|
#
|
135
155
|
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
136
|
-
# Loofah.
|
156
|
+
# Loofah.html5_fragment(unsafe_html).scrub!(:escape)
|
137
157
|
# => "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
138
158
|
#
|
139
159
|
class Escape < Scrubber
|
140
|
-
def initialize
|
160
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
141
161
|
@direction = :top_down
|
142
162
|
end
|
143
163
|
|
144
164
|
def scrub(node)
|
145
165
|
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
146
|
-
|
166
|
+
|
167
|
+
node.add_next_sibling(Nokogiri::XML::Text.new(node.to_s, node.document))
|
147
168
|
node.remove
|
148
|
-
|
169
|
+
STOP
|
149
170
|
end
|
150
171
|
end
|
151
172
|
|
@@ -158,7 +179,7 @@ module Loofah
|
|
158
179
|
# layer of paint on top of the HTML input to make it look nice.
|
159
180
|
#
|
160
181
|
# messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
|
161
|
-
# Loofah.
|
182
|
+
# Loofah.html5_fragment(messy_markup).scrub!(:whitewash)
|
162
183
|
# => "ohai! <div>div with attributes</div>"
|
163
184
|
#
|
164
185
|
# One use case for this scrubber is to clean up HTML that was
|
@@ -168,14 +189,14 @@ module Loofah
|
|
168
189
|
# Certainly not me.
|
169
190
|
#
|
170
191
|
class Whitewash < Scrubber
|
171
|
-
def initialize
|
192
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
172
193
|
@direction = :top_down
|
173
194
|
end
|
174
195
|
|
175
196
|
def scrub(node)
|
176
197
|
case node.type
|
177
198
|
when Nokogiri::XML::Node::ELEMENT_NODE
|
178
|
-
if HTML5::Scrub.allowed_element?
|
199
|
+
if HTML5::Scrub.allowed_element?(node.name)
|
179
200
|
node.attributes.each { |attr| node.remove_attribute(attr.first) }
|
180
201
|
return CONTINUE if node.namespaces.empty?
|
181
202
|
end
|
@@ -193,18 +214,48 @@ module Loofah
|
|
193
214
|
# +:nofollow+ adds a rel="nofollow" attribute to all links
|
194
215
|
#
|
195
216
|
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
196
|
-
# Loofah.
|
217
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:nofollow)
|
197
218
|
# => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
|
198
219
|
#
|
199
220
|
class NoFollow < Scrubber
|
200
|
-
def initialize
|
221
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
201
222
|
@direction = :top_down
|
202
223
|
end
|
203
224
|
|
204
225
|
def scrub(node)
|
205
226
|
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
227
|
+
|
206
228
|
append_attribute(node, "rel", "nofollow")
|
207
|
-
|
229
|
+
STOP
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
#
|
234
|
+
# === scrub!(:targetblank)
|
235
|
+
#
|
236
|
+
# +:targetblank+ adds a target="_blank" attribute to all links.
|
237
|
+
# If there is a target already set, replaces it with target="_blank".
|
238
|
+
#
|
239
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
240
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:targetblank)
|
241
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' target="_blank">I like your blog post</a>"
|
242
|
+
#
|
243
|
+
# On modern browsers, setting target="_blank" on anchor elements implicitly provides the same
|
244
|
+
# behavior as setting rel="noopener".
|
245
|
+
#
|
246
|
+
class TargetBlank < Scrubber
|
247
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
248
|
+
@direction = :top_down
|
249
|
+
end
|
250
|
+
|
251
|
+
def scrub(node)
|
252
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
253
|
+
|
254
|
+
href = node["href"]
|
255
|
+
|
256
|
+
node.set_attribute("target", "_blank") if href && href[0] != "#"
|
257
|
+
|
258
|
+
STOP
|
208
259
|
end
|
209
260
|
end
|
210
261
|
|
@@ -214,35 +265,59 @@ module Loofah
|
|
214
265
|
# +:noopener+ adds a rel="noopener" attribute to all links
|
215
266
|
#
|
216
267
|
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
217
|
-
# Loofah.
|
268
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:noopener)
|
218
269
|
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
|
219
270
|
#
|
220
271
|
class NoOpener < Scrubber
|
221
|
-
def initialize
|
272
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
222
273
|
@direction = :top_down
|
223
274
|
end
|
224
275
|
|
225
276
|
def scrub(node)
|
226
277
|
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
278
|
+
|
227
279
|
append_attribute(node, "rel", "noopener")
|
228
|
-
|
280
|
+
STOP
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
#
|
285
|
+
# === scrub!(:noreferrer)
|
286
|
+
#
|
287
|
+
# +:noreferrer+ adds a rel="noreferrer" attribute to all links
|
288
|
+
#
|
289
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
290
|
+
# Loofah.html5_fragment(link_farmers_markup).scrub!(:noreferrer)
|
291
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noreferrer">I like your blog post</a>"
|
292
|
+
#
|
293
|
+
class NoReferrer < Scrubber
|
294
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
295
|
+
@direction = :top_down
|
296
|
+
end
|
297
|
+
|
298
|
+
def scrub(node)
|
299
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
300
|
+
|
301
|
+
append_attribute(node, "rel", "noreferrer")
|
302
|
+
STOP
|
229
303
|
end
|
230
304
|
end
|
231
305
|
|
232
306
|
# This class probably isn't useful publicly, but is used for #to_text's current implemention
|
233
307
|
class NewlineBlockElements < Scrubber # :nodoc:
|
234
|
-
def initialize
|
308
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
235
309
|
@direction = :bottom_up
|
236
310
|
end
|
237
311
|
|
238
312
|
def scrub(node)
|
239
313
|
return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
|
314
|
+
|
240
315
|
replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
|
241
316
|
"\n"
|
242
317
|
else
|
243
318
|
"\n#{node.content}\n"
|
244
319
|
end
|
245
|
-
node.add_next_sibling
|
320
|
+
node.add_next_sibling(Nokogiri::XML::Text.new(replacement, node.document))
|
246
321
|
node.remove
|
247
322
|
end
|
248
323
|
end
|
@@ -253,7 +328,7 @@ module Loofah
|
|
253
328
|
# +:unprintable+ removes unprintable Unicode characters.
|
254
329
|
#
|
255
330
|
# markup = "<p>Some text with an unprintable character at the end\u2028</p>"
|
256
|
-
# Loofah.
|
331
|
+
# Loofah.html5_fragment(markup).scrub!(:unprintable)
|
257
332
|
# => "<p>Some text with an unprintable character at the end</p>"
|
258
333
|
#
|
259
334
|
# You may not be able to see the unprintable character in the above example, but there is a
|
@@ -263,7 +338,7 @@ module Loofah
|
|
263
338
|
# http://timelessrepo.com/json-isnt-a-javascript-subset
|
264
339
|
#
|
265
340
|
class Unprintable < Scrubber
|
266
|
-
def initialize
|
341
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
267
342
|
@direction = :top_down
|
268
343
|
end
|
269
344
|
|
@@ -275,25 +350,81 @@ module Loofah
|
|
275
350
|
end
|
276
351
|
end
|
277
352
|
|
353
|
+
#
|
354
|
+
# === scrub!(:double_breakpoint)
|
355
|
+
#
|
356
|
+
# +:double_breakpoint+ replaces double-break tags with closing/opening paragraph tags.
|
357
|
+
#
|
358
|
+
# markup = "<p>Some text here in a logical paragraph.<br><br>Some more text, apparently a second paragraph.</p>"
|
359
|
+
# Loofah.html5_fragment(markup).scrub!(:double_breakpoint)
|
360
|
+
# => "<p>Some text here in a logical paragraph.</p><p>Some more text, apparently a second paragraph.</p>"
|
361
|
+
#
|
362
|
+
class DoubleBreakpoint < Scrubber
|
363
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
364
|
+
@direction = :top_down
|
365
|
+
end
|
366
|
+
|
367
|
+
def scrub(node)
|
368
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "p")
|
369
|
+
|
370
|
+
paragraph_with_break_point_nodes = node.xpath("//p[br[following-sibling::br]]")
|
371
|
+
|
372
|
+
paragraph_with_break_point_nodes.each do |paragraph_node|
|
373
|
+
new_paragraph = paragraph_node.add_previous_sibling("<p>").first
|
374
|
+
|
375
|
+
paragraph_node.children.each do |child|
|
376
|
+
remove_blank_text_nodes(child)
|
377
|
+
end
|
378
|
+
|
379
|
+
paragraph_node.children.each do |child|
|
380
|
+
# already unlinked
|
381
|
+
next if child.parent.nil?
|
382
|
+
|
383
|
+
if child.name == "br" && child.next_sibling.name == "br"
|
384
|
+
new_paragraph = paragraph_node.add_previous_sibling("<p>").first
|
385
|
+
child.next_sibling.unlink
|
386
|
+
child.unlink
|
387
|
+
else
|
388
|
+
child.parent = new_paragraph
|
389
|
+
end
|
390
|
+
end
|
391
|
+
|
392
|
+
paragraph_node.unlink
|
393
|
+
end
|
394
|
+
|
395
|
+
CONTINUE
|
396
|
+
end
|
397
|
+
|
398
|
+
private
|
399
|
+
|
400
|
+
def remove_blank_text_nodes(node)
|
401
|
+
node.unlink if node.text? && node.blank?
|
402
|
+
end
|
403
|
+
end
|
278
404
|
#
|
279
405
|
# A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
|
280
406
|
#
|
281
407
|
MAP = {
|
282
|
-
:
|
283
|
-
:
|
284
|
-
:
|
285
|
-
:
|
286
|
-
:
|
287
|
-
:
|
288
|
-
:
|
289
|
-
:
|
408
|
+
escape: Escape,
|
409
|
+
prune: Prune,
|
410
|
+
whitewash: Whitewash,
|
411
|
+
strip: Strip,
|
412
|
+
nofollow: NoFollow,
|
413
|
+
noopener: NoOpener,
|
414
|
+
noreferrer: NoReferrer,
|
415
|
+
targetblank: TargetBlank,
|
416
|
+
newline_block_elements: NewlineBlockElements,
|
417
|
+
unprintable: Unprintable,
|
418
|
+
double_breakpoint: DoubleBreakpoint,
|
290
419
|
}
|
291
420
|
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
421
|
+
class << self
|
422
|
+
#
|
423
|
+
# Returns an array of symbols representing the built-in scrubbers
|
424
|
+
#
|
425
|
+
def scrubber_symbols
|
426
|
+
MAP.keys
|
427
|
+
end
|
297
428
|
end
|
298
429
|
end
|
299
430
|
end
|