loofah 2.3.1
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of loofah might be problematic. Click here for more details.
- checksums.yaml +7 -0
- data/.gemtest +0 -0
- data/CHANGELOG.md +336 -0
- data/Gemfile +22 -0
- data/MIT-LICENSE.txt +23 -0
- data/Manifest.txt +41 -0
- data/README.md +363 -0
- data/Rakefile +81 -0
- data/SECURITY.md +18 -0
- data/benchmark/benchmark.rb +149 -0
- data/benchmark/fragment.html +96 -0
- data/benchmark/helper.rb +73 -0
- data/benchmark/www.slashdot.com.html +2560 -0
- data/lib/loofah.rb +83 -0
- data/lib/loofah/elements.rb +92 -0
- data/lib/loofah/helpers.rb +103 -0
- data/lib/loofah/html/document.rb +18 -0
- data/lib/loofah/html/document_fragment.rb +40 -0
- data/lib/loofah/html5/libxml2_workarounds.rb +26 -0
- data/lib/loofah/html5/safelist.rb +796 -0
- data/lib/loofah/html5/scrub.rb +133 -0
- data/lib/loofah/instance_methods.rb +127 -0
- data/lib/loofah/metahelpers.rb +13 -0
- data/lib/loofah/scrubber.rb +133 -0
- data/lib/loofah/scrubbers.rb +297 -0
- data/lib/loofah/xml/document.rb +13 -0
- data/lib/loofah/xml/document_fragment.rb +23 -0
- data/test/assets/msword.html +63 -0
- data/test/assets/testdata_sanitizer_tests1.dat +502 -0
- data/test/helper.rb +18 -0
- data/test/html5/test_sanitizer.rb +401 -0
- data/test/html5/test_scrub.rb +10 -0
- data/test/integration/test_ad_hoc.rb +220 -0
- data/test/integration/test_helpers.rb +43 -0
- data/test/integration/test_html.rb +72 -0
- data/test/integration/test_scrubbers.rb +400 -0
- data/test/integration/test_xml.rb +55 -0
- data/test/unit/test_api.rb +142 -0
- data/test/unit/test_encoding.rb +20 -0
- data/test/unit/test_helpers.rb +62 -0
- data/test/unit/test_scrubber.rb +229 -0
- data/test/unit/test_scrubbers.rb +14 -0
- metadata +287 -0
@@ -0,0 +1,133 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
require 'crass'
|
3
|
+
|
4
|
+
module Loofah
|
5
|
+
module HTML5 # :nodoc:
|
6
|
+
module Scrub
|
7
|
+
|
8
|
+
CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
|
9
|
+
CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
|
10
|
+
CRASS_SEMICOLON = {:node => :semicolon, :raw => ";"}
|
11
|
+
|
12
|
+
class << self
|
13
|
+
|
14
|
+
def allowed_element? element_name
|
15
|
+
::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
|
16
|
+
end
|
17
|
+
|
18
|
+
# alternative implementation of the html5lib attribute scrubbing algorithm
|
19
|
+
def scrub_attributes node
|
20
|
+
node.attribute_nodes.each do |attr_node|
|
21
|
+
attr_name = if attr_node.namespace
|
22
|
+
"#{attr_node.namespace.prefix}:#{attr_node.node_name}"
|
23
|
+
else
|
24
|
+
attr_node.node_name
|
25
|
+
end
|
26
|
+
|
27
|
+
if attr_name =~ /\Adata-[\w-]+\z/
|
28
|
+
next
|
29
|
+
end
|
30
|
+
|
31
|
+
unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
|
32
|
+
attr_node.remove
|
33
|
+
next
|
34
|
+
end
|
35
|
+
|
36
|
+
if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
|
37
|
+
# this block lifted nearly verbatim from HTML5 sanitization
|
38
|
+
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
|
39
|
+
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
|
40
|
+
attr_node.remove
|
41
|
+
next
|
42
|
+
elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == 'data'
|
43
|
+
# permit only allowed data mediatypes
|
44
|
+
mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
|
45
|
+
mediatype, _ = mediatype.split(';')[0..1] if mediatype
|
46
|
+
if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
|
47
|
+
attr_node.remove
|
48
|
+
next
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
|
53
|
+
attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
|
54
|
+
end
|
55
|
+
if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
|
56
|
+
attr_node.remove
|
57
|
+
next
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
scrub_css_attribute node
|
62
|
+
|
63
|
+
node.attribute_nodes.each do |attr_node|
|
64
|
+
node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/
|
65
|
+
end
|
66
|
+
|
67
|
+
force_correct_attribute_escaping! node
|
68
|
+
end
|
69
|
+
|
70
|
+
def scrub_css_attribute node
|
71
|
+
style = node.attributes['style']
|
72
|
+
style.value = scrub_css(style.value) if style
|
73
|
+
end
|
74
|
+
|
75
|
+
def scrub_css style
|
76
|
+
style_tree = Crass.parse_properties style
|
77
|
+
sanitized_tree = []
|
78
|
+
|
79
|
+
style_tree.each do |node|
|
80
|
+
next unless node[:node] == :property
|
81
|
+
next if node[:children].any? do |child|
|
82
|
+
[:url, :bad_url].include?(child[:node]) || (child[:node] == :function && !SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase))
|
83
|
+
end
|
84
|
+
name = node[:name].downcase
|
85
|
+
if SafeList::ALLOWED_CSS_PROPERTIES.include?(name) || SafeList::ALLOWED_SVG_PROPERTIES.include?(name)
|
86
|
+
sanitized_tree << node << CRASS_SEMICOLON
|
87
|
+
elsif SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
|
88
|
+
value = node[:value].split.map do |keyword|
|
89
|
+
if SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
|
90
|
+
keyword
|
91
|
+
end
|
92
|
+
end.compact
|
93
|
+
unless value.empty?
|
94
|
+
propstring = sprintf "%s:%s", name, value.join(" ")
|
95
|
+
sanitized_node = Crass.parse_properties(propstring).first
|
96
|
+
sanitized_tree << sanitized_node << CRASS_SEMICOLON
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
Crass::Parser.stringify sanitized_tree
|
102
|
+
end
|
103
|
+
|
104
|
+
#
|
105
|
+
# libxml2 >= 2.9.2 fails to escape comments within some attributes.
|
106
|
+
#
|
107
|
+
# see comments about CVE-2018-8048 within the tests for more information
|
108
|
+
#
|
109
|
+
def force_correct_attribute_escaping! node
|
110
|
+
return unless Nokogiri::VersionInfo.instance.libxml2?
|
111
|
+
|
112
|
+
node.attribute_nodes.each do |attr_node|
|
113
|
+
next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name)
|
114
|
+
|
115
|
+
tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name]
|
116
|
+
next unless tag_name.nil? || tag_name == node.name
|
117
|
+
|
118
|
+
#
|
119
|
+
# this block is just like CGI.escape in Ruby 2.4, but
|
120
|
+
# only encodes space and double-quote, to mimic
|
121
|
+
# pre-2.9.2 behavior
|
122
|
+
#
|
123
|
+
encoding = attr_node.value.encoding
|
124
|
+
attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
|
125
|
+
'%' + m.unpack('H2' * m.bytesize).join('%').upcase
|
126
|
+
end.force_encoding(encoding)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
module Loofah
|
2
|
+
#
|
3
|
+
# Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
|
4
|
+
#
|
5
|
+
# Traverse the document or fragment, invoking the +scrubber+ on
|
6
|
+
# each node.
|
7
|
+
#
|
8
|
+
# +scrubber+ must either be one of the symbols representing the
|
9
|
+
# built-in scrubbers (see Scrubbers), or a Scrubber instance.
|
10
|
+
#
|
11
|
+
# span2div = Loofah::Scrubber.new do |node|
|
12
|
+
# node.name = "div" if node.name == "span"
|
13
|
+
# end
|
14
|
+
# Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
|
15
|
+
# # => "<div>foo</div><p>bar</p>"
|
16
|
+
#
|
17
|
+
# or
|
18
|
+
#
|
19
|
+
# unsafe_html = "ohai! <div>div is safe</div> <script>but script is not</script>"
|
20
|
+
# Loofah.fragment(unsafe_html).scrub!(:strip).to_s
|
21
|
+
# # => "ohai! <div>div is safe</div> "
|
22
|
+
#
|
23
|
+
# Note that this method is called implicitly from
|
24
|
+
# Loofah.scrub_fragment and Loofah.scrub_document.
|
25
|
+
#
|
26
|
+
# Please see Scrubber for more information on implementation and traversal, and
|
27
|
+
# README.rdoc for more example usage.
|
28
|
+
#
|
29
|
+
module ScrubBehavior
|
30
|
+
module Node # :nodoc:
|
31
|
+
def scrub!(scrubber)
|
32
|
+
#
|
33
|
+
# yes. this should be three separate methods. but nokogiri
|
34
|
+
# decorates (or not) based on whether the module name has
|
35
|
+
# already been included. and since documents get decorated
|
36
|
+
# just like their constituent nodes, we need to jam all the
|
37
|
+
# logic into a single module.
|
38
|
+
#
|
39
|
+
scrubber = ScrubBehavior.resolve_scrubber(scrubber)
|
40
|
+
case self
|
41
|
+
when Nokogiri::XML::Document
|
42
|
+
scrubber.traverse(root) if root
|
43
|
+
when Nokogiri::XML::DocumentFragment
|
44
|
+
children.scrub! scrubber
|
45
|
+
else
|
46
|
+
scrubber.traverse(self)
|
47
|
+
end
|
48
|
+
self
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
module NodeSet # :nodoc:
|
53
|
+
def scrub!(scrubber)
|
54
|
+
each { |node| node.scrub!(scrubber) }
|
55
|
+
self
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def ScrubBehavior.resolve_scrubber(scrubber) # :nodoc:
|
60
|
+
scrubber = Scrubbers::MAP[scrubber].new if Scrubbers::MAP[scrubber]
|
61
|
+
unless scrubber.is_a?(Loofah::Scrubber)
|
62
|
+
raise Loofah::ScrubberNotFound, "not a Scrubber or a scrubber name: #{scrubber.inspect}"
|
63
|
+
end
|
64
|
+
scrubber
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
#
|
69
|
+
# Overrides +text+ in HTML::Document and HTML::DocumentFragment,
|
70
|
+
# and mixes in +to_text+.
|
71
|
+
#
|
72
|
+
module TextBehavior
|
73
|
+
#
|
74
|
+
# Returns a plain-text version of the markup contained by the document,
|
75
|
+
# with HTML entities encoded.
|
76
|
+
#
|
77
|
+
# This method is significantly faster than #to_text, but isn't
|
78
|
+
# clever about whitespace around block elements.
|
79
|
+
#
|
80
|
+
# Loofah.document("<h1>Title</h1><div>Content</div>").text
|
81
|
+
# # => "TitleContent"
|
82
|
+
#
|
83
|
+
# By default, the returned text will have HTML entities
|
84
|
+
# escaped. If you want unescaped entities, and you understand
|
85
|
+
# that the result is unsafe to render in a browser, then you
|
86
|
+
# can pass an argument as shown:
|
87
|
+
#
|
88
|
+
# frag = Loofah.fragment("<script>alert('EVIL');</script>")
|
89
|
+
# # ok for browser:
|
90
|
+
# frag.text # => "<script>alert('EVIL');</script>"
|
91
|
+
# # decidedly not ok for browser:
|
92
|
+
# frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
|
93
|
+
#
|
94
|
+
def text(options={})
|
95
|
+
result = serialize_root.children.inner_text rescue ""
|
96
|
+
if options[:encode_special_chars] == false
|
97
|
+
result # possibly dangerous if rendered in a browser
|
98
|
+
else
|
99
|
+
encode_special_chars result
|
100
|
+
end
|
101
|
+
end
|
102
|
+
alias :inner_text :text
|
103
|
+
alias :to_str :text
|
104
|
+
|
105
|
+
#
|
106
|
+
# Returns a plain-text version of the markup contained by the
|
107
|
+
# fragment, with HTML entities encoded.
|
108
|
+
#
|
109
|
+
# This method is slower than #to_text, but is clever about
|
110
|
+
# whitespace around block elements.
|
111
|
+
#
|
112
|
+
# Loofah.document("<h1>Title</h1><div>Content</div>").to_text
|
113
|
+
# # => "\nTitle\n\nContent\n"
|
114
|
+
#
|
115
|
+
def to_text(options={})
|
116
|
+
Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
module DocumentDecorator # :nodoc:
|
121
|
+
def initialize(*args, &block)
|
122
|
+
super
|
123
|
+
self.decorators(Nokogiri::XML::Node) << ScrubBehavior::Node
|
124
|
+
self.decorators(Nokogiri::XML::NodeSet) << ScrubBehavior::NodeSet
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module Loofah
|
2
|
+
module MetaHelpers # :nodoc:
|
3
|
+
def self.add_downcased_set_members_to_all_set_constants mojule
|
4
|
+
mojule.constants.each do |constant_sym|
|
5
|
+
constant = mojule.const_get constant_sym
|
6
|
+
next unless Set === constant
|
7
|
+
constant.dup.each do |member|
|
8
|
+
constant.add member.downcase
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
module Loofah
|
2
|
+
#
|
3
|
+
# A RuntimeError raised when Loofah could not find an appropriate scrubber.
|
4
|
+
#
|
5
|
+
class ScrubberNotFound < RuntimeError ; end
|
6
|
+
|
7
|
+
#
|
8
|
+
# A Scrubber wraps up a block (or method) that is run on an HTML node (element):
|
9
|
+
#
|
10
|
+
# # change all <span> tags to <div> tags
|
11
|
+
# span2div = Loofah::Scrubber.new do |node|
|
12
|
+
# node.name = "div" if node.name == "span"
|
13
|
+
# end
|
14
|
+
#
|
15
|
+
# Alternatively, this scrubber could have been implemented as:
|
16
|
+
#
|
17
|
+
# class Span2Div < Loofah::Scrubber
|
18
|
+
# def scrub(node)
|
19
|
+
# node.name = "div" if node.name == "span"
|
20
|
+
# end
|
21
|
+
# end
|
22
|
+
# span2div = Span2Div.new
|
23
|
+
#
|
24
|
+
# This can then be run on a document:
|
25
|
+
#
|
26
|
+
# Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
|
27
|
+
# # => "<div>foo</div><p>bar</p>"
|
28
|
+
#
|
29
|
+
# Scrubbers can be run on a document in either a top-down traversal (the
|
30
|
+
# default) or bottom-up. Top-down scrubbers can optionally return
|
31
|
+
# Scrubber::STOP to terminate the traversal of a subtree.
|
32
|
+
#
|
33
|
+
class Scrubber
|
34
|
+
|
35
|
+
# Top-down Scrubbers may return CONTINUE to indicate that the subtree should be traversed.
|
36
|
+
CONTINUE = Object.new.freeze
|
37
|
+
|
38
|
+
# Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
|
39
|
+
STOP = Object.new.freeze
|
40
|
+
|
41
|
+
# When a scrubber is initialized, the :direction may be specified
|
42
|
+
# as :top_down (the default) or :bottom_up.
|
43
|
+
attr_reader :direction
|
44
|
+
|
45
|
+
# When a scrubber is initialized, the optional block is saved as
|
46
|
+
# :block. Note that, if no block is passed, then the +scrub+
|
47
|
+
# method is assumed to have been implemented.
|
48
|
+
attr_reader :block
|
49
|
+
|
50
|
+
#
|
51
|
+
# Options may include
|
52
|
+
# :direction => :top_down (the default)
|
53
|
+
# or
|
54
|
+
# :direction => :bottom_up
|
55
|
+
#
|
56
|
+
# For top_down traversals, if the block returns
|
57
|
+
# Loofah::Scrubber::STOP, then the traversal will be terminated
|
58
|
+
# for the current node's subtree.
|
59
|
+
#
|
60
|
+
# Alternatively, a Scrubber may inherit from Loofah::Scrubber,
|
61
|
+
# and implement +scrub+, which is slightly faster than using a
|
62
|
+
# block.
|
63
|
+
#
|
64
|
+
def initialize(options = {}, &block)
|
65
|
+
direction = options[:direction] || :top_down
|
66
|
+
unless [:top_down, :bottom_up].include?(direction)
|
67
|
+
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
68
|
+
end
|
69
|
+
@direction, @block = direction, block
|
70
|
+
end
|
71
|
+
|
72
|
+
#
|
73
|
+
# Calling +traverse+ will cause the document to be traversed by
|
74
|
+
# either the lambda passed to the initializer or the +scrub+
|
75
|
+
# method, in the direction specified at +new+ time.
|
76
|
+
#
|
77
|
+
def traverse(node)
|
78
|
+
direction == :bottom_up ? traverse_conditionally_bottom_up(node) : traverse_conditionally_top_down(node)
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# When +new+ is not passed a block, the class may implement
|
83
|
+
# +scrub+, which will be called for each document node.
|
84
|
+
#
|
85
|
+
def scrub(node)
|
86
|
+
raise ScrubberNotFound, "No scrub method has been defined on #{self.class.to_s}"
|
87
|
+
end
|
88
|
+
|
89
|
+
#
|
90
|
+
# If the attribute is not set, add it
|
91
|
+
# If the attribute is set, don't overwrite the existing value
|
92
|
+
#
|
93
|
+
def append_attribute(node, attribute, value)
|
94
|
+
current_value = node.get_attribute(attribute) || ''
|
95
|
+
current_values = current_value.split(/\s+/)
|
96
|
+
updated_value = current_values | [value]
|
97
|
+
node.set_attribute(attribute, updated_value.join(' '))
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def html5lib_sanitize(node)
|
103
|
+
case node.type
|
104
|
+
when Nokogiri::XML::Node::ELEMENT_NODE
|
105
|
+
if HTML5::Scrub.allowed_element? node.name
|
106
|
+
HTML5::Scrub.scrub_attributes node
|
107
|
+
return Scrubber::CONTINUE
|
108
|
+
end
|
109
|
+
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
|
110
|
+
return Scrubber::CONTINUE
|
111
|
+
end
|
112
|
+
Scrubber::STOP
|
113
|
+
end
|
114
|
+
|
115
|
+
def traverse_conditionally_top_down(node)
|
116
|
+
if block
|
117
|
+
return if block.call(node) == STOP
|
118
|
+
else
|
119
|
+
return if scrub(node) == STOP
|
120
|
+
end
|
121
|
+
node.children.each {|j| traverse_conditionally_top_down(j)}
|
122
|
+
end
|
123
|
+
|
124
|
+
def traverse_conditionally_bottom_up(node)
|
125
|
+
node.children.each {|j| traverse_conditionally_bottom_up(j)}
|
126
|
+
if block
|
127
|
+
block.call(node)
|
128
|
+
else
|
129
|
+
scrub(node)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
@@ -0,0 +1,297 @@
|
|
1
|
+
module Loofah
|
2
|
+
#
|
3
|
+
# Loofah provides some built-in scrubbers for sanitizing with
|
4
|
+
# HTML5lib's safelist and for accomplishing some common
|
5
|
+
# transformation tasks.
|
6
|
+
#
|
7
|
+
#
|
8
|
+
# === Loofah::Scrubbers::Strip / scrub!(:strip)
|
9
|
+
#
|
10
|
+
# +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
|
11
|
+
#
|
12
|
+
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
13
|
+
# Loofah.fragment(unsafe_html).scrub!(:strip)
|
14
|
+
# => "ohai! <div>div is safe</div> but foo is <b>not</b>"
|
15
|
+
#
|
16
|
+
#
|
17
|
+
# === Loofah::Scrubbers::Prune / scrub!(:prune)
|
18
|
+
#
|
19
|
+
# +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
|
20
|
+
#
|
21
|
+
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
22
|
+
# Loofah.fragment(unsafe_html).scrub!(:prune)
|
23
|
+
# => "ohai! <div>div is safe</div> "
|
24
|
+
#
|
25
|
+
#
|
26
|
+
# === Loofah::Scrubbers::Escape / scrub!(:escape)
|
27
|
+
#
|
28
|
+
# +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
|
29
|
+
#
|
30
|
+
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
31
|
+
# Loofah.fragment(unsafe_html).scrub!(:escape)
|
32
|
+
# => "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
33
|
+
#
|
34
|
+
#
|
35
|
+
# === Loofah::Scrubbers::Whitewash / scrub!(:whitewash)
|
36
|
+
#
|
37
|
+
# +:whitewash+ removes all comments, styling and attributes in
|
38
|
+
# addition to doing markup-fixer-uppery and pruning unsafe tags. I
|
39
|
+
# like to call this "whitewashing", since it's like putting a new
|
40
|
+
# layer of paint on top of the HTML input to make it look nice.
|
41
|
+
#
|
42
|
+
# messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
|
43
|
+
# Loofah.fragment(messy_markup).scrub!(:whitewash)
|
44
|
+
# => "ohai! <div>div with attributes</div>"
|
45
|
+
#
|
46
|
+
# One use case for this scrubber is to clean up HTML that was
|
47
|
+
# cut-and-pasted from Microsoft Word into a WYSIWYG editor or a
|
48
|
+
# rich text editor. Microsoft's software is famous for injecting
|
49
|
+
# all kinds of cruft into its HTML output. Who needs that crap?
|
50
|
+
# Certainly not me.
|
51
|
+
#
|
52
|
+
#
|
53
|
+
# === Loofah::Scrubbers::NoFollow / scrub!(:nofollow)
|
54
|
+
#
|
55
|
+
# +:nofollow+ adds a rel="nofollow" attribute to all links
|
56
|
+
#
|
57
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
58
|
+
# Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
|
59
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
|
60
|
+
#
|
61
|
+
#
|
62
|
+
# === Loofah::Scrubbers::NoOpener / scrub!(:noopener)
|
63
|
+
#
|
64
|
+
# +:noopener+ adds a rel="noopener" attribute to all links
|
65
|
+
#
|
66
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
67
|
+
# Loofah.fragment(link_farmers_markup).scrub!(:noopener)
|
68
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
|
69
|
+
#
|
70
|
+
#
|
71
|
+
# === Loofah::Scrubbers::Unprintable / scrub!(:unprintable)
|
72
|
+
#
|
73
|
+
# +:unprintable+ removes unprintable Unicode characters.
|
74
|
+
#
|
75
|
+
# markup = "<p>Some text with an unprintable character at the end\u2028</p>"
|
76
|
+
# Loofah.fragment(markup).scrub!(:unprintable)
|
77
|
+
# => "<p>Some text with an unprintable character at the end</p>"
|
78
|
+
#
|
79
|
+
# You may not be able to see the unprintable character in the above example, but there is a
|
80
|
+
# U+2028 character right before the closing </p> tag. These characters can cause issues if
|
81
|
+
# the content is ever parsed by JavaScript - more information here:
|
82
|
+
#
|
83
|
+
# http://timelessrepo.com/json-isnt-a-javascript-subset
|
84
|
+
#
|
85
|
+
module Scrubbers
|
86
|
+
#
|
87
|
+
# === scrub!(:strip)
|
88
|
+
#
|
89
|
+
# +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
|
90
|
+
#
|
91
|
+
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
92
|
+
# Loofah.fragment(unsafe_html).scrub!(:strip)
|
93
|
+
# => "ohai! <div>div is safe</div> but foo is <b>not</b>"
|
94
|
+
#
|
95
|
+
class Strip < Scrubber
|
96
|
+
def initialize
|
97
|
+
@direction = :bottom_up
|
98
|
+
end
|
99
|
+
|
100
|
+
def scrub(node)
|
101
|
+
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
102
|
+
if node.children.length == 1 && node.children.first.cdata?
|
103
|
+
sanitized_text = Loofah.fragment(node.children.first.to_html).scrub!(:strip).to_html
|
104
|
+
node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
|
105
|
+
else
|
106
|
+
node.before node.children
|
107
|
+
end
|
108
|
+
node.remove
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
#
|
113
|
+
# === scrub!(:prune)
|
114
|
+
#
|
115
|
+
# +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
|
116
|
+
#
|
117
|
+
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
118
|
+
# Loofah.fragment(unsafe_html).scrub!(:prune)
|
119
|
+
# => "ohai! <div>div is safe</div> "
|
120
|
+
#
|
121
|
+
class Prune < Scrubber
|
122
|
+
def initialize
|
123
|
+
@direction = :top_down
|
124
|
+
end
|
125
|
+
|
126
|
+
def scrub(node)
|
127
|
+
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
128
|
+
node.remove
|
129
|
+
return STOP
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
#
|
134
|
+
# === scrub!(:escape)
|
135
|
+
#
|
136
|
+
# +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
|
137
|
+
#
|
138
|
+
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
139
|
+
# Loofah.fragment(unsafe_html).scrub!(:escape)
|
140
|
+
# => "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
141
|
+
#
|
142
|
+
class Escape < Scrubber
|
143
|
+
def initialize
|
144
|
+
@direction = :top_down
|
145
|
+
end
|
146
|
+
|
147
|
+
def scrub(node)
|
148
|
+
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
149
|
+
node.add_next_sibling Nokogiri::XML::Text.new(node.to_s, node.document)
|
150
|
+
node.remove
|
151
|
+
return STOP
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
#
|
156
|
+
# === scrub!(:whitewash)
|
157
|
+
#
|
158
|
+
# +:whitewash+ removes all comments, styling and attributes in
|
159
|
+
# addition to doing markup-fixer-uppery and pruning unsafe tags. I
|
160
|
+
# like to call this "whitewashing", since it's like putting a new
|
161
|
+
# layer of paint on top of the HTML input to make it look nice.
|
162
|
+
#
|
163
|
+
# messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
|
164
|
+
# Loofah.fragment(messy_markup).scrub!(:whitewash)
|
165
|
+
# => "ohai! <div>div with attributes</div>"
|
166
|
+
#
|
167
|
+
# One use case for this scrubber is to clean up HTML that was
|
168
|
+
# cut-and-pasted from Microsoft Word into a WYSIWYG editor or a
|
169
|
+
# rich text editor. Microsoft's software is famous for injecting
|
170
|
+
# all kinds of cruft into its HTML output. Who needs that crap?
|
171
|
+
# Certainly not me.
|
172
|
+
#
|
173
|
+
class Whitewash < Scrubber
|
174
|
+
def initialize
|
175
|
+
@direction = :top_down
|
176
|
+
end
|
177
|
+
|
178
|
+
def scrub(node)
|
179
|
+
case node.type
|
180
|
+
when Nokogiri::XML::Node::ELEMENT_NODE
|
181
|
+
if HTML5::Scrub.allowed_element? node.name
|
182
|
+
node.attributes.each { |attr| node.remove_attribute(attr.first) }
|
183
|
+
return CONTINUE if node.namespaces.empty?
|
184
|
+
end
|
185
|
+
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
|
186
|
+
return CONTINUE
|
187
|
+
end
|
188
|
+
node.remove
|
189
|
+
STOP
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
#
|
194
|
+
# === scrub!(:nofollow)
|
195
|
+
#
|
196
|
+
# +:nofollow+ adds a rel="nofollow" attribute to all links
|
197
|
+
#
|
198
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
199
|
+
# Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
|
200
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
|
201
|
+
#
|
202
|
+
class NoFollow < Scrubber
|
203
|
+
def initialize
|
204
|
+
@direction = :top_down
|
205
|
+
end
|
206
|
+
|
207
|
+
def scrub(node)
|
208
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
|
209
|
+
append_attribute(node, 'rel', 'nofollow')
|
210
|
+
return STOP
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
#
|
215
|
+
# === scrub!(:noopener)
|
216
|
+
#
|
217
|
+
# +:noopener+ adds a rel="noopener" attribute to all links
|
218
|
+
#
|
219
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
220
|
+
# Loofah.fragment(link_farmers_markup).scrub!(:noopener)
|
221
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
|
222
|
+
#
|
223
|
+
class NoOpener < Scrubber
|
224
|
+
def initialize
|
225
|
+
@direction = :top_down
|
226
|
+
end
|
227
|
+
|
228
|
+
def scrub(node)
|
229
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
|
230
|
+
append_attribute(node, 'rel', 'noopener')
|
231
|
+
return STOP
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
# This class probably isn't useful publicly, but is used for #to_text's current implemention
|
236
|
+
class NewlineBlockElements < Scrubber # :nodoc:
|
237
|
+
def initialize
|
238
|
+
@direction = :bottom_up
|
239
|
+
end
|
240
|
+
|
241
|
+
def scrub(node)
|
242
|
+
return CONTINUE unless Loofah::Elements::BLOCK_LEVEL.include?(node.name)
|
243
|
+
node.add_next_sibling Nokogiri::XML::Text.new("\n#{node.content}\n", node.document)
|
244
|
+
node.remove
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
#
|
249
|
+
# === scrub!(:unprintable)
|
250
|
+
#
|
251
|
+
# +:unprintable+ removes unprintable Unicode characters.
|
252
|
+
#
|
253
|
+
# markup = "<p>Some text with an unprintable character at the end\u2028</p>"
|
254
|
+
# Loofah.fragment(markup).scrub!(:unprintable)
|
255
|
+
# => "<p>Some text with an unprintable character at the end</p>"
|
256
|
+
#
|
257
|
+
# You may not be able to see the unprintable character in the above example, but there is a
|
258
|
+
# U+2028 character right before the closing </p> tag. These characters can cause issues if
|
259
|
+
# the content is ever parsed by JavaScript - more information here:
|
260
|
+
#
|
261
|
+
# http://timelessrepo.com/json-isnt-a-javascript-subset
|
262
|
+
#
|
263
|
+
class Unprintable < Scrubber
|
264
|
+
def initialize
|
265
|
+
@direction = :top_down
|
266
|
+
end
|
267
|
+
|
268
|
+
def scrub(node)
|
269
|
+
if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
|
270
|
+
node.content = node.content.gsub(/\u2028|\u2029/, '')
|
271
|
+
end
|
272
|
+
CONTINUE
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
#
|
277
|
+
# A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
|
278
|
+
#
|
279
|
+
MAP = {
|
280
|
+
:escape => Escape,
|
281
|
+
:prune => Prune,
|
282
|
+
:whitewash => Whitewash,
|
283
|
+
:strip => Strip,
|
284
|
+
:nofollow => NoFollow,
|
285
|
+
:noopener => NoOpener,
|
286
|
+
:newline_block_elements => NewlineBlockElements,
|
287
|
+
:unprintable => Unprintable
|
288
|
+
}
|
289
|
+
|
290
|
+
#
|
291
|
+
# Returns an array of symbols representing the built-in scrubbers
|
292
|
+
#
|
293
|
+
def self.scrubber_symbols
|
294
|
+
MAP.keys
|
295
|
+
end
|
296
|
+
end
|
297
|
+
end
|