loofah 2.3.1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of loofah might be problematic. Click here for more details.

Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.gemtest +0 -0
  3. data/CHANGELOG.md +336 -0
  4. data/Gemfile +22 -0
  5. data/MIT-LICENSE.txt +23 -0
  6. data/Manifest.txt +41 -0
  7. data/README.md +363 -0
  8. data/Rakefile +81 -0
  9. data/SECURITY.md +18 -0
  10. data/benchmark/benchmark.rb +149 -0
  11. data/benchmark/fragment.html +96 -0
  12. data/benchmark/helper.rb +73 -0
  13. data/benchmark/www.slashdot.com.html +2560 -0
  14. data/lib/loofah.rb +83 -0
  15. data/lib/loofah/elements.rb +92 -0
  16. data/lib/loofah/helpers.rb +103 -0
  17. data/lib/loofah/html/document.rb +18 -0
  18. data/lib/loofah/html/document_fragment.rb +40 -0
  19. data/lib/loofah/html5/libxml2_workarounds.rb +26 -0
  20. data/lib/loofah/html5/safelist.rb +796 -0
  21. data/lib/loofah/html5/scrub.rb +133 -0
  22. data/lib/loofah/instance_methods.rb +127 -0
  23. data/lib/loofah/metahelpers.rb +13 -0
  24. data/lib/loofah/scrubber.rb +133 -0
  25. data/lib/loofah/scrubbers.rb +297 -0
  26. data/lib/loofah/xml/document.rb +13 -0
  27. data/lib/loofah/xml/document_fragment.rb +23 -0
  28. data/test/assets/msword.html +63 -0
  29. data/test/assets/testdata_sanitizer_tests1.dat +502 -0
  30. data/test/helper.rb +18 -0
  31. data/test/html5/test_sanitizer.rb +401 -0
  32. data/test/html5/test_scrub.rb +10 -0
  33. data/test/integration/test_ad_hoc.rb +220 -0
  34. data/test/integration/test_helpers.rb +43 -0
  35. data/test/integration/test_html.rb +72 -0
  36. data/test/integration/test_scrubbers.rb +400 -0
  37. data/test/integration/test_xml.rb +55 -0
  38. data/test/unit/test_api.rb +142 -0
  39. data/test/unit/test_encoding.rb +20 -0
  40. data/test/unit/test_helpers.rb +62 -0
  41. data/test/unit/test_scrubber.rb +229 -0
  42. data/test/unit/test_scrubbers.rb +14 -0
  43. metadata +287 -0
@@ -0,0 +1,133 @@
1
+ require 'cgi'
2
+ require 'crass'
3
+
4
+ module Loofah
5
+ module HTML5 # :nodoc:
6
+ module Scrub
7
+
8
+ CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
9
+ CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
10
+ CRASS_SEMICOLON = {:node => :semicolon, :raw => ";"}
11
+
12
+ class << self
13
+
14
+ def allowed_element? element_name
15
+ ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
16
+ end
17
+
18
+ # alternative implementation of the html5lib attribute scrubbing algorithm
19
+ def scrub_attributes node
20
+ node.attribute_nodes.each do |attr_node|
21
+ attr_name = if attr_node.namespace
22
+ "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
23
+ else
24
+ attr_node.node_name
25
+ end
26
+
27
+ if attr_name =~ /\Adata-[\w-]+\z/
28
+ next
29
+ end
30
+
31
+ unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
32
+ attr_node.remove
33
+ next
34
+ end
35
+
36
+ if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
37
+ # this block lifted nearly verbatim from HTML5 sanitization
38
+ val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
39
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
40
+ attr_node.remove
41
+ next
42
+ elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == 'data'
43
+ # permit only allowed data mediatypes
44
+ mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
45
+ mediatype, _ = mediatype.split(';')[0..1] if mediatype
46
+ if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
47
+ attr_node.remove
48
+ next
49
+ end
50
+ end
51
+ end
52
+ if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
53
+ attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
54
+ end
55
+ if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
56
+ attr_node.remove
57
+ next
58
+ end
59
+ end
60
+
61
+ scrub_css_attribute node
62
+
63
+ node.attribute_nodes.each do |attr_node|
64
+ node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/
65
+ end
66
+
67
+ force_correct_attribute_escaping! node
68
+ end
69
+
70
+ def scrub_css_attribute node
71
+ style = node.attributes['style']
72
+ style.value = scrub_css(style.value) if style
73
+ end
74
+
75
+ def scrub_css style
76
+ style_tree = Crass.parse_properties style
77
+ sanitized_tree = []
78
+
79
+ style_tree.each do |node|
80
+ next unless node[:node] == :property
81
+ next if node[:children].any? do |child|
82
+ [:url, :bad_url].include?(child[:node]) || (child[:node] == :function && !SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase))
83
+ end
84
+ name = node[:name].downcase
85
+ if SafeList::ALLOWED_CSS_PROPERTIES.include?(name) || SafeList::ALLOWED_SVG_PROPERTIES.include?(name)
86
+ sanitized_tree << node << CRASS_SEMICOLON
87
+ elsif SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
88
+ value = node[:value].split.map do |keyword|
89
+ if SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
90
+ keyword
91
+ end
92
+ end.compact
93
+ unless value.empty?
94
+ propstring = sprintf "%s:%s", name, value.join(" ")
95
+ sanitized_node = Crass.parse_properties(propstring).first
96
+ sanitized_tree << sanitized_node << CRASS_SEMICOLON
97
+ end
98
+ end
99
+ end
100
+
101
+ Crass::Parser.stringify sanitized_tree
102
+ end
103
+
104
+ #
105
+ # libxml2 >= 2.9.2 fails to escape comments within some attributes.
106
+ #
107
+ # see comments about CVE-2018-8048 within the tests for more information
108
+ #
109
+ def force_correct_attribute_escaping! node
110
+ return unless Nokogiri::VersionInfo.instance.libxml2?
111
+
112
+ node.attribute_nodes.each do |attr_node|
113
+ next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name)
114
+
115
+ tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name]
116
+ next unless tag_name.nil? || tag_name == node.name
117
+
118
+ #
119
+ # this block is just like CGI.escape in Ruby 2.4, but
120
+ # only encodes space and double-quote, to mimic
121
+ # pre-2.9.2 behavior
122
+ #
123
+ encoding = attr_node.value.encoding
124
+ attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
125
+ '%' + m.unpack('H2' * m.bytesize).join('%').upcase
126
+ end.force_encoding(encoding)
127
+ end
128
+ end
129
+
130
+ end
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,127 @@
1
+ module Loofah
2
+ #
3
+ # Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
4
+ #
5
+ # Traverse the document or fragment, invoking the +scrubber+ on
6
+ # each node.
7
+ #
8
+ # +scrubber+ must either be one of the symbols representing the
9
+ # built-in scrubbers (see Scrubbers), or a Scrubber instance.
10
+ #
11
+ # span2div = Loofah::Scrubber.new do |node|
12
+ # node.name = "div" if node.name == "span"
13
+ # end
14
+ # Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
15
+ # # => "<div>foo</div><p>bar</p>"
16
+ #
17
+ # or
18
+ #
19
+ # unsafe_html = "ohai! <div>div is safe</div> <script>but script is not</script>"
20
+ # Loofah.fragment(unsafe_html).scrub!(:strip).to_s
21
+ # # => "ohai! <div>div is safe</div> "
22
+ #
23
+ # Note that this method is called implicitly from
24
+ # Loofah.scrub_fragment and Loofah.scrub_document.
25
+ #
26
+ # Please see Scrubber for more information on implementation and traversal, and
27
+ # README.rdoc for more example usage.
28
+ #
29
+ module ScrubBehavior
30
+ module Node # :nodoc:
31
+ def scrub!(scrubber)
32
+ #
33
+ # yes. this should be three separate methods. but nokogiri
34
+ # decorates (or not) based on whether the module name has
35
+ # already been included. and since documents get decorated
36
+ # just like their constituent nodes, we need to jam all the
37
+ # logic into a single module.
38
+ #
39
+ scrubber = ScrubBehavior.resolve_scrubber(scrubber)
40
+ case self
41
+ when Nokogiri::XML::Document
42
+ scrubber.traverse(root) if root
43
+ when Nokogiri::XML::DocumentFragment
44
+ children.scrub! scrubber
45
+ else
46
+ scrubber.traverse(self)
47
+ end
48
+ self
49
+ end
50
+ end
51
+
52
+ module NodeSet # :nodoc:
53
+ def scrub!(scrubber)
54
+ each { |node| node.scrub!(scrubber) }
55
+ self
56
+ end
57
+ end
58
+
59
+ def ScrubBehavior.resolve_scrubber(scrubber) # :nodoc:
60
+ scrubber = Scrubbers::MAP[scrubber].new if Scrubbers::MAP[scrubber]
61
+ unless scrubber.is_a?(Loofah::Scrubber)
62
+ raise Loofah::ScrubberNotFound, "not a Scrubber or a scrubber name: #{scrubber.inspect}"
63
+ end
64
+ scrubber
65
+ end
66
+ end
67
+
68
+ #
69
+ # Overrides +text+ in HTML::Document and HTML::DocumentFragment,
70
+ # and mixes in +to_text+.
71
+ #
72
+ module TextBehavior
73
+ #
74
+ # Returns a plain-text version of the markup contained by the document,
75
+ # with HTML entities encoded.
76
+ #
77
+ # This method is significantly faster than #to_text, but isn't
78
+ # clever about whitespace around block elements.
79
+ #
80
+ # Loofah.document("<h1>Title</h1><div>Content</div>").text
81
+ # # => "TitleContent"
82
+ #
83
+ # By default, the returned text will have HTML entities
84
+ # escaped. If you want unescaped entities, and you understand
85
+ # that the result is unsafe to render in a browser, then you
86
+ # can pass an argument as shown:
87
+ #
88
+ # frag = Loofah.fragment("&lt;script&gt;alert('EVIL');&lt;/script&gt;")
89
+ # # ok for browser:
90
+ # frag.text # => "&lt;script&gt;alert('EVIL');&lt;/script&gt;"
91
+ # # decidedly not ok for browser:
92
+ # frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
93
+ #
94
+ def text(options={})
95
+ result = serialize_root.children.inner_text rescue ""
96
+ if options[:encode_special_chars] == false
97
+ result # possibly dangerous if rendered in a browser
98
+ else
99
+ encode_special_chars result
100
+ end
101
+ end
102
+ alias :inner_text :text
103
+ alias :to_str :text
104
+
105
+ #
106
+ # Returns a plain-text version of the markup contained by the
107
+ # fragment, with HTML entities encoded.
108
+ #
109
+ # This method is slower than #to_text, but is clever about
110
+ # whitespace around block elements.
111
+ #
112
+ # Loofah.document("<h1>Title</h1><div>Content</div>").to_text
113
+ # # => "\nTitle\n\nContent\n"
114
+ #
115
+ def to_text(options={})
116
+ Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
117
+ end
118
+ end
119
+
120
+ module DocumentDecorator # :nodoc:
121
+ def initialize(*args, &block)
122
+ super
123
+ self.decorators(Nokogiri::XML::Node) << ScrubBehavior::Node
124
+ self.decorators(Nokogiri::XML::NodeSet) << ScrubBehavior::NodeSet
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,13 @@
1
+ module Loofah
2
+ module MetaHelpers # :nodoc:
3
+ def self.add_downcased_set_members_to_all_set_constants mojule
4
+ mojule.constants.each do |constant_sym|
5
+ constant = mojule.const_get constant_sym
6
+ next unless Set === constant
7
+ constant.dup.each do |member|
8
+ constant.add member.downcase
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,133 @@
1
+ module Loofah
2
+ #
3
+ # A RuntimeError raised when Loofah could not find an appropriate scrubber.
4
+ #
5
+ class ScrubberNotFound < RuntimeError ; end
6
+
7
+ #
8
+ # A Scrubber wraps up a block (or method) that is run on an HTML node (element):
9
+ #
10
+ # # change all <span> tags to <div> tags
11
+ # span2div = Loofah::Scrubber.new do |node|
12
+ # node.name = "div" if node.name == "span"
13
+ # end
14
+ #
15
+ # Alternatively, this scrubber could have been implemented as:
16
+ #
17
+ # class Span2Div < Loofah::Scrubber
18
+ # def scrub(node)
19
+ # node.name = "div" if node.name == "span"
20
+ # end
21
+ # end
22
+ # span2div = Span2Div.new
23
+ #
24
+ # This can then be run on a document:
25
+ #
26
+ # Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
27
+ # # => "<div>foo</div><p>bar</p>"
28
+ #
29
+ # Scrubbers can be run on a document in either a top-down traversal (the
30
+ # default) or bottom-up. Top-down scrubbers can optionally return
31
+ # Scrubber::STOP to terminate the traversal of a subtree.
32
+ #
33
+ class Scrubber
34
+
35
+ # Top-down Scrubbers may return CONTINUE to indicate that the subtree should be traversed.
36
+ CONTINUE = Object.new.freeze
37
+
38
+ # Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
39
+ STOP = Object.new.freeze
40
+
41
+ # When a scrubber is initialized, the :direction may be specified
42
+ # as :top_down (the default) or :bottom_up.
43
+ attr_reader :direction
44
+
45
+ # When a scrubber is initialized, the optional block is saved as
46
+ # :block. Note that, if no block is passed, then the +scrub+
47
+ # method is assumed to have been implemented.
48
+ attr_reader :block
49
+
50
+ #
51
+ # Options may include
52
+ # :direction => :top_down (the default)
53
+ # or
54
+ # :direction => :bottom_up
55
+ #
56
+ # For top_down traversals, if the block returns
57
+ # Loofah::Scrubber::STOP, then the traversal will be terminated
58
+ # for the current node's subtree.
59
+ #
60
+ # Alternatively, a Scrubber may inherit from Loofah::Scrubber,
61
+ # and implement +scrub+, which is slightly faster than using a
62
+ # block.
63
+ #
64
+ def initialize(options = {}, &block)
65
+ direction = options[:direction] || :top_down
66
+ unless [:top_down, :bottom_up].include?(direction)
67
+ raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
+ end
69
+ @direction, @block = direction, block
70
+ end
71
+
72
+ #
73
+ # Calling +traverse+ will cause the document to be traversed by
74
+ # either the lambda passed to the initializer or the +scrub+
75
+ # method, in the direction specified at +new+ time.
76
+ #
77
+ def traverse(node)
78
+ direction == :bottom_up ? traverse_conditionally_bottom_up(node) : traverse_conditionally_top_down(node)
79
+ end
80
+
81
+ #
82
+ # When +new+ is not passed a block, the class may implement
83
+ # +scrub+, which will be called for each document node.
84
+ #
85
+ def scrub(node)
86
+ raise ScrubberNotFound, "No scrub method has been defined on #{self.class.to_s}"
87
+ end
88
+
89
+ #
90
+ # If the attribute is not set, add it
91
+ # If the attribute is set, don't overwrite the existing value
92
+ #
93
+ def append_attribute(node, attribute, value)
94
+ current_value = node.get_attribute(attribute) || ''
95
+ current_values = current_value.split(/\s+/)
96
+ updated_value = current_values | [value]
97
+ node.set_attribute(attribute, updated_value.join(' '))
98
+ end
99
+
100
+ private
101
+
102
+ def html5lib_sanitize(node)
103
+ case node.type
104
+ when Nokogiri::XML::Node::ELEMENT_NODE
105
+ if HTML5::Scrub.allowed_element? node.name
106
+ HTML5::Scrub.scrub_attributes node
107
+ return Scrubber::CONTINUE
108
+ end
109
+ when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
110
+ return Scrubber::CONTINUE
111
+ end
112
+ Scrubber::STOP
113
+ end
114
+
115
+ def traverse_conditionally_top_down(node)
116
+ if block
117
+ return if block.call(node) == STOP
118
+ else
119
+ return if scrub(node) == STOP
120
+ end
121
+ node.children.each {|j| traverse_conditionally_top_down(j)}
122
+ end
123
+
124
+ def traverse_conditionally_bottom_up(node)
125
+ node.children.each {|j| traverse_conditionally_bottom_up(j)}
126
+ if block
127
+ block.call(node)
128
+ else
129
+ scrub(node)
130
+ end
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,297 @@
1
+ module Loofah
2
+ #
3
+ # Loofah provides some built-in scrubbers for sanitizing with
4
+ # HTML5lib's safelist and for accomplishing some common
5
+ # transformation tasks.
6
+ #
7
+ #
8
+ # === Loofah::Scrubbers::Strip / scrub!(:strip)
9
+ #
10
+ # +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
11
+ #
12
+ # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
13
+ # Loofah.fragment(unsafe_html).scrub!(:strip)
14
+ # => "ohai! <div>div is safe</div> but foo is <b>not</b>"
15
+ #
16
+ #
17
+ # === Loofah::Scrubbers::Prune / scrub!(:prune)
18
+ #
19
+ # +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
20
+ #
21
+ # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
22
+ # Loofah.fragment(unsafe_html).scrub!(:prune)
23
+ # => "ohai! <div>div is safe</div> "
24
+ #
25
+ #
26
+ # === Loofah::Scrubbers::Escape / scrub!(:escape)
27
+ #
28
+ # +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
29
+ #
30
+ # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
31
+ # Loofah.fragment(unsafe_html).scrub!(:escape)
32
+ # => "ohai! <div>div is safe</div> &lt;foo&gt;but foo is &lt;b&gt;not&lt;/b&gt;&lt;/foo&gt;"
33
+ #
34
+ #
35
+ # === Loofah::Scrubbers::Whitewash / scrub!(:whitewash)
36
+ #
37
+ # +:whitewash+ removes all comments, styling and attributes in
38
+ # addition to doing markup-fixer-uppery and pruning unsafe tags. I
39
+ # like to call this "whitewashing", since it's like putting a new
40
+ # layer of paint on top of the HTML input to make it look nice.
41
+ #
42
+ # messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
43
+ # Loofah.fragment(messy_markup).scrub!(:whitewash)
44
+ # => "ohai! <div>div with attributes</div>"
45
+ #
46
+ # One use case for this scrubber is to clean up HTML that was
47
+ # cut-and-pasted from Microsoft Word into a WYSIWYG editor or a
48
+ # rich text editor. Microsoft's software is famous for injecting
49
+ # all kinds of cruft into its HTML output. Who needs that crap?
50
+ # Certainly not me.
51
+ #
52
+ #
53
+ # === Loofah::Scrubbers::NoFollow / scrub!(:nofollow)
54
+ #
55
+ # +:nofollow+ adds a rel="nofollow" attribute to all links
56
+ #
57
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
58
+ # Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
59
+ # => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
60
+ #
61
+ #
62
+ # === Loofah::Scrubbers::NoOpener / scrub!(:noopener)
63
+ #
64
+ # +:noopener+ adds a rel="noopener" attribute to all links
65
+ #
66
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
67
+ # Loofah.fragment(link_farmers_markup).scrub!(:noopener)
68
+ # => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
69
+ #
70
+ #
71
+ # === Loofah::Scrubbers::Unprintable / scrub!(:unprintable)
72
+ #
73
+ # +:unprintable+ removes unprintable Unicode characters.
74
+ #
75
+ # markup = "<p>Some text with an unprintable character at the end\u2028</p>"
76
+ # Loofah.fragment(markup).scrub!(:unprintable)
77
+ # => "<p>Some text with an unprintable character at the end</p>"
78
+ #
79
+ # You may not be able to see the unprintable character in the above example, but there is a
80
+ # U+2028 character right before the closing </p> tag. These characters can cause issues if
81
+ # the content is ever parsed by JavaScript - more information here:
82
+ #
83
+ # http://timelessrepo.com/json-isnt-a-javascript-subset
84
+ #
85
+ module Scrubbers
86
+ #
87
+ # === scrub!(:strip)
88
+ #
89
+ # +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
90
+ #
91
+ # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
92
+ # Loofah.fragment(unsafe_html).scrub!(:strip)
93
+ # => "ohai! <div>div is safe</div> but foo is <b>not</b>"
94
+ #
95
+ class Strip < Scrubber
96
+ def initialize
97
+ @direction = :bottom_up
98
+ end
99
+
100
+ def scrub(node)
101
+ return CONTINUE if html5lib_sanitize(node) == CONTINUE
102
+ if node.children.length == 1 && node.children.first.cdata?
103
+ sanitized_text = Loofah.fragment(node.children.first.to_html).scrub!(:strip).to_html
104
+ node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
105
+ else
106
+ node.before node.children
107
+ end
108
+ node.remove
109
+ end
110
+ end
111
+
112
+ #
113
+ # === scrub!(:prune)
114
+ #
115
+ # +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
116
+ #
117
+ # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
118
+ # Loofah.fragment(unsafe_html).scrub!(:prune)
119
+ # => "ohai! <div>div is safe</div> "
120
+ #
121
+ class Prune < Scrubber
122
+ def initialize
123
+ @direction = :top_down
124
+ end
125
+
126
+ def scrub(node)
127
+ return CONTINUE if html5lib_sanitize(node) == CONTINUE
128
+ node.remove
129
+ return STOP
130
+ end
131
+ end
132
+
133
+ #
134
+ # === scrub!(:escape)
135
+ #
136
+ # +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
137
+ #
138
+ # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
139
+ # Loofah.fragment(unsafe_html).scrub!(:escape)
140
+ # => "ohai! <div>div is safe</div> &lt;foo&gt;but foo is &lt;b&gt;not&lt;/b&gt;&lt;/foo&gt;"
141
+ #
142
+ class Escape < Scrubber
143
+ def initialize
144
+ @direction = :top_down
145
+ end
146
+
147
+ def scrub(node)
148
+ return CONTINUE if html5lib_sanitize(node) == CONTINUE
149
+ node.add_next_sibling Nokogiri::XML::Text.new(node.to_s, node.document)
150
+ node.remove
151
+ return STOP
152
+ end
153
+ end
154
+
155
+ #
156
+ # === scrub!(:whitewash)
157
+ #
158
+ # +:whitewash+ removes all comments, styling and attributes in
159
+ # addition to doing markup-fixer-uppery and pruning unsafe tags. I
160
+ # like to call this "whitewashing", since it's like putting a new
161
+ # layer of paint on top of the HTML input to make it look nice.
162
+ #
163
+ # messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
164
+ # Loofah.fragment(messy_markup).scrub!(:whitewash)
165
+ # => "ohai! <div>div with attributes</div>"
166
+ #
167
+ # One use case for this scrubber is to clean up HTML that was
168
+ # cut-and-pasted from Microsoft Word into a WYSIWYG editor or a
169
+ # rich text editor. Microsoft's software is famous for injecting
170
+ # all kinds of cruft into its HTML output. Who needs that crap?
171
+ # Certainly not me.
172
+ #
173
+ class Whitewash < Scrubber
174
+ def initialize
175
+ @direction = :top_down
176
+ end
177
+
178
+ def scrub(node)
179
+ case node.type
180
+ when Nokogiri::XML::Node::ELEMENT_NODE
181
+ if HTML5::Scrub.allowed_element? node.name
182
+ node.attributes.each { |attr| node.remove_attribute(attr.first) }
183
+ return CONTINUE if node.namespaces.empty?
184
+ end
185
+ when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
186
+ return CONTINUE
187
+ end
188
+ node.remove
189
+ STOP
190
+ end
191
+ end
192
+
193
+ #
194
+ # === scrub!(:nofollow)
195
+ #
196
+ # +:nofollow+ adds a rel="nofollow" attribute to all links
197
+ #
198
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
199
+ # Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
200
+ # => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
201
+ #
202
+ class NoFollow < Scrubber
203
+ def initialize
204
+ @direction = :top_down
205
+ end
206
+
207
+ def scrub(node)
208
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
209
+ append_attribute(node, 'rel', 'nofollow')
210
+ return STOP
211
+ end
212
+ end
213
+
214
+ #
215
+ # === scrub!(:noopener)
216
+ #
217
+ # +:noopener+ adds a rel="noopener" attribute to all links
218
+ #
219
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
220
+ # Loofah.fragment(link_farmers_markup).scrub!(:noopener)
221
+ # => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
222
+ #
223
+ class NoOpener < Scrubber
224
+ def initialize
225
+ @direction = :top_down
226
+ end
227
+
228
+ def scrub(node)
229
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
230
+ append_attribute(node, 'rel', 'noopener')
231
+ return STOP
232
+ end
233
+ end
234
+
235
+ # This class probably isn't useful publicly, but is used for #to_text's current implemention
236
+ class NewlineBlockElements < Scrubber # :nodoc:
237
+ def initialize
238
+ @direction = :bottom_up
239
+ end
240
+
241
+ def scrub(node)
242
+ return CONTINUE unless Loofah::Elements::BLOCK_LEVEL.include?(node.name)
243
+ node.add_next_sibling Nokogiri::XML::Text.new("\n#{node.content}\n", node.document)
244
+ node.remove
245
+ end
246
+ end
247
+
248
+ #
249
+ # === scrub!(:unprintable)
250
+ #
251
+ # +:unprintable+ removes unprintable Unicode characters.
252
+ #
253
+ # markup = "<p>Some text with an unprintable character at the end\u2028</p>"
254
+ # Loofah.fragment(markup).scrub!(:unprintable)
255
+ # => "<p>Some text with an unprintable character at the end</p>"
256
+ #
257
+ # You may not be able to see the unprintable character in the above example, but there is a
258
+ # U+2028 character right before the closing </p> tag. These characters can cause issues if
259
+ # the content is ever parsed by JavaScript - more information here:
260
+ #
261
+ # http://timelessrepo.com/json-isnt-a-javascript-subset
262
+ #
263
+ class Unprintable < Scrubber
264
+ def initialize
265
+ @direction = :top_down
266
+ end
267
+
268
+ def scrub(node)
269
+ if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
270
+ node.content = node.content.gsub(/\u2028|\u2029/, '')
271
+ end
272
+ CONTINUE
273
+ end
274
+ end
275
+
276
+ #
277
+ # A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
278
+ #
279
+ MAP = {
280
+ :escape => Escape,
281
+ :prune => Prune,
282
+ :whitewash => Whitewash,
283
+ :strip => Strip,
284
+ :nofollow => NoFollow,
285
+ :noopener => NoOpener,
286
+ :newline_block_elements => NewlineBlockElements,
287
+ :unprintable => Unprintable
288
+ }
289
+
290
+ #
291
+ # Returns an array of symbols representing the built-in scrubbers
292
+ #
293
+ def self.scrubber_symbols
294
+ MAP.keys
295
+ end
296
+ end
297
+ end