loofah 1.0.0 → 2.19.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +489 -0
  3. data/MIT-LICENSE.txt +3 -1
  4. data/README.md +364 -0
  5. data/SECURITY.md +18 -0
  6. data/lib/loofah/elements.rb +88 -11
  7. data/lib/loofah/helpers.rb +76 -2
  8. data/lib/loofah/html/document.rb +1 -0
  9. data/lib/loofah/html/document_fragment.rb +9 -2
  10. data/lib/loofah/html5/libxml2_workarounds.rb +27 -0
  11. data/lib/loofah/html5/safelist.rb +1042 -0
  12. data/lib/loofah/html5/scrub.rb +198 -40
  13. data/lib/loofah/instance_methods.rb +16 -10
  14. data/lib/loofah/metahelpers.rb +9 -10
  15. data/lib/loofah/scrubber.rb +22 -6
  16. data/lib/loofah/scrubbers.rb +96 -16
  17. data/lib/loofah/version.rb +5 -0
  18. data/lib/loofah/xml/document.rb +1 -0
  19. data/lib/loofah/xml/document_fragment.rb +5 -2
  20. data/lib/loofah.rb +38 -25
  21. metadata +159 -172
  22. data/CHANGELOG.rdoc +0 -134
  23. data/Gemfile +0 -1
  24. data/Manifest.txt +0 -34
  25. data/README.rdoc +0 -312
  26. data/Rakefile +0 -53
  27. data/benchmark/benchmark.rb +0 -149
  28. data/benchmark/fragment.html +0 -96
  29. data/benchmark/helper.rb +0 -73
  30. data/benchmark/www.slashdot.com.html +0 -2560
  31. data/lib/loofah/html5/whitelist.rb +0 -168
  32. data/test/helper.rb +0 -7
  33. data/test/html5/test_sanitizer.rb +0 -248
  34. data/test/integration/test_ad_hoc.rb +0 -176
  35. data/test/integration/test_helpers.rb +0 -33
  36. data/test/integration/test_html.rb +0 -51
  37. data/test/integration/test_scrubbers.rb +0 -331
  38. data/test/integration/test_xml.rb +0 -55
  39. data/test/unit/test_api.rb +0 -138
  40. data/test/unit/test_helpers.rb +0 -27
  41. data/test/unit/test_scrubber.rb +0 -229
  42. data/test/unit/test_scrubbers.rb +0 -14
@@ -1,70 +1,228 @@
1
- require 'cgi'
1
+ # frozen_string_literal: true
2
+ require "cgi"
3
+ require "crass"
2
4
 
3
5
  module Loofah
4
6
  module HTML5 # :nodoc:
5
7
  module Scrub
8
+ CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
9
+ CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
10
+ CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
11
+ CSS_IMPORTANT = '!important'
12
+ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
13
+ DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
6
14
 
7
15
  class << self
16
+ def allowed_element?(element_name)
17
+ ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
18
+ end
8
19
 
9
20
  # alternative implementation of the html5lib attribute scrubbing algorithm
10
21
  def scrub_attributes(node)
11
22
  node.attribute_nodes.each do |attr_node|
12
23
  attr_name = if attr_node.namespace
13
- "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
14
- else
15
- attr_node.node_name
16
- end
17
- attr_node.remove unless HashedWhiteList::ALLOWED_ATTRIBUTES[attr_name]
18
- if HashedWhiteList::ATTR_VAL_IS_URI[attr_name]
19
- # this block lifted nearly verbatim from HTML5 sanitization
20
- val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
21
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and HashedWhiteList::ALLOWED_PROTOCOLS[val_unescaped.split(':')[0]].nil?
22
- attr_node.remove
23
- end
24
+ "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
25
+ else
26
+ attr_node.node_name
24
27
  end
25
- if HashedWhiteList::SVG_ATTR_VAL_ALLOWS_REF[attr_name]
26
- attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
28
+
29
+ if attr_name =~ DATA_ATTRIBUTE_NAME
30
+ next
27
31
  end
28
- if HashedWhiteList::SVG_ALLOW_LOCAL_HREF[node.name] && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
32
+
33
+ unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
29
34
  attr_node.remove
35
+ next
36
+ end
37
+
38
+ if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
39
+ next if scrub_uri_attribute(attr_node)
40
+ end
41
+
42
+ if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
43
+ scrub_attribute_that_allows_local_ref(attr_node)
44
+ end
45
+
46
+ if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
47
+ attr_node.remove
48
+ next
30
49
  end
31
50
  end
32
- if node.attributes['style']
33
- node['style'] = scrub_css(node.attributes['style'])
51
+
52
+ scrub_css_attribute(node)
53
+
54
+ node.attribute_nodes.each do |attr_node|
55
+ if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
56
+ node.remove_attribute(attr_node.name)
57
+ end
34
58
  end
59
+
60
+ force_correct_attribute_escaping!(node)
61
+ end
62
+
63
+ def scrub_css_attribute(node)
64
+ style = node.attributes["style"]
65
+ style.value = scrub_css(style.value) if style
35
66
  end
36
67
 
37
- # lifted nearly verbatim from html5lib
38
68
  def scrub_css(style)
39
- # disallow urls
40
- style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
41
-
42
- # gauntlet
43
- return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
44
- return '' unless style =~ /^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$/
45
-
46
- clean = []
47
- style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
48
- next if val.empty?
49
- prop.downcase!
50
- if HashedWhiteList::ALLOWED_CSS_PROPERTIES[prop]
51
- clean << "#{prop}: #{val};"
52
- elsif %w[background border margin padding].include?(prop.split('-')[0])
53
- clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
54
- HashedWhiteList::ALLOWED_CSS_KEYWORDS[keyword].nil? and
55
- keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
69
+ style_tree = Crass.parse_properties(style)
70
+ sanitized_tree = []
71
+
72
+ style_tree.each do |node|
73
+ next unless node[:node] == :property
74
+ next if node[:children].any? do |child|
75
+ [:url, :bad_url].include?(child[:node])
76
+ end
77
+
78
+ name = node[:name].downcase
79
+ next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
80
+ SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
81
+ SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
82
+
83
+ value = node[:children].map do |child|
84
+ case child[:node]
85
+ when :whitespace
86
+ nil
87
+ when :string
88
+ if child[:raw] =~ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES
89
+ Crass::Parser.stringify(child)
90
+ else
91
+ nil
92
+ end
93
+ when :function
94
+ if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
95
+ Crass::Parser.stringify(child)
96
+ end
97
+ when :ident
98
+ keyword = child[:value]
99
+ if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
100
+ SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
101
+ (keyword =~ CSS_KEYWORDISH)
102
+ keyword
103
+ end
104
+ else
105
+ child[:raw]
106
+ end
107
+ end.compact
108
+
109
+ next if value.empty?
110
+ value << CSS_IMPORTANT if node[:important]
111
+ propstring = format("%s:%s", name, value.join(" "))
112
+ sanitized_node = Crass.parse_properties(propstring).first
113
+ sanitized_tree << sanitized_node << CRASS_SEMICOLON
114
+ end
115
+
116
+ Crass::Parser.stringify(sanitized_tree)
117
+ end
118
+
119
+ def scrub_attribute_that_allows_local_ref(attr_node)
120
+ return unless attr_node.value
121
+
122
+ nodes = Crass::Parser.new(attr_node.value).parse_component_values
123
+
124
+ values = nodes.map do |node|
125
+ case node[:node]
126
+ when :url
127
+ if node[:value].start_with?("#")
128
+ node[:raw]
129
+ else
130
+ nil
56
131
  end
57
- elsif HashedWhiteList::ALLOWED_SVG_PROPERTIES[prop]
58
- clean << "#{prop}: #{val};"
132
+ when :hash, :ident, :string
133
+ node[:raw]
134
+ else
135
+ nil
59
136
  end
137
+ end.compact
138
+
139
+ attr_node.value = values.join(" ")
140
+ end
141
+
142
+ def scrub_uri_attribute(attr_node)
143
+ # this block lifted nearly verbatim from HTML5 sanitization
144
+ val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
145
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
146
+ attr_node.remove
147
+ return true
148
+ elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
149
+ # permit only allowed data mediatypes
150
+ mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
151
+ mediatype, _ = mediatype.split(";")[0..1] if mediatype
152
+ if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
153
+ attr_node.remove
154
+ return true
155
+ end
156
+ end
157
+ false
158
+ end
159
+
160
+ #
161
+ # libxml2 >= 2.9.2 fails to escape comments within some attributes.
162
+ #
163
+ # see comments about CVE-2018-8048 within the tests for more information
164
+ #
165
+ def force_correct_attribute_escaping!(node)
166
+ return unless Nokogiri::VersionInfo.instance.libxml2?
167
+
168
+ node.attribute_nodes.each do |attr_node|
169
+ next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name)
170
+
171
+ tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name]
172
+ next unless tag_name.nil? || tag_name == node.name
173
+
174
+ #
175
+ # this block is just like CGI.escape in Ruby 2.4, but
176
+ # only encodes space and double-quote, to mimic
177
+ # pre-2.9.2 behavior
178
+ #
179
+ encoding = attr_node.value.encoding
180
+ attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
181
+ "%" + m.unpack("H2" * m.bytesize).join("%").upcase
182
+ end.force_encoding(encoding)
60
183
  end
184
+ end
61
185
 
62
- style = clean.join(' ')
186
+ def cdata_needs_escaping?(node)
187
+ # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way
188
+ node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script"))
63
189
  end
64
190
 
65
- end
191
+ def cdata_escape(node)
192
+ escaped_text = escape_tags(node.text)
193
+ if Nokogiri.jruby?
194
+ node.document.create_text_node(escaped_text)
195
+ else
196
+ node.document.create_cdata(escaped_text)
197
+ end
198
+ end
66
199
 
200
+ TABLE_FOR_ESCAPE_HTML__ = {
201
+ '<' => '&lt;',
202
+ '>' => '&gt;',
203
+ '&' => '&amp;',
204
+ }
205
+
206
+ def escape_tags(string)
207
+ # modified version of CGI.escapeHTML from ruby 3.1
208
+ enc = string.encoding
209
+ unless enc.ascii_compatible?
210
+ if enc.dummy?
211
+ origenc = enc
212
+ enc = Encoding::Converter.asciicompat_encoding(enc)
213
+ string = enc ? string.encode(enc) : string.b
214
+ end
215
+ table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
216
+ string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
217
+ string.encode!(origenc) if origenc
218
+ string
219
+ else
220
+ string = string.b
221
+ string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
222
+ string.force_encoding(enc)
223
+ end
224
+ end
225
+ end
67
226
  end
68
227
  end
69
228
  end
70
-
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
@@ -41,7 +42,7 @@ module Loofah
41
42
  when Nokogiri::XML::Document
42
43
  scrubber.traverse(root) if root
43
44
  when Nokogiri::XML::DocumentFragment
44
- children.each { |node| node.scrub!(scrubber) } # TODO: children.scrub! once Nokogiri 1.4.2 is out
45
+ children.scrub! scrubber
45
46
  else
46
47
  scrubber.traverse(self)
47
48
  end
@@ -91,29 +92,34 @@ module Loofah
91
92
  # # decidedly not ok for browser:
92
93
  # frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
93
94
  #
94
- def text(options={})
95
- result = serialize_root.children.inner_text rescue ""
95
+ def text(options = {})
96
+ result = if serialize_root
97
+ serialize_root.children.reject(&:comment?).map(&:inner_text).join("")
98
+ else
99
+ ""
100
+ end
96
101
  if options[:encode_special_chars] == false
97
102
  result # possibly dangerous if rendered in a browser
98
103
  else
99
104
  encode_special_chars result
100
105
  end
101
106
  end
107
+
102
108
  alias :inner_text :text
103
- alias :to_str :text
109
+ alias :to_str :text
104
110
 
105
111
  #
106
112
  # Returns a plain-text version of the markup contained by the
107
113
  # fragment, with HTML entities encoded.
108
114
  #
109
- # This method is slower than #to_text, but is clever about
110
- # whitespace around block elements.
115
+ # This method is slower than #text, but is clever about
116
+ # whitespace around block elements and line break elements.
111
117
  #
112
- # Loofah.document("<h1>Title</h1><div>Content</div>").to_text
113
- # # => "\nTitle\n\nContent\n"
118
+ # Loofah.document("<h1>Title</h1><div>Content<br>Next line</div>").to_text
119
+ # # => "\nTitle\n\nContent\nNext line\n"
114
120
  #
115
- def to_text(options={})
116
- Loofah::Helpers.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
121
+ def to_text(options = {})
122
+ Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
117
123
  end
118
124
  end
119
125
 
@@ -1,15 +1,14 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
- module MetaHelpers
3
- def self.HashifiedConstants(orig_module)
4
- hashed_module = Module.new
5
- orig_module.constants.each do |constant|
6
- next unless orig_module.module_eval("#{constant}").is_a?(Array)
7
- hashed_module.module_eval <<-CODE
8
- #{constant} = {}
9
- #{orig_module.name}::#{constant}.each { |c| #{constant}[c] = true ; #{constant}[c.downcase] = true }
10
- CODE
3
+ module MetaHelpers # :nodoc:
4
+ def self.add_downcased_set_members_to_all_set_constants(mojule)
5
+ mojule.constants.each do |constant_sym|
6
+ constant = mojule.const_get constant_sym
7
+ next unless Set === constant
8
+ constant.dup.each do |member|
9
+ constant.add member.downcase
10
+ end
11
11
  end
12
- hashed_module
13
12
  end
14
13
  end
15
14
  end
@@ -1,8 +1,9 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # A RuntimeError raised when Loofah could not find an appropriate scrubber.
4
5
  #
5
- class ScrubberNotFound < RuntimeError ; end
6
+ class ScrubberNotFound < RuntimeError; end
6
7
 
7
8
  #
8
9
  # A Scrubber wraps up a block (or method) that is run on an HTML node (element):
@@ -36,7 +37,7 @@ module Loofah
36
37
  CONTINUE = Object.new.freeze
37
38
 
38
39
  # Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
39
- STOP = Object.new.freeze
40
+ STOP = Object.new.freeze
40
41
 
41
42
  # When a scrubber is initialized, the :direction may be specified
42
43
  # as :top_down (the default) or :bottom_up.
@@ -64,7 +65,7 @@ module Loofah
64
65
  def initialize(options = {}, &block)
65
66
  direction = options[:direction] || :top_down
66
67
  unless [:top_down, :bottom_up].include?(direction)
67
- raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
+ raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
69
  end
69
70
  @direction, @block = direction, block
70
71
  end
@@ -86,16 +87,31 @@ module Loofah
86
87
  raise ScrubberNotFound, "No scrub method has been defined on #{self.class.to_s}"
87
88
  end
88
89
 
90
+ #
91
+ # If the attribute is not set, add it
92
+ # If the attribute is set, don't overwrite the existing value
93
+ #
94
+ def append_attribute(node, attribute, value)
95
+ current_value = node.get_attribute(attribute) || ""
96
+ current_values = current_value.split(/\s+/)
97
+ updated_value = current_values | [value]
98
+ node.set_attribute(attribute, updated_value.join(" "))
99
+ end
100
+
89
101
  private
90
102
 
91
103
  def html5lib_sanitize(node)
92
104
  case node.type
93
105
  when Nokogiri::XML::Node::ELEMENT_NODE
94
- if HTML5::HashedWhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2[node.name]
106
+ if HTML5::Scrub.allowed_element? node.name
95
107
  HTML5::Scrub.scrub_attributes node
96
108
  return Scrubber::CONTINUE
97
109
  end
98
110
  when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
111
+ if HTML5::Scrub.cdata_needs_escaping?(node)
112
+ node.before(HTML5::Scrub.cdata_escape(node))
113
+ return Scrubber::STOP
114
+ end
99
115
  return Scrubber::CONTINUE
100
116
  end
101
117
  Scrubber::STOP
@@ -107,11 +123,11 @@ module Loofah
107
123
  else
108
124
  return if scrub(node) == STOP
109
125
  end
110
- node.children.each {|j| traverse_conditionally_top_down(j)}
126
+ node.children.each { |j| traverse_conditionally_top_down(j) }
111
127
  end
112
128
 
113
129
  def traverse_conditionally_bottom_up(node)
114
- node.children.each {|j| traverse_conditionally_bottom_up(j)}
130
+ node.children.each { |j| traverse_conditionally_bottom_up(j) }
115
131
  if block
116
132
  block.call(node)
117
133
  else
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # Loofah provides some built-in scrubbers for sanitizing with
4
- # HTML5lib's whitelist and for accomplishing some common
5
+ # HTML5lib's safelist and for accomplishing some common
5
6
  # transformation tasks.
6
7
  #
7
8
  #
@@ -58,6 +59,30 @@ module Loofah
58
59
  # Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
59
60
  # => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
60
61
  #
62
+ #
63
+ # === Loofah::Scrubbers::NoOpener / scrub!(:noopener)
64
+ #
65
+ # +:noopener+ adds a rel="noopener" attribute to all links
66
+ #
67
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
68
+ # Loofah.fragment(link_farmers_markup).scrub!(:noopener)
69
+ # => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
70
+ #
71
+ #
72
+ # === Loofah::Scrubbers::Unprintable / scrub!(:unprintable)
73
+ #
74
+ # +:unprintable+ removes unprintable Unicode characters.
75
+ #
76
+ # markup = "<p>Some text with an unprintable character at the end\u2028</p>"
77
+ # Loofah.fragment(markup).scrub!(:unprintable)
78
+ # => "<p>Some text with an unprintable character at the end</p>"
79
+ #
80
+ # You may not be able to see the unprintable character in the above example, but there is a
81
+ # U+2028 character right before the closing </p> tag. These characters can cause issues if
82
+ # the content is ever parsed by JavaScript - more information here:
83
+ #
84
+ # http://timelessrepo.com/json-isnt-a-javascript-subset
85
+ #
61
86
  module Scrubbers
62
87
  #
63
88
  # === scrub!(:strip)
@@ -75,8 +100,9 @@ module Loofah
75
100
 
76
101
  def scrub(node)
77
102
  return CONTINUE if html5lib_sanitize(node) == CONTINUE
78
- node.before node.inner_html
103
+ node.before(node.children)
79
104
  node.remove
105
+ return STOP
80
106
  end
81
107
  end
82
108
 
@@ -117,8 +143,7 @@ module Loofah
117
143
 
118
144
  def scrub(node)
119
145
  return CONTINUE if html5lib_sanitize(node) == CONTINUE
120
- replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document)
121
- node.add_next_sibling replacement_killer
146
+ node.add_next_sibling Nokogiri::XML::Text.new(node.to_s, node.document)
122
147
  node.remove
123
148
  return STOP
124
149
  end
@@ -150,7 +175,7 @@ module Loofah
150
175
  def scrub(node)
151
176
  case node.type
152
177
  when Nokogiri::XML::Node::ELEMENT_NODE
153
- if HTML5::HashedWhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2[node.name]
178
+ if HTML5::Scrub.allowed_element? node.name
154
179
  node.attributes.each { |attr| node.remove_attribute(attr.first) }
155
180
  return CONTINUE if node.namespaces.empty?
156
181
  end
@@ -177,9 +202,30 @@ module Loofah
177
202
  end
178
203
 
179
204
  def scrub(node)
180
- return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
181
- node.set_attribute('rel', 'nofollow')
182
- return STOP
205
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
206
+ append_attribute(node, "rel", "nofollow")
207
+ return STOP
208
+ end
209
+ end
210
+
211
+ #
212
+ # === scrub!(:noopener)
213
+ #
214
+ # +:noopener+ adds a rel="noopener" attribute to all links
215
+ #
216
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
217
+ # Loofah.fragment(link_farmers_markup).scrub!(:noopener)
218
+ # => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
219
+ #
220
+ class NoOpener < Scrubber
221
+ def initialize
222
+ @direction = :top_down
223
+ end
224
+
225
+ def scrub(node)
226
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
227
+ append_attribute(node, "rel", "noopener")
228
+ return STOP
183
229
  end
184
230
  end
185
231
 
@@ -190,23 +236,57 @@ module Loofah
190
236
  end
191
237
 
192
238
  def scrub(node)
193
- return CONTINUE unless Loofah::HashedElements::BLOCK_LEVEL[node.name]
194
- replacement_killer = Nokogiri::XML::Text.new("\n#{node.content}\n", node.document)
195
- node.add_next_sibling replacement_killer
239
+ return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
240
+ replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
241
+ "\n"
242
+ else
243
+ "\n#{node.content}\n"
244
+ end
245
+ node.add_next_sibling Nokogiri::XML::Text.new(replacement, node.document)
196
246
  node.remove
197
247
  end
198
248
  end
199
249
 
250
+ #
251
+ # === scrub!(:unprintable)
252
+ #
253
+ # +:unprintable+ removes unprintable Unicode characters.
254
+ #
255
+ # markup = "<p>Some text with an unprintable character at the end\u2028</p>"
256
+ # Loofah.fragment(markup).scrub!(:unprintable)
257
+ # => "<p>Some text with an unprintable character at the end</p>"
258
+ #
259
+ # You may not be able to see the unprintable character in the above example, but there is a
260
+ # U+2028 character right before the closing </p> tag. These characters can cause issues if
261
+ # the content is ever parsed by JavaScript - more information here:
262
+ #
263
+ # http://timelessrepo.com/json-isnt-a-javascript-subset
264
+ #
265
+ class Unprintable < Scrubber
266
+ def initialize
267
+ @direction = :top_down
268
+ end
269
+
270
+ def scrub(node)
271
+ if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
272
+ node.content = node.content.gsub(/\u2028|\u2029/, "")
273
+ end
274
+ CONTINUE
275
+ end
276
+ end
277
+
200
278
  #
201
279
  # A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
202
280
  #
203
281
  MAP = {
204
- :escape => Escape,
205
- :prune => Prune,
282
+ :escape => Escape,
283
+ :prune => Prune,
206
284
  :whitewash => Whitewash,
207
- :strip => Strip,
208
- :nofollow => NoFollow,
209
- :newline_block_elements => NewlineBlockElements
285
+ :strip => Strip,
286
+ :nofollow => NoFollow,
287
+ :noopener => NoOpener,
288
+ :newline_block_elements => NewlineBlockElements,
289
+ :unprintable => Unprintable,
210
290
  }
211
291
 
212
292
  #
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+ module Loofah
3
+ # The version of Loofah you are using
4
+ VERSION = "2.19.1"
5
+ end
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  module XML # :nodoc:
3
4
  #
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  module XML # :nodoc:
3
4
  #
@@ -12,8 +13,10 @@ module Loofah
12
13
  # constructor. Applications should use Loofah.fragment to
13
14
  # parse a fragment.
14
15
  #
15
- def parse tags
16
- self.new(Loofah::XML::Document.new, tags)
16
+ def parse(tags)
17
+ doc = Loofah::XML::Document.new
18
+ doc.encoding = tags.encoding.name if tags.respond_to?(:encoding)
19
+ self.new(doc, tags)
17
20
  end
18
21
  end
19
22
  end