loofah 1.0.0 → 2.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +489 -0
  3. data/MIT-LICENSE.txt +3 -1
  4. data/README.md +364 -0
  5. data/SECURITY.md +18 -0
  6. data/lib/loofah/elements.rb +88 -11
  7. data/lib/loofah/helpers.rb +76 -2
  8. data/lib/loofah/html/document.rb +1 -0
  9. data/lib/loofah/html/document_fragment.rb +9 -2
  10. data/lib/loofah/html5/libxml2_workarounds.rb +27 -0
  11. data/lib/loofah/html5/safelist.rb +1042 -0
  12. data/lib/loofah/html5/scrub.rb +198 -40
  13. data/lib/loofah/instance_methods.rb +16 -10
  14. data/lib/loofah/metahelpers.rb +9 -10
  15. data/lib/loofah/scrubber.rb +22 -6
  16. data/lib/loofah/scrubbers.rb +96 -16
  17. data/lib/loofah/version.rb +5 -0
  18. data/lib/loofah/xml/document.rb +1 -0
  19. data/lib/loofah/xml/document_fragment.rb +5 -2
  20. data/lib/loofah.rb +38 -25
  21. metadata +159 -172
  22. data/CHANGELOG.rdoc +0 -134
  23. data/Gemfile +0 -1
  24. data/Manifest.txt +0 -34
  25. data/README.rdoc +0 -312
  26. data/Rakefile +0 -53
  27. data/benchmark/benchmark.rb +0 -149
  28. data/benchmark/fragment.html +0 -96
  29. data/benchmark/helper.rb +0 -73
  30. data/benchmark/www.slashdot.com.html +0 -2560
  31. data/lib/loofah/html5/whitelist.rb +0 -168
  32. data/test/helper.rb +0 -7
  33. data/test/html5/test_sanitizer.rb +0 -248
  34. data/test/integration/test_ad_hoc.rb +0 -176
  35. data/test/integration/test_helpers.rb +0 -33
  36. data/test/integration/test_html.rb +0 -51
  37. data/test/integration/test_scrubbers.rb +0 -331
  38. data/test/integration/test_xml.rb +0 -55
  39. data/test/unit/test_api.rb +0 -138
  40. data/test/unit/test_helpers.rb +0 -27
  41. data/test/unit/test_scrubber.rb +0 -229
  42. data/test/unit/test_scrubbers.rb +0 -14
@@ -1,70 +1,228 @@
1
- require 'cgi'
1
+ # frozen_string_literal: true
2
+ require "cgi"
3
+ require "crass"
2
4
 
3
5
  module Loofah
4
6
  module HTML5 # :nodoc:
5
7
  module Scrub
8
+ CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
9
+ CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
10
+ CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
11
+ CSS_IMPORTANT = '!important'
12
+ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
13
+ DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
6
14
 
7
15
  class << self
16
+ def allowed_element?(element_name)
17
+ ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
18
+ end
8
19
 
9
20
  # alternative implementation of the html5lib attribute scrubbing algorithm
10
21
  def scrub_attributes(node)
11
22
  node.attribute_nodes.each do |attr_node|
12
23
  attr_name = if attr_node.namespace
13
- "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
14
- else
15
- attr_node.node_name
16
- end
17
- attr_node.remove unless HashedWhiteList::ALLOWED_ATTRIBUTES[attr_name]
18
- if HashedWhiteList::ATTR_VAL_IS_URI[attr_name]
19
- # this block lifted nearly verbatim from HTML5 sanitization
20
- val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
21
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and HashedWhiteList::ALLOWED_PROTOCOLS[val_unescaped.split(':')[0]].nil?
22
- attr_node.remove
23
- end
24
+ "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
25
+ else
26
+ attr_node.node_name
24
27
  end
25
- if HashedWhiteList::SVG_ATTR_VAL_ALLOWS_REF[attr_name]
26
- attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
28
+
29
+ if attr_name =~ DATA_ATTRIBUTE_NAME
30
+ next
27
31
  end
28
- if HashedWhiteList::SVG_ALLOW_LOCAL_HREF[node.name] && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
32
+
33
+ unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
29
34
  attr_node.remove
35
+ next
36
+ end
37
+
38
+ if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
39
+ next if scrub_uri_attribute(attr_node)
40
+ end
41
+
42
+ if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
43
+ scrub_attribute_that_allows_local_ref(attr_node)
44
+ end
45
+
46
+ if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
47
+ attr_node.remove
48
+ next
30
49
  end
31
50
  end
32
- if node.attributes['style']
33
- node['style'] = scrub_css(node.attributes['style'])
51
+
52
+ scrub_css_attribute(node)
53
+
54
+ node.attribute_nodes.each do |attr_node|
55
+ if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
56
+ node.remove_attribute(attr_node.name)
57
+ end
34
58
  end
59
+
60
+ force_correct_attribute_escaping!(node)
61
+ end
62
+
63
+ def scrub_css_attribute(node)
64
+ style = node.attributes["style"]
65
+ style.value = scrub_css(style.value) if style
35
66
  end
36
67
 
37
- # lifted nearly verbatim from html5lib
38
68
  def scrub_css(style)
39
- # disallow urls
40
- style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
41
-
42
- # gauntlet
43
- return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
44
- return '' unless style =~ /^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$/
45
-
46
- clean = []
47
- style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
48
- next if val.empty?
49
- prop.downcase!
50
- if HashedWhiteList::ALLOWED_CSS_PROPERTIES[prop]
51
- clean << "#{prop}: #{val};"
52
- elsif %w[background border margin padding].include?(prop.split('-')[0])
53
- clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
54
- HashedWhiteList::ALLOWED_CSS_KEYWORDS[keyword].nil? and
55
- keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
69
+ style_tree = Crass.parse_properties(style)
70
+ sanitized_tree = []
71
+
72
+ style_tree.each do |node|
73
+ next unless node[:node] == :property
74
+ next if node[:children].any? do |child|
75
+ [:url, :bad_url].include?(child[:node])
76
+ end
77
+
78
+ name = node[:name].downcase
79
+ next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
80
+ SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
81
+ SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
82
+
83
+ value = node[:children].map do |child|
84
+ case child[:node]
85
+ when :whitespace
86
+ nil
87
+ when :string
88
+ if child[:raw] =~ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES
89
+ Crass::Parser.stringify(child)
90
+ else
91
+ nil
92
+ end
93
+ when :function
94
+ if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
95
+ Crass::Parser.stringify(child)
96
+ end
97
+ when :ident
98
+ keyword = child[:value]
99
+ if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
100
+ SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
101
+ (keyword =~ CSS_KEYWORDISH)
102
+ keyword
103
+ end
104
+ else
105
+ child[:raw]
106
+ end
107
+ end.compact
108
+
109
+ next if value.empty?
110
+ value << CSS_IMPORTANT if node[:important]
111
+ propstring = format("%s:%s", name, value.join(" "))
112
+ sanitized_node = Crass.parse_properties(propstring).first
113
+ sanitized_tree << sanitized_node << CRASS_SEMICOLON
114
+ end
115
+
116
+ Crass::Parser.stringify(sanitized_tree)
117
+ end
118
+
119
+ def scrub_attribute_that_allows_local_ref(attr_node)
120
+ return unless attr_node.value
121
+
122
+ nodes = Crass::Parser.new(attr_node.value).parse_component_values
123
+
124
+ values = nodes.map do |node|
125
+ case node[:node]
126
+ when :url
127
+ if node[:value].start_with?("#")
128
+ node[:raw]
129
+ else
130
+ nil
56
131
  end
57
- elsif HashedWhiteList::ALLOWED_SVG_PROPERTIES[prop]
58
- clean << "#{prop}: #{val};"
132
+ when :hash, :ident, :string
133
+ node[:raw]
134
+ else
135
+ nil
59
136
  end
137
+ end.compact
138
+
139
+ attr_node.value = values.join(" ")
140
+ end
141
+
142
+ def scrub_uri_attribute(attr_node)
143
+ # this block lifted nearly verbatim from HTML5 sanitization
144
+ val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
145
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
146
+ attr_node.remove
147
+ return true
148
+ elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
149
+ # permit only allowed data mediatypes
150
+ mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
151
+ mediatype, _ = mediatype.split(";")[0..1] if mediatype
152
+ if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
153
+ attr_node.remove
154
+ return true
155
+ end
156
+ end
157
+ false
158
+ end
159
+
160
+ #
161
+ # libxml2 >= 2.9.2 fails to escape comments within some attributes.
162
+ #
163
+ # see comments about CVE-2018-8048 within the tests for more information
164
+ #
165
+ def force_correct_attribute_escaping!(node)
166
+ return unless Nokogiri::VersionInfo.instance.libxml2?
167
+
168
+ node.attribute_nodes.each do |attr_node|
169
+ next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name)
170
+
171
+ tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name]
172
+ next unless tag_name.nil? || tag_name == node.name
173
+
174
+ #
175
+ # this block is just like CGI.escape in Ruby 2.4, but
176
+ # only encodes space and double-quote, to mimic
177
+ # pre-2.9.2 behavior
178
+ #
179
+ encoding = attr_node.value.encoding
180
+ attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
181
+ "%" + m.unpack("H2" * m.bytesize).join("%").upcase
182
+ end.force_encoding(encoding)
60
183
  end
184
+ end
61
185
 
62
- style = clean.join(' ')
186
+ def cdata_needs_escaping?(node)
187
+ # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way
188
+ node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script"))
63
189
  end
64
190
 
65
- end
191
+ def cdata_escape(node)
192
+ escaped_text = escape_tags(node.text)
193
+ if Nokogiri.jruby?
194
+ node.document.create_text_node(escaped_text)
195
+ else
196
+ node.document.create_cdata(escaped_text)
197
+ end
198
+ end
66
199
 
200
+ TABLE_FOR_ESCAPE_HTML__ = {
201
+ '<' => '&lt;',
202
+ '>' => '&gt;',
203
+ '&' => '&amp;',
204
+ }
205
+
206
+ def escape_tags(string)
207
+ # modified version of CGI.escapeHTML from ruby 3.1
208
+ enc = string.encoding
209
+ unless enc.ascii_compatible?
210
+ if enc.dummy?
211
+ origenc = enc
212
+ enc = Encoding::Converter.asciicompat_encoding(enc)
213
+ string = enc ? string.encode(enc) : string.b
214
+ end
215
+ table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
216
+ string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
217
+ string.encode!(origenc) if origenc
218
+ string
219
+ else
220
+ string = string.b
221
+ string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
222
+ string.force_encoding(enc)
223
+ end
224
+ end
225
+ end
67
226
  end
68
227
  end
69
228
  end
70
-
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
@@ -41,7 +42,7 @@ module Loofah
41
42
  when Nokogiri::XML::Document
42
43
  scrubber.traverse(root) if root
43
44
  when Nokogiri::XML::DocumentFragment
44
- children.each { |node| node.scrub!(scrubber) } # TODO: children.scrub! once Nokogiri 1.4.2 is out
45
+ children.scrub! scrubber
45
46
  else
46
47
  scrubber.traverse(self)
47
48
  end
@@ -91,29 +92,34 @@ module Loofah
91
92
  # # decidedly not ok for browser:
92
93
  # frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
93
94
  #
94
- def text(options={})
95
- result = serialize_root.children.inner_text rescue ""
95
+ def text(options = {})
96
+ result = if serialize_root
97
+ serialize_root.children.reject(&:comment?).map(&:inner_text).join("")
98
+ else
99
+ ""
100
+ end
96
101
  if options[:encode_special_chars] == false
97
102
  result # possibly dangerous if rendered in a browser
98
103
  else
99
104
  encode_special_chars result
100
105
  end
101
106
  end
107
+
102
108
  alias :inner_text :text
103
- alias :to_str :text
109
+ alias :to_str :text
104
110
 
105
111
  #
106
112
  # Returns a plain-text version of the markup contained by the
107
113
  # fragment, with HTML entities encoded.
108
114
  #
109
- # This method is slower than #to_text, but is clever about
110
- # whitespace around block elements.
115
+ # This method is slower than #text, but is clever about
116
+ # whitespace around block elements and line break elements.
111
117
  #
112
- # Loofah.document("<h1>Title</h1><div>Content</div>").to_text
113
- # # => "\nTitle\n\nContent\n"
118
+ # Loofah.document("<h1>Title</h1><div>Content<br>Next line</div>").to_text
119
+ # # => "\nTitle\n\nContent\nNext line\n"
114
120
  #
115
- def to_text(options={})
116
- Loofah::Helpers.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
121
+ def to_text(options = {})
122
+ Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
117
123
  end
118
124
  end
119
125
 
@@ -1,15 +1,14 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
- module MetaHelpers
3
- def self.HashifiedConstants(orig_module)
4
- hashed_module = Module.new
5
- orig_module.constants.each do |constant|
6
- next unless orig_module.module_eval("#{constant}").is_a?(Array)
7
- hashed_module.module_eval <<-CODE
8
- #{constant} = {}
9
- #{orig_module.name}::#{constant}.each { |c| #{constant}[c] = true ; #{constant}[c.downcase] = true }
10
- CODE
3
+ module MetaHelpers # :nodoc:
4
+ def self.add_downcased_set_members_to_all_set_constants(mojule)
5
+ mojule.constants.each do |constant_sym|
6
+ constant = mojule.const_get constant_sym
7
+ next unless Set === constant
8
+ constant.dup.each do |member|
9
+ constant.add member.downcase
10
+ end
11
11
  end
12
- hashed_module
13
12
  end
14
13
  end
15
14
  end
@@ -1,8 +1,9 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # A RuntimeError raised when Loofah could not find an appropriate scrubber.
4
5
  #
5
- class ScrubberNotFound < RuntimeError ; end
6
+ class ScrubberNotFound < RuntimeError; end
6
7
 
7
8
  #
8
9
  # A Scrubber wraps up a block (or method) that is run on an HTML node (element):
@@ -36,7 +37,7 @@ module Loofah
36
37
  CONTINUE = Object.new.freeze
37
38
 
38
39
  # Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
39
- STOP = Object.new.freeze
40
+ STOP = Object.new.freeze
40
41
 
41
42
  # When a scrubber is initialized, the :direction may be specified
42
43
  # as :top_down (the default) or :bottom_up.
@@ -64,7 +65,7 @@ module Loofah
64
65
  def initialize(options = {}, &block)
65
66
  direction = options[:direction] || :top_down
66
67
  unless [:top_down, :bottom_up].include?(direction)
67
- raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
+ raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
69
  end
69
70
  @direction, @block = direction, block
70
71
  end
@@ -86,16 +87,31 @@ module Loofah
86
87
  raise ScrubberNotFound, "No scrub method has been defined on #{self.class.to_s}"
87
88
  end
88
89
 
90
+ #
91
+ # If the attribute is not set, add it
92
+ # If the attribute is set, don't overwrite the existing value
93
+ #
94
+ def append_attribute(node, attribute, value)
95
+ current_value = node.get_attribute(attribute) || ""
96
+ current_values = current_value.split(/\s+/)
97
+ updated_value = current_values | [value]
98
+ node.set_attribute(attribute, updated_value.join(" "))
99
+ end
100
+
89
101
  private
90
102
 
91
103
  def html5lib_sanitize(node)
92
104
  case node.type
93
105
  when Nokogiri::XML::Node::ELEMENT_NODE
94
- if HTML5::HashedWhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2[node.name]
106
+ if HTML5::Scrub.allowed_element? node.name
95
107
  HTML5::Scrub.scrub_attributes node
96
108
  return Scrubber::CONTINUE
97
109
  end
98
110
  when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
111
+ if HTML5::Scrub.cdata_needs_escaping?(node)
112
+ node.before(HTML5::Scrub.cdata_escape(node))
113
+ return Scrubber::STOP
114
+ end
99
115
  return Scrubber::CONTINUE
100
116
  end
101
117
  Scrubber::STOP
@@ -107,11 +123,11 @@ module Loofah
107
123
  else
108
124
  return if scrub(node) == STOP
109
125
  end
110
- node.children.each {|j| traverse_conditionally_top_down(j)}
126
+ node.children.each { |j| traverse_conditionally_top_down(j) }
111
127
  end
112
128
 
113
129
  def traverse_conditionally_bottom_up(node)
114
- node.children.each {|j| traverse_conditionally_bottom_up(j)}
130
+ node.children.each { |j| traverse_conditionally_bottom_up(j) }
115
131
  if block
116
132
  block.call(node)
117
133
  else
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # Loofah provides some built-in scrubbers for sanitizing with
4
- # HTML5lib's whitelist and for accomplishing some common
5
+ # HTML5lib's safelist and for accomplishing some common
5
6
  # transformation tasks.
6
7
  #
7
8
  #
@@ -58,6 +59,30 @@ module Loofah
58
59
  # Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
59
60
  # => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
60
61
  #
62
+ #
63
+ # === Loofah::Scrubbers::NoOpener / scrub!(:noopener)
64
+ #
65
+ # +:noopener+ adds a rel="noopener" attribute to all links
66
+ #
67
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
68
+ # Loofah.fragment(link_farmers_markup).scrub!(:noopener)
69
+ # => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
70
+ #
71
+ #
72
+ # === Loofah::Scrubbers::Unprintable / scrub!(:unprintable)
73
+ #
74
+ # +:unprintable+ removes unprintable Unicode characters.
75
+ #
76
+ # markup = "<p>Some text with an unprintable character at the end\u2028</p>"
77
+ # Loofah.fragment(markup).scrub!(:unprintable)
78
+ # => "<p>Some text with an unprintable character at the end</p>"
79
+ #
80
+ # You may not be able to see the unprintable character in the above example, but there is a
81
+ # U+2028 character right before the closing </p> tag. These characters can cause issues if
82
+ # the content is ever parsed by JavaScript - more information here:
83
+ #
84
+ # http://timelessrepo.com/json-isnt-a-javascript-subset
85
+ #
61
86
  module Scrubbers
62
87
  #
63
88
  # === scrub!(:strip)
@@ -75,8 +100,9 @@ module Loofah
75
100
 
76
101
  def scrub(node)
77
102
  return CONTINUE if html5lib_sanitize(node) == CONTINUE
78
- node.before node.inner_html
103
+ node.before(node.children)
79
104
  node.remove
105
+ return STOP
80
106
  end
81
107
  end
82
108
 
@@ -117,8 +143,7 @@ module Loofah
117
143
 
118
144
  def scrub(node)
119
145
  return CONTINUE if html5lib_sanitize(node) == CONTINUE
120
- replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document)
121
- node.add_next_sibling replacement_killer
146
+ node.add_next_sibling Nokogiri::XML::Text.new(node.to_s, node.document)
122
147
  node.remove
123
148
  return STOP
124
149
  end
@@ -150,7 +175,7 @@ module Loofah
150
175
  def scrub(node)
151
176
  case node.type
152
177
  when Nokogiri::XML::Node::ELEMENT_NODE
153
- if HTML5::HashedWhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2[node.name]
178
+ if HTML5::Scrub.allowed_element? node.name
154
179
  node.attributes.each { |attr| node.remove_attribute(attr.first) }
155
180
  return CONTINUE if node.namespaces.empty?
156
181
  end
@@ -177,9 +202,30 @@ module Loofah
177
202
  end
178
203
 
179
204
  def scrub(node)
180
- return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
181
- node.set_attribute('rel', 'nofollow')
182
- return STOP
205
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
206
+ append_attribute(node, "rel", "nofollow")
207
+ return STOP
208
+ end
209
+ end
210
+
211
+ #
212
+ # === scrub!(:noopener)
213
+ #
214
+ # +:noopener+ adds a rel="noopener" attribute to all links
215
+ #
216
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
217
+ # Loofah.fragment(link_farmers_markup).scrub!(:noopener)
218
+ # => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
219
+ #
220
+ class NoOpener < Scrubber
221
+ def initialize
222
+ @direction = :top_down
223
+ end
224
+
225
+ def scrub(node)
226
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
227
+ append_attribute(node, "rel", "noopener")
228
+ return STOP
183
229
  end
184
230
  end
185
231
 
@@ -190,23 +236,57 @@ module Loofah
190
236
  end
191
237
 
192
238
  def scrub(node)
193
- return CONTINUE unless Loofah::HashedElements::BLOCK_LEVEL[node.name]
194
- replacement_killer = Nokogiri::XML::Text.new("\n#{node.content}\n", node.document)
195
- node.add_next_sibling replacement_killer
239
+ return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
240
+ replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
241
+ "\n"
242
+ else
243
+ "\n#{node.content}\n"
244
+ end
245
+ node.add_next_sibling Nokogiri::XML::Text.new(replacement, node.document)
196
246
  node.remove
197
247
  end
198
248
  end
199
249
 
250
+ #
251
+ # === scrub!(:unprintable)
252
+ #
253
+ # +:unprintable+ removes unprintable Unicode characters.
254
+ #
255
+ # markup = "<p>Some text with an unprintable character at the end\u2028</p>"
256
+ # Loofah.fragment(markup).scrub!(:unprintable)
257
+ # => "<p>Some text with an unprintable character at the end</p>"
258
+ #
259
+ # You may not be able to see the unprintable character in the above example, but there is a
260
+ # U+2028 character right before the closing </p> tag. These characters can cause issues if
261
+ # the content is ever parsed by JavaScript - more information here:
262
+ #
263
+ # http://timelessrepo.com/json-isnt-a-javascript-subset
264
+ #
265
+ class Unprintable < Scrubber
266
+ def initialize
267
+ @direction = :top_down
268
+ end
269
+
270
+ def scrub(node)
271
+ if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
272
+ node.content = node.content.gsub(/\u2028|\u2029/, "")
273
+ end
274
+ CONTINUE
275
+ end
276
+ end
277
+
200
278
  #
201
279
  # A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
202
280
  #
203
281
  MAP = {
204
- :escape => Escape,
205
- :prune => Prune,
282
+ :escape => Escape,
283
+ :prune => Prune,
206
284
  :whitewash => Whitewash,
207
- :strip => Strip,
208
- :nofollow => NoFollow,
209
- :newline_block_elements => NewlineBlockElements
285
+ :strip => Strip,
286
+ :nofollow => NoFollow,
287
+ :noopener => NoOpener,
288
+ :newline_block_elements => NewlineBlockElements,
289
+ :unprintable => Unprintable,
210
290
  }
211
291
 
212
292
  #
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+ module Loofah
3
+ # The version of Loofah you are using
4
+ VERSION = "2.19.1"
5
+ end
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  module XML # :nodoc:
3
4
  #
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  module XML # :nodoc:
3
4
  #
@@ -12,8 +13,10 @@ module Loofah
12
13
  # constructor. Applications should use Loofah.fragment to
13
14
  # parse a fragment.
14
15
  #
15
- def parse tags
16
- self.new(Loofah::XML::Document.new, tags)
16
+ def parse(tags)
17
+ doc = Loofah::XML::Document.new
18
+ doc.encoding = tags.encoding.name if tags.respond_to?(:encoding)
19
+ self.new(doc, tags)
17
20
  end
18
21
  end
19
22
  end