loofah 2.2.3 → 2.19.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of loofah might be problematic. Click here for more details.

Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +212 -31
  3. data/README.md +18 -24
  4. data/lib/loofah/elements.rb +79 -75
  5. data/lib/loofah/helpers.rb +18 -7
  6. data/lib/loofah/html/document.rb +1 -0
  7. data/lib/loofah/html/document_fragment.rb +4 -2
  8. data/lib/loofah/html5/libxml2_workarounds.rb +8 -7
  9. data/lib/loofah/html5/safelist.rb +1043 -0
  10. data/lib/loofah/html5/scrub.rb +73 -48
  11. data/lib/loofah/instance_methods.rb +14 -8
  12. data/lib/loofah/metahelpers.rb +2 -1
  13. data/lib/loofah/scrubber.rb +8 -7
  14. data/lib/loofah/scrubbers.rb +19 -13
  15. data/lib/loofah/version.rb +5 -0
  16. data/lib/loofah/xml/document.rb +1 -0
  17. data/lib/loofah/xml/document_fragment.rb +2 -1
  18. data/lib/loofah.rb +35 -18
  19. metadata +52 -138
  20. data/.gemtest +0 -0
  21. data/Gemfile +0 -22
  22. data/Manifest.txt +0 -40
  23. data/Rakefile +0 -79
  24. data/benchmark/benchmark.rb +0 -149
  25. data/benchmark/fragment.html +0 -96
  26. data/benchmark/helper.rb +0 -73
  27. data/benchmark/www.slashdot.com.html +0 -2560
  28. data/lib/loofah/html5/whitelist.rb +0 -186
  29. data/test/assets/msword.html +0 -63
  30. data/test/assets/testdata_sanitizer_tests1.dat +0 -502
  31. data/test/helper.rb +0 -18
  32. data/test/html5/test_sanitizer.rb +0 -382
  33. data/test/integration/test_ad_hoc.rb +0 -204
  34. data/test/integration/test_helpers.rb +0 -43
  35. data/test/integration/test_html.rb +0 -72
  36. data/test/integration/test_scrubbers.rb +0 -400
  37. data/test/integration/test_xml.rb +0 -55
  38. data/test/unit/test_api.rb +0 -142
  39. data/test/unit/test_encoding.rb +0 -20
  40. data/test/unit/test_helpers.rb +0 -62
  41. data/test/unit/test_scrubber.rb +0 -229
  42. data/test/unit/test_scrubbers.rb +0 -14
@@ -1,104 +1,130 @@
1
- require 'cgi'
2
- require 'crass'
1
+ # frozen_string_literal: true
2
+ require "cgi"
3
+ require "crass"
3
4
 
4
5
  module Loofah
5
6
  module HTML5 # :nodoc:
6
7
  module Scrub
7
-
8
8
  CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
9
- CSS_KEYWORDISH = /\A(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
10
- CRASS_SEMICOLON = {:node => :semicolon, :raw => ";"}
9
+ CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
10
+ CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
11
+ CSS_IMPORTANT = '!important'
12
+ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
13
+ DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
11
14
 
12
15
  class << self
13
-
14
- def allowed_element? element_name
15
- ::Loofah::HTML5::WhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
16
+ def allowed_element?(element_name)
17
+ ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
16
18
  end
17
19
 
18
20
  # alternative implementation of the html5lib attribute scrubbing algorithm
19
- def scrub_attributes node
21
+ def scrub_attributes(node)
20
22
  node.attribute_nodes.each do |attr_node|
21
23
  attr_name = if attr_node.namespace
22
- "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
23
- else
24
- attr_node.node_name
25
- end
24
+ "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
25
+ else
26
+ attr_node.node_name
27
+ end
26
28
 
27
- if attr_name =~ /\Adata-[\w-]+\z/
29
+ if attr_name =~ DATA_ATTRIBUTE_NAME
28
30
  next
29
31
  end
30
32
 
31
- unless WhiteList::ALLOWED_ATTRIBUTES.include?(attr_name)
33
+ unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
32
34
  attr_node.remove
33
35
  next
34
36
  end
35
37
 
36
- if WhiteList::ATTR_VAL_IS_URI.include?(attr_name)
38
+ if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
37
39
  # this block lifted nearly verbatim from HTML5 sanitization
38
- val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
39
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! WhiteList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0])
40
+ val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
41
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
40
42
  attr_node.remove
41
43
  next
42
- elsif val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0] == 'data'
44
+ elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
43
45
  # permit only allowed data mediatypes
44
- mediatype = val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[1]
45
- mediatype, _ = mediatype.split(';')[0..1] if mediatype
46
- if mediatype && !WhiteList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
46
+ mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
47
+ mediatype, _ = mediatype.split(";")[0..1] if mediatype
48
+ if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
47
49
  attr_node.remove
48
50
  next
49
51
  end
50
52
  end
51
53
  end
52
- if WhiteList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
53
- attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
54
+ if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
55
+ attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, " ") if attr_node.value
54
56
  end
55
- if WhiteList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
57
+ if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
56
58
  attr_node.remove
57
59
  next
58
60
  end
59
61
  end
60
62
 
61
- scrub_css_attribute node
63
+ scrub_css_attribute(node)
62
64
 
63
65
  node.attribute_nodes.each do |attr_node|
64
- node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/
66
+ if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
67
+ node.remove_attribute(attr_node.name)
68
+ end
65
69
  end
66
70
 
67
- force_correct_attribute_escaping! node
71
+ force_correct_attribute_escaping!(node)
68
72
  end
69
73
 
70
- def scrub_css_attribute node
71
- style = node.attributes['style']
74
+ def scrub_css_attribute(node)
75
+ style = node.attributes["style"]
72
76
  style.value = scrub_css(style.value) if style
73
77
  end
74
78
 
75
- def scrub_css style
76
- style_tree = Crass.parse_properties style
79
+ def scrub_css(style)
80
+ style_tree = Crass.parse_properties(style)
77
81
  sanitized_tree = []
78
82
 
79
83
  style_tree.each do |node|
80
84
  next unless node[:node] == :property
81
85
  next if node[:children].any? do |child|
82
- [:url, :bad_url].include?(child[:node]) || (child[:node] == :function && !WhiteList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase))
86
+ [:url, :bad_url].include?(child[:node])
83
87
  end
88
+
84
89
  name = node[:name].downcase
85
- if WhiteList::ALLOWED_CSS_PROPERTIES.include?(name) || WhiteList::ALLOWED_SVG_PROPERTIES.include?(name)
86
- sanitized_tree << node << CRASS_SEMICOLON
87
- elsif WhiteList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
88
- value = node[:value].split.map do |keyword|
89
- if WhiteList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
90
+ next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
91
+ SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
92
+ SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
93
+
94
+ value = node[:children].map do |child|
95
+ case child[:node]
96
+ when :whitespace
97
+ nil
98
+ when :string
99
+ if child[:raw] =~ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES
100
+ Crass::Parser.stringify(child)
101
+ else
102
+ nil
103
+ end
104
+ when :function
105
+ if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
106
+ Crass::Parser.stringify(child)
107
+ end
108
+ when :ident
109
+ keyword = child[:value]
110
+ if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
111
+ SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
112
+ (keyword =~ CSS_KEYWORDISH)
90
113
  keyword
91
114
  end
92
- end.compact
93
- unless value.empty?
94
- propstring = sprintf "%s:%s", name, value.join(" ")
95
- sanitized_node = Crass.parse_properties(propstring).first
96
- sanitized_tree << sanitized_node << CRASS_SEMICOLON
115
+ else
116
+ child[:raw]
97
117
  end
98
- end
118
+ end.compact
119
+
120
+ next if value.empty?
121
+ value << CSS_IMPORTANT if node[:important]
122
+ propstring = format("%s:%s", name, value.join(" "))
123
+ sanitized_node = Crass.parse_properties(propstring).first
124
+ sanitized_tree << sanitized_node << CRASS_SEMICOLON
99
125
  end
100
126
 
101
- Crass::Parser.stringify sanitized_tree
127
+ Crass::Parser.stringify(sanitized_tree)
102
128
  end
103
129
 
104
130
  #
@@ -106,7 +132,7 @@ module Loofah
106
132
  #
107
133
  # see comments about CVE-2018-8048 within the tests for more information
108
134
  #
109
- def force_correct_attribute_escaping! node
135
+ def force_correct_attribute_escaping!(node)
110
136
  return unless Nokogiri::VersionInfo.instance.libxml2?
111
137
 
112
138
  node.attribute_nodes.each do |attr_node|
@@ -122,11 +148,10 @@ module Loofah
122
148
  #
123
149
  encoding = attr_node.value.encoding
124
150
  attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
125
- '%' + m.unpack('H2' * m.bytesize).join('%').upcase
151
+ "%" + m.unpack("H2" * m.bytesize).join("%").upcase
126
152
  end.force_encoding(encoding)
127
153
  end
128
154
  end
129
-
130
155
  end
131
156
  end
132
157
  end
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
@@ -91,28 +92,33 @@ module Loofah
91
92
  # # decidedly not ok for browser:
92
93
  # frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
93
94
  #
94
- def text(options={})
95
- result = serialize_root.children.inner_text rescue ""
95
+ def text(options = {})
96
+ result = if serialize_root
97
+ serialize_root.children.reject(&:comment?).map(&:inner_text).join("")
98
+ else
99
+ ""
100
+ end
96
101
  if options[:encode_special_chars] == false
97
102
  result # possibly dangerous if rendered in a browser
98
103
  else
99
104
  encode_special_chars result
100
105
  end
101
106
  end
107
+
102
108
  alias :inner_text :text
103
- alias :to_str :text
109
+ alias :to_str :text
104
110
 
105
111
  #
106
112
  # Returns a plain-text version of the markup contained by the
107
113
  # fragment, with HTML entities encoded.
108
114
  #
109
- # This method is slower than #to_text, but is clever about
110
- # whitespace around block elements.
115
+ # This method is slower than #text, but is clever about
116
+ # whitespace around block elements and line break elements.
111
117
  #
112
- # Loofah.document("<h1>Title</h1><div>Content</div>").to_text
113
- # # => "\nTitle\n\nContent\n"
118
+ # Loofah.document("<h1>Title</h1><div>Content<br>Next line</div>").to_text
119
+ # # => "\nTitle\n\nContent\nNext line\n"
114
120
  #
115
- def to_text(options={})
121
+ def to_text(options = {})
116
122
  Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
117
123
  end
118
124
  end
@@ -1,6 +1,7 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  module MetaHelpers # :nodoc:
3
- def self.add_downcased_set_members_to_all_set_constants mojule
4
+ def self.add_downcased_set_members_to_all_set_constants(mojule)
4
5
  mojule.constants.each do |constant_sym|
5
6
  constant = mojule.const_get constant_sym
6
7
  next unless Set === constant
@@ -1,8 +1,9 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # A RuntimeError raised when Loofah could not find an appropriate scrubber.
4
5
  #
5
- class ScrubberNotFound < RuntimeError ; end
6
+ class ScrubberNotFound < RuntimeError; end
6
7
 
7
8
  #
8
9
  # A Scrubber wraps up a block (or method) that is run on an HTML node (element):
@@ -36,7 +37,7 @@ module Loofah
36
37
  CONTINUE = Object.new.freeze
37
38
 
38
39
  # Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
39
- STOP = Object.new.freeze
40
+ STOP = Object.new.freeze
40
41
 
41
42
  # When a scrubber is initialized, the :direction may be specified
42
43
  # as :top_down (the default) or :bottom_up.
@@ -64,7 +65,7 @@ module Loofah
64
65
  def initialize(options = {}, &block)
65
66
  direction = options[:direction] || :top_down
66
67
  unless [:top_down, :bottom_up].include?(direction)
67
- raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
+ raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
69
  end
69
70
  @direction, @block = direction, block
70
71
  end
@@ -91,10 +92,10 @@ module Loofah
91
92
  # If the attribute is set, don't overwrite the existing value
92
93
  #
93
94
  def append_attribute(node, attribute, value)
94
- current_value = node.get_attribute(attribute) || ''
95
+ current_value = node.get_attribute(attribute) || ""
95
96
  current_values = current_value.split(/\s+/)
96
97
  updated_value = current_values | [value]
97
- node.set_attribute(attribute, updated_value.join(' '))
98
+ node.set_attribute(attribute, updated_value.join(" "))
98
99
  end
99
100
 
100
101
  private
@@ -118,11 +119,11 @@ module Loofah
118
119
  else
119
120
  return if scrub(node) == STOP
120
121
  end
121
- node.children.each {|j| traverse_conditionally_top_down(j)}
122
+ node.children.each { |j| traverse_conditionally_top_down(j) }
122
123
  end
123
124
 
124
125
  def traverse_conditionally_bottom_up(node)
125
- node.children.each {|j| traverse_conditionally_bottom_up(j)}
126
+ node.children.each { |j| traverse_conditionally_bottom_up(j) }
126
127
  if block
127
128
  block.call(node)
128
129
  else
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # Loofah provides some built-in scrubbers for sanitizing with
4
- # HTML5lib's whitelist and for accomplishing some common
5
+ # HTML5lib's safelist and for accomplishing some common
5
6
  # transformation tasks.
6
7
  #
7
8
  #
@@ -205,8 +206,8 @@ module Loofah
205
206
  end
206
207
 
207
208
  def scrub(node)
208
- return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
209
- append_attribute(node, 'rel', 'nofollow')
209
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
210
+ append_attribute(node, "rel", "nofollow")
210
211
  return STOP
211
212
  end
212
213
  end
@@ -226,8 +227,8 @@ module Loofah
226
227
  end
227
228
 
228
229
  def scrub(node)
229
- return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
230
- append_attribute(node, 'rel', 'noopener')
230
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
231
+ append_attribute(node, "rel", "noopener")
231
232
  return STOP
232
233
  end
233
234
  end
@@ -239,8 +240,13 @@ module Loofah
239
240
  end
240
241
 
241
242
  def scrub(node)
242
- return CONTINUE unless Loofah::Elements::BLOCK_LEVEL.include?(node.name)
243
- node.add_next_sibling Nokogiri::XML::Text.new("\n#{node.content}\n", node.document)
243
+ return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
244
+ replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
245
+ "\n"
246
+ else
247
+ "\n#{node.content}\n"
248
+ end
249
+ node.add_next_sibling Nokogiri::XML::Text.new(replacement, node.document)
244
250
  node.remove
245
251
  end
246
252
  end
@@ -267,7 +273,7 @@ module Loofah
267
273
 
268
274
  def scrub(node)
269
275
  if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
270
- node.content = node.content.gsub(/\u2028|\u2029/, '')
276
+ node.content = node.content.gsub(/\u2028|\u2029/, "")
271
277
  end
272
278
  CONTINUE
273
279
  end
@@ -277,14 +283,14 @@ module Loofah
277
283
  # A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
278
284
  #
279
285
  MAP = {
280
- :escape => Escape,
281
- :prune => Prune,
286
+ :escape => Escape,
287
+ :prune => Prune,
282
288
  :whitewash => Whitewash,
283
- :strip => Strip,
284
- :nofollow => NoFollow,
289
+ :strip => Strip,
290
+ :nofollow => NoFollow,
285
291
  :noopener => NoOpener,
286
292
  :newline_block_elements => NewlineBlockElements,
287
- :unprintable => Unprintable
293
+ :unprintable => Unprintable,
288
294
  }
289
295
 
290
296
  #
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+ module Loofah
3
+ # The version of Loofah you are using
4
+ VERSION = "2.19.0"
5
+ end
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  module XML # :nodoc:
3
4
  #
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  module XML # :nodoc:
3
4
  #
@@ -12,7 +13,7 @@ module Loofah
12
13
  # constructor. Applications should use Loofah.fragment to
13
14
  # parse a fragment.
14
15
  #
15
- def parse tags
16
+ def parse(tags)
16
17
  doc = Loofah::XML::Document.new
17
18
  doc.encoding = tags.encoding.name if tags.respond_to?(:encoding)
18
19
  self.new(doc, tags)
data/lib/loofah.rb CHANGED
@@ -1,22 +1,24 @@
1
+ # frozen_string_literal: true
1
2
  $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__)))
2
3
 
3
- require 'nokogiri'
4
+ require "nokogiri"
4
5
 
5
- require 'loofah/metahelpers'
6
- require 'loofah/elements'
6
+ require_relative "loofah/version"
7
+ require_relative "loofah/metahelpers"
8
+ require_relative "loofah/elements"
7
9
 
8
- require 'loofah/html5/whitelist'
9
- require 'loofah/html5/libxml2_workarounds'
10
- require 'loofah/html5/scrub'
10
+ require_relative "loofah/html5/safelist"
11
+ require_relative "loofah/html5/libxml2_workarounds"
12
+ require_relative "loofah/html5/scrub"
11
13
 
12
- require 'loofah/scrubber'
13
- require 'loofah/scrubbers'
14
+ require_relative "loofah/scrubber"
15
+ require_relative "loofah/scrubbers"
14
16
 
15
- require 'loofah/instance_methods'
16
- require 'loofah/xml/document'
17
- require 'loofah/xml/document_fragment'
18
- require 'loofah/html/document'
19
- require 'loofah/html/document_fragment'
17
+ require_relative "loofah/instance_methods"
18
+ require_relative "loofah/xml/document"
19
+ require_relative "loofah/xml/document_fragment"
20
+ require_relative "loofah/html/document"
21
+ require_relative "loofah/html/document_fragment"
20
22
 
21
23
  # == Strings and IO Objects as Input
22
24
  #
@@ -27,14 +29,11 @@ require 'loofah/html/document_fragment'
27
29
  # quantities of docs.
28
30
  #
29
31
  module Loofah
30
- # The version of Loofah you are using
31
- VERSION = '2.2.3'
32
-
33
32
  class << self
34
33
  # Shortcut for Loofah::HTML::Document.parse
35
34
  # This method accepts the same parameters as Nokogiri::HTML::Document.parse
36
35
  def document(*args, &block)
37
- Loofah::HTML::Document.parse(*args, &block)
36
+ remove_comments_before_html_element Loofah::HTML::Document.parse(*args, &block)
38
37
  end
39
38
 
40
39
  # Shortcut for Loofah::HTML::DocumentFragment.parse
@@ -77,7 +76,25 @@ module Loofah
77
76
 
78
77
  # A helper to remove extraneous whitespace from text-ified HTML
79
78
  def remove_extraneous_whitespace(string)
80
- string.gsub(/\n\s*\n\s*\n/,"\n\n")
79
+ string.gsub(/\n\s*\n\s*\n/, "\n\n")
80
+ end
81
+
82
+ private
83
+
84
+ # remove comments that exist outside of the HTML element.
85
+ #
86
+ # these comments are allowed by the HTML spec:
87
+ #
88
+ # https://www.w3.org/TR/html401/struct/global.html#h-7.1
89
+ #
90
+ # but are not scrubbed by Loofah because these nodes don't meet
91
+ # the contract that scrubbers expect of a node (e.g., it can be
92
+ # replaced, sibling and children nodes can be created).
93
+ def remove_comments_before_html_element(doc)
94
+ doc.children.each do |child|
95
+ child.unlink if child.comment?
96
+ end
97
+ doc
81
98
  end
82
99
  end
83
100
  end