loofah 2.3.1 → 2.19.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +193 -40
  3. data/README.md +13 -12
  4. data/lib/loofah/elements.rb +79 -75
  5. data/lib/loofah/helpers.rb +5 -4
  6. data/lib/loofah/html/document.rb +1 -0
  7. data/lib/loofah/html/document_fragment.rb +4 -2
  8. data/lib/loofah/html5/libxml2_workarounds.rb +8 -7
  9. data/lib/loofah/html5/safelist.rb +273 -27
  10. data/lib/loofah/html5/scrub.rb +147 -52
  11. data/lib/loofah/instance_methods.rb +14 -8
  12. data/lib/loofah/metahelpers.rb +2 -1
  13. data/lib/loofah/scrubber.rb +12 -7
  14. data/lib/loofah/scrubbers.rb +20 -18
  15. data/lib/loofah/version.rb +5 -0
  16. data/lib/loofah/xml/document.rb +1 -0
  17. data/lib/loofah/xml/document_fragment.rb +2 -1
  18. data/lib/loofah.rb +33 -16
  19. metadata +45 -125
  20. data/.gemtest +0 -0
  21. data/Gemfile +0 -22
  22. data/Manifest.txt +0 -41
  23. data/Rakefile +0 -81
  24. data/benchmark/benchmark.rb +0 -149
  25. data/benchmark/fragment.html +0 -96
  26. data/benchmark/helper.rb +0 -73
  27. data/benchmark/www.slashdot.com.html +0 -2560
  28. data/test/assets/msword.html +0 -63
  29. data/test/assets/testdata_sanitizer_tests1.dat +0 -502
  30. data/test/helper.rb +0 -18
  31. data/test/html5/test_sanitizer.rb +0 -401
  32. data/test/html5/test_scrub.rb +0 -10
  33. data/test/integration/test_ad_hoc.rb +0 -220
  34. data/test/integration/test_helpers.rb +0 -43
  35. data/test/integration/test_html.rb +0 -72
  36. data/test/integration/test_scrubbers.rb +0 -400
  37. data/test/integration/test_xml.rb +0 -55
  38. data/test/unit/test_api.rb +0 -142
  39. data/test/unit/test_encoding.rb +0 -20
  40. data/test/unit/test_helpers.rb +0 -62
  41. data/test/unit/test_scrubber.rb +0 -229
  42. data/test/unit/test_scrubbers.rb +0 -14
@@ -1,30 +1,32 @@
1
- require 'cgi'
2
- require 'crass'
1
+ # frozen_string_literal: true
2
+ require "cgi"
3
+ require "crass"
3
4
 
4
5
  module Loofah
5
6
  module HTML5 # :nodoc:
6
7
  module Scrub
7
-
8
8
  CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
9
- CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
10
- CRASS_SEMICOLON = {:node => :semicolon, :raw => ";"}
9
+ CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
10
+ CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
11
+ CSS_IMPORTANT = '!important'
12
+ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
13
+ DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
11
14
 
12
15
  class << self
13
-
14
- def allowed_element? element_name
15
- ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
16
+ def allowed_element?(element_name)
17
+ ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
16
18
  end
17
19
 
18
20
  # alternative implementation of the html5lib attribute scrubbing algorithm
19
- def scrub_attributes node
21
+ def scrub_attributes(node)
20
22
  node.attribute_nodes.each do |attr_node|
21
23
  attr_name = if attr_node.namespace
22
- "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
23
- else
24
- attr_node.node_name
25
- end
24
+ "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
25
+ else
26
+ attr_node.node_name
27
+ end
26
28
 
27
- if attr_name =~ /\Adata-[\w-]+\z/
29
+ if attr_name =~ DATA_ATTRIBUTE_NAME
28
30
  next
29
31
  end
30
32
 
@@ -34,71 +36,125 @@ module Loofah
34
36
  end
35
37
 
36
38
  if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
37
- # this block lifted nearly verbatim from HTML5 sanitization
38
- val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
39
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
40
- attr_node.remove
41
- next
42
- elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == 'data'
43
- # permit only allowed data mediatypes
44
- mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
45
- mediatype, _ = mediatype.split(';')[0..1] if mediatype
46
- if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
47
- attr_node.remove
48
- next
49
- end
50
- end
39
+ next if scrub_uri_attribute(attr_node)
51
40
  end
41
+
52
42
  if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
53
- attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
43
+ scrub_attribute_that_allows_local_ref(attr_node)
54
44
  end
55
- if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
45
+
46
+ if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
56
47
  attr_node.remove
57
48
  next
58
49
  end
59
50
  end
60
51
 
61
- scrub_css_attribute node
52
+ scrub_css_attribute(node)
62
53
 
63
54
  node.attribute_nodes.each do |attr_node|
64
- node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/
55
+ if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
56
+ node.remove_attribute(attr_node.name)
57
+ end
65
58
  end
66
59
 
67
- force_correct_attribute_escaping! node
60
+ force_correct_attribute_escaping!(node)
68
61
  end
69
62
 
70
- def scrub_css_attribute node
71
- style = node.attributes['style']
63
+ def scrub_css_attribute(node)
64
+ style = node.attributes["style"]
72
65
  style.value = scrub_css(style.value) if style
73
66
  end
74
67
 
75
- def scrub_css style
76
- style_tree = Crass.parse_properties style
68
+ def scrub_css(style)
69
+ style_tree = Crass.parse_properties(style)
77
70
  sanitized_tree = []
78
71
 
79
72
  style_tree.each do |node|
80
73
  next unless node[:node] == :property
81
74
  next if node[:children].any? do |child|
82
- [:url, :bad_url].include?(child[:node]) || (child[:node] == :function && !SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase))
75
+ [:url, :bad_url].include?(child[:node])
83
76
  end
77
+
84
78
  name = node[:name].downcase
85
- if SafeList::ALLOWED_CSS_PROPERTIES.include?(name) || SafeList::ALLOWED_SVG_PROPERTIES.include?(name)
86
- sanitized_tree << node << CRASS_SEMICOLON
87
- elsif SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
88
- value = node[:value].split.map do |keyword|
89
- if SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
79
+ next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
80
+ SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
81
+ SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
82
+
83
+ value = node[:children].map do |child|
84
+ case child[:node]
85
+ when :whitespace
86
+ nil
87
+ when :string
88
+ if child[:raw] =~ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES
89
+ Crass::Parser.stringify(child)
90
+ else
91
+ nil
92
+ end
93
+ when :function
94
+ if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
95
+ Crass::Parser.stringify(child)
96
+ end
97
+ when :ident
98
+ keyword = child[:value]
99
+ if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
100
+ SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
101
+ (keyword =~ CSS_KEYWORDISH)
90
102
  keyword
91
103
  end
92
- end.compact
93
- unless value.empty?
94
- propstring = sprintf "%s:%s", name, value.join(" ")
95
- sanitized_node = Crass.parse_properties(propstring).first
96
- sanitized_tree << sanitized_node << CRASS_SEMICOLON
104
+ else
105
+ child[:raw]
97
106
  end
98
- end
107
+ end.compact
108
+
109
+ next if value.empty?
110
+ value << CSS_IMPORTANT if node[:important]
111
+ propstring = format("%s:%s", name, value.join(" "))
112
+ sanitized_node = Crass.parse_properties(propstring).first
113
+ sanitized_tree << sanitized_node << CRASS_SEMICOLON
99
114
  end
100
115
 
101
- Crass::Parser.stringify sanitized_tree
116
+ Crass::Parser.stringify(sanitized_tree)
117
+ end
118
+
119
+ def scrub_attribute_that_allows_local_ref(attr_node)
120
+ return unless attr_node.value
121
+
122
+ nodes = Crass::Parser.new(attr_node.value).parse_component_values
123
+
124
+ values = nodes.map do |node|
125
+ case node[:node]
126
+ when :url
127
+ if node[:value].start_with?("#")
128
+ node[:raw]
129
+ else
130
+ nil
131
+ end
132
+ when :hash, :ident, :string
133
+ node[:raw]
134
+ else
135
+ nil
136
+ end
137
+ end.compact
138
+
139
+ attr_node.value = values.join(" ")
140
+ end
141
+
142
+ def scrub_uri_attribute(attr_node)
143
+ # this block lifted nearly verbatim from HTML5 sanitization
144
+ val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
145
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
146
+ attr_node.remove
147
+ return true
148
+ elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
149
+ # permit only allowed data mediatypes
150
+ mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
151
+ mediatype, _ = mediatype.split(";")[0..1] if mediatype
152
+ if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
153
+ attr_node.remove
154
+ return true
155
+ end
156
+ end
157
+ false
102
158
  end
103
159
 
104
160
  #
@@ -106,7 +162,7 @@ module Loofah
106
162
  #
107
163
  # see comments about CVE-2018-8048 within the tests for more information
108
164
  #
109
- def force_correct_attribute_escaping! node
165
+ def force_correct_attribute_escaping!(node)
110
166
  return unless Nokogiri::VersionInfo.instance.libxml2?
111
167
 
112
168
  node.attribute_nodes.each do |attr_node|
@@ -122,11 +178,50 @@ module Loofah
122
178
  #
123
179
  encoding = attr_node.value.encoding
124
180
  attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
125
- '%' + m.unpack('H2' * m.bytesize).join('%').upcase
181
+ "%" + m.unpack("H2" * m.bytesize).join("%").upcase
126
182
  end.force_encoding(encoding)
127
183
  end
128
184
  end
129
185
 
186
+ def cdata_needs_escaping?(node)
187
+ # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way
188
+ node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script"))
189
+ end
190
+
191
+ def cdata_escape(node)
192
+ escaped_text = escape_tags(node.text)
193
+ if Nokogiri.jruby?
194
+ node.document.create_text_node(escaped_text)
195
+ else
196
+ node.document.create_cdata(escaped_text)
197
+ end
198
+ end
199
+
200
+ TABLE_FOR_ESCAPE_HTML__ = {
201
+ '<' => '&lt;',
202
+ '>' => '&gt;',
203
+ '&' => '&amp;',
204
+ }
205
+
206
+ def escape_tags(string)
207
+ # modified version of CGI.escapeHTML from ruby 3.1
208
+ enc = string.encoding
209
+ unless enc.ascii_compatible?
210
+ if enc.dummy?
211
+ origenc = enc
212
+ enc = Encoding::Converter.asciicompat_encoding(enc)
213
+ string = enc ? string.encode(enc) : string.b
214
+ end
215
+ table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
216
+ string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
217
+ string.encode!(origenc) if origenc
218
+ string
219
+ else
220
+ string = string.b
221
+ string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
222
+ string.force_encoding(enc)
223
+ end
224
+ end
130
225
  end
131
226
  end
132
227
  end
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
@@ -91,28 +92,33 @@ module Loofah
91
92
  # # decidedly not ok for browser:
92
93
  # frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
93
94
  #
94
- def text(options={})
95
- result = serialize_root.children.inner_text rescue ""
95
+ def text(options = {})
96
+ result = if serialize_root
97
+ serialize_root.children.reject(&:comment?).map(&:inner_text).join("")
98
+ else
99
+ ""
100
+ end
96
101
  if options[:encode_special_chars] == false
97
102
  result # possibly dangerous if rendered in a browser
98
103
  else
99
104
  encode_special_chars result
100
105
  end
101
106
  end
107
+
102
108
  alias :inner_text :text
103
- alias :to_str :text
109
+ alias :to_str :text
104
110
 
105
111
  #
106
112
  # Returns a plain-text version of the markup contained by the
107
113
  # fragment, with HTML entities encoded.
108
114
  #
109
- # This method is slower than #to_text, but is clever about
110
- # whitespace around block elements.
115
+ # This method is slower than #text, but is clever about
116
+ # whitespace around block elements and line break elements.
111
117
  #
112
- # Loofah.document("<h1>Title</h1><div>Content</div>").to_text
113
- # # => "\nTitle\n\nContent\n"
118
+ # Loofah.document("<h1>Title</h1><div>Content<br>Next line</div>").to_text
119
+ # # => "\nTitle\n\nContent\nNext line\n"
114
120
  #
115
- def to_text(options={})
121
+ def to_text(options = {})
116
122
  Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
117
123
  end
118
124
  end
@@ -1,6 +1,7 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  module MetaHelpers # :nodoc:
3
- def self.add_downcased_set_members_to_all_set_constants mojule
4
+ def self.add_downcased_set_members_to_all_set_constants(mojule)
4
5
  mojule.constants.each do |constant_sym|
5
6
  constant = mojule.const_get constant_sym
6
7
  next unless Set === constant
@@ -1,8 +1,9 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # A RuntimeError raised when Loofah could not find an appropriate scrubber.
4
5
  #
5
- class ScrubberNotFound < RuntimeError ; end
6
+ class ScrubberNotFound < RuntimeError; end
6
7
 
7
8
  #
8
9
  # A Scrubber wraps up a block (or method) that is run on an HTML node (element):
@@ -36,7 +37,7 @@ module Loofah
36
37
  CONTINUE = Object.new.freeze
37
38
 
38
39
  # Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
39
- STOP = Object.new.freeze
40
+ STOP = Object.new.freeze
40
41
 
41
42
  # When a scrubber is initialized, the :direction may be specified
42
43
  # as :top_down (the default) or :bottom_up.
@@ -64,7 +65,7 @@ module Loofah
64
65
  def initialize(options = {}, &block)
65
66
  direction = options[:direction] || :top_down
66
67
  unless [:top_down, :bottom_up].include?(direction)
67
- raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
+ raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
69
  end
69
70
  @direction, @block = direction, block
70
71
  end
@@ -91,10 +92,10 @@ module Loofah
91
92
  # If the attribute is set, don't overwrite the existing value
92
93
  #
93
94
  def append_attribute(node, attribute, value)
94
- current_value = node.get_attribute(attribute) || ''
95
+ current_value = node.get_attribute(attribute) || ""
95
96
  current_values = current_value.split(/\s+/)
96
97
  updated_value = current_values | [value]
97
- node.set_attribute(attribute, updated_value.join(' '))
98
+ node.set_attribute(attribute, updated_value.join(" "))
98
99
  end
99
100
 
100
101
  private
@@ -107,6 +108,10 @@ module Loofah
107
108
  return Scrubber::CONTINUE
108
109
  end
109
110
  when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
111
+ if HTML5::Scrub.cdata_needs_escaping?(node)
112
+ node.before(HTML5::Scrub.cdata_escape(node))
113
+ return Scrubber::STOP
114
+ end
110
115
  return Scrubber::CONTINUE
111
116
  end
112
117
  Scrubber::STOP
@@ -118,11 +123,11 @@ module Loofah
118
123
  else
119
124
  return if scrub(node) == STOP
120
125
  end
121
- node.children.each {|j| traverse_conditionally_top_down(j)}
126
+ node.children.each { |j| traverse_conditionally_top_down(j) }
122
127
  end
123
128
 
124
129
  def traverse_conditionally_bottom_up(node)
125
- node.children.each {|j| traverse_conditionally_bottom_up(j)}
130
+ node.children.each { |j| traverse_conditionally_bottom_up(j) }
126
131
  if block
127
132
  block.call(node)
128
133
  else
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # Loofah provides some built-in scrubbers for sanitizing with
@@ -99,13 +100,9 @@ module Loofah
99
100
 
100
101
  def scrub(node)
101
102
  return CONTINUE if html5lib_sanitize(node) == CONTINUE
102
- if node.children.length == 1 && node.children.first.cdata?
103
- sanitized_text = Loofah.fragment(node.children.first.to_html).scrub!(:strip).to_html
104
- node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
105
- else
106
- node.before node.children
107
- end
103
+ node.before(node.children)
108
104
  node.remove
105
+ return STOP
109
106
  end
110
107
  end
111
108
 
@@ -205,8 +202,8 @@ module Loofah
205
202
  end
206
203
 
207
204
  def scrub(node)
208
- return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
209
- append_attribute(node, 'rel', 'nofollow')
205
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
206
+ append_attribute(node, "rel", "nofollow")
210
207
  return STOP
211
208
  end
212
209
  end
@@ -226,8 +223,8 @@ module Loofah
226
223
  end
227
224
 
228
225
  def scrub(node)
229
- return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
230
- append_attribute(node, 'rel', 'noopener')
226
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
227
+ append_attribute(node, "rel", "noopener")
231
228
  return STOP
232
229
  end
233
230
  end
@@ -239,8 +236,13 @@ module Loofah
239
236
  end
240
237
 
241
238
  def scrub(node)
242
- return CONTINUE unless Loofah::Elements::BLOCK_LEVEL.include?(node.name)
243
- node.add_next_sibling Nokogiri::XML::Text.new("\n#{node.content}\n", node.document)
239
+ return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
240
+ replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
241
+ "\n"
242
+ else
243
+ "\n#{node.content}\n"
244
+ end
245
+ node.add_next_sibling Nokogiri::XML::Text.new(replacement, node.document)
244
246
  node.remove
245
247
  end
246
248
  end
@@ -267,7 +269,7 @@ module Loofah
267
269
 
268
270
  def scrub(node)
269
271
  if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
270
- node.content = node.content.gsub(/\u2028|\u2029/, '')
272
+ node.content = node.content.gsub(/\u2028|\u2029/, "")
271
273
  end
272
274
  CONTINUE
273
275
  end
@@ -277,14 +279,14 @@ module Loofah
277
279
  # A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
278
280
  #
279
281
  MAP = {
280
- :escape => Escape,
281
- :prune => Prune,
282
+ :escape => Escape,
283
+ :prune => Prune,
282
284
  :whitewash => Whitewash,
283
- :strip => Strip,
284
- :nofollow => NoFollow,
285
+ :strip => Strip,
286
+ :nofollow => NoFollow,
285
287
  :noopener => NoOpener,
286
288
  :newline_block_elements => NewlineBlockElements,
287
- :unprintable => Unprintable
289
+ :unprintable => Unprintable,
288
290
  }
289
291
 
290
292
  #
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+ module Loofah
3
+ # The version of Loofah you are using
4
+ VERSION = "2.19.1"
5
+ end
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  module XML # :nodoc:
3
4
  #
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  module XML # :nodoc:
3
4
  #
@@ -12,7 +13,7 @@ module Loofah
12
13
  # constructor. Applications should use Loofah.fragment to
13
14
  # parse a fragment.
14
15
  #
15
- def parse tags
16
+ def parse(tags)
16
17
  doc = Loofah::XML::Document.new
17
18
  doc.encoding = tags.encoding.name if tags.respond_to?(:encoding)
18
19
  self.new(doc, tags)
data/lib/loofah.rb CHANGED
@@ -1,22 +1,24 @@
1
+ # frozen_string_literal: true
1
2
  $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__)))
2
3
 
3
4
  require "nokogiri"
4
5
 
5
- require "loofah/metahelpers"
6
- require "loofah/elements"
6
+ require_relative "loofah/version"
7
+ require_relative "loofah/metahelpers"
8
+ require_relative "loofah/elements"
7
9
 
8
- require "loofah/html5/safelist"
9
- require "loofah/html5/libxml2_workarounds"
10
- require "loofah/html5/scrub"
10
+ require_relative "loofah/html5/safelist"
11
+ require_relative "loofah/html5/libxml2_workarounds"
12
+ require_relative "loofah/html5/scrub"
11
13
 
12
- require "loofah/scrubber"
13
- require "loofah/scrubbers"
14
+ require_relative "loofah/scrubber"
15
+ require_relative "loofah/scrubbers"
14
16
 
15
- require "loofah/instance_methods"
16
- require "loofah/xml/document"
17
- require "loofah/xml/document_fragment"
18
- require "loofah/html/document"
19
- require "loofah/html/document_fragment"
17
+ require_relative "loofah/instance_methods"
18
+ require_relative "loofah/xml/document"
19
+ require_relative "loofah/xml/document_fragment"
20
+ require_relative "loofah/html/document"
21
+ require_relative "loofah/html/document_fragment"
20
22
 
21
23
  # == Strings and IO Objects as Input
22
24
  #
@@ -27,14 +29,11 @@ require "loofah/html/document_fragment"
27
29
  # quantities of docs.
28
30
  #
29
31
  module Loofah
30
- # The version of Loofah you are using
31
- VERSION = "2.3.1"
32
-
33
32
  class << self
34
33
  # Shortcut for Loofah::HTML::Document.parse
35
34
  # This method accepts the same parameters as Nokogiri::HTML::Document.parse
36
35
  def document(*args, &block)
37
- Loofah::HTML::Document.parse(*args, &block)
36
+ remove_comments_before_html_element Loofah::HTML::Document.parse(*args, &block)
38
37
  end
39
38
 
40
39
  # Shortcut for Loofah::HTML::DocumentFragment.parse
@@ -79,5 +78,23 @@ module Loofah
79
78
  def remove_extraneous_whitespace(string)
80
79
  string.gsub(/\n\s*\n\s*\n/, "\n\n")
81
80
  end
81
+
82
+ private
83
+
84
+ # remove comments that exist outside of the HTML element.
85
+ #
86
+ # these comments are allowed by the HTML spec:
87
+ #
88
+ # https://www.w3.org/TR/html401/struct/global.html#h-7.1
89
+ #
90
+ # but are not scrubbed by Loofah because these nodes don't meet
91
+ # the contract that scrubbers expect of a node (e.g., it can be
92
+ # replaced, sibling and children nodes can be created).
93
+ def remove_comments_before_html_element(doc)
94
+ doc.children.each do |child|
95
+ child.unlink if child.comment?
96
+ end
97
+ doc
98
+ end
82
99
  end
83
100
  end