loofah 2.3.1 → 2.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +193 -40
  3. data/README.md +13 -12
  4. data/lib/loofah/elements.rb +79 -75
  5. data/lib/loofah/helpers.rb +5 -4
  6. data/lib/loofah/html/document.rb +1 -0
  7. data/lib/loofah/html/document_fragment.rb +4 -2
  8. data/lib/loofah/html5/libxml2_workarounds.rb +8 -7
  9. data/lib/loofah/html5/safelist.rb +273 -27
  10. data/lib/loofah/html5/scrub.rb +147 -52
  11. data/lib/loofah/instance_methods.rb +14 -8
  12. data/lib/loofah/metahelpers.rb +2 -1
  13. data/lib/loofah/scrubber.rb +12 -7
  14. data/lib/loofah/scrubbers.rb +20 -18
  15. data/lib/loofah/version.rb +5 -0
  16. data/lib/loofah/xml/document.rb +1 -0
  17. data/lib/loofah/xml/document_fragment.rb +2 -1
  18. data/lib/loofah.rb +33 -16
  19. metadata +45 -125
  20. data/.gemtest +0 -0
  21. data/Gemfile +0 -22
  22. data/Manifest.txt +0 -41
  23. data/Rakefile +0 -81
  24. data/benchmark/benchmark.rb +0 -149
  25. data/benchmark/fragment.html +0 -96
  26. data/benchmark/helper.rb +0 -73
  27. data/benchmark/www.slashdot.com.html +0 -2560
  28. data/test/assets/msword.html +0 -63
  29. data/test/assets/testdata_sanitizer_tests1.dat +0 -502
  30. data/test/helper.rb +0 -18
  31. data/test/html5/test_sanitizer.rb +0 -401
  32. data/test/html5/test_scrub.rb +0 -10
  33. data/test/integration/test_ad_hoc.rb +0 -220
  34. data/test/integration/test_helpers.rb +0 -43
  35. data/test/integration/test_html.rb +0 -72
  36. data/test/integration/test_scrubbers.rb +0 -400
  37. data/test/integration/test_xml.rb +0 -55
  38. data/test/unit/test_api.rb +0 -142
  39. data/test/unit/test_encoding.rb +0 -20
  40. data/test/unit/test_helpers.rb +0 -62
  41. data/test/unit/test_scrubber.rb +0 -229
  42. data/test/unit/test_scrubbers.rb +0 -14
@@ -1,30 +1,32 @@
1
- require 'cgi'
2
- require 'crass'
1
+ # frozen_string_literal: true
2
+ require "cgi"
3
+ require "crass"
3
4
 
4
5
  module Loofah
5
6
  module HTML5 # :nodoc:
6
7
  module Scrub
7
-
8
8
  CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
9
- CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
10
- CRASS_SEMICOLON = {:node => :semicolon, :raw => ";"}
9
+ CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
10
+ CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
11
+ CSS_IMPORTANT = '!important'
12
+ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
13
+ DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
11
14
 
12
15
  class << self
13
-
14
- def allowed_element? element_name
15
- ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
16
+ def allowed_element?(element_name)
17
+ ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
16
18
  end
17
19
 
18
20
  # alternative implementation of the html5lib attribute scrubbing algorithm
19
- def scrub_attributes node
21
+ def scrub_attributes(node)
20
22
  node.attribute_nodes.each do |attr_node|
21
23
  attr_name = if attr_node.namespace
22
- "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
23
- else
24
- attr_node.node_name
25
- end
24
+ "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
25
+ else
26
+ attr_node.node_name
27
+ end
26
28
 
27
- if attr_name =~ /\Adata-[\w-]+\z/
29
+ if attr_name =~ DATA_ATTRIBUTE_NAME
28
30
  next
29
31
  end
30
32
 
@@ -34,71 +36,125 @@ module Loofah
34
36
  end
35
37
 
36
38
  if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
37
- # this block lifted nearly verbatim from HTML5 sanitization
38
- val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
39
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
40
- attr_node.remove
41
- next
42
- elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == 'data'
43
- # permit only allowed data mediatypes
44
- mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
45
- mediatype, _ = mediatype.split(';')[0..1] if mediatype
46
- if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
47
- attr_node.remove
48
- next
49
- end
50
- end
39
+ next if scrub_uri_attribute(attr_node)
51
40
  end
41
+
52
42
  if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
53
- attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
43
+ scrub_attribute_that_allows_local_ref(attr_node)
54
44
  end
55
- if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
45
+
46
+ if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
56
47
  attr_node.remove
57
48
  next
58
49
  end
59
50
  end
60
51
 
61
- scrub_css_attribute node
52
+ scrub_css_attribute(node)
62
53
 
63
54
  node.attribute_nodes.each do |attr_node|
64
- node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/
55
+ if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
56
+ node.remove_attribute(attr_node.name)
57
+ end
65
58
  end
66
59
 
67
- force_correct_attribute_escaping! node
60
+ force_correct_attribute_escaping!(node)
68
61
  end
69
62
 
70
- def scrub_css_attribute node
71
- style = node.attributes['style']
63
+ def scrub_css_attribute(node)
64
+ style = node.attributes["style"]
72
65
  style.value = scrub_css(style.value) if style
73
66
  end
74
67
 
75
- def scrub_css style
76
- style_tree = Crass.parse_properties style
68
+ def scrub_css(style)
69
+ style_tree = Crass.parse_properties(style)
77
70
  sanitized_tree = []
78
71
 
79
72
  style_tree.each do |node|
80
73
  next unless node[:node] == :property
81
74
  next if node[:children].any? do |child|
82
- [:url, :bad_url].include?(child[:node]) || (child[:node] == :function && !SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase))
75
+ [:url, :bad_url].include?(child[:node])
83
76
  end
77
+
84
78
  name = node[:name].downcase
85
- if SafeList::ALLOWED_CSS_PROPERTIES.include?(name) || SafeList::ALLOWED_SVG_PROPERTIES.include?(name)
86
- sanitized_tree << node << CRASS_SEMICOLON
87
- elsif SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
88
- value = node[:value].split.map do |keyword|
89
- if SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
79
+ next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
80
+ SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
81
+ SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
82
+
83
+ value = node[:children].map do |child|
84
+ case child[:node]
85
+ when :whitespace
86
+ nil
87
+ when :string
88
+ if child[:raw] =~ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES
89
+ Crass::Parser.stringify(child)
90
+ else
91
+ nil
92
+ end
93
+ when :function
94
+ if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
95
+ Crass::Parser.stringify(child)
96
+ end
97
+ when :ident
98
+ keyword = child[:value]
99
+ if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
100
+ SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
101
+ (keyword =~ CSS_KEYWORDISH)
90
102
  keyword
91
103
  end
92
- end.compact
93
- unless value.empty?
94
- propstring = sprintf "%s:%s", name, value.join(" ")
95
- sanitized_node = Crass.parse_properties(propstring).first
96
- sanitized_tree << sanitized_node << CRASS_SEMICOLON
104
+ else
105
+ child[:raw]
97
106
  end
98
- end
107
+ end.compact
108
+
109
+ next if value.empty?
110
+ value << CSS_IMPORTANT if node[:important]
111
+ propstring = format("%s:%s", name, value.join(" "))
112
+ sanitized_node = Crass.parse_properties(propstring).first
113
+ sanitized_tree << sanitized_node << CRASS_SEMICOLON
99
114
  end
100
115
 
101
- Crass::Parser.stringify sanitized_tree
116
+ Crass::Parser.stringify(sanitized_tree)
117
+ end
118
+
119
+ def scrub_attribute_that_allows_local_ref(attr_node)
120
+ return unless attr_node.value
121
+
122
+ nodes = Crass::Parser.new(attr_node.value).parse_component_values
123
+
124
+ values = nodes.map do |node|
125
+ case node[:node]
126
+ when :url
127
+ if node[:value].start_with?("#")
128
+ node[:raw]
129
+ else
130
+ nil
131
+ end
132
+ when :hash, :ident, :string
133
+ node[:raw]
134
+ else
135
+ nil
136
+ end
137
+ end.compact
138
+
139
+ attr_node.value = values.join(" ")
140
+ end
141
+
142
+ def scrub_uri_attribute(attr_node)
143
+ # this block lifted nearly verbatim from HTML5 sanitization
144
+ val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
145
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
146
+ attr_node.remove
147
+ return true
148
+ elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
149
+ # permit only allowed data mediatypes
150
+ mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
151
+ mediatype, _ = mediatype.split(";")[0..1] if mediatype
152
+ if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
153
+ attr_node.remove
154
+ return true
155
+ end
156
+ end
157
+ false
102
158
  end
103
159
 
104
160
  #
@@ -106,7 +162,7 @@ module Loofah
106
162
  #
107
163
  # see comments about CVE-2018-8048 within the tests for more information
108
164
  #
109
- def force_correct_attribute_escaping! node
165
+ def force_correct_attribute_escaping!(node)
110
166
  return unless Nokogiri::VersionInfo.instance.libxml2?
111
167
 
112
168
  node.attribute_nodes.each do |attr_node|
@@ -122,11 +178,50 @@ module Loofah
122
178
  #
123
179
  encoding = attr_node.value.encoding
124
180
  attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
125
- '%' + m.unpack('H2' * m.bytesize).join('%').upcase
181
+ "%" + m.unpack("H2" * m.bytesize).join("%").upcase
126
182
  end.force_encoding(encoding)
127
183
  end
128
184
  end
129
185
 
186
+ def cdata_needs_escaping?(node)
187
+ # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way
188
+ node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script"))
189
+ end
190
+
191
+ def cdata_escape(node)
192
+ escaped_text = escape_tags(node.text)
193
+ if Nokogiri.jruby?
194
+ node.document.create_text_node(escaped_text)
195
+ else
196
+ node.document.create_cdata(escaped_text)
197
+ end
198
+ end
199
+
200
+ TABLE_FOR_ESCAPE_HTML__ = {
201
+ '<' => '&lt;',
202
+ '>' => '&gt;',
203
+ '&' => '&amp;',
204
+ }
205
+
206
+ def escape_tags(string)
207
+ # modified version of CGI.escapeHTML from ruby 3.1
208
+ enc = string.encoding
209
+ unless enc.ascii_compatible?
210
+ if enc.dummy?
211
+ origenc = enc
212
+ enc = Encoding::Converter.asciicompat_encoding(enc)
213
+ string = enc ? string.encode(enc) : string.b
214
+ end
215
+ table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
216
+ string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
217
+ string.encode!(origenc) if origenc
218
+ string
219
+ else
220
+ string = string.b
221
+ string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
222
+ string.force_encoding(enc)
223
+ end
224
+ end
130
225
  end
131
226
  end
132
227
  end
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
@@ -91,28 +92,33 @@ module Loofah
91
92
  # # decidedly not ok for browser:
92
93
  # frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
93
94
  #
94
- def text(options={})
95
- result = serialize_root.children.inner_text rescue ""
95
+ def text(options = {})
96
+ result = if serialize_root
97
+ serialize_root.children.reject(&:comment?).map(&:inner_text).join("")
98
+ else
99
+ ""
100
+ end
96
101
  if options[:encode_special_chars] == false
97
102
  result # possibly dangerous if rendered in a browser
98
103
  else
99
104
  encode_special_chars result
100
105
  end
101
106
  end
107
+
102
108
  alias :inner_text :text
103
- alias :to_str :text
109
+ alias :to_str :text
104
110
 
105
111
  #
106
112
  # Returns a plain-text version of the markup contained by the
107
113
  # fragment, with HTML entities encoded.
108
114
  #
109
- # This method is slower than #to_text, but is clever about
110
- # whitespace around block elements.
115
+ # This method is slower than #text, but is clever about
116
+ # whitespace around block elements and line break elements.
111
117
  #
112
- # Loofah.document("<h1>Title</h1><div>Content</div>").to_text
113
- # # => "\nTitle\n\nContent\n"
118
+ # Loofah.document("<h1>Title</h1><div>Content<br>Next line</div>").to_text
119
+ # # => "\nTitle\n\nContent\nNext line\n"
114
120
  #
115
- def to_text(options={})
121
+ def to_text(options = {})
116
122
  Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
117
123
  end
118
124
  end
@@ -1,6 +1,7 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  module MetaHelpers # :nodoc:
3
- def self.add_downcased_set_members_to_all_set_constants mojule
4
+ def self.add_downcased_set_members_to_all_set_constants(mojule)
4
5
  mojule.constants.each do |constant_sym|
5
6
  constant = mojule.const_get constant_sym
6
7
  next unless Set === constant
@@ -1,8 +1,9 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # A RuntimeError raised when Loofah could not find an appropriate scrubber.
4
5
  #
5
- class ScrubberNotFound < RuntimeError ; end
6
+ class ScrubberNotFound < RuntimeError; end
6
7
 
7
8
  #
8
9
  # A Scrubber wraps up a block (or method) that is run on an HTML node (element):
@@ -36,7 +37,7 @@ module Loofah
36
37
  CONTINUE = Object.new.freeze
37
38
 
38
39
  # Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
39
- STOP = Object.new.freeze
40
+ STOP = Object.new.freeze
40
41
 
41
42
  # When a scrubber is initialized, the :direction may be specified
42
43
  # as :top_down (the default) or :bottom_up.
@@ -64,7 +65,7 @@ module Loofah
64
65
  def initialize(options = {}, &block)
65
66
  direction = options[:direction] || :top_down
66
67
  unless [:top_down, :bottom_up].include?(direction)
67
- raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
+ raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
68
69
  end
69
70
  @direction, @block = direction, block
70
71
  end
@@ -91,10 +92,10 @@ module Loofah
91
92
  # If the attribute is set, don't overwrite the existing value
92
93
  #
93
94
  def append_attribute(node, attribute, value)
94
- current_value = node.get_attribute(attribute) || ''
95
+ current_value = node.get_attribute(attribute) || ""
95
96
  current_values = current_value.split(/\s+/)
96
97
  updated_value = current_values | [value]
97
- node.set_attribute(attribute, updated_value.join(' '))
98
+ node.set_attribute(attribute, updated_value.join(" "))
98
99
  end
99
100
 
100
101
  private
@@ -107,6 +108,10 @@ module Loofah
107
108
  return Scrubber::CONTINUE
108
109
  end
109
110
  when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
111
+ if HTML5::Scrub.cdata_needs_escaping?(node)
112
+ node.before(HTML5::Scrub.cdata_escape(node))
113
+ return Scrubber::STOP
114
+ end
110
115
  return Scrubber::CONTINUE
111
116
  end
112
117
  Scrubber::STOP
@@ -118,11 +123,11 @@ module Loofah
118
123
  else
119
124
  return if scrub(node) == STOP
120
125
  end
121
- node.children.each {|j| traverse_conditionally_top_down(j)}
126
+ node.children.each { |j| traverse_conditionally_top_down(j) }
122
127
  end
123
128
 
124
129
  def traverse_conditionally_bottom_up(node)
125
- node.children.each {|j| traverse_conditionally_bottom_up(j)}
130
+ node.children.each { |j| traverse_conditionally_bottom_up(j) }
126
131
  if block
127
132
  block.call(node)
128
133
  else
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  #
3
4
  # Loofah provides some built-in scrubbers for sanitizing with
@@ -99,13 +100,9 @@ module Loofah
99
100
 
100
101
  def scrub(node)
101
102
  return CONTINUE if html5lib_sanitize(node) == CONTINUE
102
- if node.children.length == 1 && node.children.first.cdata?
103
- sanitized_text = Loofah.fragment(node.children.first.to_html).scrub!(:strip).to_html
104
- node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
105
- else
106
- node.before node.children
107
- end
103
+ node.before(node.children)
108
104
  node.remove
105
+ return STOP
109
106
  end
110
107
  end
111
108
 
@@ -205,8 +202,8 @@ module Loofah
205
202
  end
206
203
 
207
204
  def scrub(node)
208
- return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
209
- append_attribute(node, 'rel', 'nofollow')
205
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
206
+ append_attribute(node, "rel", "nofollow")
210
207
  return STOP
211
208
  end
212
209
  end
@@ -226,8 +223,8 @@ module Loofah
226
223
  end
227
224
 
228
225
  def scrub(node)
229
- return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
230
- append_attribute(node, 'rel', 'noopener')
226
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
227
+ append_attribute(node, "rel", "noopener")
231
228
  return STOP
232
229
  end
233
230
  end
@@ -239,8 +236,13 @@ module Loofah
239
236
  end
240
237
 
241
238
  def scrub(node)
242
- return CONTINUE unless Loofah::Elements::BLOCK_LEVEL.include?(node.name)
243
- node.add_next_sibling Nokogiri::XML::Text.new("\n#{node.content}\n", node.document)
239
+ return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
240
+ replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
241
+ "\n"
242
+ else
243
+ "\n#{node.content}\n"
244
+ end
245
+ node.add_next_sibling Nokogiri::XML::Text.new(replacement, node.document)
244
246
  node.remove
245
247
  end
246
248
  end
@@ -267,7 +269,7 @@ module Loofah
267
269
 
268
270
  def scrub(node)
269
271
  if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
270
- node.content = node.content.gsub(/\u2028|\u2029/, '')
272
+ node.content = node.content.gsub(/\u2028|\u2029/, "")
271
273
  end
272
274
  CONTINUE
273
275
  end
@@ -277,14 +279,14 @@ module Loofah
277
279
  # A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
278
280
  #
279
281
  MAP = {
280
- :escape => Escape,
281
- :prune => Prune,
282
+ :escape => Escape,
283
+ :prune => Prune,
282
284
  :whitewash => Whitewash,
283
- :strip => Strip,
284
- :nofollow => NoFollow,
285
+ :strip => Strip,
286
+ :nofollow => NoFollow,
285
287
  :noopener => NoOpener,
286
288
  :newline_block_elements => NewlineBlockElements,
287
- :unprintable => Unprintable
289
+ :unprintable => Unprintable,
288
290
  }
289
291
 
290
292
  #
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+ module Loofah
3
+ # The version of Loofah you are using
4
+ VERSION = "2.19.1"
5
+ end
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  module XML # :nodoc:
3
4
  #
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  module Loofah
2
3
  module XML # :nodoc:
3
4
  #
@@ -12,7 +13,7 @@ module Loofah
12
13
  # constructor. Applications should use Loofah.fragment to
13
14
  # parse a fragment.
14
15
  #
15
- def parse tags
16
+ def parse(tags)
16
17
  doc = Loofah::XML::Document.new
17
18
  doc.encoding = tags.encoding.name if tags.respond_to?(:encoding)
18
19
  self.new(doc, tags)
data/lib/loofah.rb CHANGED
@@ -1,22 +1,24 @@
1
+ # frozen_string_literal: true
1
2
  $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__)))
2
3
 
3
4
  require "nokogiri"
4
5
 
5
- require "loofah/metahelpers"
6
- require "loofah/elements"
6
+ require_relative "loofah/version"
7
+ require_relative "loofah/metahelpers"
8
+ require_relative "loofah/elements"
7
9
 
8
- require "loofah/html5/safelist"
9
- require "loofah/html5/libxml2_workarounds"
10
- require "loofah/html5/scrub"
10
+ require_relative "loofah/html5/safelist"
11
+ require_relative "loofah/html5/libxml2_workarounds"
12
+ require_relative "loofah/html5/scrub"
11
13
 
12
- require "loofah/scrubber"
13
- require "loofah/scrubbers"
14
+ require_relative "loofah/scrubber"
15
+ require_relative "loofah/scrubbers"
14
16
 
15
- require "loofah/instance_methods"
16
- require "loofah/xml/document"
17
- require "loofah/xml/document_fragment"
18
- require "loofah/html/document"
19
- require "loofah/html/document_fragment"
17
+ require_relative "loofah/instance_methods"
18
+ require_relative "loofah/xml/document"
19
+ require_relative "loofah/xml/document_fragment"
20
+ require_relative "loofah/html/document"
21
+ require_relative "loofah/html/document_fragment"
20
22
 
21
23
  # == Strings and IO Objects as Input
22
24
  #
@@ -27,14 +29,11 @@ require "loofah/html/document_fragment"
27
29
  # quantities of docs.
28
30
  #
29
31
  module Loofah
30
- # The version of Loofah you are using
31
- VERSION = "2.3.1"
32
-
33
32
  class << self
34
33
  # Shortcut for Loofah::HTML::Document.parse
35
34
  # This method accepts the same parameters as Nokogiri::HTML::Document.parse
36
35
  def document(*args, &block)
37
- Loofah::HTML::Document.parse(*args, &block)
36
+ remove_comments_before_html_element Loofah::HTML::Document.parse(*args, &block)
38
37
  end
39
38
 
40
39
  # Shortcut for Loofah::HTML::DocumentFragment.parse
@@ -79,5 +78,23 @@ module Loofah
79
78
  def remove_extraneous_whitespace(string)
80
79
  string.gsub(/\n\s*\n\s*\n/, "\n\n")
81
80
  end
81
+
82
+ private
83
+
84
+ # remove comments that exist outside of the HTML element.
85
+ #
86
+ # these comments are allowed by the HTML spec:
87
+ #
88
+ # https://www.w3.org/TR/html401/struct/global.html#h-7.1
89
+ #
90
+ # but are not scrubbed by Loofah because these nodes don't meet
91
+ # the contract that scrubbers expect of a node (e.g., it can be
92
+ # replaced, sibling and children nodes can be created).
93
+ def remove_comments_before_html_element(doc)
94
+ doc.children.each do |child|
95
+ child.unlink if child.comment?
96
+ end
97
+ doc
98
+ end
82
99
  end
83
100
  end