loofah 2.2.3 → 2.19.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of loofah might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/CHANGELOG.md +212 -31
- data/README.md +18 -24
- data/lib/loofah/elements.rb +79 -75
- data/lib/loofah/helpers.rb +18 -7
- data/lib/loofah/html/document.rb +1 -0
- data/lib/loofah/html/document_fragment.rb +4 -2
- data/lib/loofah/html5/libxml2_workarounds.rb +8 -7
- data/lib/loofah/html5/safelist.rb +1043 -0
- data/lib/loofah/html5/scrub.rb +73 -48
- data/lib/loofah/instance_methods.rb +14 -8
- data/lib/loofah/metahelpers.rb +2 -1
- data/lib/loofah/scrubber.rb +8 -7
- data/lib/loofah/scrubbers.rb +19 -13
- data/lib/loofah/version.rb +5 -0
- data/lib/loofah/xml/document.rb +1 -0
- data/lib/loofah/xml/document_fragment.rb +2 -1
- data/lib/loofah.rb +35 -18
- metadata +52 -138
- data/.gemtest +0 -0
- data/Gemfile +0 -22
- data/Manifest.txt +0 -40
- data/Rakefile +0 -79
- data/benchmark/benchmark.rb +0 -149
- data/benchmark/fragment.html +0 -96
- data/benchmark/helper.rb +0 -73
- data/benchmark/www.slashdot.com.html +0 -2560
- data/lib/loofah/html5/whitelist.rb +0 -186
- data/test/assets/msword.html +0 -63
- data/test/assets/testdata_sanitizer_tests1.dat +0 -502
- data/test/helper.rb +0 -18
- data/test/html5/test_sanitizer.rb +0 -382
- data/test/integration/test_ad_hoc.rb +0 -204
- data/test/integration/test_helpers.rb +0 -43
- data/test/integration/test_html.rb +0 -72
- data/test/integration/test_scrubbers.rb +0 -400
- data/test/integration/test_xml.rb +0 -55
- data/test/unit/test_api.rb +0 -142
- data/test/unit/test_encoding.rb +0 -20
- data/test/unit/test_helpers.rb +0 -62
- data/test/unit/test_scrubber.rb +0 -229
- data/test/unit/test_scrubbers.rb +0 -14
data/lib/loofah/html5/scrub.rb
CHANGED
@@ -1,104 +1,130 @@
|
|
1
|
-
|
2
|
-
require
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "cgi"
|
3
|
+
require "crass"
|
3
4
|
|
4
5
|
module Loofah
|
5
6
|
module HTML5 # :nodoc:
|
6
7
|
module Scrub
|
7
|
-
|
8
8
|
CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
|
9
|
-
CSS_KEYWORDISH = /\A(#[0-9a-
|
10
|
-
CRASS_SEMICOLON = {:
|
9
|
+
CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
|
10
|
+
CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
|
11
|
+
CSS_IMPORTANT = '!important'
|
12
|
+
CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
|
13
|
+
DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
|
11
14
|
|
12
15
|
class << self
|
13
|
-
|
14
|
-
|
15
|
-
::Loofah::HTML5::WhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
|
16
|
+
def allowed_element?(element_name)
|
17
|
+
::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
|
16
18
|
end
|
17
19
|
|
18
20
|
# alternative implementation of the html5lib attribute scrubbing algorithm
|
19
|
-
def scrub_attributes
|
21
|
+
def scrub_attributes(node)
|
20
22
|
node.attribute_nodes.each do |attr_node|
|
21
23
|
attr_name = if attr_node.namespace
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
24
|
+
"#{attr_node.namespace.prefix}:#{attr_node.node_name}"
|
25
|
+
else
|
26
|
+
attr_node.node_name
|
27
|
+
end
|
26
28
|
|
27
|
-
if attr_name =~
|
29
|
+
if attr_name =~ DATA_ATTRIBUTE_NAME
|
28
30
|
next
|
29
31
|
end
|
30
32
|
|
31
|
-
unless
|
33
|
+
unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
|
32
34
|
attr_node.remove
|
33
35
|
next
|
34
36
|
end
|
35
37
|
|
36
|
-
if
|
38
|
+
if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
|
37
39
|
# this block lifted nearly verbatim from HTML5 sanitization
|
38
|
-
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,
|
39
|
-
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !
|
40
|
+
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
|
41
|
+
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
|
40
42
|
attr_node.remove
|
41
43
|
next
|
42
|
-
elsif val_unescaped.split(
|
44
|
+
elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
|
43
45
|
# permit only allowed data mediatypes
|
44
|
-
mediatype = val_unescaped.split(
|
45
|
-
mediatype, _ = mediatype.split(
|
46
|
-
if mediatype && !
|
46
|
+
mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
|
47
|
+
mediatype, _ = mediatype.split(";")[0..1] if mediatype
|
48
|
+
if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
|
47
49
|
attr_node.remove
|
48
50
|
next
|
49
51
|
end
|
50
52
|
end
|
51
53
|
end
|
52
|
-
if
|
53
|
-
attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m,
|
54
|
+
if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
|
55
|
+
attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, " ") if attr_node.value
|
54
56
|
end
|
55
|
-
if
|
57
|
+
if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
|
56
58
|
attr_node.remove
|
57
59
|
next
|
58
60
|
end
|
59
61
|
end
|
60
62
|
|
61
|
-
scrub_css_attribute
|
63
|
+
scrub_css_attribute(node)
|
62
64
|
|
63
65
|
node.attribute_nodes.each do |attr_node|
|
64
|
-
|
66
|
+
if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
|
67
|
+
node.remove_attribute(attr_node.name)
|
68
|
+
end
|
65
69
|
end
|
66
70
|
|
67
|
-
force_correct_attribute_escaping!
|
71
|
+
force_correct_attribute_escaping!(node)
|
68
72
|
end
|
69
73
|
|
70
|
-
def scrub_css_attribute
|
71
|
-
style = node.attributes[
|
74
|
+
def scrub_css_attribute(node)
|
75
|
+
style = node.attributes["style"]
|
72
76
|
style.value = scrub_css(style.value) if style
|
73
77
|
end
|
74
78
|
|
75
|
-
def scrub_css
|
76
|
-
style_tree = Crass.parse_properties
|
79
|
+
def scrub_css(style)
|
80
|
+
style_tree = Crass.parse_properties(style)
|
77
81
|
sanitized_tree = []
|
78
82
|
|
79
83
|
style_tree.each do |node|
|
80
84
|
next unless node[:node] == :property
|
81
85
|
next if node[:children].any? do |child|
|
82
|
-
[:url, :bad_url].include?(child[:node])
|
86
|
+
[:url, :bad_url].include?(child[:node])
|
83
87
|
end
|
88
|
+
|
84
89
|
name = node[:name].downcase
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
+
next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
|
91
|
+
SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
|
92
|
+
SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
|
93
|
+
|
94
|
+
value = node[:children].map do |child|
|
95
|
+
case child[:node]
|
96
|
+
when :whitespace
|
97
|
+
nil
|
98
|
+
when :string
|
99
|
+
if child[:raw] =~ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES
|
100
|
+
Crass::Parser.stringify(child)
|
101
|
+
else
|
102
|
+
nil
|
103
|
+
end
|
104
|
+
when :function
|
105
|
+
if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
|
106
|
+
Crass::Parser.stringify(child)
|
107
|
+
end
|
108
|
+
when :ident
|
109
|
+
keyword = child[:value]
|
110
|
+
if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
|
111
|
+
SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
|
112
|
+
(keyword =~ CSS_KEYWORDISH)
|
90
113
|
keyword
|
91
114
|
end
|
92
|
-
|
93
|
-
|
94
|
-
propstring = sprintf "%s:%s", name, value.join(" ")
|
95
|
-
sanitized_node = Crass.parse_properties(propstring).first
|
96
|
-
sanitized_tree << sanitized_node << CRASS_SEMICOLON
|
115
|
+
else
|
116
|
+
child[:raw]
|
97
117
|
end
|
98
|
-
end
|
118
|
+
end.compact
|
119
|
+
|
120
|
+
next if value.empty?
|
121
|
+
value << CSS_IMPORTANT if node[:important]
|
122
|
+
propstring = format("%s:%s", name, value.join(" "))
|
123
|
+
sanitized_node = Crass.parse_properties(propstring).first
|
124
|
+
sanitized_tree << sanitized_node << CRASS_SEMICOLON
|
99
125
|
end
|
100
126
|
|
101
|
-
Crass::Parser.stringify
|
127
|
+
Crass::Parser.stringify(sanitized_tree)
|
102
128
|
end
|
103
129
|
|
104
130
|
#
|
@@ -106,7 +132,7 @@ module Loofah
|
|
106
132
|
#
|
107
133
|
# see comments about CVE-2018-8048 within the tests for more information
|
108
134
|
#
|
109
|
-
def force_correct_attribute_escaping!
|
135
|
+
def force_correct_attribute_escaping!(node)
|
110
136
|
return unless Nokogiri::VersionInfo.instance.libxml2?
|
111
137
|
|
112
138
|
node.attribute_nodes.each do |attr_node|
|
@@ -122,11 +148,10 @@ module Loofah
|
|
122
148
|
#
|
123
149
|
encoding = attr_node.value.encoding
|
124
150
|
attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
|
125
|
-
|
151
|
+
"%" + m.unpack("H2" * m.bytesize).join("%").upcase
|
126
152
|
end.force_encoding(encoding)
|
127
153
|
end
|
128
154
|
end
|
129
|
-
|
130
155
|
end
|
131
156
|
end
|
132
157
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
#
|
3
4
|
# Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
|
@@ -91,28 +92,33 @@ module Loofah
|
|
91
92
|
# # decidedly not ok for browser:
|
92
93
|
# frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
|
93
94
|
#
|
94
|
-
def text(options={})
|
95
|
-
result = serialize_root
|
95
|
+
def text(options = {})
|
96
|
+
result = if serialize_root
|
97
|
+
serialize_root.children.reject(&:comment?).map(&:inner_text).join("")
|
98
|
+
else
|
99
|
+
""
|
100
|
+
end
|
96
101
|
if options[:encode_special_chars] == false
|
97
102
|
result # possibly dangerous if rendered in a browser
|
98
103
|
else
|
99
104
|
encode_special_chars result
|
100
105
|
end
|
101
106
|
end
|
107
|
+
|
102
108
|
alias :inner_text :text
|
103
|
-
alias :to_str
|
109
|
+
alias :to_str :text
|
104
110
|
|
105
111
|
#
|
106
112
|
# Returns a plain-text version of the markup contained by the
|
107
113
|
# fragment, with HTML entities encoded.
|
108
114
|
#
|
109
|
-
# This method is slower than #
|
110
|
-
# whitespace around block elements.
|
115
|
+
# This method is slower than #text, but is clever about
|
116
|
+
# whitespace around block elements and line break elements.
|
111
117
|
#
|
112
|
-
# Loofah.document("<h1>Title</h1><div>Content</div>").to_text
|
113
|
-
# # => "\nTitle\n\nContent\n"
|
118
|
+
# Loofah.document("<h1>Title</h1><div>Content<br>Next line</div>").to_text
|
119
|
+
# # => "\nTitle\n\nContent\nNext line\n"
|
114
120
|
#
|
115
|
-
def to_text(options={})
|
121
|
+
def to_text(options = {})
|
116
122
|
Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
|
117
123
|
end
|
118
124
|
end
|
data/lib/loofah/metahelpers.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
module MetaHelpers # :nodoc:
|
3
|
-
def self.add_downcased_set_members_to_all_set_constants
|
4
|
+
def self.add_downcased_set_members_to_all_set_constants(mojule)
|
4
5
|
mojule.constants.each do |constant_sym|
|
5
6
|
constant = mojule.const_get constant_sym
|
6
7
|
next unless Set === constant
|
data/lib/loofah/scrubber.rb
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
#
|
3
4
|
# A RuntimeError raised when Loofah could not find an appropriate scrubber.
|
4
5
|
#
|
5
|
-
class ScrubberNotFound < RuntimeError
|
6
|
+
class ScrubberNotFound < RuntimeError; end
|
6
7
|
|
7
8
|
#
|
8
9
|
# A Scrubber wraps up a block (or method) that is run on an HTML node (element):
|
@@ -36,7 +37,7 @@ module Loofah
|
|
36
37
|
CONTINUE = Object.new.freeze
|
37
38
|
|
38
39
|
# Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
|
39
|
-
STOP
|
40
|
+
STOP = Object.new.freeze
|
40
41
|
|
41
42
|
# When a scrubber is initialized, the :direction may be specified
|
42
43
|
# as :top_down (the default) or :bottom_up.
|
@@ -64,7 +65,7 @@ module Loofah
|
|
64
65
|
def initialize(options = {}, &block)
|
65
66
|
direction = options[:direction] || :top_down
|
66
67
|
unless [:top_down, :bottom_up].include?(direction)
|
67
|
-
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
68
|
+
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
|
68
69
|
end
|
69
70
|
@direction, @block = direction, block
|
70
71
|
end
|
@@ -91,10 +92,10 @@ module Loofah
|
|
91
92
|
# If the attribute is set, don't overwrite the existing value
|
92
93
|
#
|
93
94
|
def append_attribute(node, attribute, value)
|
94
|
-
current_value = node.get_attribute(attribute) ||
|
95
|
+
current_value = node.get_attribute(attribute) || ""
|
95
96
|
current_values = current_value.split(/\s+/)
|
96
97
|
updated_value = current_values | [value]
|
97
|
-
node.set_attribute(attribute, updated_value.join(
|
98
|
+
node.set_attribute(attribute, updated_value.join(" "))
|
98
99
|
end
|
99
100
|
|
100
101
|
private
|
@@ -118,11 +119,11 @@ module Loofah
|
|
118
119
|
else
|
119
120
|
return if scrub(node) == STOP
|
120
121
|
end
|
121
|
-
node.children.each {|j| traverse_conditionally_top_down(j)}
|
122
|
+
node.children.each { |j| traverse_conditionally_top_down(j) }
|
122
123
|
end
|
123
124
|
|
124
125
|
def traverse_conditionally_bottom_up(node)
|
125
|
-
node.children.each {|j| traverse_conditionally_bottom_up(j)}
|
126
|
+
node.children.each { |j| traverse_conditionally_bottom_up(j) }
|
126
127
|
if block
|
127
128
|
block.call(node)
|
128
129
|
else
|
data/lib/loofah/scrubbers.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
#
|
3
4
|
# Loofah provides some built-in scrubbers for sanitizing with
|
4
|
-
# HTML5lib's
|
5
|
+
# HTML5lib's safelist and for accomplishing some common
|
5
6
|
# transformation tasks.
|
6
7
|
#
|
7
8
|
#
|
@@ -205,8 +206,8 @@ module Loofah
|
|
205
206
|
end
|
206
207
|
|
207
208
|
def scrub(node)
|
208
|
-
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name ==
|
209
|
-
append_attribute(node,
|
209
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
210
|
+
append_attribute(node, "rel", "nofollow")
|
210
211
|
return STOP
|
211
212
|
end
|
212
213
|
end
|
@@ -226,8 +227,8 @@ module Loofah
|
|
226
227
|
end
|
227
228
|
|
228
229
|
def scrub(node)
|
229
|
-
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name ==
|
230
|
-
append_attribute(node,
|
230
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
|
231
|
+
append_attribute(node, "rel", "noopener")
|
231
232
|
return STOP
|
232
233
|
end
|
233
234
|
end
|
@@ -239,8 +240,13 @@ module Loofah
|
|
239
240
|
end
|
240
241
|
|
241
242
|
def scrub(node)
|
242
|
-
return CONTINUE unless Loofah::Elements::
|
243
|
-
|
243
|
+
return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
|
244
|
+
replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
|
245
|
+
"\n"
|
246
|
+
else
|
247
|
+
"\n#{node.content}\n"
|
248
|
+
end
|
249
|
+
node.add_next_sibling Nokogiri::XML::Text.new(replacement, node.document)
|
244
250
|
node.remove
|
245
251
|
end
|
246
252
|
end
|
@@ -267,7 +273,7 @@ module Loofah
|
|
267
273
|
|
268
274
|
def scrub(node)
|
269
275
|
if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
|
270
|
-
node.content = node.content.gsub(/\u2028|\u2029/,
|
276
|
+
node.content = node.content.gsub(/\u2028|\u2029/, "")
|
271
277
|
end
|
272
278
|
CONTINUE
|
273
279
|
end
|
@@ -277,14 +283,14 @@ module Loofah
|
|
277
283
|
# A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
|
278
284
|
#
|
279
285
|
MAP = {
|
280
|
-
:escape
|
281
|
-
:prune
|
286
|
+
:escape => Escape,
|
287
|
+
:prune => Prune,
|
282
288
|
:whitewash => Whitewash,
|
283
|
-
:strip
|
284
|
-
:nofollow
|
289
|
+
:strip => Strip,
|
290
|
+
:nofollow => NoFollow,
|
285
291
|
:noopener => NoOpener,
|
286
292
|
:newline_block_elements => NewlineBlockElements,
|
287
|
-
:unprintable => Unprintable
|
293
|
+
:unprintable => Unprintable,
|
288
294
|
}
|
289
295
|
|
290
296
|
#
|
data/lib/loofah/xml/document.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Loofah
|
2
3
|
module XML # :nodoc:
|
3
4
|
#
|
@@ -12,7 +13,7 @@ module Loofah
|
|
12
13
|
# constructor. Applications should use Loofah.fragment to
|
13
14
|
# parse a fragment.
|
14
15
|
#
|
15
|
-
def parse
|
16
|
+
def parse(tags)
|
16
17
|
doc = Loofah::XML::Document.new
|
17
18
|
doc.encoding = tags.encoding.name if tags.respond_to?(:encoding)
|
18
19
|
self.new(doc, tags)
|
data/lib/loofah.rb
CHANGED
@@ -1,22 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__)))
|
2
3
|
|
3
|
-
require
|
4
|
+
require "nokogiri"
|
4
5
|
|
5
|
-
|
6
|
-
|
6
|
+
require_relative "loofah/version"
|
7
|
+
require_relative "loofah/metahelpers"
|
8
|
+
require_relative "loofah/elements"
|
7
9
|
|
8
|
-
|
9
|
-
|
10
|
-
|
10
|
+
require_relative "loofah/html5/safelist"
|
11
|
+
require_relative "loofah/html5/libxml2_workarounds"
|
12
|
+
require_relative "loofah/html5/scrub"
|
11
13
|
|
12
|
-
|
13
|
-
|
14
|
+
require_relative "loofah/scrubber"
|
15
|
+
require_relative "loofah/scrubbers"
|
14
16
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
require_relative "loofah/instance_methods"
|
18
|
+
require_relative "loofah/xml/document"
|
19
|
+
require_relative "loofah/xml/document_fragment"
|
20
|
+
require_relative "loofah/html/document"
|
21
|
+
require_relative "loofah/html/document_fragment"
|
20
22
|
|
21
23
|
# == Strings and IO Objects as Input
|
22
24
|
#
|
@@ -27,14 +29,11 @@ require 'loofah/html/document_fragment'
|
|
27
29
|
# quantities of docs.
|
28
30
|
#
|
29
31
|
module Loofah
|
30
|
-
# The version of Loofah you are using
|
31
|
-
VERSION = '2.2.3'
|
32
|
-
|
33
32
|
class << self
|
34
33
|
# Shortcut for Loofah::HTML::Document.parse
|
35
34
|
# This method accepts the same parameters as Nokogiri::HTML::Document.parse
|
36
35
|
def document(*args, &block)
|
37
|
-
Loofah::HTML::Document.parse(*args, &block)
|
36
|
+
remove_comments_before_html_element Loofah::HTML::Document.parse(*args, &block)
|
38
37
|
end
|
39
38
|
|
40
39
|
# Shortcut for Loofah::HTML::DocumentFragment.parse
|
@@ -77,7 +76,25 @@ module Loofah
|
|
77
76
|
|
78
77
|
# A helper to remove extraneous whitespace from text-ified HTML
|
79
78
|
def remove_extraneous_whitespace(string)
|
80
|
-
string.gsub(/\n\s*\n\s*\n/,"\n\n")
|
79
|
+
string.gsub(/\n\s*\n\s*\n/, "\n\n")
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
# remove comments that exist outside of the HTML element.
|
85
|
+
#
|
86
|
+
# these comments are allowed by the HTML spec:
|
87
|
+
#
|
88
|
+
# https://www.w3.org/TR/html401/struct/global.html#h-7.1
|
89
|
+
#
|
90
|
+
# but are not scrubbed by Loofah because these nodes don't meet
|
91
|
+
# the contract that scrubbers expect of a node (e.g., it can be
|
92
|
+
# replaced, sibling and children nodes can be created).
|
93
|
+
def remove_comments_before_html_element(doc)
|
94
|
+
doc.children.each do |child|
|
95
|
+
child.unlink if child.comment?
|
96
|
+
end
|
97
|
+
doc
|
81
98
|
end
|
82
99
|
end
|
83
100
|
end
|