loofah 2.19.1 → 2.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,207 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Loofah
4
+ #
5
+ # Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
6
+ #
7
+ # Traverse the document or fragment, invoking the +scrubber+ on each node.
8
+ #
9
+ # +scrubber+ must either be one of the symbols representing the built-in scrubbers (see
10
+ # Scrubbers), or a Scrubber instance.
11
+ #
12
+ # span2div = Loofah::Scrubber.new do |node|
13
+ # node.name = "div" if node.name == "span"
14
+ # end
15
+ # Loofah.html5_fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
16
+ # # => "<div>foo</div><p>bar</p>"
17
+ #
18
+ # or
19
+ #
20
+ # unsafe_html = "ohai! <div>div is safe</div> <script>but script is not</script>"
21
+ # Loofah.html5_fragment(unsafe_html).scrub!(:strip).to_s
22
+ # # => "ohai! <div>div is safe</div> "
23
+ #
24
+ # Note that this method is called implicitly from the shortcuts Loofah.scrub_html5_fragment et
25
+ # al.
26
+ #
27
+ # Please see Scrubber for more information on implementation and traversal, and README.rdoc for
28
+ # more example usage.
29
+ #
30
+ module ScrubBehavior
31
+ module Node # :nodoc:
32
+ def scrub!(scrubber)
33
+ #
34
+ # yes. this should be three separate methods. but nokogiri decorates (or not) based on
35
+ # whether the module name has already been included. and since documents get decorated just
36
+ # like their constituent nodes, we need to jam all the logic into a single module.
37
+ #
38
+ scrubber = ScrubBehavior.resolve_scrubber(scrubber)
39
+ case self
40
+ when Nokogiri::XML::Document
41
+ scrubber.traverse(root) if root
42
+ when Nokogiri::XML::DocumentFragment
43
+ children.scrub!(scrubber)
44
+ else
45
+ scrubber.traverse(self)
46
+ end
47
+ self
48
+ end
49
+ end
50
+
51
+ module NodeSet # :nodoc:
52
+ def scrub!(scrubber)
53
+ each { |node| node.scrub!(scrubber) }
54
+ self
55
+ end
56
+ end
57
+
58
+ class << self
59
+ def resolve_scrubber(scrubber) # :nodoc:
60
+ scrubber = Scrubbers::MAP[scrubber].new if Scrubbers::MAP[scrubber]
61
+ unless scrubber.is_a?(Loofah::Scrubber)
62
+ raise Loofah::ScrubberNotFound, "not a Scrubber or a scrubber name: #{scrubber.inspect}"
63
+ end
64
+
65
+ scrubber
66
+ end
67
+ end
68
+ end
69
+
70
+ #
71
+ # Overrides +text+ in Document and DocumentFragment classes, and mixes in +to_text+.
72
+ #
73
+ module TextBehavior
74
+ #
75
+ # Returns a plain-text version of the markup contained by the document, with HTML entities
76
+ # encoded.
77
+ #
78
+ # This method is significantly faster than #to_text, but isn't clever about whitespace around
79
+ # block elements.
80
+ #
81
+ # Loofah.html5_document("<h1>Title</h1><div>Content</div>").text
82
+ # # => "TitleContent"
83
+ #
84
+ # By default, the returned text will have HTML entities escaped. If you want unescaped
85
+ # entities, and you understand that the result is unsafe to render in a browser, then you can
86
+ # pass an argument as shown:
87
+ #
88
+ # frag = Loofah.html5_fragment("&lt;script&gt;alert('EVIL');&lt;/script&gt;")
89
+ # # ok for browser:
90
+ # frag.text # => "&lt;script&gt;alert('EVIL');&lt;/script&gt;"
91
+ # # decidedly not ok for browser:
92
+ # frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
93
+ #
94
+ def text(options = {})
95
+ result = if serialize_root
96
+ serialize_root.children.reject(&:comment?).map(&:inner_text).join("")
97
+ else
98
+ ""
99
+ end
100
+ if options[:encode_special_chars] == false
101
+ result # possibly dangerous if rendered in a browser
102
+ else
103
+ encode_special_chars(result)
104
+ end
105
+ end
106
+
107
+ alias_method :inner_text, :text
108
+ alias_method :to_str, :text
109
+
110
+ #
111
+ # Returns a plain-text version of the markup contained by the fragment, with HTML entities
112
+ # encoded.
113
+ #
114
+ # This method is slower than #text, but is clever about whitespace around block elements and
115
+ # line break elements.
116
+ #
117
+ # Loofah.html5_document("<h1>Title</h1><div>Content<br>Next line</div>").to_text
118
+ # # => "\nTitle\n\nContent\nNext line\n"
119
+ #
120
+ def to_text(options = {})
121
+ Loofah.remove_extraneous_whitespace(dup.scrub!(:newline_block_elements).text(options))
122
+ end
123
+ end
124
+
125
+ module DocumentDecorator # :nodoc:
126
+ def initialize(*args, &block)
127
+ super
128
+ decorators(Nokogiri::XML::Node) << ScrubBehavior::Node
129
+ decorators(Nokogiri::XML::NodeSet) << ScrubBehavior::NodeSet
130
+ end
131
+ end
132
+
133
+ module HtmlDocumentBehavior # :nodoc:
134
+ module ClassMethods
135
+ def parse(*args, &block)
136
+ remove_comments_before_html_element(super)
137
+ end
138
+
139
+ private
140
+
141
+ # remove comments that exist outside of the HTML element.
142
+ #
143
+ # these comments are allowed by the HTML spec:
144
+ #
145
+ # https://www.w3.org/TR/html401/struct/global.html#h-7.1
146
+ #
147
+ # but are not scrubbed by Loofah because these nodes don't meet
148
+ # the contract that scrubbers expect of a node (e.g., it can be
149
+ # replaced, sibling and children nodes can be created).
150
+ def remove_comments_before_html_element(doc)
151
+ doc.children.each do |child|
152
+ child.unlink if child.comment?
153
+ end
154
+ doc
155
+ end
156
+ end
157
+
158
+ class << self
159
+ def included(base)
160
+ base.extend(ClassMethods)
161
+ end
162
+ end
163
+
164
+ def serialize_root
165
+ at_xpath("/html/body")
166
+ end
167
+ end
168
+
169
+ module HtmlFragmentBehavior # :nodoc:
170
+ module ClassMethods
171
+ def parse(tags, encoding = nil)
172
+ doc = document_klass.new
173
+
174
+ encoding ||= tags.respond_to?(:encoding) ? tags.encoding.name : "UTF-8"
175
+ doc.encoding = encoding
176
+
177
+ new(doc, tags)
178
+ end
179
+
180
+ def document_klass
181
+ @document_klass ||= if Loofah.html5_support? && self == Loofah::HTML5::DocumentFragment
182
+ Loofah::HTML5::Document
183
+ elsif self == Loofah::HTML4::DocumentFragment
184
+ Loofah::HTML4::Document
185
+ else
186
+ raise ArgumentError, "unexpected class: #{self}"
187
+ end
188
+ end
189
+ end
190
+
191
+ class << self
192
+ def included(base)
193
+ base.extend(ClassMethods)
194
+ end
195
+ end
196
+
197
+ def to_s
198
+ serialize_root.children.to_s
199
+ end
200
+
201
+ alias_method :serialize, :to_s
202
+
203
+ def serialize_root
204
+ at_xpath("./body") || self
205
+ end
206
+ end
207
+ end
@@ -1,88 +1,90 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  require "set"
3
4
 
4
5
  module Loofah
5
6
  module Elements
6
- STRICT_BLOCK_LEVEL_HTML4 = Set.new %w[
7
- address
8
- blockquote
9
- center
10
- dir
11
- div
12
- dl
13
- fieldset
14
- form
15
- h1
16
- h2
17
- h3
18
- h4
19
- h5
20
- h6
21
- hr
22
- isindex
23
- menu
24
- noframes
25
- noscript
26
- ol
27
- p
28
- pre
29
- table
30
- ul
31
- ]
7
+ STRICT_BLOCK_LEVEL_HTML4 = Set.new([
8
+ "address",
9
+ "blockquote",
10
+ "center",
11
+ "dir",
12
+ "div",
13
+ "dl",
14
+ "fieldset",
15
+ "form",
16
+ "h1",
17
+ "h2",
18
+ "h3",
19
+ "h4",
20
+ "h5",
21
+ "h6",
22
+ "hr",
23
+ "isindex",
24
+ "menu",
25
+ "noframes",
26
+ "noscript",
27
+ "ol",
28
+ "p",
29
+ "pre",
30
+ "table",
31
+ "ul",
32
+ ])
32
33
 
33
34
  # https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
34
- STRICT_BLOCK_LEVEL_HTML5 = Set.new %w[
35
- address
36
- article
37
- aside
38
- blockquote
39
- canvas
40
- dd
41
- div
42
- dl
43
- dt
44
- fieldset
45
- figcaption
46
- figure
47
- footer
48
- form
49
- h1
50
- h2
51
- h3
52
- h4
53
- h5
54
- h6
55
- header
56
- hgroup
57
- hr
58
- li
59
- main
60
- nav
61
- noscript
62
- ol
63
- output
64
- p
65
- pre
66
- section
67
- table
68
- tfoot
69
- ul
70
- video
71
- ]
35
+ STRICT_BLOCK_LEVEL_HTML5 = Set.new([
36
+ "address",
37
+ "article",
38
+ "aside",
39
+ "blockquote",
40
+ "canvas",
41
+ "dd",
42
+ "div",
43
+ "dl",
44
+ "dt",
45
+ "fieldset",
46
+ "figcaption",
47
+ "figure",
48
+ "footer",
49
+ "form",
50
+ "h1",
51
+ "h2",
52
+ "h3",
53
+ "h4",
54
+ "h5",
55
+ "h6",
56
+ "header",
57
+ "hgroup",
58
+ "hr",
59
+ "li",
60
+ "main",
61
+ "nav",
62
+ "noscript",
63
+ "ol",
64
+ "output",
65
+ "p",
66
+ "pre",
67
+ "section",
68
+ "table",
69
+ "tfoot",
70
+ "ul",
71
+ "video",
72
+ ])
72
73
 
73
74
  # The following elements may also be considered block-level
74
75
  # elements since they may contain block-level elements
75
- LOOSE_BLOCK_LEVEL = Set.new %w[dd
76
- dt
77
- frameset
78
- li
79
- tbody
80
- td
81
- tfoot
82
- th
83
- thead
84
- tr
85
- ]
76
+ LOOSE_BLOCK_LEVEL = Set.new([
77
+ "dd",
78
+ "dt",
79
+ "frameset",
80
+ "li",
81
+ "tbody",
82
+ "td",
83
+ "tfoot",
84
+ "th",
85
+ "thead",
86
+ "tr",
87
+ ])
86
88
 
87
89
  # Elements that aren't block but should generate a newline in #to_text
88
90
  INLINE_LINE_BREAK = Set.new(["br"])
@@ -92,5 +94,5 @@ module Loofah
92
94
  LINEBREAKERS = BLOCK_LEVEL + INLINE_LINE_BREAK
93
95
  end
94
96
 
95
- ::Loofah::MetaHelpers.add_downcased_set_members_to_all_set_constants ::Loofah::Elements
97
+ ::Loofah::MetaHelpers.add_downcased_set_members_to_all_set_constants(::Loofah::Elements)
96
98
  end
@@ -1,43 +1,47 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  module Loofah
3
4
  module Helpers
4
5
  class << self
5
6
  #
6
7
  # A replacement for Rails's built-in +strip_tags+ helper.
7
8
  #
8
- # Loofah::Helpers.strip_tags("<div>Hello <b>there</b></div>") # => "Hello there"
9
+ # Loofah::Helpers.strip_tags("<div>Hello <b>there</b></div>") # => "Hello there"
9
10
  #
10
11
  def strip_tags(string_or_io)
11
- Loofah.fragment(string_or_io).text
12
+ Loofah.html4_fragment(string_or_io).text
12
13
  end
13
14
 
14
15
  #
15
16
  # A replacement for Rails's built-in +sanitize+ helper.
16
17
  #
17
- # Loofah::Helpers.sanitize("<script src=http://ha.ckers.org/xss.js></script>") # => "&lt;script src=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;"
18
+ # Loofah::Helpers.sanitize("<script src=http://ha.ckers.org/xss.js></script>")
19
+ # # => "&lt;script src=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;"
18
20
  #
19
21
  def sanitize(string_or_io)
20
- loofah_fragment = Loofah.fragment(string_or_io)
22
+ loofah_fragment = Loofah.html4_fragment(string_or_io)
21
23
  loofah_fragment.scrub!(:strip)
22
- loofah_fragment.xpath("./form").each { |form| form.remove }
24
+ loofah_fragment.xpath("./form").each(&:remove)
23
25
  loofah_fragment.to_s
24
26
  end
25
27
 
26
28
  #
27
29
  # A replacement for Rails's built-in +sanitize_css+ helper.
28
30
  #
29
- # Loofah::Helpers.sanitize_css("display:block;background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg)") # => "display: block;"
31
+ # Loofah::Helpers.sanitize_css("display:block;background-image:url(http://example.com/foo.jpg)")
32
+ # # => "display: block;"
30
33
  #
31
34
  def sanitize_css(style_string)
32
- ::Loofah::HTML5::Scrub.scrub_css style_string
35
+ ::Loofah::HTML5::Scrub.scrub_css(style_string)
33
36
  end
34
37
 
35
38
  #
36
- # A helper to remove extraneous whitespace from text-ified HTML
39
+ # A helper to remove extraneous whitespace from text-ified HTML.
40
+ #
37
41
  # TODO: remove this in a future major-point-release.
38
42
  #
39
43
  def remove_extraneous_whitespace(string)
40
- Loofah.remove_extraneous_whitespace string
44
+ Loofah.remove_extraneous_whitespace(string)
41
45
  end
42
46
  end
43
47
 
@@ -52,7 +56,7 @@ module Loofah
52
56
  end
53
57
 
54
58
  def white_list_sanitizer
55
- warn "warning: white_list_sanitizer is deprecated, please use safe_list_sanitizer instead."
59
+ warn("warning: white_list_sanitizer is deprecated, please use safe_list_sanitizer instead.")
56
60
  safe_list_sanitizer
57
61
  end
58
62
  end
@@ -62,7 +66,8 @@ module Loofah
62
66
  #
63
67
  # To use by default, call this in an application initializer:
64
68
  #
65
- # ActionView::Helpers::SanitizeHelper.full_sanitizer = ::Loofah::Helpers::ActionView::FullSanitizer.new
69
+ # ActionView::Helpers::SanitizeHelper.full_sanitizer = \
70
+ # Loofah::Helpers::ActionView::FullSanitizer.new
66
71
  #
67
72
  # Or, to generally opt-in to Loofah's view sanitizers:
68
73
  #
@@ -70,7 +75,7 @@ module Loofah
70
75
  #
71
76
  class FullSanitizer
72
77
  def sanitize(html, *args)
73
- Loofah::Helpers.strip_tags html
78
+ Loofah::Helpers.strip_tags(html)
74
79
  end
75
80
  end
76
81
 
@@ -79,7 +84,8 @@ module Loofah
79
84
  #
80
85
  # To use by default, call this in an application initializer:
81
86
  #
82
- # ActionView::Helpers::SanitizeHelper.safe_list_sanitizer = ::Loofah::Helpers::ActionView::SafeListSanitizer.new
87
+ # ActionView::Helpers::SanitizeHelper.safe_list_sanitizer = \
88
+ # Loofah::Helpers::ActionView::SafeListSanitizer.new
83
89
  #
84
90
  # Or, to generally opt-in to Loofah's view sanitizers:
85
91
  #
@@ -87,11 +93,11 @@ module Loofah
87
93
  #
88
94
  class SafeListSanitizer
89
95
  def sanitize(html, *args)
90
- Loofah::Helpers.sanitize html
96
+ Loofah::Helpers.sanitize(html)
91
97
  end
92
98
 
93
99
  def sanitize_css(style_string, *args)
94
- Loofah::Helpers.sanitize_css style_string
100
+ Loofah::Helpers.sanitize_css(style_string)
95
101
  end
96
102
  end
97
103
 
@@ -1,19 +1,17 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  module Loofah
3
- module HTML # :nodoc:
4
+ module HTML4 # :nodoc:
4
5
  #
5
- # Subclass of Nokogiri::HTML::Document.
6
+ # Subclass of Nokogiri::HTML4::Document.
6
7
  #
7
8
  # See Loofah::ScrubBehavior and Loofah::TextBehavior for additional methods.
8
9
  #
9
- class Document < Nokogiri::HTML::Document
10
+ class Document < Nokogiri::HTML4::Document
10
11
  include Loofah::ScrubBehavior::Node
11
12
  include Loofah::DocumentDecorator
12
13
  include Loofah::TextBehavior
13
-
14
- def serialize_root
15
- at_xpath("/html/body")
16
- end
14
+ include Loofah::HtmlDocumentBehavior
17
15
  end
18
16
  end
19
17
  end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Loofah
4
+ module HTML4 # :nodoc:
5
+ #
6
+ # Subclass of Nokogiri::HTML4::DocumentFragment.
7
+ #
8
+ # See Loofah::ScrubBehavior and Loofah::TextBehavior for additional methods.
9
+ #
10
+ class DocumentFragment < Nokogiri::HTML4::DocumentFragment
11
+ include Loofah::TextBehavior
12
+ include Loofah::HtmlFragmentBehavior
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Loofah
4
+ module HTML5 # :nodoc:
5
+ #
6
+ # Subclass of Nokogiri::HTML5::Document.
7
+ #
8
+ # See Loofah::ScrubBehavior and Loofah::TextBehavior for additional methods.
9
+ #
10
+ class Document < Nokogiri::HTML5::Document
11
+ include Loofah::ScrubBehavior::Node
12
+ include Loofah::DocumentDecorator
13
+ include Loofah::TextBehavior
14
+ include Loofah::HtmlDocumentBehavior
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Loofah
4
+ module HTML5 # :nodoc:
5
+ #
6
+ # Subclass of Nokogiri::HTML5::DocumentFragment.
7
+ #
8
+ # See Loofah::ScrubBehavior and Loofah::TextBehavior for additional methods.
9
+ #
10
+ class DocumentFragment < Nokogiri::HTML5::DocumentFragment
11
+ include Loofah::TextBehavior
12
+ include Loofah::HtmlFragmentBehavior
13
+ end
14
+ end
15
+ end
@@ -1,5 +1,6 @@
1
1
  # coding: utf-8
2
2
  # frozen_string_literal: true
3
+
3
4
  require "set"
4
5
 
5
6
  module Loofah
@@ -16,12 +17,12 @@ module Loofah
16
17
  #
17
18
  # see comments about CVE-2018-8048 within the tests for more information
18
19
  #
19
- BROKEN_ESCAPING_ATTRIBUTES = Set.new %w[
20
- href
21
- action
22
- src
23
- name
24
- ]
20
+ BROKEN_ESCAPING_ATTRIBUTES = Set.new([
21
+ "href",
22
+ "action",
23
+ "src",
24
+ "name",
25
+ ])
25
26
  BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG = { "name" => "a" }
26
27
  end
27
28
  end