loofah 2.20.0 → 2.21.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/loofah.rb CHANGED
@@ -1,8 +1,24 @@
1
1
  # frozen_string_literal: true
2
- $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__)))
3
2
 
4
3
  require "nokogiri"
5
4
 
5
+ module Loofah
6
+ class << self
7
+ def html5_support?
8
+ # Note that Loofah can only support HTML5 in Nokogiri >= 1.14.0 because it requires the
9
+ # subclassing fix from https://github.com/sparklemotion/nokogiri/pull/2534
10
+ unless @html5_support_set
11
+ @html5_support = (
12
+ Gem::Version.new(Nokogiri::VERSION) > Gem::Version.new("1.14.0") &&
13
+ Nokogiri.uses_gumbo?
14
+ )
15
+ @html5_support_set = true
16
+ end
17
+ @html5_support
18
+ end
19
+ end
20
+ end
21
+
6
22
  require_relative "loofah/version"
7
23
  require_relative "loofah/metahelpers"
8
24
  require_relative "loofah/elements"
@@ -14,51 +30,129 @@ require_relative "loofah/html5/scrub"
14
30
  require_relative "loofah/scrubber"
15
31
  require_relative "loofah/scrubbers"
16
32
 
17
- require_relative "loofah/instance_methods"
33
+ require_relative "loofah/concerns"
18
34
  require_relative "loofah/xml/document"
19
35
  require_relative "loofah/xml/document_fragment"
20
- require_relative "loofah/html/document"
21
- require_relative "loofah/html/document_fragment"
36
+ require_relative "loofah/html4/document"
37
+ require_relative "loofah/html4/document_fragment"
38
+
39
+ if Nokogiri.respond_to?(:uses_gumbo?) && Nokogiri.uses_gumbo?
40
+ require_relative "loofah/html5/document"
41
+ require_relative "loofah/html5/document_fragment"
42
+ end
22
43
 
23
44
  # == Strings and IO Objects as Input
24
45
  #
25
- # Loofah.document and Loofah.fragment accept any IO object in addition
26
- # to accepting a string. That IO object could be a file, or a socket,
27
- # or a StringIO, or anything that responds to +read+ and
28
- # +close+. Which makes it particularly easy to sanitize mass
29
- # quantities of docs.
46
+ # The following methods accept any IO object in addition to accepting a string:
47
+ #
48
+ # - Loofah.html4_document
49
+ # - Loofah.html4_fragment
50
+ # - Loofah.scrub_html4_document
51
+ # - Loofah.scrub_html4_fragment
52
+ #
53
+ # - Loofah.html5_document
54
+ # - Loofah.html5_fragment
55
+ # - Loofah.scrub_html5_document
56
+ # - Loofah.scrub_html5_fragment
57
+ #
58
+ # - Loofah.xml_document
59
+ # - Loofah.xml_fragment
60
+ # - Loofah.scrub_xml_document
61
+ # - Loofah.scrub_xml_fragment
62
+ #
63
+ # - Loofah.document
64
+ # - Loofah.fragment
65
+ # - Loofah.scrub_document
66
+ # - Loofah.scrub_fragment
67
+ #
68
+ # That IO object could be a file, or a socket, or a StringIO, or anything that responds to +read+
69
+ # and +close+.
30
70
  #
31
71
  module Loofah
72
+ # Alias for Loofah::HTML4
73
+ HTML = HTML4
74
+
32
75
  class << self
33
- # Shortcut for Loofah::HTML::Document.parse
34
- # This method accepts the same parameters as Nokogiri::HTML::Document.parse
35
- def document(*args, &block)
36
- remove_comments_before_html_element Loofah::HTML::Document.parse(*args, &block)
76
+ # Shortcut for Loofah::HTML4::Document.parse(*args, &block)
77
+ #
78
+ # This method accepts the same parameters as Nokogiri::HTML4::Document.parse
79
+ def html4_document(*args, &block)
80
+ Loofah::HTML4::Document.parse(*args, &block)
37
81
  end
38
82
 
39
- # Shortcut for Loofah::HTML::DocumentFragment.parse
40
- # This method accepts the same parameters as Nokogiri::HTML::DocumentFragment.parse
41
- def fragment(*args, &block)
42
- Loofah::HTML::DocumentFragment.parse(*args, &block)
83
+ # Shortcut for Loofah::HTML4::DocumentFragment.parse(*args, &block)
84
+ #
85
+ # This method accepts the same parameters as Nokogiri::HTML4::DocumentFragment.parse
86
+ def html4_fragment(*args, &block)
87
+ Loofah::HTML4::DocumentFragment.parse(*args, &block)
43
88
  end
44
89
 
45
- # Shortcut for Loofah.fragment(string_or_io).scrub!(method)
46
- def scrub_fragment(string_or_io, method)
47
- Loofah.fragment(string_or_io).scrub!(method)
90
+ # Shortcut for Loofah::HTML4::Document.parse(string_or_io).scrub!(method)
91
+ def scrub_html4_document(string_or_io, method)
92
+ Loofah::HTML4::Document.parse(string_or_io).scrub!(method)
48
93
  end
49
94
 
50
- # Shortcut for Loofah.document(string_or_io).scrub!(method)
51
- def scrub_document(string_or_io, method)
52
- Loofah.document(string_or_io).scrub!(method)
95
+ # Shortcut for Loofah::HTML4::DocumentFragment.parse(string_or_io).scrub!(method)
96
+ def scrub_html4_fragment(string_or_io, method)
97
+ Loofah::HTML4::DocumentFragment.parse(string_or_io).scrub!(method)
53
98
  end
54
99
 
55
- # Shortcut for Loofah::XML::Document.parse
100
+ if Loofah.html5_support?
101
+ # Shortcut for Loofah::HTML5::Document.parse(*args, &block)
102
+ #
103
+ # This method accepts the same parameters as Nokogiri::HTML5::Document.parse
104
+ def html5_document(*args, &block)
105
+ Loofah::HTML5::Document.parse(*args, &block)
106
+ end
107
+
108
+ # Shortcut for Loofah::HTML5::DocumentFragment.parse(*args, &block)
109
+ #
110
+ # This method accepts the same parameters as Nokogiri::HTML5::DocumentFragment.parse
111
+ def html5_fragment(*args, &block)
112
+ Loofah::HTML5::DocumentFragment.parse(*args, &block)
113
+ end
114
+
115
+ # Shortcut for Loofah::HTML5::Document.parse(string_or_io).scrub!(method)
116
+ def scrub_html5_document(string_or_io, method)
117
+ Loofah::HTML5::Document.parse(string_or_io).scrub!(method)
118
+ end
119
+
120
+ # Shortcut for Loofah::HTML5::DocumentFragment.parse(string_or_io).scrub!(method)
121
+ def scrub_html5_fragment(string_or_io, method)
122
+ Loofah::HTML5::DocumentFragment.parse(string_or_io).scrub!(method)
123
+ end
124
+ else
125
+ def html5_document(*args, &block)
126
+ raise NotImplementedError, "HTML5 is not supported by your version of Nokogiri"
127
+ end
128
+
129
+ def html5_fragment(*args, &block)
130
+ raise NotImplementedError, "HTML5 is not supported by your version of Nokogiri"
131
+ end
132
+
133
+ def scrub_html5_document(string_or_io, method)
134
+ raise NotImplementedError, "HTML5 is not supported by your version of Nokogiri"
135
+ end
136
+
137
+ def scrub_html5_fragment(string_or_io, method)
138
+ raise NotImplementedError, "HTML5 is not supported by your version of Nokogiri"
139
+ end
140
+ end
141
+
142
+ alias_method :document, :html4_document
143
+ alias_method :fragment, :html4_fragment
144
+ alias_method :scrub_document, :scrub_html4_document
145
+ alias_method :scrub_fragment, :scrub_html4_fragment
146
+
147
+ # Shortcut for Loofah::XML::Document.parse(*args, &block)
148
+ #
56
149
  # This method accepts the same parameters as Nokogiri::XML::Document.parse
57
150
  def xml_document(*args, &block)
58
151
  Loofah::XML::Document.parse(*args, &block)
59
152
  end
60
153
 
61
- # Shortcut for Loofah::XML::DocumentFragment.parse
154
+ # Shortcut for Loofah::XML::DocumentFragment.parse(*args, &block)
155
+ #
62
156
  # This method accepts the same parameters as Nokogiri::XML::DocumentFragment.parse
63
157
  def xml_fragment(*args, &block)
64
158
  Loofah::XML::DocumentFragment.parse(*args, &block)
@@ -78,23 +172,5 @@ module Loofah
78
172
  def remove_extraneous_whitespace(string)
79
173
  string.gsub(/\n\s*\n\s*\n/, "\n\n")
80
174
  end
81
-
82
- private
83
-
84
- # remove comments that exist outside of the HTML element.
85
- #
86
- # these comments are allowed by the HTML spec:
87
- #
88
- # https://www.w3.org/TR/html401/struct/global.html#h-7.1
89
- #
90
- # but are not scrubbed by Loofah because these nodes don't meet
91
- # the contract that scrubbers expect of a node (e.g., it can be
92
- # replaced, sibling and children nodes can be created).
93
- def remove_comments_before_html_element(doc)
94
- doc.children.each do |child|
95
- child.unlink if child.comment?
96
- end
97
- doc
98
- end
99
175
  end
100
176
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: loofah
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.20.0
4
+ version: 2.21.0.rc1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mike Dalessio
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2023-04-01 00:00:00.000000000 Z
12
+ date: 2023-04-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: crass
@@ -39,102 +39,12 @@ dependencies:
39
39
  - - ">="
40
40
  - !ruby/object:Gem::Version
41
41
  version: 1.5.9
42
- - !ruby/object:Gem::Dependency
43
- name: hoe-markdown
44
- requirement: !ruby/object:Gem::Requirement
45
- requirements:
46
- - - "~>"
47
- - !ruby/object:Gem::Version
48
- version: '1.3'
49
- type: :development
50
- prerelease: false
51
- version_requirements: !ruby/object:Gem::Requirement
52
- requirements:
53
- - - "~>"
54
- - !ruby/object:Gem::Version
55
- version: '1.3'
56
- - !ruby/object:Gem::Dependency
57
- name: json
58
- requirement: !ruby/object:Gem::Requirement
59
- requirements:
60
- - - "~>"
61
- - !ruby/object:Gem::Version
62
- version: '2.2'
63
- type: :development
64
- prerelease: false
65
- version_requirements: !ruby/object:Gem::Requirement
66
- requirements:
67
- - - "~>"
68
- - !ruby/object:Gem::Version
69
- version: '2.2'
70
- - !ruby/object:Gem::Dependency
71
- name: minitest
72
- requirement: !ruby/object:Gem::Requirement
73
- requirements:
74
- - - "~>"
75
- - !ruby/object:Gem::Version
76
- version: '5.14'
77
- type: :development
78
- prerelease: false
79
- version_requirements: !ruby/object:Gem::Requirement
80
- requirements:
81
- - - "~>"
82
- - !ruby/object:Gem::Version
83
- version: '5.14'
84
- - !ruby/object:Gem::Dependency
85
- name: rake
86
- requirement: !ruby/object:Gem::Requirement
87
- requirements:
88
- - - "~>"
89
- - !ruby/object:Gem::Version
90
- version: '13.0'
91
- type: :development
92
- prerelease: false
93
- version_requirements: !ruby/object:Gem::Requirement
94
- requirements:
95
- - - "~>"
96
- - !ruby/object:Gem::Version
97
- version: '13.0'
98
- - !ruby/object:Gem::Dependency
99
- name: rdoc
100
- requirement: !ruby/object:Gem::Requirement
101
- requirements:
102
- - - ">="
103
- - !ruby/object:Gem::Version
104
- version: '4.0'
105
- - - "<"
106
- - !ruby/object:Gem::Version
107
- version: '7'
108
- type: :development
109
- prerelease: false
110
- version_requirements: !ruby/object:Gem::Requirement
111
- requirements:
112
- - - ">="
113
- - !ruby/object:Gem::Version
114
- version: '4.0'
115
- - - "<"
116
- - !ruby/object:Gem::Version
117
- version: '7'
118
- - !ruby/object:Gem::Dependency
119
- name: rubocop
120
- requirement: !ruby/object:Gem::Requirement
121
- requirements:
122
- - - "~>"
123
- - !ruby/object:Gem::Version
124
- version: '1.1'
125
- type: :development
126
- prerelease: false
127
- version_requirements: !ruby/object:Gem::Requirement
128
- requirements:
129
- - - "~>"
130
- - !ruby/object:Gem::Version
131
- version: '1.1'
132
- description: |-
133
- Loofah is a general library for manipulating and transforming HTML/XML documents and fragments, built on top of Nokogiri.
134
-
135
- Loofah excels at HTML sanitization (XSS prevention). It includes some nice HTML sanitizers, which are based on HTML5lib's safelist, so it most likely won't make your codes less secure. (These statements have not been evaluated by Netexperts.)
42
+ description: |
43
+ Loofah is a general library for manipulating and transforming HTML/XML documents and fragments,
44
+ built on top of Nokogiri.
136
45
 
137
- ActiveRecord extensions for sanitization are available in the [`loofah-activerecord` gem](https://github.com/flavorjones/loofah-activerecord).
46
+ Loofah also includes some HTML sanitizers based on `html5lib`'s safelist, which are a specific
47
+ application of the general transformation functionality.
138
48
  email:
139
49
  - mike.dalessio@gmail.com
140
50
  - bryan@brynary.com
@@ -147,14 +57,16 @@ files:
147
57
  - README.md
148
58
  - SECURITY.md
149
59
  - lib/loofah.rb
60
+ - lib/loofah/concerns.rb
150
61
  - lib/loofah/elements.rb
151
62
  - lib/loofah/helpers.rb
152
- - lib/loofah/html/document.rb
153
- - lib/loofah/html/document_fragment.rb
63
+ - lib/loofah/html4/document.rb
64
+ - lib/loofah/html4/document_fragment.rb
65
+ - lib/loofah/html5/document.rb
66
+ - lib/loofah/html5/document_fragment.rb
154
67
  - lib/loofah/html5/libxml2_workarounds.rb
155
68
  - lib/loofah/html5/safelist.rb
156
69
  - lib/loofah/html5/scrub.rb
157
- - lib/loofah/instance_methods.rb
158
70
  - lib/loofah/metahelpers.rb
159
71
  - lib/loofah/scrubber.rb
160
72
  - lib/loofah/scrubbers.rb
@@ -181,13 +93,13 @@ required_ruby_version: !ruby/object:Gem::Requirement
181
93
  version: '0'
182
94
  required_rubygems_version: !ruby/object:Gem::Requirement
183
95
  requirements:
184
- - - ">="
96
+ - - ">"
185
97
  - !ruby/object:Gem::Version
186
- version: '0'
98
+ version: 1.3.1
187
99
  requirements: []
188
100
  rubygems_version: 3.4.10
189
101
  signing_key:
190
102
  specification_version: 4
191
103
  summary: Loofah is a general library for manipulating and transforming HTML/XML documents
192
- and fragments, built on top of Nokogiri
104
+ and fragments, built on top of Nokogiri.
193
105
  test_files: []
@@ -1,42 +0,0 @@
1
- # frozen_string_literal: true
2
- module Loofah
3
- module HTML # :nodoc:
4
- #
5
- # Subclass of Nokogiri::HTML::DocumentFragment.
6
- #
7
- # See Loofah::ScrubBehavior and Loofah::TextBehavior for additional methods.
8
- #
9
- class DocumentFragment < Nokogiri::HTML::DocumentFragment
10
- include Loofah::TextBehavior
11
-
12
- class << self
13
- #
14
- # Overridden Nokogiri::HTML::DocumentFragment
15
- # constructor. Applications should use Loofah.fragment to
16
- # parse a fragment.
17
- #
18
- def parse(tags, encoding = nil)
19
- doc = Loofah::HTML::Document.new
20
-
21
- encoding ||= tags.respond_to?(:encoding) ? tags.encoding.name : "UTF-8"
22
- doc.encoding = encoding
23
-
24
- new(doc, tags)
25
- end
26
- end
27
-
28
- #
29
- # Returns the HTML markup contained by the fragment
30
- #
31
- def to_s
32
- serialize_root.children.to_s
33
- end
34
-
35
- alias :serialize :to_s
36
-
37
- def serialize_root
38
- at_xpath("./body") || self
39
- end
40
- end
41
- end
42
- end
@@ -1,133 +0,0 @@
1
- # frozen_string_literal: true
2
- module Loofah
3
- #
4
- # Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
5
- #
6
- # Traverse the document or fragment, invoking the +scrubber+ on
7
- # each node.
8
- #
9
- # +scrubber+ must either be one of the symbols representing the
10
- # built-in scrubbers (see Scrubbers), or a Scrubber instance.
11
- #
12
- # span2div = Loofah::Scrubber.new do |node|
13
- # node.name = "div" if node.name == "span"
14
- # end
15
- # Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
16
- # # => "<div>foo</div><p>bar</p>"
17
- #
18
- # or
19
- #
20
- # unsafe_html = "ohai! <div>div is safe</div> <script>but script is not</script>"
21
- # Loofah.fragment(unsafe_html).scrub!(:strip).to_s
22
- # # => "ohai! <div>div is safe</div> "
23
- #
24
- # Note that this method is called implicitly from
25
- # Loofah.scrub_fragment and Loofah.scrub_document.
26
- #
27
- # Please see Scrubber for more information on implementation and traversal, and
28
- # README.rdoc for more example usage.
29
- #
30
- module ScrubBehavior
31
- module Node # :nodoc:
32
- def scrub!(scrubber)
33
- #
34
- # yes. this should be three separate methods. but nokogiri
35
- # decorates (or not) based on whether the module name has
36
- # already been included. and since documents get decorated
37
- # just like their constituent nodes, we need to jam all the
38
- # logic into a single module.
39
- #
40
- scrubber = ScrubBehavior.resolve_scrubber(scrubber)
41
- case self
42
- when Nokogiri::XML::Document
43
- scrubber.traverse(root) if root
44
- when Nokogiri::XML::DocumentFragment
45
- children.scrub! scrubber
46
- else
47
- scrubber.traverse(self)
48
- end
49
- self
50
- end
51
- end
52
-
53
- module NodeSet # :nodoc:
54
- def scrub!(scrubber)
55
- each { |node| node.scrub!(scrubber) }
56
- self
57
- end
58
- end
59
-
60
- def ScrubBehavior.resolve_scrubber(scrubber) # :nodoc:
61
- scrubber = Scrubbers::MAP[scrubber].new if Scrubbers::MAP[scrubber]
62
- unless scrubber.is_a?(Loofah::Scrubber)
63
- raise Loofah::ScrubberNotFound, "not a Scrubber or a scrubber name: #{scrubber.inspect}"
64
- end
65
- scrubber
66
- end
67
- end
68
-
69
- #
70
- # Overrides +text+ in HTML::Document and HTML::DocumentFragment,
71
- # and mixes in +to_text+.
72
- #
73
- module TextBehavior
74
- #
75
- # Returns a plain-text version of the markup contained by the document,
76
- # with HTML entities encoded.
77
- #
78
- # This method is significantly faster than #to_text, but isn't
79
- # clever about whitespace around block elements.
80
- #
81
- # Loofah.document("<h1>Title</h1><div>Content</div>").text
82
- # # => "TitleContent"
83
- #
84
- # By default, the returned text will have HTML entities
85
- # escaped. If you want unescaped entities, and you understand
86
- # that the result is unsafe to render in a browser, then you
87
- # can pass an argument as shown:
88
- #
89
- # frag = Loofah.fragment("&lt;script&gt;alert('EVIL');&lt;/script&gt;")
90
- # # ok for browser:
91
- # frag.text # => "&lt;script&gt;alert('EVIL');&lt;/script&gt;"
92
- # # decidedly not ok for browser:
93
- # frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
94
- #
95
- def text(options = {})
96
- result = if serialize_root
97
- serialize_root.children.reject(&:comment?).map(&:inner_text).join("")
98
- else
99
- ""
100
- end
101
- if options[:encode_special_chars] == false
102
- result # possibly dangerous if rendered in a browser
103
- else
104
- encode_special_chars result
105
- end
106
- end
107
-
108
- alias :inner_text :text
109
- alias :to_str :text
110
-
111
- #
112
- # Returns a plain-text version of the markup contained by the
113
- # fragment, with HTML entities encoded.
114
- #
115
- # This method is slower than #text, but is clever about
116
- # whitespace around block elements and line break elements.
117
- #
118
- # Loofah.document("<h1>Title</h1><div>Content<br>Next line</div>").to_text
119
- # # => "\nTitle\n\nContent\nNext line\n"
120
- #
121
- def to_text(options = {})
122
- Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
123
- end
124
- end
125
-
126
- module DocumentDecorator # :nodoc:
127
- def initialize(*args, &block)
128
- super
129
- self.decorators(Nokogiri::XML::Node) << ScrubBehavior::Node
130
- self.decorators(Nokogiri::XML::NodeSet) << ScrubBehavior::NodeSet
131
- end
132
- end
133
- end