sanitize 4.6.6 → 5.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sanitize might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/HISTORY.md +147 -16
- data/README.md +61 -41
- data/lib/sanitize.rb +37 -61
- data/lib/sanitize/config/default.rb +10 -4
- data/lib/sanitize/css.rb +2 -2
- data/lib/sanitize/transformers/clean_comment.rb +1 -1
- data/lib/sanitize/transformers/clean_css.rb +3 -3
- data/lib/sanitize/transformers/clean_doctype.rb +1 -1
- data/lib/sanitize/transformers/clean_element.rb +54 -13
- data/lib/sanitize/version.rb +1 -1
- data/test/common.rb +0 -31
- data/test/test_clean_comment.rb +1 -5
- data/test/test_clean_css.rb +1 -1
- data/test/test_clean_doctype.rb +8 -8
- data/test/test_clean_element.rb +121 -26
- data/test/test_malicious_html.rb +50 -7
- data/test/test_parser.rb +3 -32
- data/test/test_sanitize.rb +103 -18
- data/test/test_sanitize_css.rb +43 -16
- data/test/test_transformers.rb +29 -23
- metadata +16 -18
- data/test/test_unicode.rb +0 -95
    
        data/lib/sanitize.rb
    CHANGED
    
    | @@ -19,6 +19,20 @@ require_relative 'sanitize/transformers/clean_element' | |
| 19 19 | 
             
            class Sanitize
         | 
| 20 20 | 
             
              attr_reader :config
         | 
| 21 21 |  | 
| 22 | 
            +
              # Matches one or more control characters that should be removed from HTML
         | 
| 23 | 
            +
              # before parsing, as defined by the HTML living standard.
         | 
| 24 | 
            +
              #
         | 
| 25 | 
            +
              # -   https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
         | 
| 26 | 
            +
              # -   https://infra.spec.whatwg.org/#control
         | 
| 27 | 
            +
              REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
         | 
| 28 | 
            +
             | 
| 29 | 
            +
              # Matches one or more non-characters that should be removed from HTML before
         | 
| 30 | 
            +
              # parsing, as defined by the HTML living standard.
         | 
| 31 | 
            +
              #
         | 
| 32 | 
            +
              # -   https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
         | 
| 33 | 
            +
              # -   https://infra.spec.whatwg.org/#noncharacter
         | 
| 34 | 
            +
              REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
         | 
| 35 | 
            +
             | 
| 22 36 | 
             
              # Matches an attribute value that could be treated by a browser as a URL
         | 
| 23 37 | 
             
              # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
         | 
| 24 38 | 
             
              # or more characters followed by a colon is considered a match, even if the
         | 
| @@ -26,11 +40,12 @@ class Sanitize | |
| 26 40 | 
             
              # IE6 and Opera will still parse).
         | 
| 27 41 | 
             
              REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|�*58|�*3a)/i
         | 
| 28 42 |  | 
| 29 | 
            -
              # Matches  | 
| 30 | 
            -
              #  | 
| 43 | 
            +
              # Matches one or more characters that should be stripped from HTML before
         | 
| 44 | 
            +
              # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
         | 
| 45 | 
            +
              # `REGEX_HTML_NON_CHARACTERS`.
         | 
| 31 46 | 
             
              #
         | 
| 32 | 
            -
              #  | 
| 33 | 
            -
              REGEX_UNSUITABLE_CHARS = / | 
| 47 | 
            +
              # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
         | 
| 48 | 
            +
              REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
         | 
| 34 49 |  | 
| 35 50 | 
             
              #--
         | 
| 36 51 | 
             
              # Class Methods
         | 
| @@ -39,7 +54,7 @@ class Sanitize | |
| 39 54 | 
             
              # Returns a sanitized copy of the given full _html_ document, using the
         | 
| 40 55 | 
             
              # settings in _config_ if specified.
         | 
| 41 56 | 
             
              #
         | 
| 42 | 
            -
              # When sanitizing a document, the `<html>` element must be  | 
| 57 | 
            +
              # When sanitizing a document, the `<html>` element must be allowlisted or an
         | 
| 43 58 | 
             
              # error will be raised. If this is undesirable, you should probably use
         | 
| 44 59 | 
             
              # {#fragment} instead.
         | 
| 45 60 | 
             
              def self.document(html, config = {})
         | 
| @@ -102,13 +117,13 @@ class Sanitize | |
| 102 117 |  | 
| 103 118 | 
             
              # Returns a sanitized copy of the given _html_ document.
         | 
| 104 119 | 
             
              #
         | 
| 105 | 
            -
              # When sanitizing a document, the `<html>` element must be  | 
| 120 | 
            +
              # When sanitizing a document, the `<html>` element must be allowlisted or an
         | 
| 106 121 | 
             
              # error will be raised. If this is undesirable, you should probably use
         | 
| 107 122 | 
             
              # {#fragment} instead.
         | 
| 108 123 | 
             
              def document(html)
         | 
| 109 124 | 
             
                return '' unless html
         | 
| 110 125 |  | 
| 111 | 
            -
                doc = Nokogiri::HTML5.parse(preprocess(html))
         | 
| 126 | 
            +
                doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
         | 
| 112 127 | 
             
                node!(doc)
         | 
| 113 128 | 
             
                to_html(doc)
         | 
| 114 129 | 
             
              end
         | 
| @@ -120,20 +135,7 @@ class Sanitize | |
| 120 135 | 
             
              def fragment(html)
         | 
| 121 136 | 
             
                return '' unless html
         | 
| 122 137 |  | 
| 123 | 
            -
                 | 
| 124 | 
            -
                doc  = Nokogiri::HTML5.parse("<html><body>#{html}")
         | 
| 125 | 
            -
             | 
| 126 | 
            -
                # Hack to allow fragments containing <body>. Borrowed from
         | 
| 127 | 
            -
                # Nokogiri::HTML::DocumentFragment.
         | 
| 128 | 
            -
                if html =~ /\A<body(?:\s|>)/i
         | 
| 129 | 
            -
                  path = '/html/body'
         | 
| 130 | 
            -
                else
         | 
| 131 | 
            -
                  path = '/html/body/node()'
         | 
| 132 | 
            -
                end
         | 
| 133 | 
            -
             | 
| 134 | 
            -
                frag = doc.fragment
         | 
| 135 | 
            -
                frag << doc.xpath(path)
         | 
| 136 | 
            -
             | 
| 138 | 
            +
                frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
         | 
| 137 139 | 
             
                node!(frag)
         | 
| 138 140 | 
             
                to_html(frag)
         | 
| 139 141 | 
             
              end
         | 
| @@ -145,20 +147,20 @@ class Sanitize | |
| 145 147 | 
             
              # in place.
         | 
| 146 148 | 
             
              #
         | 
| 147 149 | 
             
              # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
         | 
| 148 | 
            -
              #  | 
| 150 | 
            +
              # allowlisted or an error will be raised.
         | 
| 149 151 | 
             
              def node!(node)
         | 
| 150 152 | 
             
                raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
         | 
| 151 153 |  | 
| 152 154 | 
             
                if node.is_a?(Nokogiri::XML::Document)
         | 
| 153 155 | 
             
                  unless @config[:elements].include?('html')
         | 
| 154 | 
            -
                    raise Error, 'When sanitizing a document, "<html>" must be  | 
| 156 | 
            +
                    raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
         | 
| 155 157 | 
             
                  end
         | 
| 156 158 | 
             
                end
         | 
| 157 159 |  | 
| 158 | 
            -
                 | 
| 160 | 
            +
                node_allowlist = Set.new
         | 
| 159 161 |  | 
| 160 162 | 
             
                traverse(node) do |n|
         | 
| 161 | 
            -
                  transform_node!(n,  | 
| 163 | 
            +
                  transform_node!(n, node_allowlist)
         | 
| 162 164 | 
             
                end
         | 
| 163 165 |  | 
| 164 166 | 
             
                node
         | 
| @@ -184,40 +186,10 @@ class Sanitize | |
| 184 186 | 
             
              end
         | 
| 185 187 |  | 
| 186 188 | 
             
              def to_html(node)
         | 
| 187 | 
            -
                 | 
| 188 | 
            -
             | 
| 189 | 
            -
                # Hacky workaround for a libxml2 bug that adds an undesired Content-Type
         | 
| 190 | 
            -
                # meta tag to all serialized HTML documents.
         | 
| 191 | 
            -
                #
         | 
| 192 | 
            -
                # https://github.com/sparklemotion/nokogiri/issues/1008
         | 
| 193 | 
            -
                if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
         | 
| 194 | 
            -
                    node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
         | 
| 195 | 
            -
             | 
| 196 | 
            -
                  regex_meta   = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
         | 
| 197 | 
            -
             | 
| 198 | 
            -
                  # Only replace the content-type meta tag if <meta> isn't whitelisted or
         | 
| 199 | 
            -
                  # the original document didn't actually include a content-type meta tag.
         | 
| 200 | 
            -
                  replace_meta = !@config[:elements].include?('meta') ||
         | 
| 201 | 
            -
                    node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
         | 
| 202 | 
            -
                      meta['http-equiv'].casecmp('content-type').zero?
         | 
| 203 | 
            -
                    end
         | 
| 204 | 
            -
                end
         | 
| 205 | 
            -
             | 
| 206 | 
            -
                so = Nokogiri::XML::Node::SaveOptions
         | 
| 207 | 
            -
             | 
| 208 | 
            -
                # Serialize to HTML without any formatting to prevent Nokogiri from adding
         | 
| 209 | 
            -
                # newlines after certain tags.
         | 
| 210 | 
            -
                html = node.to_html(
         | 
| 211 | 
            -
                  :encoding  => 'utf-8',
         | 
| 212 | 
            -
                  :indent    => 0,
         | 
| 213 | 
            -
                  :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
         | 
| 214 | 
            -
                )
         | 
| 215 | 
            -
             | 
| 216 | 
            -
                html.gsub!(regex_meta, '\1') if replace_meta
         | 
| 217 | 
            -
                html
         | 
| 189 | 
            +
                node.to_html(preserve_newline: true)
         | 
| 218 190 | 
             
              end
         | 
| 219 191 |  | 
| 220 | 
            -
              def transform_node!(node,  | 
| 192 | 
            +
              def transform_node!(node, node_allowlist)
         | 
| 221 193 | 
             
                @transformers.each do |transformer|
         | 
| 222 194 | 
             
                  # Since transform_node! may be called in a tight loop to process thousands
         | 
| 223 195 | 
             
                  # of items, we can optimize both memory and CPU performance by:
         | 
| @@ -227,15 +199,19 @@ class Sanitize | |
| 227 199 | 
             
                  # does merge! create a new hash, it is also 2.6x slower:
         | 
| 228 200 | 
             
                  # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
         | 
| 229 201 | 
             
                  config = @transformer_config
         | 
| 230 | 
            -
                  config[:is_whitelisted] =  | 
| 202 | 
            +
                  config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
         | 
| 231 203 | 
             
                  config[:node] = node
         | 
| 232 204 | 
             
                  config[:node_name] = node.name.downcase
         | 
| 233 | 
            -
                  config[:node_whitelist] =  | 
| 205 | 
            +
                  config[:node_allowlist] = config[:node_whitelist] = node_allowlist
         | 
| 234 206 |  | 
| 235 207 | 
             
                  result = transformer.call(config)
         | 
| 236 208 |  | 
| 237 | 
            -
                  if result.is_a?(Hash) | 
| 238 | 
            -
                     | 
| 209 | 
            +
                  if result.is_a?(Hash)
         | 
| 210 | 
            +
                    result_allowlist = result[:node_allowlist] || result[:node_whitelist]
         | 
| 211 | 
            +
             | 
| 212 | 
            +
                    if result_allowlist.respond_to?(:each)
         | 
| 213 | 
            +
                      node_allowlist.merge(result_allowlist)
         | 
| 214 | 
            +
                    end
         | 
| 239 215 | 
             
                  end
         | 
| 240 216 | 
             
                end
         | 
| 241 217 |  | 
| @@ -56,6 +56,10 @@ class Sanitize | |
| 56 56 | 
             
                  # that all HTML will be stripped).
         | 
| 57 57 | 
             
                  :elements => [],
         | 
| 58 58 |  | 
| 59 | 
            +
                  # HTML parsing options to pass to Nokogumbo.
         | 
| 60 | 
            +
                  # https://github.com/rubys/nokogumbo/tree/v2.0.1#parsing-options
         | 
| 61 | 
            +
                  :parser_options => {},
         | 
| 62 | 
            +
             | 
| 59 63 | 
             
                  # URL handling protocols to allow in specific attributes. By default, no
         | 
| 60 64 | 
             
                  # protocols are allowed. Use :relative in place of a protocol if you want
         | 
| 61 65 | 
             
                  # to allow relative URLs sans protocol.
         | 
| @@ -66,10 +70,12 @@ class Sanitize | |
| 66 70 | 
             
                  # leaves the safe parts of an element's contents behind when the element
         | 
| 67 71 | 
             
                  # is removed.
         | 
| 68 72 | 
             
                  #
         | 
| 69 | 
            -
                  # If this is an Array of element names, then only the contents of | 
| 70 | 
            -
                  # specified elements (when filtered) will be removed, and the contents | 
| 71 | 
            -
                  # all other filtered elements will be left behind.
         | 
| 72 | 
            -
                  :remove_contents =>  | 
| 73 | 
            +
                  # If this is an Array or Set of element names, then only the contents of
         | 
| 74 | 
            +
                  # the specified elements (when filtered) will be removed, and the contents
         | 
| 75 | 
            +
                  # of all other filtered elements will be left behind.
         | 
| 76 | 
            +
                  :remove_contents => %w[
         | 
| 77 | 
            +
                    iframe math noembed noframes noscript plaintext script style svg xmp
         | 
| 78 | 
            +
                  ],
         | 
| 73 79 |  | 
| 74 80 | 
             
                  # Transformers allow you to filter or alter nodes using custom logic. See
         | 
| 75 81 | 
             
                  # README.md for details and examples.
         | 
    
        data/lib/sanitize/css.rb
    CHANGED
    
    | @@ -175,7 +175,7 @@ class Sanitize; class CSS | |
| 175 175 | 
             
                    next prop
         | 
| 176 176 |  | 
| 177 177 | 
             
                  when :semicolon
         | 
| 178 | 
            -
                    # Only preserve the semicolon if it was preceded by  | 
| 178 | 
            +
                    # Only preserve the semicolon if it was preceded by an allowlisted
         | 
| 179 179 | 
             
                    # property. Otherwise, omit it in order to prevent redundant semicolons.
         | 
| 180 180 | 
             
                    if preceded_by_property
         | 
| 181 181 | 
             
                      preceded_by_property = false
         | 
| @@ -296,7 +296,7 @@ class Sanitize; class CSS | |
| 296 296 | 
             
              end
         | 
| 297 297 |  | 
| 298 298 | 
             
              # Returns `true` if the given node (which may be of type `:url` or
         | 
| 299 | 
            -
              # `:function`, since the CSS syntax can produce both) uses  | 
| 299 | 
            +
              # `:function`, since the CSS syntax can produce both) uses an allowlisted
         | 
| 300 300 | 
             
              # protocol.
         | 
| 301 301 | 
             
              def valid_url?(node)
         | 
| 302 302 | 
             
                type = node[:node]
         | 
| @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            class Sanitize; module Transformers; module CSS
         | 
| 2 2 |  | 
| 3 | 
            -
            # Enforces a CSS  | 
| 3 | 
            +
            # Enforces a CSS allowlist on the contents of `style` attributes.
         | 
| 4 4 | 
             
            class CleanAttribute
         | 
| 5 5 | 
             
              def initialize(sanitizer_or_config)
         | 
| 6 6 | 
             
                if Sanitize::CSS === sanitizer_or_config
         | 
| @@ -14,7 +14,7 @@ class CleanAttribute | |
| 14 14 | 
             
                node = env[:node]
         | 
| 15 15 |  | 
| 16 16 | 
             
                return unless node.type == Nokogiri::XML::Node::ELEMENT_NODE &&
         | 
| 17 | 
            -
                    node.key?('style') && !env[: | 
| 17 | 
            +
                    node.key?('style') && !env[:is_allowlisted]
         | 
| 18 18 |  | 
| 19 19 | 
             
                attr = node.attribute('style')
         | 
| 20 20 | 
             
                css  = @scss.properties(attr.value)
         | 
| @@ -27,7 +27,7 @@ class CleanAttribute | |
| 27 27 | 
             
              end
         | 
| 28 28 | 
             
            end
         | 
| 29 29 |  | 
| 30 | 
            -
            # Enforces a CSS  | 
| 30 | 
            +
            # Enforces a CSS allowlist on the contents of `<style>` elements.
         | 
| 31 31 | 
             
            class CleanElement
         | 
| 32 32 | 
             
              def initialize(sanitizer_or_config)
         | 
| 33 33 | 
             
                if Sanitize::CSS === sanitizer_or_config
         | 
| @@ -67,7 +67,7 @@ class Sanitize; module Transformers; class CleanElement | |
| 67 67 | 
             
                  @whitespace_elements = config[:whitespace_elements]
         | 
| 68 68 | 
             
                end
         | 
| 69 69 |  | 
| 70 | 
            -
                if config[:remove_contents].is_a?( | 
| 70 | 
            +
                if config[:remove_contents].is_a?(Enumerable)
         | 
| 71 71 | 
             
                  @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
         | 
| 72 72 | 
             
                else
         | 
| 73 73 | 
             
                  @remove_all_contents = !!config[:remove_contents]
         | 
| @@ -76,11 +76,11 @@ class Sanitize; module Transformers; class CleanElement | |
| 76 76 |  | 
| 77 77 | 
             
              def call(env)
         | 
| 78 78 | 
             
                node = env[:node]
         | 
| 79 | 
            -
                return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[: | 
| 79 | 
            +
                return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
         | 
| 80 80 |  | 
| 81 81 | 
             
                name = env[:node_name]
         | 
| 82 82 |  | 
| 83 | 
            -
                # Delete any element that isn't in the config  | 
| 83 | 
            +
                # Delete any element that isn't in the config allowlist, unless the node has
         | 
| 84 84 | 
             
                # already been deleted from the document.
         | 
| 85 85 | 
             
                #
         | 
| 86 86 | 
             
                # It's important that we not try to reparent the children of a node that has
         | 
| @@ -97,28 +97,30 @@ class Sanitize; module Transformers; class CleanElement | |
| 97 97 | 
             
                    end
         | 
| 98 98 | 
             
                  end
         | 
| 99 99 |  | 
| 100 | 
            -
                  unless  | 
| 101 | 
            -
                     | 
| 100 | 
            +
                  unless node.children.empty?
         | 
| 101 | 
            +
                    unless @remove_all_contents || @remove_element_contents.include?(name)
         | 
| 102 | 
            +
                      node.add_previous_sibling(node.children)
         | 
| 103 | 
            +
                    end
         | 
| 102 104 | 
             
                  end
         | 
| 103 105 |  | 
| 104 106 | 
             
                  node.unlink
         | 
| 105 107 | 
             
                  return
         | 
| 106 108 | 
             
                end
         | 
| 107 109 |  | 
| 108 | 
            -
                 | 
| 110 | 
            +
                attr_allowlist = @attributes[name] || @attributes[:all]
         | 
| 109 111 |  | 
| 110 | 
            -
                if  | 
| 111 | 
            -
                  # Delete all attributes from elements with no  | 
| 112 | 
            +
                if attr_allowlist.nil?
         | 
| 113 | 
            +
                  # Delete all attributes from elements with no allowlisted attributes.
         | 
| 112 114 | 
             
                  node.attribute_nodes.each {|attr| attr.unlink }
         | 
| 113 115 | 
             
                else
         | 
| 114 | 
            -
                  allow_data_attributes =  | 
| 116 | 
            +
                  allow_data_attributes = attr_allowlist.include?(:data)
         | 
| 115 117 |  | 
| 116 118 | 
             
                  # Delete any attribute that isn't allowed on this element.
         | 
| 117 119 | 
             
                  node.attribute_nodes.each do |attr|
         | 
| 118 120 | 
             
                    attr_name = attr.name.downcase
         | 
| 119 121 |  | 
| 120 | 
            -
                    unless  | 
| 121 | 
            -
                      # The attribute isn't  | 
| 122 | 
            +
                    unless attr_allowlist.include?(attr_name)
         | 
| 123 | 
            +
                      # The attribute isn't allowed.
         | 
| 122 124 |  | 
| 123 125 | 
             
                      if allow_data_attributes && attr_name.start_with?('data-')
         | 
| 124 126 | 
             
                        # Arbitrary data attributes are allowed. If this is a data
         | 
| @@ -132,7 +134,7 @@ class Sanitize; module Transformers; class CleanElement | |
| 132 134 | 
             
                      next
         | 
| 133 135 | 
             
                    end
         | 
| 134 136 |  | 
| 135 | 
            -
                    # The attribute is  | 
| 137 | 
            +
                    # The attribute is allowed.
         | 
| 136 138 |  | 
| 137 139 | 
             
                    # Remove any attributes that use unacceptable protocols.
         | 
| 138 140 | 
             
                    if @protocols.include?(name) && @protocols[name].include?(attr_name)
         | 
| @@ -160,12 +162,17 @@ class Sanitize; module Transformers; class CleanElement | |
| 160 162 | 
             
                    # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
         | 
| 161 163 | 
             
                    # attempt to preserve server-side includes. This can result in XSS since
         | 
| 162 164 | 
             
                    # an unescaped double quote can allow an attacker to inject a
         | 
| 163 | 
            -
                    # non- | 
| 165 | 
            +
                    # non-allowlisted attribute.
         | 
| 164 166 | 
             
                    #
         | 
| 165 167 | 
             
                    # Sanitize works around this by implementing its own escaping for
         | 
| 166 168 | 
             
                    # affected attributes, some of which can exist on any element and some
         | 
| 167 169 | 
             
                    # of which can only exist on `<a>` elements.
         | 
| 168 170 | 
             
                    #
         | 
| 171 | 
            +
                    # This fix is technically no longer necessary with Nokogumbo >= 2.0
         | 
| 172 | 
            +
                    # since it no longer uses libxml2's serializer, but it's retained to
         | 
| 173 | 
            +
                    # avoid breaking use cases where people might be sanitizing individual
         | 
| 174 | 
            +
                    # Nokogiri nodes and then serializing them manually without Nokogumbo.
         | 
| 175 | 
            +
                    #
         | 
| 169 176 | 
             
                    # The relevant libxml2 code is here:
         | 
| 170 177 | 
             
                    # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
         | 
| 171 178 | 
             
                    if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
         | 
| @@ -180,6 +187,40 @@ class Sanitize; module Transformers; class CleanElement | |
| 180 187 | 
             
                if @add_attributes.include?(name)
         | 
| 181 188 | 
             
                  @add_attributes[name].each {|key, val| node[key] = val }
         | 
| 182 189 | 
             
                end
         | 
| 190 | 
            +
             | 
| 191 | 
            +
                # Element-specific special cases.
         | 
| 192 | 
            +
                case name
         | 
| 193 | 
            +
             | 
| 194 | 
            +
                # If this is an allowlisted iframe that has children, remove all its
         | 
| 195 | 
            +
                # children. The HTML standard says iframes shouldn't have content, but when
         | 
| 196 | 
            +
                # they do, this content is parsed as text and is serialized verbatim without
         | 
| 197 | 
            +
                # being escaped, which is unsafe because legacy browsers may still render it
         | 
| 198 | 
            +
                # and execute `<script>` content. So the safe and correct thing to do is to
         | 
| 199 | 
            +
                # always remove iframe content.
         | 
| 200 | 
            +
                when 'iframe'
         | 
| 201 | 
            +
                  if !node.children.empty?
         | 
| 202 | 
            +
                    node.children.each do |child|
         | 
| 203 | 
            +
                      child.unlink
         | 
| 204 | 
            +
                    end
         | 
| 205 | 
            +
                  end
         | 
| 206 | 
            +
             | 
| 207 | 
            +
                # Prevent the use of `<meta>` elements that set a charset other than UTF-8,
         | 
| 208 | 
            +
                # since Sanitize's output is always UTF-8.
         | 
| 209 | 
            +
                when 'meta'
         | 
| 210 | 
            +
                  if node.has_attribute?('charset') &&
         | 
| 211 | 
            +
                      node['charset'].downcase != 'utf-8'
         | 
| 212 | 
            +
             | 
| 213 | 
            +
                    node['charset'] = 'utf-8'
         | 
| 214 | 
            +
                  end
         | 
| 215 | 
            +
             | 
| 216 | 
            +
                  if node.has_attribute?('http-equiv') &&
         | 
| 217 | 
            +
                      node.has_attribute?('content') &&
         | 
| 218 | 
            +
                      node['http-equiv'].downcase == 'content-type' &&
         | 
| 219 | 
            +
                      node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
         | 
| 220 | 
            +
             | 
| 221 | 
            +
                    node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8')
         | 
| 222 | 
            +
                  end
         | 
| 223 | 
            +
                end
         | 
| 183 224 | 
             
              end
         | 
| 184 225 |  | 
| 185 226 | 
             
            end; end; end
         | 
    
        data/lib/sanitize/version.rb
    CHANGED
    
    
    
        data/test/common.rb
    CHANGED
    
    | @@ -1,34 +1,3 @@ | |
| 1 1 | 
             
            # encoding: utf-8
         | 
| 2 | 
            -
            gem 'minitest'
         | 
| 3 2 | 
             
            require 'minitest/autorun'
         | 
| 4 | 
            -
             | 
| 5 3 | 
             
            require_relative '../lib/sanitize'
         | 
| 6 | 
            -
             | 
| 7 | 
            -
            # Helper to stub an instance method. Shamelessly stolen from
         | 
| 8 | 
            -
            # https://github.com/codeodor/minitest-stub_any_instance/
         | 
| 9 | 
            -
            class Object
         | 
| 10 | 
            -
              def self.stub_instance(name, value, &block)
         | 
| 11 | 
            -
                old_method = "__stubbed_method_#{name}__"
         | 
| 12 | 
            -
             | 
| 13 | 
            -
                class_eval do
         | 
| 14 | 
            -
                  alias_method old_method, name
         | 
| 15 | 
            -
             | 
| 16 | 
            -
                  define_method(name) do |*args|
         | 
| 17 | 
            -
                    if value.respond_to?(:call) then
         | 
| 18 | 
            -
                      value.call(*args)
         | 
| 19 | 
            -
                    else
         | 
| 20 | 
            -
                      value
         | 
| 21 | 
            -
                    end
         | 
| 22 | 
            -
                  end
         | 
| 23 | 
            -
                end
         | 
| 24 | 
            -
             | 
| 25 | 
            -
                yield
         | 
| 26 | 
            -
             | 
| 27 | 
            -
              ensure
         | 
| 28 | 
            -
                class_eval do
         | 
| 29 | 
            -
                  undef_method name
         | 
| 30 | 
            -
                  alias_method name, old_method
         | 
| 31 | 
            -
                  undef_method old_method
         | 
| 32 | 
            -
                end
         | 
| 33 | 
            -
              end
         | 
| 34 | 
            -
            end
         | 
    
        data/test/test_clean_comment.rb
    CHANGED
    
    | @@ -20,7 +20,7 @@ describe 'Sanitize::Transformers::CleanComment' do | |
| 20 20 |  | 
| 21 21 | 
             
                  # Special case: the comment markup is inside a <script>, which makes it
         | 
| 22 22 | 
             
                  # text content and not an actual HTML comment.
         | 
| 23 | 
            -
                  @s.fragment("<script><!-- comment --></script>").must_equal ' | 
| 23 | 
            +
                  @s.fragment("<script><!-- comment --></script>").must_equal ''
         | 
| 24 24 |  | 
| 25 25 | 
             
                  Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script'])
         | 
| 26 26 | 
             
                    .must_equal '<script><!-- comment --></script>'
         | 
| @@ -40,10 +40,6 @@ describe 'Sanitize::Transformers::CleanComment' do | |
| 40 40 | 
             
                  @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo <!-- <!-- <!-- --> --> -->bar'
         | 
| 41 41 | 
             
                  @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>>bar</div>'
         | 
| 42 42 |  | 
| 43 | 
            -
                  # Special case: the comment markup is inside a <script>, which makes it
         | 
| 44 | 
            -
                  # text content and not an actual HTML comment.
         | 
| 45 | 
            -
                  @s.fragment("<script><!-- comment --></script>").must_equal '<!-- comment -->'
         | 
| 46 | 
            -
             | 
| 47 43 | 
             
                  Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script'])
         | 
| 48 44 | 
             
                    .must_equal '<script><!-- comment --></script>'
         | 
| 49 45 | 
             
                end
         |