RubyGems - html-pipeline - Versions diffs - 2.14.3 → 3.0.0.pre1 - Mend

html-pipeline 2.14.3 → 3.0.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

checksums.yaml +4 -4
data/.github/FUNDING.yml +11 -3
data/.github/dependabot.yml +20 -0
data/.github/workflows/automerge.yml +34 -0
data/.github/workflows/lint.yml +23 -0
data/.github/workflows/tag_and_release.yml +70 -0
data/.github/workflows/test.yml +33 -0
data/.rubocop.yml +17 -0
data/CHANGELOG.md +28 -2
data/Gemfile +29 -15
data/{LICENSE → LICENSE.txt} +2 -2
data/README.md +209 -218
data/Rakefile +14 -7
data/UPGRADING.md +35 -0
data/html-pipeline.gemspec +31 -21
data/lib/html-pipeline.rb +3 -0
data/lib/html_pipeline/convert_filter/markdown_filter.rb +26 -0
data/lib/html_pipeline/convert_filter.rb +17 -0
data/lib/html_pipeline/filter.rb +89 -0
data/lib/{html/pipeline → html_pipeline/node_filter}/absolute_source_filter.rb +23 -21
data/lib/{html/pipeline → html_pipeline/node_filter}/emoji_filter.rb +58 -54
data/lib/html_pipeline/node_filter/https_filter.rb +22 -0
data/lib/html_pipeline/node_filter/image_max_width_filter.rb +40 -0
data/lib/{html/pipeline/@mention_filter.rb → html_pipeline/node_filter/mention_filter.rb} +55 -69
data/lib/html_pipeline/node_filter/table_of_contents_filter.rb +68 -0
data/lib/html_pipeline/node_filter/team_mention_filter.rb +105 -0
data/lib/html_pipeline/node_filter.rb +31 -0
data/lib/html_pipeline/sanitization_filter.rb +65 -0
data/lib/{html/pipeline → html_pipeline/text_filter}/image_filter.rb +3 -3
data/lib/{html/pipeline → html_pipeline/text_filter}/plain_text_input_filter.rb +3 -5
data/lib/html_pipeline/text_filter.rb +21 -0
data/lib/html_pipeline/version.rb +5 -0
data/lib/html_pipeline.rb +252 -0
metadata +52 -54
data/.travis.yml +0 -43
data/Appraisals +0 -19
data/CONTRIBUTING.md +0 -60
data/bin/html-pipeline +0 -78
data/lib/html/pipeline/@team_mention_filter.rb +0 -99
data/lib/html/pipeline/autolink_filter.rb +0 -34
data/lib/html/pipeline/body_content.rb +0 -44
data/lib/html/pipeline/camo_filter.rb +0 -105
data/lib/html/pipeline/email_reply_filter.rb +0 -69
data/lib/html/pipeline/filter.rb +0 -165
data/lib/html/pipeline/https_filter.rb +0 -29
data/lib/html/pipeline/image_max_width_filter.rb +0 -37
data/lib/html/pipeline/markdown_filter.rb +0 -56
data/lib/html/pipeline/sanitization_filter.rb +0 -144
data/lib/html/pipeline/syntax_highlight_filter.rb +0 -50
data/lib/html/pipeline/text_filter.rb +0 -16
data/lib/html/pipeline/textile_filter.rb +0 -25
data/lib/html/pipeline/toc_filter.rb +0 -69
data/lib/html/pipeline/version.rb +0 -7
data/lib/html/pipeline.rb +0 -210

data/lib/html/pipeline/sanitization_filter.rb DELETED Viewed

@@ -1,144 +0,0 @@
-# frozen_string_literal: true
-HTML::Pipeline.require_dependency('sanitize', 'SanitizationFilter')
-module HTML
-  class Pipeline
-    # HTML filter with sanization routines and allowlists. This module defines
-    # what HTML is allowed in user provided content and fixes up issues with
-    # unbalanced tags and whatnot.
-    #
-    # See the Sanitize docs for more information on the underlying library:
-    #
-    # https://github.com/rgrove/sanitize/#readme
-    #
-    # Context options:
-    #   :allowlist      - The sanitizer allowlist configuration to use. This
-    #                     can be one of the options constants defined in this
-    #                     class or a custom sanitize options hash.
-    #   :anchor_schemes - The URL schemes to allow in <a href> attributes. The
-    #                     default set is provided in the ANCHOR_SCHEMES
-    #                     constant in this class. If passed, this overrides any
-    #                     schemes specified in the allowlist configuration.
-    #
-    # This filter does not write additional information to the context.
-    class SanitizationFilter < Filter
-      LISTS     = Set.new(%w[ul ol].freeze)
-      LIST_ITEM = 'li'.freeze
-      # List of table child elements. These must be contained by a <table> element
-      # or they are not allowed through. Otherwise they can be used to break out
-      # of places we're using tables to contain formatted user content (like pull
-      # request review comments).
-      TABLE_ITEMS = Set.new(%w[tr td th].freeze)
-      TABLE = 'table'.freeze
-      TABLE_SECTIONS = Set.new(%w[thead tbody tfoot].freeze)
-      # These schemes are the only ones allowed in <a href> attributes by default.
-      ANCHOR_SCHEMES = ['http', 'https', 'mailto', 'xmpp', :relative, 'github-windows', 'github-mac', 'irc', 'ircs'].freeze
-      # The main sanitization allowlist. Only these elements and attributes are
-      # allowed through by default.
-      ALLOWLIST = {
-        elements: %w[
-          h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
-          div ins del sup sub p ol ul table thead tbody tfoot blockquote
-          dl dt dd kbd q samp var hr ruby rt rp li tr td th s strike summary
-          details caption figure figcaption
-          abbr bdo cite dfn mark small span time wbr
-        ].freeze,
-        remove_contents: ['script'].freeze,
-        attributes: {
-          'a'          => ['href'].freeze,
-          'img'        => %w[src longdesc].freeze,
-          'div'        => %w[itemscope itemtype].freeze,
-          'blockquote' => ['cite'].freeze,
-          'del'        => ['cite'].freeze,
-          'ins'        => ['cite'].freeze,
-          'q'          => ['cite'].freeze,
-          all: %w[abbr accept accept-charset
-                  accesskey action align alt
-                  aria-describedby aria-hidden aria-label aria-labelledby
-                  axis border cellpadding cellspacing char
-                  charoff charset checked
-                  clear cols colspan color
-                  compact coords datetime dir
-                  disabled enctype for frame
-                  headers height hreflang
-                  hspace ismap label lang
-                  maxlength media method
-                  multiple name nohref noshade
-                  nowrap open progress prompt readonly rel rev
-                  role rows rowspan rules scope
-                  selected shape size span
-                  start summary tabindex target
-                  title type usemap valign value
-                  vspace width itemprop].freeze
-        }.freeze,
-        protocols: {
-          'a'          => { 'href' => ANCHOR_SCHEMES }.freeze,
-          'blockquote' => { 'cite' => ['http', 'https', :relative].freeze },
-          'del'        => { 'cite' => ['http', 'https', :relative].freeze },
-          'ins'        => { 'cite' => ['http', 'https', :relative].freeze },
-          'q'          => { 'cite' => ['http', 'https', :relative].freeze },
-          'img'        => {
-            'src'      => ['http', 'https', :relative].freeze,
-            'longdesc' => ['http', 'https', :relative].freeze
-          }.freeze
-        },
-        transformers: [
-          # Top-level <li> elements are removed because they can break out of
-          # containing markup.
-          lambda { |env|
-            name = env[:node_name]
-            node = env[:node]
-            if name == LIST_ITEM && node.ancestors.none? { |n| LISTS.include?(n.name) }
-              node.replace(node.children)
-            end
-          },
-          # Table child elements that are not contained by a <table> are removed.
-          lambda { |env|
-            name = env[:node_name]
-            node = env[:node]
-            if (TABLE_SECTIONS.include?(name) || TABLE_ITEMS.include?(name)) && node.ancestors.none? { |n| n.name == TABLE }
-              node.replace(node.children)
-            end
-          }
-        ].freeze
-      }.freeze
-      # A more limited sanitization allowlist. This includes all attributes,
-      # protocols, and transformers from ALLOWLIST but with a more locked down
-      # set of allowed elements.
-      LIMITED = ALLOWLIST.merge(
-        elements: %w[b i strong em a pre code img ins del sup sub mark abbr p ol ul li]
-      )
-      # Strip all HTML tags from the document.
-      FULL = { elements: [] }.freeze
-      # Sanitize markup using the Sanitize library.
-      def call
-        Sanitize.clean_node!(doc, allowlist)
-      end
-      def whitelist
-        warn "[DEPRECATION] 'whitelist' is deprecated. Please use 'allowlist' instead."
-        allowlist
-      end
-      # The allowlist to use when sanitizing. This can be passed in the context
-      # hash to the filter but defaults to ALLOWLIST constant value above.
-      def allowlist
-        allowlist = context[:allowlist] || context[:whitelist] || ALLOWLIST
-        anchor_schemes = context[:anchor_schemes]
-        return allowlist unless anchor_schemes
-        allowlist = allowlist.dup
-        allowlist[:protocols] = (allowlist[:protocols] || {}).dup
-        allowlist[:protocols]['a'] = (allowlist[:protocols]['a'] || {}).merge('href' => anchor_schemes)
-        allowlist
-      end
-    end
-  end
-end

data/lib/html/pipeline/syntax_highlight_filter.rb DELETED Viewed

@@ -1,50 +0,0 @@
-# frozen_string_literal: true
-HTML::Pipeline.require_dependency('rouge', 'SyntaxHighlightFilter')
-module HTML
-  class Pipeline
-    # HTML Filter that syntax highlights text inside code blocks.
-    #
-    # Context options:
-    #
-    #   :highlight => String represents the language to pick lexer. Defaults to empty string.
-    #   :scope => String represents the class attribute adds to pre element after.
-    #             Defaults to "highlight highlight-css" if highlights a css code block.
-    #
-    # This filter does not write any additional information to the context hash.
-    class SyntaxHighlightFilter < Filter
-      def initialize(*args)
-        super(*args)
-        @formatter = Rouge::Formatters::HTML.new
-      end
-      def call
-        doc.search('pre').each do |node|
-          default = context[:highlight] && context[:highlight].to_s
-          next unless lang = node['lang'] || default
-          next unless lexer = lexer_for(lang)
-          text = node.inner_text
-          html = highlight_with_timeout_handling(text, lexer)
-          next if html.nil?
-          node.inner_html = html
-          scope = context.fetch(:scope) { 'highlight' }
-          node['class'] = "#{scope} #{scope}-#{lang}"
-        end
-        doc
-      end
-      def highlight_with_timeout_handling(text, lexer)
-        Rouge.highlight(text, lexer, @formatter)
-      rescue Timeout::Error => _
-        nil
-      end
-      def lexer_for(lang)
-        Rouge::Lexer.find(lang)
-      end
-    end
-  end
-end

data/lib/html/pipeline/text_filter.rb DELETED Viewed

@@ -1,16 +0,0 @@
-# frozen_string_literal: true
-module HTML
-  class Pipeline
-    class TextFilter < Filter
-      attr_reader :text
-      def initialize(text, context = nil, result = nil)
-        raise TypeError, 'text cannot be HTML' if text.is_a?(DocumentFragment)
-        # Ensure that this is always a string
-        @text = text.respond_to?(:to_str) ? text.to_str : text.to_s
-        super nil, context, result
-      end
-    end
-  end
-end

data/lib/html/pipeline/textile_filter.rb DELETED Viewed

@@ -1,25 +0,0 @@
-# frozen_string_literal: true
-HTML::Pipeline.require_dependency('redcloth', 'RedCloth')
-module HTML
-  class Pipeline
-    # HTML Filter that converts Textile text into HTML and converts into a
-    # DocumentFragment. This is different from most filters in that it can take a
-    # non-HTML as input. It must be used as the first filter in a pipeline.
-    #
-    # Context options:
-    #   :autolink => false    Disable autolinking URLs
-    #
-    # This filter does not write any additional information to the context hash.
-    #
-    # NOTE This filter is provided for really old comments only. It probably
-    # shouldn't be used for anything new.
-    class TextileFilter < TextFilter
-      # Convert Textile to HTML and convert into a DocumentFragment.
-      def call
-        RedCloth.new(@text).to_html
-      end
-    end
-  end
-end

data/lib/html/pipeline/toc_filter.rb DELETED Viewed

@@ -1,69 +0,0 @@
-# frozen_string_literal: true
-HTML::Pipeline.require_dependency('escape_utils', 'TableOfContentsFilter')
-module HTML
-  class Pipeline
-    # HTML filter that adds an 'id' attribute to all headers
-    # in a document, so they can be accessed from a table of contents.
-    #
-    # Generates the Table of Contents, with links to each header.
-    #
-    # Examples
-    #
-    #  TocPipeline =
-    #    HTML::Pipeline.new [
-    #      HTML::Pipeline::TableOfContentsFilter
-    #    ]
-    #  # => #<HTML::Pipeline:0x007fc13c4528d8...>
-    #  orig = %(<h1>Ice cube</h1><p>is not for the pop chart</p>)
-    #  # => "<h1>Ice cube</h1><p>is not for the pop chart</p>"
-    #  result = {}
-    #  # => {}
-    #  TocPipeline.call(orig, {}, result)
-    #  # => {:toc=> ...}
-    #  result[:toc]
-    #  # => "<ul class=\"section-nav\">\n<li><a href=\"#ice-cube\">...</li><ul>"
-    #  result[:output].to_s
-    #  # => "<h1>\n<a id=\"ice-cube\" class=\"anchor\" href=\"#ice-cube\">..."
-    class TableOfContentsFilter < Filter
-      PUNCTUATION_REGEXP = RUBY_VERSION > '1.9' ? /[^\p{Word}\- ]/u : /[^\w\- ]/
-      # The icon that will be placed next to an anchored rendered markdown header
-      def anchor_icon
-        context[:anchor_icon] || '<span aria-hidden="true" class="octicon octicon-link"></span>'
-      end
-      def call
-        result[:toc] = String.new('')
-        headers = Hash.new(0)
-        doc.css('h1, h2, h3, h4, h5, h6').each do |node|
-          text = node.text
-          id = ascii_downcase(text)
-          id.gsub!(PUNCTUATION_REGEXP, '') # remove punctuation
-          id.tr!(' ', '-') # replace spaces with dash
-          uniq = headers[id] > 0 ? "-#{headers[id]}" : ''
-          headers[id] += 1
-          if header_content = node.children.first
-            result[:toc] << %(<li><a href="##{id}#{uniq}">#{CGI.escape_html(text)}</a></li>\n)
-            header_content.add_previous_sibling(%(<a id="#{id}#{uniq}" class="anchor" href="##{id}#{uniq}" aria-hidden="true">#{anchor_icon}</a>))
-          end
-        end
-        result[:toc] = %(<ul class="section-nav">\n#{result[:toc]}</ul>) unless result[:toc].empty?
-        doc
-      end
-      if RUBY_VERSION >= '2.4'
-        def ascii_downcase(str)
-          str.downcase(:ascii)
-        end
-      else
-        def ascii_downcase(str)
-          str.downcase
-        end
-      end
-    end
-  end
-end

data/lib/html/pipeline/version.rb DELETED Viewed

@@ -1,7 +0,0 @@
-# frozen_string_literal: true
-module HTML
-  class Pipeline
-    VERSION = '2.14.3'
-  end
-end

data/lib/html/pipeline.rb DELETED Viewed

@@ -1,210 +0,0 @@
-# frozen_string_literal: true
-require 'nokogiri'
-require 'active_support/xml_mini/nokogiri' # convert Documents to hashes
-module HTML
-  # GitHub HTML processing filters and utilities. This module includes a small
-  # framework for defining DOM based content filters and applying them to user
-  # provided content.
-  #
-  # See HTML::Pipeline::Filter for information on building filters.
-  #
-  # Construct a Pipeline for running multiple HTML filters.  A pipeline is created once
-  # with one to many filters, and it then can be `call`ed many times over the course
-  # of its lifetime with input.
-  #
-  # filters         - Array of Filter objects. Each must respond to call(doc,
-  #                   context) and return the modified DocumentFragment or a
-  #                   String containing HTML markup. Filters are performed in the
-  #                   order provided.
-  # default_context - The default context hash. Values specified here will be merged
-  #                   into values from the each individual pipeline run.  Can NOT be
-  #                   nil.  Default: empty Hash.
-  # result_class    - The default Class of the result object for individual
-  #                   calls.  Default: Hash.  Protip:  Pass in a Struct to get
-  #                   some semblance of type safety.
-  class Pipeline
-    autoload :VERSION,               'html/pipeline/version'
-    autoload :Filter,                'html/pipeline/filter'
-    autoload :AbsoluteSourceFilter,  'html/pipeline/absolute_source_filter'
-    autoload :BodyContent,           'html/pipeline/body_content'
-    autoload :AutolinkFilter,        'html/pipeline/autolink_filter'
-    autoload :CamoFilter,            'html/pipeline/camo_filter'
-    autoload :EmailReplyFilter,      'html/pipeline/email_reply_filter'
-    autoload :EmojiFilter,           'html/pipeline/emoji_filter'
-    autoload :HttpsFilter,           'html/pipeline/https_filter'
-    autoload :ImageFilter,           'html/pipeline/image_filter'
-    autoload :ImageMaxWidthFilter,   'html/pipeline/image_max_width_filter'
-    autoload :MarkdownFilter,        'html/pipeline/markdown_filter'
-    autoload :MentionFilter,         'html/pipeline/@mention_filter'
-    autoload :TeamMentionFilter,     'html/pipeline/@team_mention_filter'
-    autoload :PlainTextInputFilter,  'html/pipeline/plain_text_input_filter'
-    autoload :SanitizationFilter,    'html/pipeline/sanitization_filter'
-    autoload :SyntaxHighlightFilter, 'html/pipeline/syntax_highlight_filter'
-    autoload :TextileFilter,         'html/pipeline/textile_filter'
-    autoload :TableOfContentsFilter, 'html/pipeline/toc_filter'
-    autoload :TextFilter,            'html/pipeline/text_filter'
-    class MissingDependencyError < RuntimeError; end
-    def self.require_dependency(name, requirer)
-      require name
-    rescue LoadError => e
-      raise MissingDependencyError,
-            "Missing dependency '#{name}' for #{requirer}. See README.md for details.\n#{e.class.name}: #{e}"
-    end
-    # Our DOM implementation.
-    DocumentFragment = Nokogiri::HTML::DocumentFragment
-    # Parse a String into a DocumentFragment object. When a DocumentFragment is
-    # provided, return it verbatim.
-    def self.parse(document_or_html)
-      document_or_html ||= ''
-      if document_or_html.is_a?(String)
-        DocumentFragment.parse(document_or_html)
-      else
-        document_or_html
-      end
-    end
-    # Public: Returns an Array of Filter objects for this Pipeline.
-    attr_reader :filters
-    # Public: Instrumentation service for the pipeline.
-    # Set an ActiveSupport::Notifications compatible object to enable.
-    attr_accessor :instrumentation_service
-    # Public: String name for this Pipeline. Defaults to Class name.
-    attr_writer :instrumentation_name
-    def instrumentation_name
-      return @instrumentation_name if defined?(@instrumentation_name)
-      @instrumentation_name = self.class.name
-    end
-    class << self
-      # Public: Default instrumentation service for new pipeline objects.
-      attr_accessor :default_instrumentation_service
-    end
-    def initialize(filters, default_context = {}, result_class = nil)
-      raise ArgumentError, 'default_context cannot be nil' if default_context.nil?
-      @filters = filters.flatten.freeze
-      @default_context = default_context.freeze
-      @result_class = result_class || Hash
-      @instrumentation_service = self.class.default_instrumentation_service
-    end
-    # Apply all filters in the pipeline to the given HTML.
-    #
-    # html    - A String containing HTML or a DocumentFragment object.
-    # context - The context hash passed to each filter. See the Filter docs
-    #           for more info on possible values. This object MUST NOT be modified
-    #           in place by filters.  Use the Result for passing state back.
-    # result  - The result Hash passed to each filter for modification.  This
-    #           is where Filters store extracted information from the content.
-    #
-    # Returns the result Hash after being filtered by this Pipeline.  Contains an
-    # :output key with the DocumentFragment or String HTML markup based on the
-    # output of the last filter in the pipeline.
-    def call(html, context = {}, result = nil)
-      context = @default_context.merge(context)
-      context = context.freeze
-      result ||= @result_class.new
-      payload = default_payload filters: @filters.map(&:name),
-                                context: context, result: result
-      instrument 'call_pipeline.html_pipeline', payload do
-        result[:output] =
-          @filters.inject(html) do |doc, filter|
-            perform_filter(filter, doc, context, result)
-          end
-      end
-      result
-    end
-    # Internal: Applies a specific filter to the supplied doc.
-    #
-    # The filter is instrumented.
-    #
-    # Returns the result of the filter.
-    def perform_filter(filter, doc, context, result)
-      payload = default_payload filter: filter.name,
-                                context: context, result: result
-      instrument 'call_filter.html_pipeline', payload do
-        filter.call(doc, context, result)
-      end
-    end
-    # Like call but guarantee the value returned is a DocumentFragment.
-    # Pipelines may return a DocumentFragment or a String. Callers that need a
-    # DocumentFragment should use this method.
-    def to_document(input, context = {}, result = nil)
-      result = call(input, context, result)
-      HTML::Pipeline.parse(result[:output])
-    end
-    # Like call but guarantee the value returned is a string of HTML markup.
-    def to_html(input, context = {}, result = nil)
-      result = call(input, context, result = nil)
-      output = result[:output]
-      if output.respond_to?(:to_html)
-        output.to_html
-      else
-        output.to_s
-      end
-    end
-    # Public: setup instrumentation for this pipeline.
-    #
-    # Returns nothing.
-    def setup_instrumentation(name = nil, service = nil)
-      self.instrumentation_name = name
-      self.instrumentation_service =
-        service || self.class.default_instrumentation_service
-    end
-    # Internal: if the `instrumentation_service` object is set, instruments the
-    # block, otherwise the block is ran without instrumentation.
-    #
-    # Returns the result of the provided block.
-    def instrument(event, payload = nil)
-      payload ||= default_payload
-      return yield(payload) unless instrumentation_service
-      instrumentation_service.instrument event, payload do |payload|
-        yield payload
-      end
-    end
-    # Internal: Default payload for instrumentation.
-    #
-    # Accepts a Hash of additional payload data to be merged.
-    #
-    # Returns a Hash.
-    def default_payload(payload = {})
-      { pipeline: instrumentation_name }.merge(payload)
-    end
-  end
-end
-# XXX nokogiri monkey patches for 1.8
-unless ''.respond_to?(:force_encoding)
-  class Nokogiri::XML::Node
-    # Work around an issue with utf-8 encoded data being erroneously converted to
-    # ... some other shit when replacing text nodes. See 'utf-8 output 2' in
-    # user_content_test.rb for details.
-    def replace_with_encoding_fix(replacement)
-      if replacement.respond_to?(:to_str)
-        replacement = document.fragment("<div>#{replacement}</div>").children.first.children
-      end
-      replace_without_encoding_fix(replacement)
-    end
-    alias replace_without_encoding_fix replace
-    alias replace replace_with_encoding_fix
-    def swap(replacement)
-      replace(replacement)
-      self
-    end
-  end
-end