RubyGems - html-pipeline-plus - Versions diffs - 2.10.1 - Mend

html-pipeline-plus 2.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +7 -0
data/.gitignore +20 -0
data/.travis.yml +34 -0
data/Appraisals +13 -0
data/CHANGELOG.md +221 -0
data/CONTRIBUTING.md +60 -0
data/Gemfile +23 -0
data/LICENSE +22 -0
data/README.md +370 -0
data/Rakefile +15 -0
data/bin/html-pipeline-plus +78 -0
data/html-pipeline-plus.gemspec +28 -0
data/lib/html/pipeline-plus/@mention_filter.rb +138 -0
data/lib/html/pipeline-plus/absolute_source_filter.rb +45 -0
data/lib/html/pipeline-plus/autolink_filter.rb +27 -0
data/lib/html/pipeline-plus/body_content.rb +42 -0
data/lib/html/pipeline-plus/camo_filter.rb +93 -0
data/lib/html/pipeline-plus/email_reply_filter.rb +66 -0
data/lib/html/pipeline-plus/emoji_filter.rb +125 -0
data/lib/html/pipeline-plus/filter.rb +163 -0
data/lib/html/pipeline-plus/https_filter.rb +27 -0
data/lib/html/pipeline-plus/image_filter.rb +17 -0
data/lib/html/pipeline-plus/image_max_width_filter.rb +35 -0
data/lib/html/pipeline-plus/markdown_filter.rb +37 -0
data/lib/html/pipeline-plus/plain_text_input_filter.rb +13 -0
data/lib/html/pipeline-plus/sanitization_filter.rb +137 -0
data/lib/html/pipeline-plus/syntax_highlight_filter.rb +44 -0
data/lib/html/pipeline-plus/text_filter.rb +14 -0
data/lib/html/pipeline-plus/textile_filter.rb +23 -0
data/lib/html/pipeline-plus/toc_filter.rb +67 -0
data/lib/html/pipeline-plus/version.rb +5 -0
data/lib/html/pipeline-plus.rb +207 -0
data/test.txt +13 -0
metadata +115 -0

data/lib/html/pipeline-plus/autolink_filter.rb ADDED Viewed

@@ -0,0 +1,27 @@
+HTML::Pipeline.require_dependency('rinku', 'AutolinkFilter')
+module HTML
+  class Pipeline
+    # HTML Filter for auto_linking urls in HTML.
+    #
+    # Context options:
+    #   :autolink  - boolean whether to autolink urls
+    #   :link_attr - HTML attributes for the link that will be generated
+    #   :skip_tags - HTML tags inside which autolinking will be skipped.
+    #                See Rinku.skip_tags
+    #   :flags     - additional Rinku flags. See https://github.com/vmg/rinku
+    #
+    # This filter does not write additional information to the context.
+    class AutolinkFilter < Filter
+      def call
+        return html if context[:autolink] == false
+        skip_tags = context[:skip_tags]
+        flags = 0
+        flags |= context[:flags] if context[:flags]
+        Rinku.auto_link(html, :urls, context[:link_attr], skip_tags, flags)
+      end
+    end
+  end
+end

data/lib/html/pipeline-plus/body_content.rb ADDED Viewed

@@ -0,0 +1,42 @@
+module HTML
+  class Pipeline
+    # Public: Runs a String of content through an HTML processing pipeline,
+    # providing easy access to a generated DocumentFragment.
+    class BodyContent
+      attr_reader :result
+      # Public: Initialize a BodyContent.
+      #
+      # body     - A String body.
+      # context  - A Hash of context options for the filters.
+      # pipeline - A HTML::Pipeline object with one or more Filters.
+      def initialize(body, context, pipeline)
+        @body = body
+        @context = context
+        @pipeline = pipeline
+      end
+      # Public: Gets the memoized result of the body content as it passed through
+      # the Pipeline.
+      #
+      # Returns a Hash, or something similar as defined by @pipeline.result_class.
+      def result
+        @result ||= @pipeline.call @body, @context
+      end
+      # Public: Gets the updated body from the Pipeline result.
+      #
+      # Returns a String or DocumentFragment.
+      def output
+        @output ||= result[:output]
+      end
+      # Public: Parses the output into a DocumentFragment.
+      #
+      # Returns a DocumentFragment.
+      def document
+        @document ||= HTML::Pipeline.parse output
+      end
+    end
+  end
+end

data/lib/html/pipeline-plus/camo_filter.rb ADDED Viewed

@@ -0,0 +1,93 @@
+require 'openssl'
+require 'uri'
+module HTML
+  class Pipeline
+    # HTML Filter for replacing http image URLs with camo versions. See:
+    #
+    # https://github.com/atmos/camo
+    #
+    # All images provided in user content should be run through this
+    # filter so that http image sources do not cause mixed-content warnings
+    # in browser clients.
+    #
+    # Context options:
+    #   :asset_proxy (required) - Base URL for constructed asset proxy URLs.
+    #   :asset_proxy_secret_key (required) - The shared secret used to encode URLs.
+    #   :asset_proxy_whitelist - Array of host Strings or Regexps to skip
+    #                            src rewriting.
+    #
+    # This filter does not write additional information to the context.
+    class CamoFilter < Filter
+      # Hijacks images in the markup provided, replacing them with URLs that
+      # go through the github asset proxy.
+      def call
+        return doc unless asset_proxy_enabled?
+        doc.search('img').each do |element|
+          original_src = element['src']
+          next unless original_src
+          begin
+            uri = URI.parse(original_src)
+          rescue Exception
+            next
+          end
+          next if uri.host.nil?
+          next if asset_host_whitelisted?(uri.host)
+          element['src'] = asset_proxy_url(original_src)
+          element['data-canonical-src'] = original_src
+        end
+        doc
+      end
+      # Implementation of validate hook.
+      # Errors should raise exceptions or use an existing validator.
+      def validate
+        needs :asset_proxy, :asset_proxy_secret_key
+      end
+      # The camouflaged URL for a given image URL.
+      def asset_proxy_url(url)
+        "#{asset_proxy_host}/#{asset_url_hash(url)}/#{hexencode(url)}"
+      end
+      # Private: calculate the HMAC digest for a image source URL.
+      def asset_url_hash(url)
+        OpenSSL::HMAC.hexdigest('sha1', asset_proxy_secret_key, url)
+      end
+      # Private: Return true if asset proxy filter should be enabled
+      def asset_proxy_enabled?
+        !context[:disable_asset_proxy]
+      end
+      # Private: the host to use for generated asset proxied URLs.
+      def asset_proxy_host
+        context[:asset_proxy]
+      end
+      def asset_proxy_secret_key
+        context[:asset_proxy_secret_key]
+      end
+      def asset_proxy_whitelist
+        context[:asset_proxy_whitelist] || []
+      end
+      def asset_host_whitelisted?(host)
+        asset_proxy_whitelist.any? do |test|
+          test.is_a?(String) ? host == test : test.match(host)
+        end
+      end
+      # Private: helper to hexencode a string. Each byte ends up encoded into
+      # two characters, zero padded value in the range [0-9a-f].
+      def hexencode(str)
+        str.to_enum(:each_byte).map { |byte| format('%02x', byte) }.join
+      end
+    end
+  end
+end

data/lib/html/pipeline-plus/email_reply_filter.rb ADDED Viewed

@@ -0,0 +1,66 @@
+HTML::Pipeline.require_dependency('escape_utils', 'EmailReplyFilter')
+HTML::Pipeline.require_dependency('email_reply_parser', 'EmailReplyFilter')
+module HTML
+  class Pipeline
+    # HTML Filter that converts email reply text into an HTML DocumentFragment.
+    # It must be used as the first filter in a pipeline.
+    #
+    # Context options:
+    #   None
+    #
+    # This filter does not write any additional information to the context hash.
+    class EmailReplyFilter < TextFilter
+      include EscapeUtils
+      EMAIL_HIDDEN_HEADER    = %(<span class="email-hidden-toggle"><a href="#">&hellip;</a></span><div class="email-hidden-reply" style="display:none">).freeze
+      EMAIL_QUOTED_HEADER    = %(<div class="email-quoted-reply">).freeze
+      EMAIL_SIGNATURE_HEADER = %(<div class="email-signature-reply">).freeze
+      EMAIL_FRAGMENT_HEADER  = %(<div class="email-fragment">).freeze
+      EMAIL_HEADER_END       = '</div>'.freeze
+      EMAIL_REGEX            = /[^@\s.][^@\s]*@\[?[a-z0-9.-]+\]?/
+      HIDDEN_EMAIL_PATTERN   = '***@***.***'.freeze
+      # Scans an email body to determine which bits are quoted and which should
+      # be hidden. EmailReplyParser is used to split the comment into an Array
+      # of quoted or unquoted Blocks. Now, we loop through them and attempt to
+      # add <div> tags around them so we can hide the hidden blocks, and style
+      # the quoted blocks differently. Since multiple blocks may be hidden, be
+      # sure to keep the "email-hidden-reply" <div>s around "email-quoted-reply"
+      # <div> tags. Call this on each comment of a visible thread in the order
+      # that they are displayed. Note: all comments are processed so we can
+      # maintain a Set of SHAs of paragraphs. Only plaintext comments skip the
+      # markdown step.
+      #
+      # Returns the email comment HTML as a String
+      def call
+        found_hidden = nil
+        paragraphs = EmailReplyParser.read(text.dup).fragments.map do |fragment|
+          pieces = [escape_html(fragment.to_s.strip).gsub(/^\s*(>|&gt;)/, '')]
+          if fragment.quoted?
+            if context[:hide_quoted_email_addresses]
+              pieces.map! do |piece|
+                piece.gsub(EMAIL_REGEX, HIDDEN_EMAIL_PATTERN)
+              end
+            end
+            pieces.unshift EMAIL_QUOTED_HEADER
+            pieces << EMAIL_HEADER_END
+          elsif fragment.signature?
+            pieces.unshift EMAIL_SIGNATURE_HEADER
+            pieces << EMAIL_HEADER_END
+          else
+            pieces.unshift EMAIL_FRAGMENT_HEADER
+            pieces << EMAIL_HEADER_END
+          end
+          if fragment.hidden? && !found_hidden
+            found_hidden = true
+            pieces.unshift EMAIL_HIDDEN_HEADER
+          end
+          pieces.join
+        end
+        paragraphs << EMAIL_HEADER_END if found_hidden
+        paragraphs.join("\n")
+      end
+    end
+  end
+end

data/lib/html/pipeline-plus/emoji_filter.rb ADDED Viewed

@@ -0,0 +1,125 @@
+require 'cgi'
+HTML::Pipeline.require_dependency('gemoji', 'EmojiFilter')
+module HTML
+  class Pipeline
+    # HTML filter that replaces :emoji: with images.
+    #
+    # Context:
+    #   :asset_root (required) - base url to link to emoji sprite
+    #   :asset_path (optional) - url path to link to emoji sprite. :file_name can be used as a placeholder for the sprite file name. If no asset_path is set "emoji/:file_name" is used.
+    #   :ignored_ancestor_tags (optional) - Tags to stop the emojification. Node has matched ancestor HTML tags will not be emojified. Default to pre, code, and tt tags. Extra tags please pass in the form of array, e.g., %w(blockquote summary).
+    #   :img_attrs (optional) - Attributes for generated img tag. E.g. Pass { "draggble" => true, "height" => nil } to set draggable attribute to "true" and clear height attribute of generated img tag.
+    class EmojiFilter < Filter
+      DEFAULT_IGNORED_ANCESTOR_TAGS = %w[pre code tt].freeze
+      def call
+        doc.search('.//text()').each do |node|
+          content = node.text
+          next unless content.include?(':')
+          next if has_ancestor?(node, ignored_ancestor_tags)
+          html = emoji_image_filter(content)
+          next if html == content
+          node.replace(html)
+        end
+        doc
+      end
+      # Implementation of validate hook.
+      # Errors should raise exceptions or use an existing validator.
+      def validate
+        needs :asset_root
+      end
+      # Replace :emoji: with corresponding images.
+      #
+      # text - String text to replace :emoji: in.
+      #
+      # Returns a String with :emoji: replaced with images.
+      def emoji_image_filter(text)
+        text.gsub(emoji_pattern) do |_match|
+          emoji_image_tag(Regexp.last_match(1))
+        end
+      end
+      # The base url to link emoji sprites
+      #
+      # Raises ArgumentError if context option has not been provided.
+      # Returns the context's asset_root.
+      def asset_root
+        context[:asset_root]
+      end
+      # The url path to link emoji sprites
+      #
+      # :file_name can be used in the asset_path as a placeholder for the sprite file name. If no asset_path is set in the context "emoji/:file_name" is used.
+      # Returns the context's asset_path or the default path if no context asset_path is given.
+      def asset_path(name)
+        if context[:asset_path]
+          context[:asset_path].gsub(':file_name', emoji_filename(name))
+        else
+          File.join('emoji', emoji_filename(name))
+        end
+      end
+      private
+      # Build an emoji image tag
+      def emoji_image_tag(name)
+        require 'active_support/core_ext/hash/indifferent_access'
+        html_attrs =
+          default_img_attrs(name)
+          .merge!((context[:img_attrs] || {}).with_indifferent_access)
+          .map { |attr, value| !value.nil? && %(#{attr}="#{value.respond_to?(:call) && value.call(name) || value}") }
+          .reject(&:blank?).join(' '.freeze)
+        "<img #{html_attrs} />"
+      end
+      # Default attributes for img tag
+      def default_img_attrs(name)
+        {
+          'class' => 'emoji'.freeze,
+          'title' => ":#{name}:",
+          'alt' => ":#{name}:",
+          'src' => emoji_url(name).to_s,
+          'align' => 'absmiddle'.freeze,
+          'width' => '20'.freeze,
+          'height' => '20'.freeze
+        }
+      end
+      def emoji_url(name)
+        File.join(asset_root, asset_path(name))
+      end
+      # Build a regexp that matches all valid :emoji: names.
+      def self.emoji_pattern
+        @emoji_pattern ||= /:(#{emoji_names.map { |name| Regexp.escape(name) }.join('|')}):/
+      end
+      def emoji_pattern
+        self.class.emoji_pattern
+      end
+      def self.emoji_names
+        Emoji.all.map(&:aliases).flatten.sort
+      end
+      def emoji_filename(name)
+        Emoji.find_by_alias(name).image_filename
+      end
+      # Return ancestor tags to stop the emojification.
+      #
+      # @return [Array<String>] Ancestor tags.
+      def ignored_ancestor_tags
+        if context[:ignored_ancestor_tags]
+          DEFAULT_IGNORED_ANCESTOR_TAGS | context[:ignored_ancestor_tags]
+        else
+          DEFAULT_IGNORED_ANCESTOR_TAGS
+        end
+      end
+    end
+  end
+end

data/lib/html/pipeline-plus/filter.rb ADDED Viewed

@@ -0,0 +1,163 @@
+module HTML
+  class Pipeline
+    # Base class for user content HTML filters. Each filter takes an
+    # HTML string or Nokogiri::HTML::DocumentFragment, performs
+    # modifications and/or writes information to the result hash. Filters must
+    # return a DocumentFragment (typically the same instance provided to the call
+    # method) or a String with HTML markup.
+    #
+    # Example filter that replaces all images with trollface:
+    #
+    #   class FuuuFilter < HTML::Pipeline::Filter
+    #     def call
+    #       doc.search('img').each do |img|
+    #         img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
+    #       end
+    #     end
+    #   end
+    #
+    # The context Hash passes options to filters and should not be changed in
+    # place.  A Result Hash allows filters to make extracted information
+    # available to the caller and is mutable.
+    #
+    # Common context options:
+    #   :base_url   - The site's base URL
+    #   :repository - A Repository providing context for the HTML being processed
+    #
+    # Each filter may define additional options and output values. See the class
+    # docs for more info.
+    class Filter
+      class InvalidDocumentException < StandardError; end
+      def initialize(doc, context = nil, result = nil)
+        if doc.is_a?(String)
+          @html = doc.to_str
+          @doc = nil
+        else
+          @doc = doc
+          @html = nil
+        end
+        @context = context || {}
+        @result = result || {}
+        validate
+      end
+      # Public: Returns a simple Hash used to pass extra information into filters
+      # and also to allow filters to make extracted information available to the
+      # caller.
+      attr_reader :context
+      # Public: Returns a Hash used to allow filters to pass back information
+      # to callers of the various Pipelines.  This can be used for
+      # #mentioned_users, for example.
+      attr_reader :result
+      # The Nokogiri::HTML::DocumentFragment to be manipulated. If the filter was
+      # provided a String, parse into a DocumentFragment the first time this
+      # method is called.
+      def doc
+        @doc ||= parse_html(html)
+      end
+      # The String representation of the document. If a DocumentFragment was
+      # provided to the Filter, it is serialized into a String when this method is
+      # called.
+      def html
+        raise InvalidDocumentException if @html.nil? && @doc.nil?
+        @html || doc.to_html
+      end
+      # The main filter entry point. The doc attribute is guaranteed to be a
+      # Nokogiri::HTML::DocumentFragment when invoked. Subclasses should modify
+      # this document in place or extract information and add it to the context
+      # hash.
+      def call
+        raise NotImplementedError
+      end
+      # Make sure the context has everything we need. Noop: Subclasses can override.
+      def validate; end
+      # The Repository object provided in the context hash, or nil when no
+      # :repository was specified.
+      #
+      # It's assumed that the repository context has already been checked
+      # for permissions
+      def repository
+        context[:repository]
+      end
+      # The User object provided in the context hash, or nil when no user
+      # was specified
+      def current_user
+        context[:current_user]
+      end
+      # The site's base URL provided in the context hash, or '/' when no
+      # base URL was specified.
+      def base_url
+        context[:base_url] || '/'
+      end
+      # Ensure the passed argument is a DocumentFragment. When a string is
+      # provided, it is parsed and returned; otherwise, the DocumentFragment is
+      # returned unmodified.
+      def parse_html(html)
+        HTML::Pipeline.parse(html)
+      end
+      # Helper method for filter subclasses used to determine if any of a node's
+      # ancestors have one of the tag names specified.
+      #
+      # node - The Node object to check.
+      # tags - An array of tag name strings to check. These should be downcase.
+      #
+      # Returns true when the node has a matching ancestor.
+      def has_ancestor?(node, tags)
+        while node = node.parent
+          break true if tags.include?(node.name.downcase)
+        end
+      end
+      # Perform a filter on doc with the given context.
+      #
+      # Returns a HTML::Pipeline::DocumentFragment or a String containing HTML
+      # markup.
+      def self.call(doc, context = nil, result = nil)
+        new(doc, context, result).call
+      end
+      # Like call but guarantees that a DocumentFragment is returned, even when
+      # the last filter returns a String.
+      def self.to_document(input, context = nil)
+        html = call(input, context)
+        HTML::Pipeline.parse(html)
+      end
+      # Like call but guarantees that a string of HTML markup is returned.
+      def self.to_html(input, context = nil)
+        output = call(input, context)
+        if output.respond_to?(:to_html)
+          output.to_html
+        else
+          output.to_s
+        end
+      end
+      # Validator for required context. This will check that anything passed in
+      # contexts exists in @contexts
+      #
+      # If any errors are found an ArgumentError will be raised with a
+      # message listing all the missing contexts and the filters that
+      # require them.
+      def needs(*keys)
+        missing = keys.reject { |key| context.include? key }
+        if missing.any?
+          raise ArgumentError,
+                "Missing context keys for #{self.class.name}: #{missing.map(&:inspect).join ', '}"
+        end
+      end
+    end
+  end
+end

data/lib/html/pipeline-plus/https_filter.rb ADDED Viewed

@@ -0,0 +1,27 @@
+module HTML
+  class Pipeline
+    # HTML Filter for replacing http references to :http_url with https versions.
+    # Subdomain references are not rewritten.
+    #
+    # Context options:
+    #   :http_url - The HTTP url to force HTTPS. Falls back to :base_url
+    class HttpsFilter < Filter
+      def call
+        doc.css(%(a[href^="#{http_url}"])).each do |element|
+          element['href'] = element['href'].sub(/^http:/, 'https:')
+        end
+        doc
+      end
+      # HTTP url to replace. Falls back to :base_url
+      def http_url
+        context[:http_url] || context[:base_url]
+      end
+      # Raise error if :http_url undefined
+      def validate
+        needs :http_url unless http_url
+      end
+    end
+  end
+end

data/lib/html/pipeline-plus/image_filter.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module HTML
+  class Pipeline
+    # HTML Filter that converts image's url into <img> tag.
+    # For example, it will convert
+    #   http://example.com/test.jpg
+    # into
+    #   <img src="http://example.com/test.jpg" alt=""/>.
+    class ImageFilter < TextFilter
+      def call
+        @text.gsub(/(https|http)?:\/\/.+\.(jpg|jpeg|bmp|gif|png)(\?\S+)?/i) do |match|
+          %(<img src="#{match}" alt=""/>)
+        end
+      end
+    end
+  end
+end

data/lib/html/pipeline-plus/image_max_width_filter.rb ADDED Viewed

@@ -0,0 +1,35 @@
+module HTML
+  class Pipeline
+    # This filter rewrites image tags with a max-width inline style and also wraps
+    # the image in an <a> tag that causes the full size image to be opened in a
+    # new tab.
+    #
+    # The max-width inline styles are especially useful in HTML email which
+    # don't use a global stylesheets.
+    class ImageMaxWidthFilter < Filter
+      def call
+        doc.search('img').each do |element|
+          # Skip if there's already a style attribute. Not sure how this
+          # would happen but we can reconsider it in the future.
+          next if element['style']
+          # Bail out if src doesn't look like a valid http url. trying to avoid weird
+          # js injection via javascript: urls.
+          next if element['src'].to_s.strip =~ /\Ajavascript/i
+          element['style'] = 'max-width:100%;'
+          link_image element unless has_ancestor?(element, %w[a])
+        end
+        doc
+      end
+      def link_image(element)
+        link = doc.document.create_element('a', href: element['src'], target: '_blank')
+        link.add_child(element.dup)
+        element.replace(link)
+      end
+    end
+  end
+end

data/lib/html/pipeline-plus/markdown_filter.rb ADDED Viewed

@@ -0,0 +1,37 @@
+HTML::Pipeline.require_dependency('commonmarker', 'MarkdownFilter')
+module HTML
+  class Pipeline
+    # HTML Filter that converts Markdown text into HTML and converts into a
+    # DocumentFragment. This is different from most filters in that it can take a
+    # non-HTML as input. It must be used as the first filter in a pipeline.
+    #
+    # Context options:
+    #   :gfm      => false    Disable GFM line-end processing
+    #   :commonmarker_extensions => [ :table, :strikethrough,
+    #      :tagfilter, :autolink ] Common marker extensions to include
+    #
+    # This filter does not write any additional information to the context hash.
+    class MarkdownFilter < TextFilter
+      def initialize(text, context = nil, result = nil)
+        super text, context, result
+        @text = @text.delete "\r"
+      end
+      # Convert Markdown to HTML using the best available implementation
+      # and convert into a DocumentFragment.
+      def call
+        options = [:GITHUB_PRE_LANG]
+        options << :HARDBREAKS if context[:gfm] != false
+        options << :UNSAFE if context[:unsafe]
+        extensions = context.fetch(
+          :commonmarker_extensions,
+          %i[table strikethrough tagfilter autolink]
+        )
+        html = CommonMarker.render_html(@text, options, extensions)
+        html.rstrip!
+        html
+      end
+    end
+  end
+end

data/lib/html/pipeline-plus/plain_text_input_filter.rb ADDED Viewed

@@ -0,0 +1,13 @@
+HTML::Pipeline.require_dependency('escape_utils', 'PlainTextInputFilter')
+module HTML
+  class Pipeline
+    # Simple filter for plain text input. HTML escapes the text input and wraps it
+    # in a div.
+    class PlainTextInputFilter < TextFilter
+      def call
+        "<div>#{EscapeUtils.escape_html(@text, false)}</div>"
+      end
+    end
+  end
+end