RubyGems - html-pipeline - Versions diffs - 0.0.4 - Mend

html-pipeline 0.0.4

Files changed (36) hide show

data/.gitignore +19 -0
data/.travis.yml +13 -0
data/Gemfile +9 -0
data/LICENSE +22 -0
data/README.md +128 -0
data/Rakefile +11 -0
data/html-pipeline.gemspec +25 -0
data/lib/html/pipeline.rb +130 -0
data/lib/html/pipeline/@mention_filter.rb +118 -0
data/lib/html/pipeline/autolink_filter.rb +22 -0
data/lib/html/pipeline/body_content.rb +42 -0
data/lib/html/pipeline/camo_filter.rb +64 -0
data/lib/html/pipeline/email_reply_filter.rb +56 -0
data/lib/html/pipeline/emoji_filter.rb +48 -0
data/lib/html/pipeline/filter.rb +158 -0
data/lib/html/pipeline/https_filter.rb +13 -0
data/lib/html/pipeline/image_max_width_filter.rb +37 -0
data/lib/html/pipeline/markdown_filter.rb +29 -0
data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
data/lib/html/pipeline/sanitization_filter.rb +107 -0
data/lib/html/pipeline/syntax_highlight_filter.rb +29 -0
data/lib/html/pipeline/text_filter.rb +14 -0
data/lib/html/pipeline/textile_filter.rb +21 -0
data/lib/html/pipeline/toc_filter.rb +28 -0
data/lib/html/pipeline/version.rb +5 -0
data/test/html/pipeline/autolink_filter_test.rb +22 -0
data/test/html/pipeline/camo_filter_test.rb +39 -0
data/test/html/pipeline/emoji_filter_test.rb +16 -0
data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
data/test/html/pipeline/markdown_filter_test.rb +101 -0
data/test/html/pipeline/mention_filter_test.rb +158 -0
data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
data/test/html/pipeline/sanitization_filter_test.rb +47 -0
data/test/html/pipeline/toc_filter_test.rb +47 -0
data/test/test_helper.rb +38 -0
metadata +221 -0

data/lib/html/pipeline/autolink_filter.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'rinku'
+module HTML
+  class Pipeline
+    # HTML Filter for auto_linking urls in HTML.
+    #
+    # Context options:
+    #   :autolink - boolean whether to autolink urls
+    #   :flags    - additional Rinku flags. See https://github.com/vmg/rinku
+    #
+    # This filter does not write additional information to the context.
+    class AutolinkFilter < Filter
+      def call
+        return html if context[:autolink] == false
+        flags = 0
+        flags |= context[:flags] if context[:flags]
+        Rinku.auto_link(html, :urls, nil, %w[a script kbd pre code], flags)
+      end
+    end
+  end
+end

data/lib/html/pipeline/body_content.rb ADDED Viewed

@@ -0,0 +1,42 @@
+module HTML
+  class Pipeline
+    # Public: Runs a String of content through an HTML processing pipeline,
+    # providing easy access to a generated DocumentFragment.
+    class BodyContent
+      attr_reader :result
+      # Public: Initialize a BodyContent.
+      #
+      # body     - A String body.
+      # context  - A Hash of context options for the filters.
+      # pipeline - A HTML::Pipeline object with one or more Filters.
+      def initialize(body, context, pipeline)
+        @body = body
+        @context = context
+        @pipeline = pipeline
+      end
+      # Public: Gets the memoized result of the body content as it passed through
+      # the Pipeline.
+      #
+      # Returns a Hash, or something similar as defined by @pipeline.result_class.
+      def result
+        @result ||= @pipeline.call @body, @context
+      end
+      # Public: Gets the updated body from the Pipeline result.
+      #
+      # Returns a String or DocumentFragment.
+      def output
+        @output ||= result[:output]
+      end
+      # Public: Parses the output into a DocumentFragment.
+      #
+      # Returns a DocumentFragment.
+      def document
+        @document ||= HTML::Pipeline.parse output
+      end
+    end
+  end
+end

data/lib/html/pipeline/camo_filter.rb ADDED Viewed

@@ -0,0 +1,64 @@
+require 'openssl'
+module HTML
+  class Pipeline
+    # HTML Filter for replacing http image URLs with camo versions. See:
+    #
+    # https://github.com/atmos/camo
+    #
+    # All images provided in user content should be run through this
+    # filter so that http image sources do not cause mixed-content warnings
+    # in browser clients.
+    #
+    # Context options:
+    #   :asset_proxy - Base URL for constructed asset proxy URLs.
+    #   :asset_proxy_secret_key - The shared secret used to encode URLs.
+    #
+    # This filter does not write additional information to the context.
+    class CamoFilter < Filter
+      # Hijacks images in the markup provided, replacing them with URLs that
+      # go through the github asset proxy.
+      def call
+        doc.search("img").each do |element|
+          next if element['src'].nil?
+          src = element['src'].strip
+          src = src.sub(%r!^http://github.com!, 'https://github.com')
+          next if context[:disable_asset_proxy]
+          if src =~ /^http:/ || src =~ /^https:\/\/img.skitch.com\//
+            element['src'] = asset_proxy_url(src)
+          else
+            element['src'] = src
+          end
+        end
+        doc
+      end
+      # The camouflaged URL for a given image URL.
+      def asset_proxy_url(url)
+        "#{asset_proxy_host}/#{asset_url_hash(url)}/#{hexencode(url)}"
+      end
+      # Private: calculate the HMAC digest for a image source URL.
+      def asset_url_hash(url)
+        digest = OpenSSL::Digest::Digest.new('sha1')
+        OpenSSL::HMAC.hexdigest(digest, asset_proxy_secret_key, url)
+      end
+      # Private: the hostname to use for generated asset proxied URLs.
+      def asset_proxy_host
+        context[:asset_proxy] or raise "Missing context :asset_proxy"
+      end
+      def asset_proxy_secret_key
+        context[:asset_proxy_secret_key] or raise "Missing context :asset_proxy_secret_key"
+      end
+      # Private: helper to hexencode a string. Each byte ends up encoded into
+      # two characters, zero padded value in the range [0-9a-f].
+      def hexencode(str)
+        str.to_enum(:each_byte).map { |byte| "%02x" % byte }.join
+      end
+    end
+  end
+end

data/lib/html/pipeline/email_reply_filter.rb ADDED Viewed

@@ -0,0 +1,56 @@
+module HTML
+  class Pipeline
+    # HTML Filter that converts email reply text into an HTML DocumentFragment.
+    # It must be used as the first filter in a pipeline.
+    #
+    # Context options:
+    #   None
+    #
+    # This filter does not write any additional information to the context hash.
+    class EmailReplyFilter < TextFilter
+      include EscapeUtils
+      EMAIL_HIDDEN_HEADER    = %(<span class="email-hidden-toggle"><a href="#">&hellip;</a></span><div class="email-hidden-reply" style="display:none">).freeze
+      EMAIL_QUOTED_HEADER    = %(<div class="email-quoted-reply">).freeze
+      EMAIL_SIGNATURE_HEADER = %(<div class="email-signature-reply">).freeze
+      EMAIL_FRAGMENT_HEADER  = %(<div class="email-fragment">).freeze
+      EMAIL_HEADER_END       = "</div>".freeze
+      # Scans an email body to determine which bits are quoted and which should
+      # be hidden. EmailReplyParser is used to split the comment into an Array
+      # of quoted or unquoted Blocks. Now, we loop through them and attempt to
+      # add <div> tags around them so we can hide the hidden blocks, and style
+      # the quoted blocks differently. Since multiple blocks may be hidden, be
+      # sure to keep the "email-hidden-reply" <div>s around "email-quoted-reply"
+      # <div> tags. Call this on each comment of a visible thread in the order
+      # that they are displayed. Note: all comments are processed so we can
+      # maintain a Set of SHAs of paragraphs. Only plaintext comments skip the
+      # markdown step.
+      #
+      # Returns the email comment HTML as a String
+      def call
+        found_hidden = nil
+        paragraphs = EmailReplyParser.read(text.dup).fragments.map do |fragment|
+          pieces = [escape_html(fragment.to_s.strip).gsub(/^\s*(>|&gt;)/, '')]
+          if fragment.quoted?
+            pieces.unshift EMAIL_QUOTED_HEADER
+            pieces << EMAIL_HEADER_END
+          elsif fragment.signature?
+            pieces.unshift EMAIL_SIGNATURE_HEADER
+            pieces << EMAIL_HEADER_END
+          else
+            pieces.unshift EMAIL_FRAGMENT_HEADER
+            pieces << EMAIL_HEADER_END
+          end
+          if fragment.hidden? && !found_hidden
+            found_hidden = true
+            pieces.unshift EMAIL_HIDDEN_HEADER
+          end
+          pieces.join
+        end
+        paragraphs << EMAIL_HEADER_END if found_hidden
+        paragraphs.join("\n")
+      end
+    end
+  end
+end

data/lib/html/pipeline/emoji_filter.rb ADDED Viewed

@@ -0,0 +1,48 @@
+require 'emoji'
+module HTML
+  class Pipeline
+    # HTML filter that replaces :emoji: with images.
+    #
+    # Context:
+    #   :asset_root - base url to link to emoji sprite
+    class EmojiFilter < Filter
+      # Build a regexp that matches all valid :emoji: names.
+      EmojiPattern = /:(#{Emoji.names.map { |name| Regexp.escape(name) }.join('|')}):/
+      def call
+        doc.search('text()').each do |node|
+          content = node.to_html
+          next if !content.include?(':')
+          next if has_ancestor?(node, %w(pre code))
+          html = emoji_image_filter(content)
+          next if html == content
+          node.replace(html)
+        end
+        doc
+      end
+      # Replace :emoji: with corresponding images.
+      #
+      # text - String text to replace :emoji: in.
+      #
+      # Returns a String with :emoji: replaced with images.
+      def emoji_image_filter(text)
+        return text unless text.include?(':')
+        text.gsub EmojiPattern do |match|
+          name = $1
+          "<img class='emoji' title=':#{name}:' alt=':#{name}:' src='#{File.join(asset_root, "emoji", "#{name}.png")}' height='20' width='20' align='absmiddle' />"
+        end
+      end
+      # The base url to link emoji sprites
+      #
+      # Raises ArgumentError if context option has not been provided.
+      # Returns the context's asset_root.
+      def asset_root
+        context[:asset_root] or raise ArgumentError, "Missing context :asset_root"
+      end
+    end
+  end
+end

data/lib/html/pipeline/filter.rb ADDED Viewed

@@ -0,0 +1,158 @@
+module HTML
+  class Pipeline
+    # Base class for user content HTML filters. Each filter takes an
+    # HTML string or Nokogiri::HTML::DocumentFragment, performs
+    # modifications and/or writes information to the result hash. Filters must
+    # return a DocumentFragment (typically the same instance provided to the call
+    # method) or a String with HTML markup.
+    #
+    # Example filter that replaces all images with trollface:
+    #
+    #   class FuuuFilter < HTML::Pipeline::Filter
+    #     def call
+    #       doc.search('img').each do |img|
+    #         img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
+    #       end
+    #     end
+    #   end
+    #
+    # The context Hash passes options to filters and should not be changed in
+    # place.  A Result Hash allows filters to make extracted information
+    # available to the caller and is mutable.
+    #
+    # Common context options:
+    #   :base_url   - The site's base URL
+    #   :repository - A Repository providing context for the HTML being processed
+    #
+    # Each filter may define additional options and output values. See the class
+    # docs for more info.
+    class Filter
+      class InvalidDocumentException < StandardError; end
+      def initialize(doc, context = nil, result = nil)
+        if doc.kind_of?(String)
+          @html = doc.to_str
+          @doc = nil
+        else
+          @doc = doc
+          @html = nil
+        end
+        @context = context || {}
+        @result = result || {}
+      end
+      # Public: Returns a simple Hash used to pass extra information into filters
+      # and also to allow filters to make extracted information available to the
+      # caller.
+      attr_reader :context
+      # Public: Returns a Hash used to allow filters to pass back information
+      # to callers of the various Pipelines.  This can be used for
+      # #mentioned_users, for example.
+      attr_reader :result
+      # The Nokogiri::HTML::DocumentFragment to be manipulated. If the filter was
+      # provided a String, parse into a DocumentFragment the first time this
+      # method is called.
+      def doc
+        @doc ||= parse_html(html)
+      end
+      # The String representation of the document. If a DocumentFragment was
+      # provided to the Filter, it is serialized into a String when this method is
+      # called.
+      def html
+        raise InvalidDocumentException if @html.nil? && @doc.nil?
+        @html || doc.to_html
+      end
+      # The main filter entry point. The doc attribute is guaranteed to be a
+      # Nokogiri::HTML::DocumentFragment when invoked. Subclasses should modify
+      # this document in place or extract information and add it to the context
+      # hash.
+      def call
+        raise NotImplementedError
+      end
+      # The Repository object provided in the context hash, or nil when no
+      # :repository was specified.
+      #
+      # It's assumed that the repository context has already been checked
+      # for permissions
+      def repository
+        context[:repository]
+      end
+      # The User object provided in the context hash, or nil when no user
+      # was specified
+      def current_user
+        context[:current_user]
+      end
+      # Return whether the filter can access a given repo while
+      # applying a filter
+      #
+      # A repo can only be accessed if its pullable by the user who
+      # submitted the content of this filter, or if it's the same as
+      # the repository context in which the filter runs
+      def can_access_repo?(repo)
+        return false if repo.nil?
+        return true if repo == repository
+        repo.pullable_by?(current_user)
+      end
+      # The site's base URL provided in the context hash, or '/' when no
+      # base URL was specified.
+      def base_url
+        context[:base_url] || '/'
+      end
+      # Ensure the passed argument is a DocumentFragment. When a string is
+      # provided, it is parsed and returned; otherwise, the DocumentFragment is
+      # returned unmodified.
+      def parse_html(html)
+        HTML::Pipeline.parse(html)
+      end
+      # Helper method for filter subclasses used to determine if any of a node's
+      # ancestors have one of the tag names specified.
+      #
+      # node - The Node object to check.
+      # tags - An array of tag name strings to check. These should be downcase.
+      #
+      # Returns true when the node has a matching ancestor.
+      def has_ancestor?(node, tags)
+        while node = node.parent
+          if tags.include?(node.name.downcase)
+            break true
+          end
+        end
+      end
+      # Perform a filter on doc with the given context.
+      #
+      # Returns a HTML::Pipeline::DocumentFragment or a String containing HTML
+      # markup.
+      def self.call(doc, context = nil, result = nil)
+        new(doc, context, result).call
+      end
+      # Like call but guarantees that a DocumentFragment is returned, even when
+      # the last filter returns a String.
+      def self.to_document(input, context = nil)
+        html = call(input, context)
+        HTML::Pipeline::parse(html)
+      end
+      # Like call but guarantees that a string of HTML markup is returned.
+      def self.to_html(input, context = nil)
+        output = call(input, context)
+        if output.respond_to?(:to_html)
+          output.to_html
+        else
+          output.to_s
+        end
+      end
+    end
+  end
+end

data/lib/html/pipeline/https_filter.rb ADDED Viewed

@@ -0,0 +1,13 @@
+module HTML
+  class Pipeline
+    # HTML Filter for replacing http github urls with https versions.
+    class HttpsFilter < Filter
+      def call
+        doc.css('a[href^="http://github.com"]').each do |element|
+          element['href'] = element['href'].sub(/^http:/,'https:')
+        end
+        doc
+      end
+    end
+  end
+end

data/lib/html/pipeline/image_max_width_filter.rb ADDED Viewed

@@ -0,0 +1,37 @@
+module HTML
+  class Pipeline
+    # This filter rewrites image tags with a max-width inline style and also wraps
+    # the image in an <a> tag that causes the full size image to be opened in a
+    # new tab.
+    #
+    # The max-width inline styles are especially useful in HTML email which
+    # don't use a global stylesheets.
+    class ImageMaxWidthFilter < Filter
+      def call
+        doc.search('img').each do |element|
+          # Skip if theres already a style attribute. Not sure how this
+          # would happen but we can reconsider it in the future.
+          next if element['style']
+          # Bail out if src doesn't look like a valid http url. tryna avoid weird
+          # js injection via javascript: urls.
+          next if element['src'].to_s.strip =~ /\Ajavascript/i
+          element['style'] = "max-width:100%;"
+          if !has_ancestor?(element, %w(a))
+            link_image element
+          end
+        end
+        doc
+      end
+      def link_image(element)
+        link = doc.document.create_element('a', :href => element['src'], :target => '_blank')
+        link.add_child(element.dup)
+        element.replace(link)
+      end
+    end
+  end
+end