RubyGems - html-pipeline-no-charlock - Versions diffs - 0.0.6 - Mend

html-pipeline-no-charlock 0.0.6

Files changed (38) hide show

data/.gitignore +19 -0
data/.travis.yml +13 -0
data/CHANGELOG.md +16 -0
data/Gemfile +9 -0
data/LICENSE +22 -0
data/README.md +221 -0
data/Rakefile +13 -0
data/html-pipeline-no-charlock.gemspec +25 -0
data/html-pipeline.gemspec +26 -0
data/lib/html/pipeline.rb +130 -0
data/lib/html/pipeline/@mention_filter.rb +118 -0
data/lib/html/pipeline/autolink_filter.rb +22 -0
data/lib/html/pipeline/body_content.rb +42 -0
data/lib/html/pipeline/camo_filter.rb +70 -0
data/lib/html/pipeline/email_reply_filter.rb +56 -0
data/lib/html/pipeline/emoji_filter.rb +54 -0
data/lib/html/pipeline/filter.rb +178 -0
data/lib/html/pipeline/https_filter.rb +13 -0
data/lib/html/pipeline/image_max_width_filter.rb +37 -0
data/lib/html/pipeline/markdown_filter.rb +29 -0
data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
data/lib/html/pipeline/sanitization_filter.rb +105 -0
data/lib/html/pipeline/syntax_highlight_filter.rb +29 -0
data/lib/html/pipeline/text_filter.rb +14 -0
data/lib/html/pipeline/textile_filter.rb +21 -0
data/lib/html/pipeline/toc_filter.rb +28 -0
data/lib/html/pipeline/version.rb +5 -0
data/test/html/pipeline/autolink_filter_test.rb +22 -0
data/test/html/pipeline/camo_filter_test.rb +47 -0
data/test/html/pipeline/emoji_filter_test.rb +18 -0
data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
data/test/html/pipeline/markdown_filter_test.rb +101 -0
data/test/html/pipeline/mention_filter_test.rb +158 -0
data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
data/test/html/pipeline/sanitization_filter_test.rb +47 -0
data/test/html/pipeline/toc_filter_test.rb +47 -0
data/test/test_helper.rb +38 -0
metadata +214 -0

data/lib/html/pipeline/https_filter.rb ADDED

@@ -0,0 +1,13 @@
+module HTML
+  class Pipeline
+    # HTML Filter for replacing http github urls with https versions.
+    class HttpsFilter < Filter
+      def call
+        doc.css('a[href^="http://github.com"]').each do |element|
+          element['href'] = element['href'].sub(/^http:/,'https:')
+        end
+        doc
+      end
+    end
+  end
+end

data/lib/html/pipeline/image_max_width_filter.rb ADDED

@@ -0,0 +1,37 @@
+module HTML
+  class Pipeline
+    # This filter rewrites image tags with a max-width inline style and also wraps
+    # the image in an <a> tag that causes the full size image to be opened in a
+    # new tab.
+    #
+    # The max-width inline styles are especially useful in HTML email which
+    # don't use a global stylesheets.
+    class ImageMaxWidthFilter < Filter
+      def call
+        doc.search('img').each do |element|
+          # Skip if there's already a style attribute. Not sure how this
+          # would happen but we can reconsider it in the future.
+          next if element['style']
+          # Bail out if src doesn't look like a valid http url. trying to avoid weird
+          # js injection via javascript: urls.
+          next if element['src'].to_s.strip =~ /\Ajavascript/i
+          element['style'] = "max-width:100%;"
+          if !has_ancestor?(element, %w(a))
+            link_image element
+          end
+        end
+        doc
+      end
+      def link_image(element)
+        link = doc.document.create_element('a', :href => element['src'], :target => '_blank')
+        link.add_child(element.dup)
+        element.replace(link)
+      end
+    end
+  end
+end

data/lib/html/pipeline/markdown_filter.rb ADDED

@@ -0,0 +1,29 @@
+require 'github/markdown'
+module HTML
+  class Pipeline
+    # HTML Filter that converts Markdown text into HTML and converts into a
+    # DocumentFragment. This is different from most filters in that it can take a
+    # non-HTML as input. It must be used as the first filter in a pipeline.
+    #
+    # Context options:
+    #   :gfm      => false    Disable GFM line-end processing
+    #
+    # This filter does not write any additional information to the context hash.
+    class MarkdownFilter < TextFilter
+      def initialize(text, context = nil, result = nil)
+        super text, context, result
+        @text = @text.gsub "\r", ''
+      end
+      # Convert Markdown to HTML using the best available implementation
+      # and convert into a DocumentFragment.
+      def call
+        mode = (context[:gfm] != false) ? :gfm : :markdown
+        html = GitHub::Markdown.to_html(@text, mode)
+        html.rstrip!
+        html
+      end
+    end
+  end
+end

data/lib/html/pipeline/plain_text_input_filter.rb ADDED

@@ -0,0 +1,11 @@
+module HTML
+  class Pipeline
+    # Simple filter for plain text input. HTML escapes the text input and wraps it
+    # in a div.
+    class PlainTextInputFilter < TextFilter
+      def call
+        "<div>#{EscapeUtils.escape_html(@text, false)}</div>"
+      end
+    end
+  end
+end

data/lib/html/pipeline/sanitization_filter.rb ADDED

@@ -0,0 +1,105 @@
+require 'sanitize'
+module HTML
+  class Pipeline
+    # HTML filter with sanization routines and whitelists. This module defines
+    # what HTML is allowed in user provided content and fixes up issues with
+    # unbalanced tags and whatnot.
+    #
+    # See the Sanitize docs for more information on the underlying library:
+    #
+    # https://github.com/rgrove/sanitize/#readme
+    #
+    # Context options:
+    #   :whitelist - The sanitizer whitelist configuration to use. This can be one
+    #                of the options constants defined in this class or a custom
+    #                sanitize options hash.
+    #
+    # This filter does not write additional information to the context.
+    class SanitizationFilter < Filter
+      LISTS     = Set.new(%w(ul ol).freeze)
+      LIST_ITEM = 'li'.freeze
+      # List of table child elements. These must be contained by a <table> element
+      # or they are not allowed through. Otherwise they can be used to break out
+      # of places we're using tables to contain formatted user content (like pull
+      # request review comments).
+      TABLE_ITEMS = Set.new(%w(tr td th).freeze)
+      TABLE       = 'table'.freeze
+      # The main sanitization whitelist. Only these elements and attributes are
+      # allowed through by default.
+      WHITELIST = {
+        :elements => %w(
+          h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
+          div ins del sup sub p ol ul table blockquote dl dt dd
+          kbd q samp var hr ruby rt rp li tr td th
+        ),
+        :attributes => {
+          'a' => ['href'],
+          'img' => ['src'],
+          'div' => ['itemscope', 'itemtype'],
+          :all  => ['abbr', 'accept', 'accept-charset',
+                    'accesskey', 'action', 'align', 'alt', 'axis',
+                    'border', 'cellpadding', 'cellspacing', 'char',
+                    'charoff', 'charset', 'checked', 'cite',
+                    'clear', 'cols', 'colspan', 'color',
+                    'compact', 'coords', 'datetime', 'dir',
+                    'disabled', 'enctype', 'for', 'frame',
+                    'headers', 'height', 'hreflang',
+                    'hspace', 'ismap', 'label', 'lang',
+                    'longdesc', 'maxlength', 'media', 'method',
+                    'multiple', 'name', 'nohref', 'noshade',
+                    'nowrap', 'prompt', 'readonly', 'rel', 'rev',
+                    'rows', 'rowspan', 'rules', 'scope',
+                    'selected', 'shape', 'size', 'span',
+                    'start', 'summary', 'tabindex', 'target',
+                    'title', 'type', 'usemap', 'valign', 'value',
+                    'vspace', 'width', 'itemprop']
+        },
+        :protocols => {
+          'a'   => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
+          'img' => {'src'  => ['http', 'https', :relative]}
+        },
+        :transformers => [
+          # Top-level <li> elements are removed because they can break out of
+          # containing markup.
+          lambda { |env|
+            name, node = env[:node_name], env[:node]
+            if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
+              node.replace(node.children)
+            end
+          },
+          # Table child elements that are not contained by a <table> are removed.
+          lambda { |env|
+            name, node = env[:node_name], env[:node]
+            if TABLE_ITEMS.include?(name) && !node.ancestors.any? { |n| n.name == TABLE }
+              node.replace(node.children)
+            end
+          }
+        ]
+      }
+      # A more limited sanitization whitelist. This includes all attributes,
+      # protocols, and transformers from WHITELIST but with a more locked down
+      # set of allowed elements.
+      LIMITED = WHITELIST.merge(
+        :elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))
+      # Strip all HTML tags from the document.
+      FULL = { :elements => [] }
+      # Sanitize markup using the Sanitize library.
+      def call
+        Sanitize.clean_node!(doc, whitelist)
+      end
+      # The whitelist to use when sanitizing. This can be passed in the context
+      # hash to the filter but defaults to WHITELIST constant value above.
+      def whitelist
+        context[:whitelist] || WHITELIST
+      end
+    end
+  end
+end

data/lib/html/pipeline/syntax_highlight_filter.rb ADDED

@@ -0,0 +1,29 @@
+require 'linguist'
+module HTML
+  class Pipeline
+    # HTML Filter that syntax highlights code blocks wrapped
+    # in <pre lang="...">.
+    class SyntaxHighlightFilter < Filter
+      def call
+        doc.search('pre').each do |node|
+          next unless lang = node['lang']
+          next unless lexer = Pygments::Lexer[lang]
+          text = node.inner_text
+          html = highlight_with_timeout_handling(lexer, text)
+          next if html.nil?
+          node.replace(html)
+        end
+        doc
+      end
+      def highlight_with_timeout_handling(lexer, text)
+        lexer.highlight(text)
+      rescue Timeout::Error => boom
+        nil
+      end
+    end
+  end
+end

data/lib/html/pipeline/text_filter.rb ADDED

@@ -0,0 +1,14 @@
+module HTML
+  class Pipeline
+    class TextFilter < Filter
+      attr_reader :text
+      def initialize(text, context = nil, result = nil)
+        raise TypeError, "text cannot be HTML" if text.is_a?(DocumentFragment)
+        # Ensure that this is always a string
+        @text = text.respond_to?(:to_str) ? text.to_str : text.to_s
+        super nil, context, result
+      end
+    end
+  end
+end

data/lib/html/pipeline/textile_filter.rb ADDED

@@ -0,0 +1,21 @@
+module HTML
+  class Pipeline
+    # HTML Filter that converts Textile text into HTML and converts into a
+    # DocumentFragment. This is different from most filters in that it can take a
+    # non-HTML as input. It must be used as the first filter in a pipeline.
+    #
+    # Context options:
+    #   :autolink => false    Disable autolinking URLs
+    #
+    # This filter does not write any additional information to the context hash.
+    #
+    # NOTE This filter is provided for really old comments only. It probably
+    # shouldn't be used for anything new.
+    class TextileFilter < TextFilter
+      # Convert Textile to HTML and convert into a DocumentFragment.
+      def call
+        RedCloth.new(@text).to_html
+      end
+    end
+  end
+end

data/lib/html/pipeline/toc_filter.rb ADDED

@@ -0,0 +1,28 @@
+module HTML
+  class Pipeline
+    # HTML filter that adds a 'name' attribute to all headers
+    # in a document, so they can be accessed from a table of contents
+    #
+    # TODO: besides adding the name attribute, we should get around to
+    # eventually generating the Table of Contents itself, with links
+    # to each header
+    class TableOfContentsFilter < Filter
+      def call
+        headers = Hash.new(0)
+        doc.css('h1, h2, h3, h4, h5, h6').each do |node|
+          name = node.text.downcase
+          name.gsub!(/[^\w\- ]/, '') # remove punctuation
+          name.gsub!(' ', '-') # replace spaces with dash
+          name = EscapeUtils.escape_uri(name) # escape extended UTF-8 chars
+          uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
+          headers[name] += 1
+          if header_content = node.children.first
+            header_content.add_previous_sibling(%Q{<a name="#{name}#{uniq}" class="anchor" href="##{name}#{uniq}"><span class="mini-icon mini-icon-link"></span></a>})
+          end
+        end
+        doc
+      end
+    end
+  end
+end

data/lib/html/pipeline/version.rb ADDED

@@ -0,0 +1,5 @@
+module HTML
+  class Pipeline
+    VERSION = "0.0.6"
+  end
+end

data/test/html/pipeline/autolink_filter_test.rb ADDED

@@ -0,0 +1,22 @@
+require "test_helper"
+AutolinkFilter = HTML::Pipeline::AutolinkFilter
+class HTML::Pipeline::AutolinkFilterTest < Test::Unit::TestCase
+  def test_uses_rinku_for_autolinking
+    # just try to parse a complicated piece of HTML
+    # that Rails auto_link cannot handle
+    assert_equal '<p>"<a href="http://www.github.com">http://www.github.com</a>"</p>',
+      AutolinkFilter.to_html('<p>"http://www.github.com"</p>')
+  end
+  def test_autolink_option
+    assert_equal '<p>"http://www.github.com"</p>',
+      AutolinkFilter.to_html('<p>"http://www.github.com"</p>', :autolink => false)
+  end
+  def test_autolink_flags
+    assert_equal '<p>"<a href="http://github">http://github</a>"</p>',
+      AutolinkFilter.to_html('<p>"http://github"</p>', :flags => Rinku::AUTOLINK_SHORT_DOMAINS)
+  end
+end

data/test/html/pipeline/camo_filter_test.rb ADDED

@@ -0,0 +1,47 @@
+require "test_helper"
+class HTML::Pipeline::CamoFilterTest < Test::Unit::TestCase
+  CamoFilter = HTML::Pipeline::CamoFilter
+  def setup
+    @asset_proxy_url        = 'https//assets.example.org'
+    @asset_proxy_secret_key = 'ssssh-secret'
+    @options = {
+      :asset_proxy            => @asset_proxy_url,
+      :asset_proxy_secret_key => @asset_proxy_secret_key
+    }
+  end
+  def test_camouflaging_http_image_urls
+    orig = %(<p><img src="http://twitter.com/img.png"></p>)
+    assert_includes 'img src="' + @asset_proxy_url,
+      CamoFilter.call(orig, @options).to_s
+  end
+  def test_rewrites_dotcom_image_urls
+    orig = %(<p><img src="http://github.com/img.png"></p>)
+    assert_equal "<p><img src=\"https://github.com/img.png\"></p>",
+      CamoFilter.call(orig, @options).to_s
+  end
+  def test_not_camouflaging_https_image_urls
+    orig = %(<p><img src="https://foo.com/img.png"></p>)
+    assert_doesnt_include 'img src="' + @asset_proxy_url,
+      CamoFilter.call(orig, @options).to_s
+  end
+  def test_handling_images_with_no_src_attribute
+    orig = %(<p><img></p>)
+    assert_nothing_raised do
+      CamoFilter.call(orig, @options).to_s
+    end
+  end
+  def test_required_context_validation
+    exception = assert_raise(ArgumentError) {
+      CamoFilter.call("", {})
+    }
+    assert_match /:asset_proxy[^_]/, exception.message
+    assert_match /:asset_proxy_secret_key/, exception.message
+  end
+end

data/test/html/pipeline/emoji_filter_test.rb ADDED

@@ -0,0 +1,18 @@
+require 'test_helper'
+class HTML::Pipeline::EmojiFilterTest < Test::Unit::TestCase
+  EmojiFilter = HTML::Pipeline::EmojiFilter
+  def test_emojify
+    filter = EmojiFilter.new("<p>:shipit:</p>", {:asset_root => 'https://foo.com'})
+    doc = filter.call
+    assert_match "https://foo.com/emoji/shipit.png", doc.search('img').attr('src').value
+  end
+  def test_required_context_validation
+    exception = assert_raise(ArgumentError) {
+      EmojiFilter.call("", {})
+    }
+    assert_match /:asset_root/, exception.message
+  end
+end

data/test/html/pipeline/image_max_width_filter_test.rb ADDED

@@ -0,0 +1,50 @@
+require "test_helper"
+class HTML::Pipeline::ImageMaxWidthFilterTest < Test::Unit::TestCase
+  def filter(html)
+    HTML::Pipeline::ImageMaxWidthFilter.call(html)
+  end
+  def test_rewrites_image_style_tags
+    body = "<p>Screenshot: <img src='screenshot.png'></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html %q(<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
+      res.to_html
+  end
+  def test_leaves_existing_image_style_tags_alone
+    body = "<p><img src='screenshot.png' style='width:100px;'></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html '<p><img src="screenshot.png" style="width:100px;"></p>',
+      res.to_html
+  end
+  def test_links_to_image
+    body = "<p>Screenshot: <img src='screenshot.png'></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html '<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>',
+      res.to_html
+  end
+  def test_doesnt_link_to_image_when_already_linked
+    body = "<p>Screenshot: <a href='blah.png'><img src='screenshot.png'></a></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html %q(<p>Screenshot: <a href="blah.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
+      res.to_html
+  end
+  def test_doesnt_screw_up_inlined_images
+    body = "<p>Screenshot <img src='screenshot.png'>, yes, this is a <b>screenshot</b> indeed.</p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    assert_equal_html %q(<p>Screenshot <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a>, yes, this is a <b>screenshot</b> indeed.</p>), filter(doc).to_html
+  end
+end