RubyGems - html-pipeline-no-charlock - Versions diffs - 0.0.6 - Mend

html-pipeline-no-charlock 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

data/.gitignore +19 -0
data/.travis.yml +13 -0
data/CHANGELOG.md +16 -0
data/Gemfile +9 -0
data/LICENSE +22 -0
data/README.md +221 -0
data/Rakefile +13 -0
data/html-pipeline-no-charlock.gemspec +25 -0
data/html-pipeline.gemspec +26 -0
data/lib/html/pipeline.rb +130 -0
data/lib/html/pipeline/@mention_filter.rb +118 -0
data/lib/html/pipeline/autolink_filter.rb +22 -0
data/lib/html/pipeline/body_content.rb +42 -0
data/lib/html/pipeline/camo_filter.rb +70 -0
data/lib/html/pipeline/email_reply_filter.rb +56 -0
data/lib/html/pipeline/emoji_filter.rb +54 -0
data/lib/html/pipeline/filter.rb +178 -0
data/lib/html/pipeline/https_filter.rb +13 -0
data/lib/html/pipeline/image_max_width_filter.rb +37 -0
data/lib/html/pipeline/markdown_filter.rb +29 -0
data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
data/lib/html/pipeline/sanitization_filter.rb +105 -0
data/lib/html/pipeline/syntax_highlight_filter.rb +29 -0
data/lib/html/pipeline/text_filter.rb +14 -0
data/lib/html/pipeline/textile_filter.rb +21 -0
data/lib/html/pipeline/toc_filter.rb +28 -0
data/lib/html/pipeline/version.rb +5 -0
data/test/html/pipeline/autolink_filter_test.rb +22 -0
data/test/html/pipeline/camo_filter_test.rb +47 -0
data/test/html/pipeline/emoji_filter_test.rb +18 -0
data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
data/test/html/pipeline/markdown_filter_test.rb +101 -0
data/test/html/pipeline/mention_filter_test.rb +158 -0
data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
data/test/html/pipeline/sanitization_filter_test.rb +47 -0
data/test/html/pipeline/toc_filter_test.rb +47 -0
data/test/test_helper.rb +38 -0
metadata +214 -0

data/lib/html/pipeline/https_filter.rb ADDED

@@ -0,0 +1,13 @@
+module HTML
+  class Pipeline
+    # HTML Filter for replacing http github urls with https versions.
+    class HttpsFilter < Filter
+      def call
+        doc.css('a[href^="http://github.com"]').each do |element|
+          element['href'] = element['href'].sub(/^http:/,'https:')
+        end
+        doc
+      end
+    end
+  end
+end

data/lib/html/pipeline/image_max_width_filter.rb ADDED

@@ -0,0 +1,37 @@
+module HTML
+  class Pipeline
+    # This filter rewrites image tags with a max-width inline style and also wraps
+    # the image in an <a> tag that causes the full size image to be opened in a
+    # new tab.
+    #
+    # The max-width inline styles are especially useful in HTML email which
+    # don't use a global stylesheets.
+    class ImageMaxWidthFilter < Filter
+      def call
+        doc.search('img').each do |element|
+          # Skip if there's already a style attribute. Not sure how this
+          # would happen but we can reconsider it in the future.
+          next if element['style']
+          # Bail out if src doesn't look like a valid http url. trying to avoid weird
+          # js injection via javascript: urls.
+          next if element['src'].to_s.strip =~ /\Ajavascript/i
+          element['style'] = "max-width:100%;"
+          if !has_ancestor?(element, %w(a))
+            link_image element
+          end
+        end
+        doc
+      end
+      def link_image(element)
+        link = doc.document.create_element('a', :href => element['src'], :target => '_blank')
+        link.add_child(element.dup)
+        element.replace(link)
+      end
+    end
+  end
+end

data/lib/html/pipeline/markdown_filter.rb ADDED

@@ -0,0 +1,29 @@
+require 'github/markdown'
+module HTML
+  class Pipeline
+    # HTML Filter that converts Markdown text into HTML and converts into a
+    # DocumentFragment. This is different from most filters in that it can take a
+    # non-HTML as input. It must be used as the first filter in a pipeline.
+    #
+    # Context options:
+    #   :gfm      => false    Disable GFM line-end processing
+    #
+    # This filter does not write any additional information to the context hash.
+    class MarkdownFilter < TextFilter
+      def initialize(text, context = nil, result = nil)
+        super text, context, result
+        @text = @text.gsub "\r", ''
+      end
+      # Convert Markdown to HTML using the best available implementation
+      # and convert into a DocumentFragment.
+      def call
+        mode = (context[:gfm] != false) ? :gfm : :markdown
+        html = GitHub::Markdown.to_html(@text, mode)
+        html.rstrip!
+        html
+      end
+    end
+  end
+end

data/lib/html/pipeline/plain_text_input_filter.rb ADDED

@@ -0,0 +1,11 @@
+module HTML
+  class Pipeline
+    # Simple filter for plain text input. HTML escapes the text input and wraps it
+    # in a div.
+    class PlainTextInputFilter < TextFilter
+      def call
+        "<div>#{EscapeUtils.escape_html(@text, false)}</div>"
+      end
+    end
+  end
+end

data/lib/html/pipeline/sanitization_filter.rb ADDED

@@ -0,0 +1,105 @@
+require 'sanitize'
+module HTML
+  class Pipeline
+    # HTML filter with sanization routines and whitelists. This module defines
+    # what HTML is allowed in user provided content and fixes up issues with
+    # unbalanced tags and whatnot.
+    #
+    # See the Sanitize docs for more information on the underlying library:
+    #
+    # https://github.com/rgrove/sanitize/#readme
+    #
+    # Context options:
+    #   :whitelist - The sanitizer whitelist configuration to use. This can be one
+    #                of the options constants defined in this class or a custom
+    #                sanitize options hash.
+    #
+    # This filter does not write additional information to the context.
+    class SanitizationFilter < Filter
+      LISTS     = Set.new(%w(ul ol).freeze)
+      LIST_ITEM = 'li'.freeze
+      # List of table child elements. These must be contained by a <table> element
+      # or they are not allowed through. Otherwise they can be used to break out
+      # of places we're using tables to contain formatted user content (like pull
+      # request review comments).
+      TABLE_ITEMS = Set.new(%w(tr td th).freeze)
+      TABLE       = 'table'.freeze
+      # The main sanitization whitelist. Only these elements and attributes are
+      # allowed through by default.
+      WHITELIST = {
+        :elements => %w(
+          h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
+          div ins del sup sub p ol ul table blockquote dl dt dd
+          kbd q samp var hr ruby rt rp li tr td th
+        ),
+        :attributes => {
+          'a' => ['href'],
+          'img' => ['src'],
+          'div' => ['itemscope', 'itemtype'],
+          :all  => ['abbr', 'accept', 'accept-charset',
+                    'accesskey', 'action', 'align', 'alt', 'axis',
+                    'border', 'cellpadding', 'cellspacing', 'char',
+                    'charoff', 'charset', 'checked', 'cite',
+                    'clear', 'cols', 'colspan', 'color',
+                    'compact', 'coords', 'datetime', 'dir',
+                    'disabled', 'enctype', 'for', 'frame',
+                    'headers', 'height', 'hreflang',
+                    'hspace', 'ismap', 'label', 'lang',
+                    'longdesc', 'maxlength', 'media', 'method',
+                    'multiple', 'name', 'nohref', 'noshade',
+                    'nowrap', 'prompt', 'readonly', 'rel', 'rev',
+                    'rows', 'rowspan', 'rules', 'scope',
+                    'selected', 'shape', 'size', 'span',
+                    'start', 'summary', 'tabindex', 'target',
+                    'title', 'type', 'usemap', 'valign', 'value',
+                    'vspace', 'width', 'itemprop']
+        },
+        :protocols => {
+          'a'   => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
+          'img' => {'src'  => ['http', 'https', :relative]}
+        },
+        :transformers => [
+          # Top-level <li> elements are removed because they can break out of
+          # containing markup.
+          lambda { |env|
+            name, node = env[:node_name], env[:node]
+            if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
+              node.replace(node.children)
+            end
+          },
+          # Table child elements that are not contained by a <table> are removed.
+          lambda { |env|
+            name, node = env[:node_name], env[:node]
+            if TABLE_ITEMS.include?(name) && !node.ancestors.any? { |n| n.name == TABLE }
+              node.replace(node.children)
+            end
+          }
+        ]
+      }
+      # A more limited sanitization whitelist. This includes all attributes,
+      # protocols, and transformers from WHITELIST but with a more locked down
+      # set of allowed elements.
+      LIMITED = WHITELIST.merge(
+        :elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))
+      # Strip all HTML tags from the document.
+      FULL = { :elements => [] }
+      # Sanitize markup using the Sanitize library.
+      def call
+        Sanitize.clean_node!(doc, whitelist)
+      end
+      # The whitelist to use when sanitizing. This can be passed in the context
+      # hash to the filter but defaults to WHITELIST constant value above.
+      def whitelist
+        context[:whitelist] || WHITELIST
+      end
+    end
+  end
+end

data/lib/html/pipeline/syntax_highlight_filter.rb ADDED

@@ -0,0 +1,29 @@
+require 'linguist'
+module HTML
+  class Pipeline
+    # HTML Filter that syntax highlights code blocks wrapped
+    # in <pre lang="...">.
+    class SyntaxHighlightFilter < Filter
+      def call
+        doc.search('pre').each do |node|
+          next unless lang = node['lang']
+          next unless lexer = Pygments::Lexer[lang]
+          text = node.inner_text
+          html = highlight_with_timeout_handling(lexer, text)
+          next if html.nil?
+          node.replace(html)
+        end
+        doc
+      end
+      def highlight_with_timeout_handling(lexer, text)
+        lexer.highlight(text)
+      rescue Timeout::Error => boom
+        nil
+      end
+    end
+  end
+end

data/lib/html/pipeline/text_filter.rb ADDED

@@ -0,0 +1,14 @@
+module HTML
+  class Pipeline
+    class TextFilter < Filter
+      attr_reader :text
+      def initialize(text, context = nil, result = nil)
+        raise TypeError, "text cannot be HTML" if text.is_a?(DocumentFragment)
+        # Ensure that this is always a string
+        @text = text.respond_to?(:to_str) ? text.to_str : text.to_s
+        super nil, context, result
+      end
+    end
+  end
+end

data/lib/html/pipeline/textile_filter.rb ADDED

@@ -0,0 +1,21 @@
+module HTML
+  class Pipeline
+    # HTML Filter that converts Textile text into HTML and converts into a
+    # DocumentFragment. This is different from most filters in that it can take a
+    # non-HTML as input. It must be used as the first filter in a pipeline.
+    #
+    # Context options:
+    #   :autolink => false    Disable autolinking URLs
+    #
+    # This filter does not write any additional information to the context hash.
+    #
+    # NOTE This filter is provided for really old comments only. It probably
+    # shouldn't be used for anything new.
+    class TextileFilter < TextFilter
+      # Convert Textile to HTML and convert into a DocumentFragment.
+      def call
+        RedCloth.new(@text).to_html
+      end
+    end
+  end
+end

data/lib/html/pipeline/toc_filter.rb ADDED

@@ -0,0 +1,28 @@
+module HTML
+  class Pipeline
+    # HTML filter that adds a 'name' attribute to all headers
+    # in a document, so they can be accessed from a table of contents
+    #
+    # TODO: besides adding the name attribute, we should get around to
+    # eventually generating the Table of Contents itself, with links
+    # to each header
+    class TableOfContentsFilter < Filter
+      def call
+        headers = Hash.new(0)
+        doc.css('h1, h2, h3, h4, h5, h6').each do |node|
+          name = node.text.downcase
+          name.gsub!(/[^\w\- ]/, '') # remove punctuation
+          name.gsub!(' ', '-') # replace spaces with dash
+          name = EscapeUtils.escape_uri(name) # escape extended UTF-8 chars
+          uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
+          headers[name] += 1
+          if header_content = node.children.first
+            header_content.add_previous_sibling(%Q{<a name="#{name}#{uniq}" class="anchor" href="##{name}#{uniq}"><span class="mini-icon mini-icon-link"></span></a>})
+          end
+        end
+        doc
+      end
+    end
+  end
+end

data/lib/html/pipeline/version.rb ADDED

@@ -0,0 +1,5 @@
+module HTML
+  class Pipeline
+    VERSION = "0.0.6"
+  end
+end

data/test/html/pipeline/autolink_filter_test.rb ADDED

@@ -0,0 +1,22 @@
+require "test_helper"
+AutolinkFilter = HTML::Pipeline::AutolinkFilter
+class HTML::Pipeline::AutolinkFilterTest < Test::Unit::TestCase
+  def test_uses_rinku_for_autolinking
+    # just try to parse a complicated piece of HTML
+    # that Rails auto_link cannot handle
+    assert_equal '<p>"<a href="http://www.github.com">http://www.github.com</a>"</p>',
+      AutolinkFilter.to_html('<p>"http://www.github.com"</p>')
+  end
+  def test_autolink_option
+    assert_equal '<p>"http://www.github.com"</p>',
+      AutolinkFilter.to_html('<p>"http://www.github.com"</p>', :autolink => false)
+  end
+  def test_autolink_flags
+    assert_equal '<p>"<a href="http://github">http://github</a>"</p>',
+      AutolinkFilter.to_html('<p>"http://github"</p>', :flags => Rinku::AUTOLINK_SHORT_DOMAINS)
+  end
+end

data/test/html/pipeline/camo_filter_test.rb ADDED

@@ -0,0 +1,47 @@
+require "test_helper"
+class HTML::Pipeline::CamoFilterTest < Test::Unit::TestCase
+  CamoFilter = HTML::Pipeline::CamoFilter
+  def setup
+    @asset_proxy_url        = 'https//assets.example.org'
+    @asset_proxy_secret_key = 'ssssh-secret'
+    @options = {
+      :asset_proxy            => @asset_proxy_url,
+      :asset_proxy_secret_key => @asset_proxy_secret_key
+    }
+  end
+  def test_camouflaging_http_image_urls
+    orig = %(<p><img src="http://twitter.com/img.png"></p>)
+    assert_includes 'img src="' + @asset_proxy_url,
+      CamoFilter.call(orig, @options).to_s
+  end
+  def test_rewrites_dotcom_image_urls
+    orig = %(<p><img src="http://github.com/img.png"></p>)
+    assert_equal "<p><img src=\"https://github.com/img.png\"></p>",
+      CamoFilter.call(orig, @options).to_s
+  end
+  def test_not_camouflaging_https_image_urls
+    orig = %(<p><img src="https://foo.com/img.png"></p>)
+    assert_doesnt_include 'img src="' + @asset_proxy_url,
+      CamoFilter.call(orig, @options).to_s
+  end
+  def test_handling_images_with_no_src_attribute
+    orig = %(<p><img></p>)
+    assert_nothing_raised do
+      CamoFilter.call(orig, @options).to_s
+    end
+  end
+  def test_required_context_validation
+    exception = assert_raise(ArgumentError) {
+      CamoFilter.call("", {})
+    }
+    assert_match /:asset_proxy[^_]/, exception.message
+    assert_match /:asset_proxy_secret_key/, exception.message
+  end
+end

data/test/html/pipeline/emoji_filter_test.rb ADDED

@@ -0,0 +1,18 @@
+require 'test_helper'
+class HTML::Pipeline::EmojiFilterTest < Test::Unit::TestCase
+  EmojiFilter = HTML::Pipeline::EmojiFilter
+  def test_emojify
+    filter = EmojiFilter.new("<p>:shipit:</p>", {:asset_root => 'https://foo.com'})
+    doc = filter.call
+    assert_match "https://foo.com/emoji/shipit.png", doc.search('img').attr('src').value
+  end
+  def test_required_context_validation
+    exception = assert_raise(ArgumentError) {
+      EmojiFilter.call("", {})
+    }
+    assert_match /:asset_root/, exception.message
+  end
+end

data/test/html/pipeline/image_max_width_filter_test.rb ADDED

@@ -0,0 +1,50 @@
+require "test_helper"
+class HTML::Pipeline::ImageMaxWidthFilterTest < Test::Unit::TestCase
+  def filter(html)
+    HTML::Pipeline::ImageMaxWidthFilter.call(html)
+  end
+  def test_rewrites_image_style_tags
+    body = "<p>Screenshot: <img src='screenshot.png'></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html %q(<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
+      res.to_html
+  end
+  def test_leaves_existing_image_style_tags_alone
+    body = "<p><img src='screenshot.png' style='width:100px;'></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html '<p><img src="screenshot.png" style="width:100px;"></p>',
+      res.to_html
+  end
+  def test_links_to_image
+    body = "<p>Screenshot: <img src='screenshot.png'></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html '<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>',
+      res.to_html
+  end
+  def test_doesnt_link_to_image_when_already_linked
+    body = "<p>Screenshot: <a href='blah.png'><img src='screenshot.png'></a></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html %q(<p>Screenshot: <a href="blah.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
+      res.to_html
+  end
+  def test_doesnt_screw_up_inlined_images
+    body = "<p>Screenshot <img src='screenshot.png'>, yes, this is a <b>screenshot</b> indeed.</p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    assert_equal_html %q(<p>Screenshot <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a>, yes, this is a <b>screenshot</b> indeed.</p>), filter(doc).to_html
+  end
+end