RubyGems - html-pipeline - Versions diffs - 0.0.4 - Mend

html-pipeline 0.0.4

Files changed (36) hide show

data/.gitignore +19 -0
data/.travis.yml +13 -0
data/Gemfile +9 -0
data/LICENSE +22 -0
data/README.md +128 -0
data/Rakefile +11 -0
data/html-pipeline.gemspec +25 -0
data/lib/html/pipeline.rb +130 -0
data/lib/html/pipeline/@mention_filter.rb +118 -0
data/lib/html/pipeline/autolink_filter.rb +22 -0
data/lib/html/pipeline/body_content.rb +42 -0
data/lib/html/pipeline/camo_filter.rb +64 -0
data/lib/html/pipeline/email_reply_filter.rb +56 -0
data/lib/html/pipeline/emoji_filter.rb +48 -0
data/lib/html/pipeline/filter.rb +158 -0
data/lib/html/pipeline/https_filter.rb +13 -0
data/lib/html/pipeline/image_max_width_filter.rb +37 -0
data/lib/html/pipeline/markdown_filter.rb +29 -0
data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
data/lib/html/pipeline/sanitization_filter.rb +107 -0
data/lib/html/pipeline/syntax_highlight_filter.rb +29 -0
data/lib/html/pipeline/text_filter.rb +14 -0
data/lib/html/pipeline/textile_filter.rb +21 -0
data/lib/html/pipeline/toc_filter.rb +28 -0
data/lib/html/pipeline/version.rb +5 -0
data/test/html/pipeline/autolink_filter_test.rb +22 -0
data/test/html/pipeline/camo_filter_test.rb +39 -0
data/test/html/pipeline/emoji_filter_test.rb +16 -0
data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
data/test/html/pipeline/markdown_filter_test.rb +101 -0
data/test/html/pipeline/mention_filter_test.rb +158 -0
data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
data/test/html/pipeline/sanitization_filter_test.rb +47 -0
data/test/html/pipeline/toc_filter_test.rb +47 -0
data/test/test_helper.rb +38 -0
metadata +221 -0

data/lib/html/pipeline/markdown_filter.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require 'github/markdown'
+module HTML
+  class Pipeline
+    # HTML Filter that converts Markdown text into HTML and converts into a
+    # DocumentFragment. This is different from most filters in that it can take a
+    # non-HTML as input. It must be used as the first filter in a pipeline.
+    #
+    # Context options:
+    #   :gfm      => false    Disable GFM line-end processing
+    #
+    # This filter does not write any additional information to the context hash.
+    class MarkdownFilter < TextFilter
+      def initialize(text, context = nil, result = nil)
+        super text, context, result
+        @text.gsub! "\r", ''
+      end
+      # Convert Markdown to HTML using the best available implementation
+      # and convert into a DocumentFragment.
+      def call
+        mode = (context[:gfm] != false) ? :gfm : :markdown
+        html = GitHub::Markdown.to_html(@text, mode)
+        html.rstrip!
+        html
+      end
+    end
+  end
+end

data/lib/html/pipeline/plain_text_input_filter.rb ADDED Viewed

@@ -0,0 +1,11 @@
+module HTML
+  class Pipeline
+    # Simple filter for plain text input. HTML escapes the text input and wraps it
+    # in a div.
+    class PlainTextInputFilter < TextFilter
+      def call
+        "<div>#{EscapeUtils.escape_html(@text, false)}</div>"
+      end
+    end
+  end
+end

data/lib/html/pipeline/sanitization_filter.rb ADDED Viewed

@@ -0,0 +1,107 @@
+require 'sanitize'
+module HTML
+  class Pipeline
+    # HTML filter with sanization routines and whitelists. This module defines
+    # what HTML is allowed in user provided content and fixes up issues with
+    # unbalanced tags and whatnot.
+    #
+    # See the Sanitize docs for more information on the underlying library:
+    #
+    # https://github.com/rgrove/sanitize/#readme
+    #
+    # Context options:
+    #   :whitelist - The sanitizer whitelist configuration to use. This can be one
+    #                of the options constants defined in this class or a custom
+    #                sanitize options hash.
+    #
+    # This filter does not write additional information to the context.
+    class SanitizationFilter < Filter
+      LISTS     = Set.new(%w(ul ol).freeze)
+      LIST_ITEM = 'li'.freeze
+      # List of table child elements. These must be contained by a <table> element
+      # or they are not allowed through. Otherwise they can be used to break out
+      # of places we're using tables to contain formatted user content (like pull
+      # request review comments).
+      TABLE_ITEMS = Set.new(%w(tr td th).freeze)
+      TABLE       = 'table'.freeze
+      # The main sanitization whitelist. Only these elements and attributes are
+      # allowed through by default.
+      WHITELIST = {
+        :elements => %w(
+          h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
+          div ins del sup sub p ol ul table blockquote dl dt dd
+          kbd q samp var hr ruby rt rp
+        ),
+        :attributes => {
+          'a' => ['href'],
+          'img' => ['src'],
+          'div' => ['itemscope', 'itemtype'],
+          :all  => ['abbr', 'accept', 'accept-charset',
+                    'accesskey', 'action', 'align', 'alt', 'axis',
+                    'border', 'cellpadding', 'cellspacing', 'char',
+                    'charoff', 'charset', 'checked', 'cite',
+                    'clear', 'cols', 'colspan', 'color',
+                    'compact', 'coords', 'datetime', 'dir',
+                    'disabled', 'enctype', 'for', 'frame',
+                    'headers', 'height', 'hreflang',
+                    'hspace', 'ismap', 'label', 'lang',
+                    'longdesc', 'maxlength', 'media', 'method',
+                    'multiple', 'name', 'nohref', 'noshade',
+                    'nowrap', 'prompt', 'readonly', 'rel', 'rev',
+                    'rows', 'rowspan', 'rules', 'scope',
+                    'selected', 'shape', 'size', 'span',
+                    'start', 'summary', 'tabindex', 'target',
+                    'title', 'type', 'usemap', 'valign', 'value',
+                    'vspace', 'width', 'itemprop']
+        },
+        :protocols => {
+          'a'   => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
+          'img' => {'src'  => ['http', 'https', :relative]}
+        },
+        :transformers => [
+          # whitelist only <li> elements that are descended from a <ul> or <ol>.
+          # top-level <li> elements are removed because they can break out of
+          # containing markup.
+          lambda { |env|
+            name, node = env[:node_name], env[:node]
+            if name == LIST_ITEM && node.ancestors.any?{ |n| LISTS.include?(n.name) }
+              {:node_whitelist => [node]}
+            end
+          },
+          # Whitelist only table child elements that are descended from a <table>.
+          # Table child elements that are not contained by a <table> are removed.
+          lambda { |env|
+            name, node = env[:node_name], env[:node]
+            if TABLE_ITEMS.include?(name) && node.ancestors.any? { |n| n.name == TABLE }
+              { :node_whitelist => [node] }
+            end
+          }
+        ]
+      }
+      # A more limited sanitization whitelist. This includes all attributes,
+      # protocols, and transformers from WHITELIST but with a more locked down
+      # set of allowed elements.
+      LIMITED = WHITELIST.merge(
+        :elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))
+      # Strip all HTML tags from the document.
+      FULL = { :elements => [] }
+      # Sanitize markup using the Sanitize library.
+      def call
+        Sanitize.clean_node!(doc, whitelist)
+      end
+      # The whitelist to use when sanitizing. This can be passed in the context
+      # hash to the filter but defaults to WHITELIST constant value above.
+      def whitelist
+        context[:whitelist] || WHITELIST
+      end
+    end
+  end
+end

data/lib/html/pipeline/syntax_highlight_filter.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require 'linguist'
+module HTML
+  class Pipeline
+    # HTML Filter that syntax highlights code blocks wrapped
+    # in <pre lang="...">.
+    class SyntaxHighlightFilter < Filter
+      def call
+        doc.search('pre').each do |node|
+          next unless lang = node['lang']
+          next unless lexer = Pygments::Lexer[lang]
+          text = node.inner_text
+          html = highlight_with_timeout_handling(lexer, text)
+          next if html.nil?
+          node.replace(html)
+        end
+        doc
+      end
+      def highlight_with_timeout_handling(lexer, text)
+        lexer.highlight(text)
+      rescue Timeout::Error => boom
+        nil
+      end
+    end
+  end
+end

data/lib/html/pipeline/text_filter.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module HTML
+  class Pipeline
+    class TextFilter < Filter
+      attr_reader :text
+      def initialize(text, context = nil, result = nil)
+        raise TypeError, "text cannot be HTML" if text.is_a?(DocumentFragment)
+        # Ensure that this is always a string
+        @text = text.respond_to?(:to_str) ? text.to_str : text.to_s
+        super nil, context, result
+      end
+    end
+  end
+end

data/lib/html/pipeline/textile_filter.rb ADDED Viewed

@@ -0,0 +1,21 @@
+module HTML
+  class Pipeline
+    # HTML Filter that converts Textile text into HTML and converts into a
+    # DocumentFragment. This is different from most filters in that it can take a
+    # non-HTML as input. It must be used as the first filter in a pipeline.
+    #
+    # Context options:
+    #   :autolink => false    Disable autolinking URLs
+    #
+    # This filter does not write any additional information to the context hash.
+    #
+    # NOTE This filter is provided for really old comments only. It probably
+    # shouldn't be used for anything new.
+    class TextileFilter < TextFilter
+      # Convert Textile to HTML and convert into a DocumentFragment.
+      def call
+        RedCloth.new(@text).to_html
+      end
+    end
+  end
+end

data/lib/html/pipeline/toc_filter.rb ADDED Viewed

@@ -0,0 +1,28 @@
+module HTML
+  class Pipeline
+    # HTML filter that adds a 'name' attribute to all headers
+    # in a document, so they can be accessed from a table of contents
+    #
+    # TODO: besides adding the name attribute, we should get around to
+    # eventually generating the Table of Contents itself, with links
+    # to each header
+    class TableOfContentsFilter < Filter
+      def call
+        headers = Hash.new(0)
+        doc.css('h1, h2, h3, h4, h5, h6').each do |node|
+          name = node.text.downcase
+          name.gsub!(/[^\w\- ]/, '') # remove punctuation
+          name.gsub!(' ', '-') # replace spaces with dash
+          name = EscapeUtils.escape_uri(name) # escape extended UTF-8 chars
+          uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
+          headers[name] += 1
+          if header_content = node.children.first
+            header_content.add_previous_sibling(%Q{<a name="#{name}#{uniq}" class="anchor" href="##{name}#{uniq}"><span class="mini-icon mini-icon-link"></span></a>})
+          end
+        end
+        doc
+      end
+    end
+  end
+end

data/lib/html/pipeline/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module HTML
+  class Pipeline
+    VERSION = "0.0.4"
+  end
+end

data/test/html/pipeline/autolink_filter_test.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require "test_helper"
+AutolinkFilter = HTML::Pipeline::AutolinkFilter
+class HTML::Pipeline::AutolinkFilterTest < Test::Unit::TestCase
+  def test_uses_rinku_for_autolinking
+    # just try to parse a complicated piece of HTML
+    # that Rails auto_link cannot handle
+    assert_equal '<p>"<a href="http://www.github.com">http://www.github.com</a>"</p>',
+      AutolinkFilter.to_html('<p>"http://www.github.com"</p>')
+  end
+  def test_autolink_option
+    assert_equal '<p>"http://www.github.com"</p>',
+      AutolinkFilter.to_html('<p>"http://www.github.com"</p>', :autolink => false)
+  end
+  def test_autolink_flags
+    assert_equal '<p>"<a href="http://github">http://github</a>"</p>',
+      AutolinkFilter.to_html('<p>"http://github"</p>', :flags => Rinku::AUTOLINK_SHORT_DOMAINS)
+  end
+end

data/test/html/pipeline/camo_filter_test.rb ADDED Viewed

@@ -0,0 +1,39 @@
+require "test_helper"
+class HTML::Pipeline::CamoFilterTest < Test::Unit::TestCase
+  CamoFilter = HTML::Pipeline::CamoFilter
+  def setup
+    @asset_proxy_url        = 'https//assets.example.org'
+    @asset_proxy_secret_key = 'ssssh-secret'
+    @options = {
+      :asset_proxy            => @asset_proxy_url,
+      :asset_proxy_secret_key => @asset_proxy_secret_key
+    }
+  end
+  def test_camouflaging_http_image_urls
+    orig = %(<p><img src="http://twitter.com/img.png"></p>)
+    assert_includes 'img src="' + @asset_proxy_url,
+      CamoFilter.call(orig, @options).to_s
+  end
+  def test_rewrites_dotcom_image_urls
+    orig = %(<p><img src="http://github.com/img.png"></p>)
+    assert_equal "<p><img src=\"https://github.com/img.png\"></p>",
+      CamoFilter.call(orig, @options).to_s
+  end
+  def test_not_camouflaging_https_image_urls
+    orig = %(<p><img src="https://foo.com/img.png"></p>)
+    assert_doesnt_include 'img src="' + @asset_proxy_url,
+      CamoFilter.call(orig, @options).to_s
+  end
+  def test_handling_images_with_no_src_attribute
+    orig = %(<p><img></p>)
+    assert_nothing_raised do
+      CamoFilter.call(orig, @options).to_s
+    end
+  end
+end

data/test/html/pipeline/emoji_filter_test.rb ADDED Viewed

@@ -0,0 +1,16 @@
+require 'test_helper'
+class HTML::Pipeline::EmojiFilterTest < Test::Unit::TestCase
+  def test_emojify
+    filter = HTML::Pipeline::EmojiFilter.new("<p>:shipit:</p>", {:asset_root => 'https://foo.com'})
+    doc = filter.call
+    assert_match "https://foo.com/emoji/shipit.png", doc.search('img').attr('src').value
+  end
+  def test_missing_context
+    filter = HTML::Pipeline::EmojiFilter.new("<p>:shipit:</p>", {})
+    assert_raises ArgumentError do
+      filter.call
+    end
+  end
+end

data/test/html/pipeline/image_max_width_filter_test.rb ADDED Viewed

@@ -0,0 +1,50 @@
+require "test_helper"
+class HTML::Pipeline::ImageMaxWidthFilterTest < Test::Unit::TestCase
+  def filter(html)
+    HTML::Pipeline::ImageMaxWidthFilter.call(html)
+  end
+  def test_rewrites_image_style_tags
+    body = "<p>Screenshot: <img src='screenshot.png'></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html %q(<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
+      res.to_html
+  end
+  def test_leaves_existing_image_style_tags_alone
+    body = "<p><img src='screenshot.png' style='width:100px;'></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html '<p><img src="screenshot.png" style="width:100px;"></p>',
+      res.to_html
+  end
+  def test_links_to_image
+    body = "<p>Screenshot: <img src='screenshot.png'></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html '<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>',
+      res.to_html
+  end
+  def test_doesnt_link_to_image_when_already_linked
+    body = "<p>Screenshot: <a href='blah.png'><img src='screenshot.png'></a></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html %q(<p>Screenshot: <a href="blah.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
+      res.to_html
+  end
+  def test_doesnt_screw_up_inlined_images
+    body = "<p>Screenshot <img src='screenshot.png'>, yes, this is a <b>screenshot</b> indeed.</p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    assert_equal_html %q(<p>Screenshot <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a>, yes, this is a <b>screenshot</b> indeed.</p>), filter(doc).to_html
+  end
+end

data/test/html/pipeline/markdown_filter_test.rb ADDED Viewed

@@ -0,0 +1,101 @@
+require "test_helper"
+MarkdownFilter = HTML::Pipeline::MarkdownFilter
+class HTML::Pipeline::MarkdownFilterTest < Test::Unit::TestCase
+  def setup
+    @haiku =
+      "Pointing at the moon\n" +
+      "Reminded of simple things\n" +
+      "Moments matter most"
+    @links =
+      "See http://example.org/ for more info"
+    @code =
+      "```\n" +
+      "def hello()" +
+      "  'world'" +
+      "end" +
+      "```"
+  end
+  def test_fails_when_given_a_documentfragment
+    body = "<p>heyo</p>"
+    doc  = HTML::Pipeline.parse(body)
+    assert_raise(TypeError) { MarkdownFilter.call(doc, {}) }
+  end
+  def test_gfm_enabled_by_default
+    doc = MarkdownFilter.to_document(@haiku, {})
+    assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
+    assert_equal 2, doc.search('br').size
+  end
+  def test_disabling_gfm
+    doc = MarkdownFilter.to_document(@haiku, :gfm => false)
+    assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
+    assert_equal 0, doc.search('br').size
+  end
+  def test_fenced_code_blocks
+    doc = MarkdownFilter.to_document(@code)
+    assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
+    assert_equal 1, doc.search('pre').size
+  end
+  def test_fenced_code_blocks_with_language
+    doc = MarkdownFilter.to_document(@code.sub("```", "``` ruby"))
+    assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
+    assert_equal 1, doc.search('pre').size
+    assert_equal 'ruby', doc.search('pre').first['lang']
+  end
+end
+class GFMTest < Test::Unit::TestCase
+  def gfm(text)
+    MarkdownFilter.call(text, :gfm => true)
+  end
+  def test_not_touch_single_underscores_inside_words
+    assert_equal "<p>foo_bar</p>",
+                 gfm("foo_bar")
+  end
+  def test_not_touch_underscores_in_code_blocks
+    assert_equal "<pre><code>foo_bar_baz\n</code></pre>",
+                 gfm("    foo_bar_baz")
+  end
+  def test_not_touch_underscores_in_pre_blocks
+    assert_equal "<pre>\nfoo_bar_baz\n</pre>",
+                 gfm("<pre>\nfoo_bar_baz\n</pre>")
+  end
+  def test_not_touch_two_or_more_underscores_inside_words
+    assert_equal "<p>foo_bar_baz</p>",
+                 gfm("foo_bar_baz")
+  end
+  def test_turn_newlines_into_br_tags_in_simple_cases
+    assert_equal "<p>foo<br>\nbar</p>",
+                 gfm("foo\nbar")
+  end
+  def test_convert_newlines_in_all_groups
+    assert_equal "<p>apple<br>\npear<br>\norange</p>\n\n" +
+                 "<p>ruby<br>\npython<br>\nerlang</p>",
+                 gfm("apple\npear\norange\n\nruby\npython\nerlang")
+  end
+  def test_convert_newlines_in_even_long_groups
+    assert_equal "<p>apple<br>\npear<br>\norange<br>\nbanana</p>\n\n" +
+                 "<p>ruby<br>\npython<br>\nerlang</p>",
+                 gfm("apple\npear\norange\nbanana\n\nruby\npython\nerlang")
+  end
+  def test_not_convert_newlines_in_lists
+    assert_equal "<h1>foo</h1>\n\n<h1>bar</h1>",
+                 gfm("# foo\n# bar")
+    assert_equal "<ul>\n<li>foo</li>\n<li>bar</li>\n</ul>",
+                 gfm("* foo\n* bar")
+  end
+end