RubyGems - html-pipeline-linuxfr - Versions diffs - 0.0.14 - Mend

html-pipeline-linuxfr 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +7 -0
data/.gitignore +19 -0
data/.travis.yml +13 -0
data/CHANGELOG.md +51 -0
data/Gemfile +9 -0
data/LICENSE +22 -0
data/README.md +294 -0
data/Rakefile +11 -0
data/bin/html-pipeline +80 -0
data/html-pipeline-linuxfr.gemspec +24 -0
data/lib/html/pipeline.rb +167 -0
data/lib/html/pipeline/custom_links_filter.rb +47 -0
data/lib/html/pipeline/filter.rb +166 -0
data/lib/html/pipeline/linuxfr.rb +25 -0
data/lib/html/pipeline/markdown_filter.rb +76 -0
data/lib/html/pipeline/relative_links_filter.rb +18 -0
data/lib/html/pipeline/sanitization_filter.rb +108 -0
data/lib/html/pipeline/syntax_highlight_filter.rb +31 -0
data/lib/html/pipeline/text_filter.rb +14 -0
data/lib/html/pipeline/toc_filter.rb +61 -0
data/lib/html/pipeline/version.rb +5 -0
data/test/helpers/mocked_instrumentation_service.rb +17 -0
data/test/html/pipeline/absolute_source_filter_test.rb +56 -0
data/test/html/pipeline/camo_filter_test.rb +47 -0
data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
data/test/html/pipeline/markdown_filter_test.rb +101 -0
data/test/html/pipeline/mention_filter_test.rb +156 -0
data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
data/test/html/pipeline/sanitization_filter_test.rb +52 -0
data/test/html/pipeline/toc_filter_test.rb +47 -0
data/test/html/pipeline_test.rb +74 -0
data/test/test_helper.rb +38 -0
metadata +175 -0

data/lib/html/pipeline/relative_links_filter.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module HTML
+  class Pipeline
+    # HTML Filter for replacing http and https urls with protocol relative versions.
+    class RelativeLinksFilter < Filter
+      def call
+        h = context[:host]
+        doc.css("a[href^=\"http://#{h}\"],a[href^=\"https://#{h}\"]").each do |element|
+          element['href'] = element['href'].sub(/^https?:/, '')
+        end
+        doc
+      end
+    end
+  end
+end

data/lib/html/pipeline/sanitization_filter.rb ADDED Viewed

@@ -0,0 +1,108 @@
+require 'sanitize'
+module HTML
+  class Pipeline
+    # HTML filter with sanization routines and whitelists. This module defines
+    # what HTML is allowed in user provided content and fixes up issues with
+    # unbalanced tags and whatnot.
+    #
+    # See the Sanitize docs for more information on the underlying library:
+    #
+    # https://github.com/rgrove/sanitize/#readme
+    #
+    # Context options:
+    #   :whitelist - The sanitizer whitelist configuration to use. This can be one
+    #                of the options constants defined in this class or a custom
+    #                sanitize options hash.
+    #
+    # This filter does not write additional information to the context.
+    class SanitizationFilter < Filter
+      LISTS     = Set.new(%w(ul ol).freeze)
+      LIST_ITEM = 'li'.freeze
+      # List of table child elements. These must be contained by a <table> element
+      # or they are not allowed through. Otherwise they can be used to break out
+      # of places we're using tables to contain formatted user content (like pull
+      # request review comments).
+      TABLE_ITEMS = Set.new(%w(tr td th).freeze)
+      TABLE       = 'table'.freeze
+      # The main sanitization whitelist. Only these elements and attributes are
+      # allowed through by default.
+      WHITELIST = {
+        :elements => %w(
+          h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
+          div ins del sup sub p ol ul table blockquote dl dt dd
+          kbd q samp var hr ruby rt rp li tr td th
+        ),
+        :remove_contents => ['script'],
+        :attributes => {
+          'a' => ['href'],
+          'img' => ['src'],
+          'div' => ['itemscope', 'itemtype'],
+          :all  => ['abbr', 'accept', 'accept-charset',
+                    'accesskey', 'action', 'align', 'alt', 'axis',
+                    'border', 'cellpadding', 'cellspacing', 'char',
+                    'charoff', 'charset', 'checked', 'cite',
+                    'clear', 'cols', 'colspan', 'color',
+                    'compact', 'coords', 'datetime', 'dir',
+                    'disabled', 'enctype', 'for', 'frame',
+                    'headers', 'height', 'hreflang',
+                    'hspace', 'ismap', 'label', 'lang',
+                    'longdesc', 'maxlength', 'media', 'method',
+                    'multiple', 'name', 'nohref', 'noshade',
+                    'nowrap', 'prompt', 'readonly', 'rel', 'rev',
+                    'rows', 'rowspan', 'rules', 'scope',
+                    'selected', 'shape', 'size', 'span',
+                    'start', 'summary', 'tabindex', 'target',
+                    'title', 'type', 'usemap', 'valign', 'value',
+                    'vspace', 'width', 'itemprop']
+        },
+        :protocols => {
+          'a'   => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
+          'img' => {'src'  => ['http', 'https', :relative]}
+        },
+        :transformers => [
+          # Top-level <li> elements are removed because they can break out of
+          # containing markup.
+          lambda { |env|
+            name, node = env[:node_name], env[:node]
+            if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
+              node.replace(node.children)
+            end
+          },
+          # Table child elements that are not contained by a <table> are removed.
+          lambda { |env|
+            name, node = env[:node_name], env[:node]
+            if TABLE_ITEMS.include?(name) && !node.ancestors.any? { |n| n.name == TABLE }
+              node.replace(node.children)
+            end
+          }
+        ]
+      }
+      # A more limited sanitization whitelist. This includes all attributes,
+      # protocols, and transformers from WHITELIST but with a more locked down
+      # set of allowed elements.
+      LIMITED = WHITELIST.merge(
+        :elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))
+      # Strip all HTML tags from the document.
+      FULL = { :elements => [] }
+      # Sanitize markup using the Sanitize library.
+      def call
+        Sanitize.clean_node!(doc, whitelist)
+      end
+      # The whitelist to use when sanitizing. This can be passed in the context
+      # hash to the filter but defaults to WHITELIST constant value above.
+      def whitelist
+        context[:whitelist] || WHITELIST
+      end
+    end
+  end
+end

data/lib/html/pipeline/syntax_highlight_filter.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require 'pygments'
+module HTML
+  class Pipeline
+    # HTML Filter that syntax highlights code blocks wrapped
+    # in <pre lang="...">.
+    class SyntaxHighlightFilter < Filter
+      def call
+        doc.search('code').each do |node|
+          next unless lang = node['class']
+          next unless lexer = Pygments::Lexer[lang]
+          text = node.inner_text
+          html = highlight_with_timeout_handling(lexer, text)
+          next if html.nil?
+          node.child.replace(html)
+        end
+        doc
+      end
+      def highlight_with_timeout_handling(lexer, text)
+        lexer.highlight(text, options: { nowrap: true })
+      rescue Timeout::Error
+        nil
+      end
+    end
+  end
+end

data/lib/html/pipeline/text_filter.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module HTML
+  class Pipeline
+    class TextFilter < Filter
+      attr_reader :text
+      def initialize(text, context = nil, result = nil)
+        raise TypeError, "text cannot be HTML" if text.is_a?(DocumentFragment)
+        # Ensure that this is always a string
+        @text = text.respond_to?(:to_str) ? text.to_str : text.to_s
+        super nil, context, result
+      end
+    end
+  end
+end

data/lib/html/pipeline/toc_filter.rb ADDED Viewed

@@ -0,0 +1,61 @@
+module HTML
+  class Pipeline
+    # HTML filter that adds a 'name' attribute to all headers
+    # in a document, so they can be accessed from a table of contents
+    #
+    # Context options:
+    #   :toc_minimal_length (required) - Only add the table of contents to text with this number of characters
+    #   :toc_header (required) - Introduce the table of contents with this header
+    #
+    class TableOfContentsFilter < Filter
+      def call
+        headers = Hash.new 0
+        was = 2
+        toc = ""
+        doc.css('h1, h2, h3, h4, h5, h6').each do |node|
+          level = node.name.scan(/\d/).first.to_i
+          name = node.text.downcase
+          name.gsub!(/[^\w\- ]/, '') # remove punctuation
+          name.gsub!(' ', '-') # replace spaces with dash
+          name = EscapeUtils.escape_uri(name) # escape extended UTF-8 chars
+          uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
+          headers[name] += 1
+          node['id'] = "#{name}#{uniq}"
+          while was > level
+            toc << "</ul>\n</li>\n"
+            was -= 1
+          end
+          while was < level
+            toc << "<li>\n<ul>"
+            was += 1
+          end
+          toc << "<li><a href=\"##{name}#{uniq}\">#{node.inner_html}</a></li>"
+        end
+        length = 0
+        doc.traverse {|node| length += node.text.length if node.text? }
+        return doc unless length >= context[:toc_minimal_length]
+        while was > 1
+          toc << "</ul>\n</li>\n"
+          was -= 1
+        end
+        unless headers.empty?
+          first_child = doc.child
+          first_child.add_previous_sibling context[:toc_header]
+          first_child.add_previous_sibling "<ul class=\"toc\">#{toc}</ul>"
+        end
+        doc
+      end
+      def validate
+        needs :toc_minimal_length, :toc_header
+      end
+    end
+  end
+end

data/lib/html/pipeline/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module HTML
+  class Pipeline
+    VERSION = "0.0.14"
+  end
+end

data/test/helpers/mocked_instrumentation_service.rb ADDED Viewed

@@ -0,0 +1,17 @@
+class MockedInstrumentationService
+  attr_reader :events
+  def initialize(event = nil, events = [])
+    @events = events
+    subscribe event
+  end
+  def instrument(event, payload = nil)
+    payload ||= {}
+    res = yield payload
+    events << [event, payload, res] if @subscribe == event
+    res
+  end
+  def subscribe(event)
+    @subscribe = event
+    @events
+  end
+end

data/test/html/pipeline/absolute_source_filter_test.rb ADDED Viewed

@@ -0,0 +1,56 @@
+require "test_helper"
+class HTML::Pipeline::AbsoluteSourceFilterTest < Test::Unit::TestCase
+  AbsoluteSourceFilter = HTML::Pipeline::AbsoluteSourceFilter
+  def setup
+    @image_base_url = 'http://assets.example.com'
+    @image_subpage_url = 'http://blog.example.com/a/post'
+    @options = {
+      :image_base_url    => @image_base_url,
+      :image_subpage_url => @image_subpage_url
+    }
+  end
+  def test_rewrites_root_relative_urls
+    orig = %(<p><img src="/img.png"></p>)
+    puts AbsoluteSourceFilter.call(orig, @options).to_s
+    assert_equal "<p><img src=\"#{@image_base_url}/img.png\"></p>",
+      AbsoluteSourceFilter.call(orig, @options).to_s
+  end
+  def test_rewrites_root_relative_urls
+    orig = %(<p><img src="post/img.png"></p>)
+    assert_equal "<p><img src=\"#{@image_subpage_url}/img.png\"></p>",
+      AbsoluteSourceFilter.call(orig, @options).to_s
+  end
+  def test_does_not_rewrite_absolute_urls
+    orig = %(<p><img src="http://other.example.com/img.png"></p>)
+    result = AbsoluteSourceFilter.call(orig, @options).to_s
+    assert_no_match /@image_base_url/, result
+    assert_no_match /@image_subpage_url/, result
+  end
+  def test_fails_when_context_is_missing
+    assert_raise RuntimeError do
+      AbsoluteSourceFilter.call("<img src=\"img.png\">", {})
+    end
+    assert_raise RuntimeError do
+      AbsoluteSourceFilter.call("<img src=\"/img.png\">", {})
+    end
+  end
+  def test_tells_you_where_context_is_required
+    exception = assert_raise(RuntimeError) {
+      AbsoluteSourceFilter.call("<img src=\"img.png\">", {})
+    }
+    assert_match 'HTML::Pipeline::AbsoluteSourceFilter', exception.message
+    exception = assert_raise(RuntimeError) {
+      AbsoluteSourceFilter.call("<img src=\"/img.png\">", {})
+    }
+    assert_match 'HTML::Pipeline::AbsoluteSourceFilter', exception.message
+  end
+end

data/test/html/pipeline/camo_filter_test.rb ADDED Viewed

@@ -0,0 +1,47 @@
+require "test_helper"
+class HTML::Pipeline::CamoFilterTest < Test::Unit::TestCase
+  CamoFilter = HTML::Pipeline::CamoFilter
+  def setup
+    @asset_proxy_url        = 'https//assets.example.org'
+    @asset_proxy_secret_key = 'ssssh-secret'
+    @options = {
+      :asset_proxy            => @asset_proxy_url,
+      :asset_proxy_secret_key => @asset_proxy_secret_key
+    }
+  end
+  def test_camouflaging_http_image_urls
+    orig = %(<p><img src="http://twitter.com/img.png"></p>)
+    assert_includes 'img src="' + @asset_proxy_url,
+      CamoFilter.call(orig, @options).to_s
+  end
+  def test_rewrites_dotcom_image_urls
+    orig = %(<p><img src="http://github.com/img.png"></p>)
+    assert_equal "<p><img src=\"https://github.com/img.png\"></p>",
+      CamoFilter.call(orig, @options).to_s
+  end
+  def test_not_camouflaging_https_image_urls
+    orig = %(<p><img src="https://foo.com/img.png"></p>)
+    assert_doesnt_include 'img src="' + @asset_proxy_url,
+      CamoFilter.call(orig, @options).to_s
+  end
+  def test_handling_images_with_no_src_attribute
+    orig = %(<p><img></p>)
+    assert_nothing_raised do
+      CamoFilter.call(orig, @options).to_s
+    end
+  end
+  def test_required_context_validation
+    exception = assert_raise(ArgumentError) {
+      CamoFilter.call("", {})
+    }
+    assert_match /:asset_proxy[^_]/, exception.message
+    assert_match /:asset_proxy_secret_key/, exception.message
+  end
+end

data/test/html/pipeline/image_max_width_filter_test.rb ADDED Viewed

@@ -0,0 +1,50 @@
+require "test_helper"
+class HTML::Pipeline::ImageMaxWidthFilterTest < Test::Unit::TestCase
+  def filter(html)
+    HTML::Pipeline::ImageMaxWidthFilter.call(html)
+  end
+  def test_rewrites_image_style_tags
+    body = "<p>Screenshot: <img src='screenshot.png'></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html %q(<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
+      res.to_html
+  end
+  def test_leaves_existing_image_style_tags_alone
+    body = "<p><img src='screenshot.png' style='width:100px;'></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html '<p><img src="screenshot.png" style="width:100px;"></p>',
+      res.to_html
+  end
+  def test_links_to_image
+    body = "<p>Screenshot: <img src='screenshot.png'></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html '<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>',
+      res.to_html
+  end
+  def test_doesnt_link_to_image_when_already_linked
+    body = "<p>Screenshot: <a href='blah.png'><img src='screenshot.png'></a></p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    res = filter(doc)
+    assert_equal_html %q(<p>Screenshot: <a href="blah.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
+      res.to_html
+  end
+  def test_doesnt_screw_up_inlined_images
+    body = "<p>Screenshot <img src='screenshot.png'>, yes, this is a <b>screenshot</b> indeed.</p>"
+    doc  = Nokogiri::HTML::DocumentFragment.parse(body)
+    assert_equal_html %q(<p>Screenshot <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a>, yes, this is a <b>screenshot</b> indeed.</p>), filter(doc).to_html
+  end
+end

data/test/html/pipeline/markdown_filter_test.rb ADDED Viewed

@@ -0,0 +1,101 @@
+require "test_helper"
+MarkdownFilter = HTML::Pipeline::MarkdownFilter
+class HTML::Pipeline::MarkdownFilterTest < Test::Unit::TestCase
+  def setup
+    @haiku =
+      "Pointing at the moon\n" +
+      "Reminded of simple things\n" +
+      "Moments matter most"
+    @links =
+      "See http://example.org/ for more info"
+    @code =
+      "```\n" +
+      "def hello()" +
+      "  'world'" +
+      "end" +
+      "```"
+  end
+  def test_fails_when_given_a_documentfragment
+    body = "<p>heyo</p>"
+    doc  = HTML::Pipeline.parse(body)
+    assert_raise(TypeError) { MarkdownFilter.call(doc, {}) }
+  end
+  def test_gfm_enabled_by_default
+    doc = MarkdownFilter.to_document(@haiku, {})
+    assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
+    assert_equal 2, doc.search('br').size
+  end
+  def test_disabling_gfm
+    doc = MarkdownFilter.to_document(@haiku, :gfm => false)
+    assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
+    assert_equal 0, doc.search('br').size
+  end
+  def test_fenced_code_blocks
+    doc = MarkdownFilter.to_document(@code)
+    assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
+    assert_equal 1, doc.search('pre').size
+  end
+  def test_fenced_code_blocks_with_language
+    doc = MarkdownFilter.to_document(@code.sub("```", "``` ruby"))
+    assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
+    assert_equal 1, doc.search('pre').size
+    assert_equal 'ruby', doc.search('pre').first['lang']
+  end
+end
+class GFMTest < Test::Unit::TestCase
+  def gfm(text)
+    MarkdownFilter.call(text, :gfm => true)
+  end
+  def test_not_touch_single_underscores_inside_words
+    assert_equal "<p>foo_bar</p>",
+                 gfm("foo_bar")
+  end
+  def test_not_touch_underscores_in_code_blocks
+    assert_equal "<pre><code>foo_bar_baz\n</code></pre>",
+                 gfm("    foo_bar_baz")
+  end
+  def test_not_touch_underscores_in_pre_blocks
+    assert_equal "<pre>\nfoo_bar_baz\n</pre>",
+                 gfm("<pre>\nfoo_bar_baz\n</pre>")
+  end
+  def test_not_touch_two_or_more_underscores_inside_words
+    assert_equal "<p>foo_bar_baz</p>",
+                 gfm("foo_bar_baz")
+  end
+  def test_turn_newlines_into_br_tags_in_simple_cases
+    assert_equal "<p>foo<br>\nbar</p>",
+                 gfm("foo\nbar")
+  end
+  def test_convert_newlines_in_all_groups
+    assert_equal "<p>apple<br>\npear<br>\norange</p>\n\n" +
+                 "<p>ruby<br>\npython<br>\nerlang</p>",
+                 gfm("apple\npear\norange\n\nruby\npython\nerlang")
+  end
+  def test_convert_newlines_in_even_long_groups
+    assert_equal "<p>apple<br>\npear<br>\norange<br>\nbanana</p>\n\n" +
+                 "<p>ruby<br>\npython<br>\nerlang</p>",
+                 gfm("apple\npear\norange\nbanana\n\nruby\npython\nerlang")
+  end
+  def test_not_convert_newlines_in_lists
+    assert_equal "<h1>foo</h1>\n\n<h1>bar</h1>",
+                 gfm("# foo\n# bar")
+    assert_equal "<ul>\n<li>foo</li>\n<li>bar</li>\n</ul>",
+                 gfm("* foo\n* bar")
+  end
+end