html-pipeline 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. data/.gitignore +19 -0
  2. data/.travis.yml +13 -0
  3. data/Gemfile +9 -0
  4. data/LICENSE +22 -0
  5. data/README.md +128 -0
  6. data/Rakefile +11 -0
  7. data/html-pipeline.gemspec +25 -0
  8. data/lib/html/pipeline.rb +130 -0
  9. data/lib/html/pipeline/@mention_filter.rb +118 -0
  10. data/lib/html/pipeline/autolink_filter.rb +22 -0
  11. data/lib/html/pipeline/body_content.rb +42 -0
  12. data/lib/html/pipeline/camo_filter.rb +64 -0
  13. data/lib/html/pipeline/email_reply_filter.rb +56 -0
  14. data/lib/html/pipeline/emoji_filter.rb +48 -0
  15. data/lib/html/pipeline/filter.rb +158 -0
  16. data/lib/html/pipeline/https_filter.rb +13 -0
  17. data/lib/html/pipeline/image_max_width_filter.rb +37 -0
  18. data/lib/html/pipeline/markdown_filter.rb +29 -0
  19. data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
  20. data/lib/html/pipeline/sanitization_filter.rb +107 -0
  21. data/lib/html/pipeline/syntax_highlight_filter.rb +29 -0
  22. data/lib/html/pipeline/text_filter.rb +14 -0
  23. data/lib/html/pipeline/textile_filter.rb +21 -0
  24. data/lib/html/pipeline/toc_filter.rb +28 -0
  25. data/lib/html/pipeline/version.rb +5 -0
  26. data/test/html/pipeline/autolink_filter_test.rb +22 -0
  27. data/test/html/pipeline/camo_filter_test.rb +39 -0
  28. data/test/html/pipeline/emoji_filter_test.rb +16 -0
  29. data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
  30. data/test/html/pipeline/markdown_filter_test.rb +101 -0
  31. data/test/html/pipeline/mention_filter_test.rb +158 -0
  32. data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
  33. data/test/html/pipeline/sanitization_filter_test.rb +47 -0
  34. data/test/html/pipeline/toc_filter_test.rb +47 -0
  35. data/test/test_helper.rb +38 -0
  36. metadata +221 -0
@@ -0,0 +1,29 @@
1
+ require 'github/markdown'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter that converts Markdown text into HTML and converts into a
6
+ # DocumentFragment. This is different from most filters in that it can take a
7
+ # non-HTML as input. It must be used as the first filter in a pipeline.
8
+ #
9
+ # Context options:
10
+ # :gfm => false Disable GFM line-end processing
11
+ #
12
+ # This filter does not write any additional information to the context hash.
13
+ class MarkdownFilter < TextFilter
14
+ def initialize(text, context = nil, result = nil)
15
+ super text, context, result
16
+ @text.gsub! "\r", ''
17
+ end
18
+
19
+ # Convert Markdown to HTML using the best available implementation
20
+ # and convert into a DocumentFragment.
21
+ def call
22
+ mode = (context[:gfm] != false) ? :gfm : :markdown
23
+ html = GitHub::Markdown.to_html(@text, mode)
24
+ html.rstrip!
25
+ html
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,11 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Simple filter for plain text input. HTML escapes the text input and wraps it
4
+ # in a div.
5
+ class PlainTextInputFilter < TextFilter
6
+ def call
7
+ "<div>#{EscapeUtils.escape_html(@text, false)}</div>"
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,107 @@
1
+ require 'sanitize'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML filter with sanization routines and whitelists. This module defines
6
+ # what HTML is allowed in user provided content and fixes up issues with
7
+ # unbalanced tags and whatnot.
8
+ #
9
+ # See the Sanitize docs for more information on the underlying library:
10
+ #
11
+ # https://github.com/rgrove/sanitize/#readme
12
+ #
13
+ # Context options:
14
+ # :whitelist - The sanitizer whitelist configuration to use. This can be one
15
+ # of the options constants defined in this class or a custom
16
+ # sanitize options hash.
17
+ #
18
+ # This filter does not write additional information to the context.
19
+ class SanitizationFilter < Filter
20
+ LISTS = Set.new(%w(ul ol).freeze)
21
+ LIST_ITEM = 'li'.freeze
22
+
23
+ # List of table child elements. These must be contained by a <table> element
24
+ # or they are not allowed through. Otherwise they can be used to break out
25
+ # of places we're using tables to contain formatted user content (like pull
26
+ # request review comments).
27
+ TABLE_ITEMS = Set.new(%w(tr td th).freeze)
28
+ TABLE = 'table'.freeze
29
+
30
+ # The main sanitization whitelist. Only these elements and attributes are
31
+ # allowed through by default.
32
+ WHITELIST = {
33
+ :elements => %w(
34
+ h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
35
+ div ins del sup sub p ol ul table blockquote dl dt dd
36
+ kbd q samp var hr ruby rt rp
37
+ ),
38
+ :attributes => {
39
+ 'a' => ['href'],
40
+ 'img' => ['src'],
41
+ 'div' => ['itemscope', 'itemtype'],
42
+ :all => ['abbr', 'accept', 'accept-charset',
43
+ 'accesskey', 'action', 'align', 'alt', 'axis',
44
+ 'border', 'cellpadding', 'cellspacing', 'char',
45
+ 'charoff', 'charset', 'checked', 'cite',
46
+ 'clear', 'cols', 'colspan', 'color',
47
+ 'compact', 'coords', 'datetime', 'dir',
48
+ 'disabled', 'enctype', 'for', 'frame',
49
+ 'headers', 'height', 'hreflang',
50
+ 'hspace', 'ismap', 'label', 'lang',
51
+ 'longdesc', 'maxlength', 'media', 'method',
52
+ 'multiple', 'name', 'nohref', 'noshade',
53
+ 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
54
+ 'rows', 'rowspan', 'rules', 'scope',
55
+ 'selected', 'shape', 'size', 'span',
56
+ 'start', 'summary', 'tabindex', 'target',
57
+ 'title', 'type', 'usemap', 'valign', 'value',
58
+ 'vspace', 'width', 'itemprop']
59
+ },
60
+ :protocols => {
61
+ 'a' => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
62
+ 'img' => {'src' => ['http', 'https', :relative]}
63
+ },
64
+ :transformers => [
65
+ # whitelist only <li> elements that are descended from a <ul> or <ol>.
66
+ # top-level <li> elements are removed because they can break out of
67
+ # containing markup.
68
+ lambda { |env|
69
+ name, node = env[:node_name], env[:node]
70
+ if name == LIST_ITEM && node.ancestors.any?{ |n| LISTS.include?(n.name) }
71
+ {:node_whitelist => [node]}
72
+ end
73
+ },
74
+
75
+ # Whitelist only table child elements that are descended from a <table>.
76
+ # Table child elements that are not contained by a <table> are removed.
77
+ lambda { |env|
78
+ name, node = env[:node_name], env[:node]
79
+ if TABLE_ITEMS.include?(name) && node.ancestors.any? { |n| n.name == TABLE }
80
+ { :node_whitelist => [node] }
81
+ end
82
+ }
83
+ ]
84
+ }
85
+
86
+ # A more limited sanitization whitelist. This includes all attributes,
87
+ # protocols, and transformers from WHITELIST but with a more locked down
88
+ # set of allowed elements.
89
+ LIMITED = WHITELIST.merge(
90
+ :elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))
91
+
92
+ # Strip all HTML tags from the document.
93
+ FULL = { :elements => [] }
94
+
95
+ # Sanitize markup using the Sanitize library.
96
+ def call
97
+ Sanitize.clean_node!(doc, whitelist)
98
+ end
99
+
100
+ # The whitelist to use when sanitizing. This can be passed in the context
101
+ # hash to the filter but defaults to WHITELIST constant value above.
102
+ def whitelist
103
+ context[:whitelist] || WHITELIST
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,29 @@
1
+ require 'linguist'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter that syntax highlights code blocks wrapped
6
+ # in <pre lang="...">.
7
+ class SyntaxHighlightFilter < Filter
8
+ def call
9
+ doc.search('pre').each do |node|
10
+ next unless lang = node['lang']
11
+ next unless lexer = Pygments::Lexer[lang]
12
+ text = node.inner_text
13
+
14
+ html = highlight_with_timeout_handling(lexer, text)
15
+ next if html.nil?
16
+
17
+ node.replace(html)
18
+ end
19
+ doc
20
+ end
21
+
22
+ def highlight_with_timeout_handling(lexer, text)
23
+ lexer.highlight(text)
24
+ rescue Timeout::Error => boom
25
+ nil
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,14 @@
1
+ module HTML
2
+ class Pipeline
3
+ class TextFilter < Filter
4
+ attr_reader :text
5
+
6
+ def initialize(text, context = nil, result = nil)
7
+ raise TypeError, "text cannot be HTML" if text.is_a?(DocumentFragment)
8
+ # Ensure that this is always a string
9
+ @text = text.respond_to?(:to_str) ? text.to_str : text.to_s
10
+ super nil, context, result
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,21 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML Filter that converts Textile text into HTML and converts into a
4
+ # DocumentFragment. This is different from most filters in that it can take a
5
+ # non-HTML as input. It must be used as the first filter in a pipeline.
6
+ #
7
+ # Context options:
8
+ # :autolink => false Disable autolinking URLs
9
+ #
10
+ # This filter does not write any additional information to the context hash.
11
+ #
12
+ # NOTE This filter is provided for really old comments only. It probably
13
+ # shouldn't be used for anything new.
14
+ class TextileFilter < TextFilter
15
+ # Convert Textile to HTML and convert into a DocumentFragment.
16
+ def call
17
+ RedCloth.new(@text).to_html
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,28 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML filter that adds a 'name' attribute to all headers
4
+ # in a document, so they can be accessed from a table of contents
5
+ #
6
+ # TODO: besides adding the name attribute, we should get around to
7
+ # eventually generating the Table of Contents itself, with links
8
+ # to each header
9
+ class TableOfContentsFilter < Filter
10
+ def call
11
+ headers = Hash.new(0)
12
+ doc.css('h1, h2, h3, h4, h5, h6').each do |node|
13
+ name = node.text.downcase
14
+ name.gsub!(/[^\w\- ]/, '') # remove punctuation
15
+ name.gsub!(' ', '-') # replace spaces with dash
16
+ name = EscapeUtils.escape_uri(name) # escape extended UTF-8 chars
17
+
18
+ uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
19
+ headers[name] += 1
20
+ if header_content = node.children.first
21
+ header_content.add_previous_sibling(%Q{<a name="#{name}#{uniq}" class="anchor" href="##{name}#{uniq}"><span class="mini-icon mini-icon-link"></span></a>})
22
+ end
23
+ end
24
+ doc
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,5 @@
1
+ module HTML
2
+ class Pipeline
3
+ VERSION = "0.0.4"
4
+ end
5
+ end
@@ -0,0 +1,22 @@
1
+ require "test_helper"
2
+
3
+ AutolinkFilter = HTML::Pipeline::AutolinkFilter
4
+
5
+ class HTML::Pipeline::AutolinkFilterTest < Test::Unit::TestCase
6
+ def test_uses_rinku_for_autolinking
7
+ # just try to parse a complicated piece of HTML
8
+ # that Rails auto_link cannot handle
9
+ assert_equal '<p>"<a href="http://www.github.com">http://www.github.com</a>"</p>',
10
+ AutolinkFilter.to_html('<p>"http://www.github.com"</p>')
11
+ end
12
+
13
+ def test_autolink_option
14
+ assert_equal '<p>"http://www.github.com"</p>',
15
+ AutolinkFilter.to_html('<p>"http://www.github.com"</p>', :autolink => false)
16
+ end
17
+
18
+ def test_autolink_flags
19
+ assert_equal '<p>"<a href="http://github">http://github</a>"</p>',
20
+ AutolinkFilter.to_html('<p>"http://github"</p>', :flags => Rinku::AUTOLINK_SHORT_DOMAINS)
21
+ end
22
+ end
@@ -0,0 +1,39 @@
1
+ require "test_helper"
2
+
3
+ class HTML::Pipeline::CamoFilterTest < Test::Unit::TestCase
4
+ CamoFilter = HTML::Pipeline::CamoFilter
5
+
6
+ def setup
7
+ @asset_proxy_url = 'https//assets.example.org'
8
+ @asset_proxy_secret_key = 'ssssh-secret'
9
+ @options = {
10
+ :asset_proxy => @asset_proxy_url,
11
+ :asset_proxy_secret_key => @asset_proxy_secret_key
12
+ }
13
+ end
14
+
15
+ def test_camouflaging_http_image_urls
16
+ orig = %(<p><img src="http://twitter.com/img.png"></p>)
17
+ assert_includes 'img src="' + @asset_proxy_url,
18
+ CamoFilter.call(orig, @options).to_s
19
+ end
20
+
21
+ def test_rewrites_dotcom_image_urls
22
+ orig = %(<p><img src="http://github.com/img.png"></p>)
23
+ assert_equal "<p><img src=\"https://github.com/img.png\"></p>",
24
+ CamoFilter.call(orig, @options).to_s
25
+ end
26
+
27
+ def test_not_camouflaging_https_image_urls
28
+ orig = %(<p><img src="https://foo.com/img.png"></p>)
29
+ assert_doesnt_include 'img src="' + @asset_proxy_url,
30
+ CamoFilter.call(orig, @options).to_s
31
+ end
32
+
33
+ def test_handling_images_with_no_src_attribute
34
+ orig = %(<p><img></p>)
35
+ assert_nothing_raised do
36
+ CamoFilter.call(orig, @options).to_s
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,16 @@
1
+ require 'test_helper'
2
+
3
+ class HTML::Pipeline::EmojiFilterTest < Test::Unit::TestCase
4
+ def test_emojify
5
+ filter = HTML::Pipeline::EmojiFilter.new("<p>:shipit:</p>", {:asset_root => 'https://foo.com'})
6
+ doc = filter.call
7
+ assert_match "https://foo.com/emoji/shipit.png", doc.search('img').attr('src').value
8
+ end
9
+
10
+ def test_missing_context
11
+ filter = HTML::Pipeline::EmojiFilter.new("<p>:shipit:</p>", {})
12
+ assert_raises ArgumentError do
13
+ filter.call
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,50 @@
1
+ require "test_helper"
2
+
3
+ class HTML::Pipeline::ImageMaxWidthFilterTest < Test::Unit::TestCase
4
+ def filter(html)
5
+ HTML::Pipeline::ImageMaxWidthFilter.call(html)
6
+ end
7
+
8
+ def test_rewrites_image_style_tags
9
+ body = "<p>Screenshot: <img src='screenshot.png'></p>"
10
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
11
+
12
+ res = filter(doc)
13
+ assert_equal_html %q(<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
14
+ res.to_html
15
+ end
16
+
17
+ def test_leaves_existing_image_style_tags_alone
18
+ body = "<p><img src='screenshot.png' style='width:100px;'></p>"
19
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
20
+
21
+ res = filter(doc)
22
+ assert_equal_html '<p><img src="screenshot.png" style="width:100px;"></p>',
23
+ res.to_html
24
+ end
25
+
26
+ def test_links_to_image
27
+ body = "<p>Screenshot: <img src='screenshot.png'></p>"
28
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
29
+
30
+ res = filter(doc)
31
+ assert_equal_html '<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>',
32
+ res.to_html
33
+ end
34
+
35
+ def test_doesnt_link_to_image_when_already_linked
36
+ body = "<p>Screenshot: <a href='blah.png'><img src='screenshot.png'></a></p>"
37
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
38
+
39
+ res = filter(doc)
40
+ assert_equal_html %q(<p>Screenshot: <a href="blah.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
41
+ res.to_html
42
+ end
43
+
44
+ def test_doesnt_screw_up_inlined_images
45
+ body = "<p>Screenshot <img src='screenshot.png'>, yes, this is a <b>screenshot</b> indeed.</p>"
46
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
47
+
48
+ assert_equal_html %q(<p>Screenshot <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a>, yes, this is a <b>screenshot</b> indeed.</p>), filter(doc).to_html
49
+ end
50
+ end
@@ -0,0 +1,101 @@
1
+ require "test_helper"
2
+
3
+ MarkdownFilter = HTML::Pipeline::MarkdownFilter
4
+
5
+ class HTML::Pipeline::MarkdownFilterTest < Test::Unit::TestCase
6
+ def setup
7
+ @haiku =
8
+ "Pointing at the moon\n" +
9
+ "Reminded of simple things\n" +
10
+ "Moments matter most"
11
+ @links =
12
+ "See http://example.org/ for more info"
13
+ @code =
14
+ "```\n" +
15
+ "def hello()" +
16
+ " 'world'" +
17
+ "end" +
18
+ "```"
19
+ end
20
+
21
+ def test_fails_when_given_a_documentfragment
22
+ body = "<p>heyo</p>"
23
+ doc = HTML::Pipeline.parse(body)
24
+ assert_raise(TypeError) { MarkdownFilter.call(doc, {}) }
25
+ end
26
+
27
+ def test_gfm_enabled_by_default
28
+ doc = MarkdownFilter.to_document(@haiku, {})
29
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
30
+ assert_equal 2, doc.search('br').size
31
+ end
32
+
33
+ def test_disabling_gfm
34
+ doc = MarkdownFilter.to_document(@haiku, :gfm => false)
35
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
36
+ assert_equal 0, doc.search('br').size
37
+ end
38
+
39
+ def test_fenced_code_blocks
40
+ doc = MarkdownFilter.to_document(@code)
41
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
42
+ assert_equal 1, doc.search('pre').size
43
+ end
44
+
45
+ def test_fenced_code_blocks_with_language
46
+ doc = MarkdownFilter.to_document(@code.sub("```", "``` ruby"))
47
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
48
+ assert_equal 1, doc.search('pre').size
49
+ assert_equal 'ruby', doc.search('pre').first['lang']
50
+ end
51
+ end
52
+
53
+ class GFMTest < Test::Unit::TestCase
54
+ def gfm(text)
55
+ MarkdownFilter.call(text, :gfm => true)
56
+ end
57
+
58
+ def test_not_touch_single_underscores_inside_words
59
+ assert_equal "<p>foo_bar</p>",
60
+ gfm("foo_bar")
61
+ end
62
+
63
+ def test_not_touch_underscores_in_code_blocks
64
+ assert_equal "<pre><code>foo_bar_baz\n</code></pre>",
65
+ gfm(" foo_bar_baz")
66
+ end
67
+
68
+ def test_not_touch_underscores_in_pre_blocks
69
+ assert_equal "<pre>\nfoo_bar_baz\n</pre>",
70
+ gfm("<pre>\nfoo_bar_baz\n</pre>")
71
+ end
72
+
73
+ def test_not_touch_two_or_more_underscores_inside_words
74
+ assert_equal "<p>foo_bar_baz</p>",
75
+ gfm("foo_bar_baz")
76
+ end
77
+
78
+ def test_turn_newlines_into_br_tags_in_simple_cases
79
+ assert_equal "<p>foo<br>\nbar</p>",
80
+ gfm("foo\nbar")
81
+ end
82
+
83
+ def test_convert_newlines_in_all_groups
84
+ assert_equal "<p>apple<br>\npear<br>\norange</p>\n\n" +
85
+ "<p>ruby<br>\npython<br>\nerlang</p>",
86
+ gfm("apple\npear\norange\n\nruby\npython\nerlang")
87
+ end
88
+
89
+ def test_convert_newlines_in_even_long_groups
90
+ assert_equal "<p>apple<br>\npear<br>\norange<br>\nbanana</p>\n\n" +
91
+ "<p>ruby<br>\npython<br>\nerlang</p>",
92
+ gfm("apple\npear\norange\nbanana\n\nruby\npython\nerlang")
93
+ end
94
+
95
+ def test_not_convert_newlines_in_lists
96
+ assert_equal "<h1>foo</h1>\n\n<h1>bar</h1>",
97
+ gfm("# foo\n# bar")
98
+ assert_equal "<ul>\n<li>foo</li>\n<li>bar</li>\n</ul>",
99
+ gfm("* foo\n* bar")
100
+ end
101
+ end