html-pipeline 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. data/.gitignore +19 -0
  2. data/.travis.yml +13 -0
  3. data/Gemfile +9 -0
  4. data/LICENSE +22 -0
  5. data/README.md +128 -0
  6. data/Rakefile +11 -0
  7. data/html-pipeline.gemspec +25 -0
  8. data/lib/html/pipeline.rb +130 -0
  9. data/lib/html/pipeline/@mention_filter.rb +118 -0
  10. data/lib/html/pipeline/autolink_filter.rb +22 -0
  11. data/lib/html/pipeline/body_content.rb +42 -0
  12. data/lib/html/pipeline/camo_filter.rb +64 -0
  13. data/lib/html/pipeline/email_reply_filter.rb +56 -0
  14. data/lib/html/pipeline/emoji_filter.rb +48 -0
  15. data/lib/html/pipeline/filter.rb +158 -0
  16. data/lib/html/pipeline/https_filter.rb +13 -0
  17. data/lib/html/pipeline/image_max_width_filter.rb +37 -0
  18. data/lib/html/pipeline/markdown_filter.rb +29 -0
  19. data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
  20. data/lib/html/pipeline/sanitization_filter.rb +107 -0
  21. data/lib/html/pipeline/syntax_highlight_filter.rb +29 -0
  22. data/lib/html/pipeline/text_filter.rb +14 -0
  23. data/lib/html/pipeline/textile_filter.rb +21 -0
  24. data/lib/html/pipeline/toc_filter.rb +28 -0
  25. data/lib/html/pipeline/version.rb +5 -0
  26. data/test/html/pipeline/autolink_filter_test.rb +22 -0
  27. data/test/html/pipeline/camo_filter_test.rb +39 -0
  28. data/test/html/pipeline/emoji_filter_test.rb +16 -0
  29. data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
  30. data/test/html/pipeline/markdown_filter_test.rb +101 -0
  31. data/test/html/pipeline/mention_filter_test.rb +158 -0
  32. data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
  33. data/test/html/pipeline/sanitization_filter_test.rb +47 -0
  34. data/test/html/pipeline/toc_filter_test.rb +47 -0
  35. data/test/test_helper.rb +38 -0
  36. metadata +221 -0
@@ -0,0 +1,29 @@
1
+ require 'github/markdown'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter that converts Markdown text into HTML and converts into a
6
+ # DocumentFragment. This is different from most filters in that it can take a
7
+ # non-HTML as input. It must be used as the first filter in a pipeline.
8
+ #
9
+ # Context options:
10
+ # :gfm => false Disable GFM line-end processing
11
+ #
12
+ # This filter does not write any additional information to the context hash.
13
+ class MarkdownFilter < TextFilter
14
+ def initialize(text, context = nil, result = nil)
15
+ super text, context, result
16
+ @text.gsub! "\r", ''
17
+ end
18
+
19
+ # Convert Markdown to HTML using the best available implementation
20
+ # and convert into a DocumentFragment.
21
+ def call
22
+ mode = (context[:gfm] != false) ? :gfm : :markdown
23
+ html = GitHub::Markdown.to_html(@text, mode)
24
+ html.rstrip!
25
+ html
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,11 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Simple filter for plain text input. HTML escapes the text input and wraps it
4
+ # in a div.
5
+ class PlainTextInputFilter < TextFilter
6
+ def call
7
+ "<div>#{EscapeUtils.escape_html(@text, false)}</div>"
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,107 @@
1
+ require 'sanitize'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML filter with sanization routines and whitelists. This module defines
6
+ # what HTML is allowed in user provided content and fixes up issues with
7
+ # unbalanced tags and whatnot.
8
+ #
9
+ # See the Sanitize docs for more information on the underlying library:
10
+ #
11
+ # https://github.com/rgrove/sanitize/#readme
12
+ #
13
+ # Context options:
14
+ # :whitelist - The sanitizer whitelist configuration to use. This can be one
15
+ # of the options constants defined in this class or a custom
16
+ # sanitize options hash.
17
+ #
18
+ # This filter does not write additional information to the context.
19
+ class SanitizationFilter < Filter
20
+ LISTS = Set.new(%w(ul ol).freeze)
21
+ LIST_ITEM = 'li'.freeze
22
+
23
+ # List of table child elements. These must be contained by a <table> element
24
+ # or they are not allowed through. Otherwise they can be used to break out
25
+ # of places we're using tables to contain formatted user content (like pull
26
+ # request review comments).
27
+ TABLE_ITEMS = Set.new(%w(tr td th).freeze)
28
+ TABLE = 'table'.freeze
29
+
30
+ # The main sanitization whitelist. Only these elements and attributes are
31
+ # allowed through by default.
32
+ WHITELIST = {
33
+ :elements => %w(
34
+ h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
35
+ div ins del sup sub p ol ul table blockquote dl dt dd
36
+ kbd q samp var hr ruby rt rp
37
+ ),
38
+ :attributes => {
39
+ 'a' => ['href'],
40
+ 'img' => ['src'],
41
+ 'div' => ['itemscope', 'itemtype'],
42
+ :all => ['abbr', 'accept', 'accept-charset',
43
+ 'accesskey', 'action', 'align', 'alt', 'axis',
44
+ 'border', 'cellpadding', 'cellspacing', 'char',
45
+ 'charoff', 'charset', 'checked', 'cite',
46
+ 'clear', 'cols', 'colspan', 'color',
47
+ 'compact', 'coords', 'datetime', 'dir',
48
+ 'disabled', 'enctype', 'for', 'frame',
49
+ 'headers', 'height', 'hreflang',
50
+ 'hspace', 'ismap', 'label', 'lang',
51
+ 'longdesc', 'maxlength', 'media', 'method',
52
+ 'multiple', 'name', 'nohref', 'noshade',
53
+ 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
54
+ 'rows', 'rowspan', 'rules', 'scope',
55
+ 'selected', 'shape', 'size', 'span',
56
+ 'start', 'summary', 'tabindex', 'target',
57
+ 'title', 'type', 'usemap', 'valign', 'value',
58
+ 'vspace', 'width', 'itemprop']
59
+ },
60
+ :protocols => {
61
+ 'a' => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
62
+ 'img' => {'src' => ['http', 'https', :relative]}
63
+ },
64
+ :transformers => [
65
+ # whitelist only <li> elements that are descended from a <ul> or <ol>.
66
+ # top-level <li> elements are removed because they can break out of
67
+ # containing markup.
68
+ lambda { |env|
69
+ name, node = env[:node_name], env[:node]
70
+ if name == LIST_ITEM && node.ancestors.any?{ |n| LISTS.include?(n.name) }
71
+ {:node_whitelist => [node]}
72
+ end
73
+ },
74
+
75
+ # Whitelist only table child elements that are descended from a <table>.
76
+ # Table child elements that are not contained by a <table> are removed.
77
+ lambda { |env|
78
+ name, node = env[:node_name], env[:node]
79
+ if TABLE_ITEMS.include?(name) && node.ancestors.any? { |n| n.name == TABLE }
80
+ { :node_whitelist => [node] }
81
+ end
82
+ }
83
+ ]
84
+ }
85
+
86
+ # A more limited sanitization whitelist. This includes all attributes,
87
+ # protocols, and transformers from WHITELIST but with a more locked down
88
+ # set of allowed elements.
89
+ LIMITED = WHITELIST.merge(
90
+ :elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))
91
+
92
+ # Strip all HTML tags from the document.
93
+ FULL = { :elements => [] }
94
+
95
+ # Sanitize markup using the Sanitize library.
96
+ def call
97
+ Sanitize.clean_node!(doc, whitelist)
98
+ end
99
+
100
+ # The whitelist to use when sanitizing. This can be passed in the context
101
+ # hash to the filter but defaults to WHITELIST constant value above.
102
+ def whitelist
103
+ context[:whitelist] || WHITELIST
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,29 @@
1
+ require 'linguist'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter that syntax highlights code blocks wrapped
6
+ # in <pre lang="...">.
7
+ class SyntaxHighlightFilter < Filter
8
+ def call
9
+ doc.search('pre').each do |node|
10
+ next unless lang = node['lang']
11
+ next unless lexer = Pygments::Lexer[lang]
12
+ text = node.inner_text
13
+
14
+ html = highlight_with_timeout_handling(lexer, text)
15
+ next if html.nil?
16
+
17
+ node.replace(html)
18
+ end
19
+ doc
20
+ end
21
+
22
+ def highlight_with_timeout_handling(lexer, text)
23
+ lexer.highlight(text)
24
+ rescue Timeout::Error => boom
25
+ nil
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,14 @@
1
+ module HTML
2
+ class Pipeline
3
+ class TextFilter < Filter
4
+ attr_reader :text
5
+
6
+ def initialize(text, context = nil, result = nil)
7
+ raise TypeError, "text cannot be HTML" if text.is_a?(DocumentFragment)
8
+ # Ensure that this is always a string
9
+ @text = text.respond_to?(:to_str) ? text.to_str : text.to_s
10
+ super nil, context, result
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,21 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML Filter that converts Textile text into HTML and converts into a
4
+ # DocumentFragment. This is different from most filters in that it can take a
5
+ # non-HTML as input. It must be used as the first filter in a pipeline.
6
+ #
7
+ # Context options:
8
+ # :autolink => false Disable autolinking URLs
9
+ #
10
+ # This filter does not write any additional information to the context hash.
11
+ #
12
+ # NOTE This filter is provided for really old comments only. It probably
13
+ # shouldn't be used for anything new.
14
+ class TextileFilter < TextFilter
15
+ # Convert Textile to HTML and convert into a DocumentFragment.
16
+ def call
17
+ RedCloth.new(@text).to_html
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,28 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML filter that adds a 'name' attribute to all headers
4
+ # in a document, so they can be accessed from a table of contents
5
+ #
6
+ # TODO: besides adding the name attribute, we should get around to
7
+ # eventually generating the Table of Contents itself, with links
8
+ # to each header
9
+ class TableOfContentsFilter < Filter
10
+ def call
11
+ headers = Hash.new(0)
12
+ doc.css('h1, h2, h3, h4, h5, h6').each do |node|
13
+ name = node.text.downcase
14
+ name.gsub!(/[^\w\- ]/, '') # remove punctuation
15
+ name.gsub!(' ', '-') # replace spaces with dash
16
+ name = EscapeUtils.escape_uri(name) # escape extended UTF-8 chars
17
+
18
+ uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
19
+ headers[name] += 1
20
+ if header_content = node.children.first
21
+ header_content.add_previous_sibling(%Q{<a name="#{name}#{uniq}" class="anchor" href="##{name}#{uniq}"><span class="mini-icon mini-icon-link"></span></a>})
22
+ end
23
+ end
24
+ doc
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,5 @@
1
+ module HTML
2
+ class Pipeline
3
+ VERSION = "0.0.4"
4
+ end
5
+ end
@@ -0,0 +1,22 @@
1
+ require "test_helper"
2
+
3
+ AutolinkFilter = HTML::Pipeline::AutolinkFilter
4
+
5
+ class HTML::Pipeline::AutolinkFilterTest < Test::Unit::TestCase
6
+ def test_uses_rinku_for_autolinking
7
+ # just try to parse a complicated piece of HTML
8
+ # that Rails auto_link cannot handle
9
+ assert_equal '<p>"<a href="http://www.github.com">http://www.github.com</a>"</p>',
10
+ AutolinkFilter.to_html('<p>"http://www.github.com"</p>')
11
+ end
12
+
13
+ def test_autolink_option
14
+ assert_equal '<p>"http://www.github.com"</p>',
15
+ AutolinkFilter.to_html('<p>"http://www.github.com"</p>', :autolink => false)
16
+ end
17
+
18
+ def test_autolink_flags
19
+ assert_equal '<p>"<a href="http://github">http://github</a>"</p>',
20
+ AutolinkFilter.to_html('<p>"http://github"</p>', :flags => Rinku::AUTOLINK_SHORT_DOMAINS)
21
+ end
22
+ end
@@ -0,0 +1,39 @@
1
+ require "test_helper"
2
+
3
+ class HTML::Pipeline::CamoFilterTest < Test::Unit::TestCase
4
+ CamoFilter = HTML::Pipeline::CamoFilter
5
+
6
+ def setup
7
+ @asset_proxy_url = 'https//assets.example.org'
8
+ @asset_proxy_secret_key = 'ssssh-secret'
9
+ @options = {
10
+ :asset_proxy => @asset_proxy_url,
11
+ :asset_proxy_secret_key => @asset_proxy_secret_key
12
+ }
13
+ end
14
+
15
+ def test_camouflaging_http_image_urls
16
+ orig = %(<p><img src="http://twitter.com/img.png"></p>)
17
+ assert_includes 'img src="' + @asset_proxy_url,
18
+ CamoFilter.call(orig, @options).to_s
19
+ end
20
+
21
+ def test_rewrites_dotcom_image_urls
22
+ orig = %(<p><img src="http://github.com/img.png"></p>)
23
+ assert_equal "<p><img src=\"https://github.com/img.png\"></p>",
24
+ CamoFilter.call(orig, @options).to_s
25
+ end
26
+
27
+ def test_not_camouflaging_https_image_urls
28
+ orig = %(<p><img src="https://foo.com/img.png"></p>)
29
+ assert_doesnt_include 'img src="' + @asset_proxy_url,
30
+ CamoFilter.call(orig, @options).to_s
31
+ end
32
+
33
+ def test_handling_images_with_no_src_attribute
34
+ orig = %(<p><img></p>)
35
+ assert_nothing_raised do
36
+ CamoFilter.call(orig, @options).to_s
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,16 @@
1
+ require 'test_helper'
2
+
3
+ class HTML::Pipeline::EmojiFilterTest < Test::Unit::TestCase
4
+ def test_emojify
5
+ filter = HTML::Pipeline::EmojiFilter.new("<p>:shipit:</p>", {:asset_root => 'https://foo.com'})
6
+ doc = filter.call
7
+ assert_match "https://foo.com/emoji/shipit.png", doc.search('img').attr('src').value
8
+ end
9
+
10
+ def test_missing_context
11
+ filter = HTML::Pipeline::EmojiFilter.new("<p>:shipit:</p>", {})
12
+ assert_raises ArgumentError do
13
+ filter.call
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,50 @@
1
+ require "test_helper"
2
+
3
+ class HTML::Pipeline::ImageMaxWidthFilterTest < Test::Unit::TestCase
4
+ def filter(html)
5
+ HTML::Pipeline::ImageMaxWidthFilter.call(html)
6
+ end
7
+
8
+ def test_rewrites_image_style_tags
9
+ body = "<p>Screenshot: <img src='screenshot.png'></p>"
10
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
11
+
12
+ res = filter(doc)
13
+ assert_equal_html %q(<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
14
+ res.to_html
15
+ end
16
+
17
+ def test_leaves_existing_image_style_tags_alone
18
+ body = "<p><img src='screenshot.png' style='width:100px;'></p>"
19
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
20
+
21
+ res = filter(doc)
22
+ assert_equal_html '<p><img src="screenshot.png" style="width:100px;"></p>',
23
+ res.to_html
24
+ end
25
+
26
+ def test_links_to_image
27
+ body = "<p>Screenshot: <img src='screenshot.png'></p>"
28
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
29
+
30
+ res = filter(doc)
31
+ assert_equal_html '<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>',
32
+ res.to_html
33
+ end
34
+
35
+ def test_doesnt_link_to_image_when_already_linked
36
+ body = "<p>Screenshot: <a href='blah.png'><img src='screenshot.png'></a></p>"
37
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
38
+
39
+ res = filter(doc)
40
+ assert_equal_html %q(<p>Screenshot: <a href="blah.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
41
+ res.to_html
42
+ end
43
+
44
+ def test_doesnt_screw_up_inlined_images
45
+ body = "<p>Screenshot <img src='screenshot.png'>, yes, this is a <b>screenshot</b> indeed.</p>"
46
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
47
+
48
+ assert_equal_html %q(<p>Screenshot <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a>, yes, this is a <b>screenshot</b> indeed.</p>), filter(doc).to_html
49
+ end
50
+ end
@@ -0,0 +1,101 @@
1
+ require "test_helper"
2
+
3
+ MarkdownFilter = HTML::Pipeline::MarkdownFilter
4
+
5
+ class HTML::Pipeline::MarkdownFilterTest < Test::Unit::TestCase
6
+ def setup
7
+ @haiku =
8
+ "Pointing at the moon\n" +
9
+ "Reminded of simple things\n" +
10
+ "Moments matter most"
11
+ @links =
12
+ "See http://example.org/ for more info"
13
+ @code =
14
+ "```\n" +
15
+ "def hello()" +
16
+ " 'world'" +
17
+ "end" +
18
+ "```"
19
+ end
20
+
21
+ def test_fails_when_given_a_documentfragment
22
+ body = "<p>heyo</p>"
23
+ doc = HTML::Pipeline.parse(body)
24
+ assert_raise(TypeError) { MarkdownFilter.call(doc, {}) }
25
+ end
26
+
27
+ def test_gfm_enabled_by_default
28
+ doc = MarkdownFilter.to_document(@haiku, {})
29
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
30
+ assert_equal 2, doc.search('br').size
31
+ end
32
+
33
+ def test_disabling_gfm
34
+ doc = MarkdownFilter.to_document(@haiku, :gfm => false)
35
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
36
+ assert_equal 0, doc.search('br').size
37
+ end
38
+
39
+ def test_fenced_code_blocks
40
+ doc = MarkdownFilter.to_document(@code)
41
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
42
+ assert_equal 1, doc.search('pre').size
43
+ end
44
+
45
+ def test_fenced_code_blocks_with_language
46
+ doc = MarkdownFilter.to_document(@code.sub("```", "``` ruby"))
47
+ assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
48
+ assert_equal 1, doc.search('pre').size
49
+ assert_equal 'ruby', doc.search('pre').first['lang']
50
+ end
51
+ end
52
+
53
+ class GFMTest < Test::Unit::TestCase
54
+ def gfm(text)
55
+ MarkdownFilter.call(text, :gfm => true)
56
+ end
57
+
58
+ def test_not_touch_single_underscores_inside_words
59
+ assert_equal "<p>foo_bar</p>",
60
+ gfm("foo_bar")
61
+ end
62
+
63
+ def test_not_touch_underscores_in_code_blocks
64
+ assert_equal "<pre><code>foo_bar_baz\n</code></pre>",
65
+ gfm(" foo_bar_baz")
66
+ end
67
+
68
+ def test_not_touch_underscores_in_pre_blocks
69
+ assert_equal "<pre>\nfoo_bar_baz\n</pre>",
70
+ gfm("<pre>\nfoo_bar_baz\n</pre>")
71
+ end
72
+
73
+ def test_not_touch_two_or_more_underscores_inside_words
74
+ assert_equal "<p>foo_bar_baz</p>",
75
+ gfm("foo_bar_baz")
76
+ end
77
+
78
+ def test_turn_newlines_into_br_tags_in_simple_cases
79
+ assert_equal "<p>foo<br>\nbar</p>",
80
+ gfm("foo\nbar")
81
+ end
82
+
83
+ def test_convert_newlines_in_all_groups
84
+ assert_equal "<p>apple<br>\npear<br>\norange</p>\n\n" +
85
+ "<p>ruby<br>\npython<br>\nerlang</p>",
86
+ gfm("apple\npear\norange\n\nruby\npython\nerlang")
87
+ end
88
+
89
+ def test_convert_newlines_in_even_long_groups
90
+ assert_equal "<p>apple<br>\npear<br>\norange<br>\nbanana</p>\n\n" +
91
+ "<p>ruby<br>\npython<br>\nerlang</p>",
92
+ gfm("apple\npear\norange\nbanana\n\nruby\npython\nerlang")
93
+ end
94
+
95
+ def test_not_convert_newlines_in_lists
96
+ assert_equal "<h1>foo</h1>\n\n<h1>bar</h1>",
97
+ gfm("# foo\n# bar")
98
+ assert_equal "<ul>\n<li>foo</li>\n<li>bar</li>\n</ul>",
99
+ gfm("* foo\n* bar")
100
+ end
101
+ end