html-pipeline-no-charlock 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/.gitignore +19 -0
  2. data/.travis.yml +13 -0
  3. data/CHANGELOG.md +16 -0
  4. data/Gemfile +9 -0
  5. data/LICENSE +22 -0
  6. data/README.md +221 -0
  7. data/Rakefile +13 -0
  8. data/html-pipeline-no-charlock.gemspec +25 -0
  9. data/html-pipeline.gemspec +26 -0
  10. data/lib/html/pipeline.rb +130 -0
  11. data/lib/html/pipeline/@mention_filter.rb +118 -0
  12. data/lib/html/pipeline/autolink_filter.rb +22 -0
  13. data/lib/html/pipeline/body_content.rb +42 -0
  14. data/lib/html/pipeline/camo_filter.rb +70 -0
  15. data/lib/html/pipeline/email_reply_filter.rb +56 -0
  16. data/lib/html/pipeline/emoji_filter.rb +54 -0
  17. data/lib/html/pipeline/filter.rb +178 -0
  18. data/lib/html/pipeline/https_filter.rb +13 -0
  19. data/lib/html/pipeline/image_max_width_filter.rb +37 -0
  20. data/lib/html/pipeline/markdown_filter.rb +29 -0
  21. data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
  22. data/lib/html/pipeline/sanitization_filter.rb +105 -0
  23. data/lib/html/pipeline/syntax_highlight_filter.rb +29 -0
  24. data/lib/html/pipeline/text_filter.rb +14 -0
  25. data/lib/html/pipeline/textile_filter.rb +21 -0
  26. data/lib/html/pipeline/toc_filter.rb +28 -0
  27. data/lib/html/pipeline/version.rb +5 -0
  28. data/test/html/pipeline/autolink_filter_test.rb +22 -0
  29. data/test/html/pipeline/camo_filter_test.rb +47 -0
  30. data/test/html/pipeline/emoji_filter_test.rb +18 -0
  31. data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
  32. data/test/html/pipeline/markdown_filter_test.rb +101 -0
  33. data/test/html/pipeline/mention_filter_test.rb +158 -0
  34. data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
  35. data/test/html/pipeline/sanitization_filter_test.rb +47 -0
  36. data/test/html/pipeline/toc_filter_test.rb +47 -0
  37. data/test/test_helper.rb +38 -0
  38. metadata +214 -0
@@ -0,0 +1,13 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML Filter for replacing http github urls with https versions.
4
+ class HttpsFilter < Filter
5
+ def call
6
+ doc.css('a[href^="http://github.com"]').each do |element|
7
+ element['href'] = element['href'].sub(/^http:/,'https:')
8
+ end
9
+ doc
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,37 @@
1
+ module HTML
2
+ class Pipeline
3
+ # This filter rewrites image tags with a max-width inline style and also wraps
4
+ # the image in an <a> tag that causes the full size image to be opened in a
5
+ # new tab.
6
+ #
7
+ # The max-width inline styles are especially useful in HTML email which
8
+ # don't use a global stylesheets.
9
+ class ImageMaxWidthFilter < Filter
10
+ def call
11
+ doc.search('img').each do |element|
12
+ # Skip if there's already a style attribute. Not sure how this
13
+ # would happen but we can reconsider it in the future.
14
+ next if element['style']
15
+
16
+ # Bail out if src doesn't look like a valid http url. trying to avoid weird
17
+ # js injection via javascript: urls.
18
+ next if element['src'].to_s.strip =~ /\Ajavascript/i
19
+
20
+ element['style'] = "max-width:100%;"
21
+
22
+ if !has_ancestor?(element, %w(a))
23
+ link_image element
24
+ end
25
+ end
26
+
27
+ doc
28
+ end
29
+
30
+ def link_image(element)
31
+ link = doc.document.create_element('a', :href => element['src'], :target => '_blank')
32
+ link.add_child(element.dup)
33
+ element.replace(link)
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,29 @@
1
+ require 'github/markdown'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter that converts Markdown text into HTML and converts into a
6
+ # DocumentFragment. This is different from most filters in that it can take a
7
+ # non-HTML as input. It must be used as the first filter in a pipeline.
8
+ #
9
+ # Context options:
10
+ # :gfm => false Disable GFM line-end processing
11
+ #
12
+ # This filter does not write any additional information to the context hash.
13
+ class MarkdownFilter < TextFilter
14
+ def initialize(text, context = nil, result = nil)
15
+ super text, context, result
16
+ @text = @text.gsub "\r", ''
17
+ end
18
+
19
+ # Convert Markdown to HTML using the best available implementation
20
+ # and convert into a DocumentFragment.
21
+ def call
22
+ mode = (context[:gfm] != false) ? :gfm : :markdown
23
+ html = GitHub::Markdown.to_html(@text, mode)
24
+ html.rstrip!
25
+ html
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,11 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Simple filter for plain text input. HTML escapes the text input and wraps it
4
+ # in a div.
5
+ class PlainTextInputFilter < TextFilter
6
+ def call
7
+ "<div>#{EscapeUtils.escape_html(@text, false)}</div>"
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,105 @@
1
+ require 'sanitize'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML filter with sanization routines and whitelists. This module defines
6
+ # what HTML is allowed in user provided content and fixes up issues with
7
+ # unbalanced tags and whatnot.
8
+ #
9
+ # See the Sanitize docs for more information on the underlying library:
10
+ #
11
+ # https://github.com/rgrove/sanitize/#readme
12
+ #
13
+ # Context options:
14
+ # :whitelist - The sanitizer whitelist configuration to use. This can be one
15
+ # of the options constants defined in this class or a custom
16
+ # sanitize options hash.
17
+ #
18
+ # This filter does not write additional information to the context.
19
+ class SanitizationFilter < Filter
20
+ LISTS = Set.new(%w(ul ol).freeze)
21
+ LIST_ITEM = 'li'.freeze
22
+
23
+ # List of table child elements. These must be contained by a <table> element
24
+ # or they are not allowed through. Otherwise they can be used to break out
25
+ # of places we're using tables to contain formatted user content (like pull
26
+ # request review comments).
27
+ TABLE_ITEMS = Set.new(%w(tr td th).freeze)
28
+ TABLE = 'table'.freeze
29
+
30
+ # The main sanitization whitelist. Only these elements and attributes are
31
+ # allowed through by default.
32
+ WHITELIST = {
33
+ :elements => %w(
34
+ h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
35
+ div ins del sup sub p ol ul table blockquote dl dt dd
36
+ kbd q samp var hr ruby rt rp li tr td th
37
+ ),
38
+ :attributes => {
39
+ 'a' => ['href'],
40
+ 'img' => ['src'],
41
+ 'div' => ['itemscope', 'itemtype'],
42
+ :all => ['abbr', 'accept', 'accept-charset',
43
+ 'accesskey', 'action', 'align', 'alt', 'axis',
44
+ 'border', 'cellpadding', 'cellspacing', 'char',
45
+ 'charoff', 'charset', 'checked', 'cite',
46
+ 'clear', 'cols', 'colspan', 'color',
47
+ 'compact', 'coords', 'datetime', 'dir',
48
+ 'disabled', 'enctype', 'for', 'frame',
49
+ 'headers', 'height', 'hreflang',
50
+ 'hspace', 'ismap', 'label', 'lang',
51
+ 'longdesc', 'maxlength', 'media', 'method',
52
+ 'multiple', 'name', 'nohref', 'noshade',
53
+ 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
54
+ 'rows', 'rowspan', 'rules', 'scope',
55
+ 'selected', 'shape', 'size', 'span',
56
+ 'start', 'summary', 'tabindex', 'target',
57
+ 'title', 'type', 'usemap', 'valign', 'value',
58
+ 'vspace', 'width', 'itemprop']
59
+ },
60
+ :protocols => {
61
+ 'a' => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
62
+ 'img' => {'src' => ['http', 'https', :relative]}
63
+ },
64
+ :transformers => [
65
+ # Top-level <li> elements are removed because they can break out of
66
+ # containing markup.
67
+ lambda { |env|
68
+ name, node = env[:node_name], env[:node]
69
+ if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
70
+ node.replace(node.children)
71
+ end
72
+ },
73
+
74
+ # Table child elements that are not contained by a <table> are removed.
75
+ lambda { |env|
76
+ name, node = env[:node_name], env[:node]
77
+ if TABLE_ITEMS.include?(name) && !node.ancestors.any? { |n| n.name == TABLE }
78
+ node.replace(node.children)
79
+ end
80
+ }
81
+ ]
82
+ }
83
+
84
+ # A more limited sanitization whitelist. This includes all attributes,
85
+ # protocols, and transformers from WHITELIST but with a more locked down
86
+ # set of allowed elements.
87
+ LIMITED = WHITELIST.merge(
88
+ :elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))
89
+
90
+ # Strip all HTML tags from the document.
91
+ FULL = { :elements => [] }
92
+
93
+ # Sanitize markup using the Sanitize library.
94
+ def call
95
+ Sanitize.clean_node!(doc, whitelist)
96
+ end
97
+
98
+ # The whitelist to use when sanitizing. This can be passed in the context
99
+ # hash to the filter but defaults to WHITELIST constant value above.
100
+ def whitelist
101
+ context[:whitelist] || WHITELIST
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,29 @@
1
+ require 'linguist'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter that syntax highlights code blocks wrapped
6
+ # in <pre lang="...">.
7
+ class SyntaxHighlightFilter < Filter
8
+ def call
9
+ doc.search('pre').each do |node|
10
+ next unless lang = node['lang']
11
+ next unless lexer = Pygments::Lexer[lang]
12
+ text = node.inner_text
13
+
14
+ html = highlight_with_timeout_handling(lexer, text)
15
+ next if html.nil?
16
+
17
+ node.replace(html)
18
+ end
19
+ doc
20
+ end
21
+
22
+ def highlight_with_timeout_handling(lexer, text)
23
+ lexer.highlight(text)
24
+ rescue Timeout::Error => boom
25
+ nil
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,14 @@
1
+ module HTML
2
+ class Pipeline
3
+ class TextFilter < Filter
4
+ attr_reader :text
5
+
6
+ def initialize(text, context = nil, result = nil)
7
+ raise TypeError, "text cannot be HTML" if text.is_a?(DocumentFragment)
8
+ # Ensure that this is always a string
9
+ @text = text.respond_to?(:to_str) ? text.to_str : text.to_s
10
+ super nil, context, result
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,21 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML Filter that converts Textile text into HTML and converts into a
4
+ # DocumentFragment. This is different from most filters in that it can take a
5
+ # non-HTML as input. It must be used as the first filter in a pipeline.
6
+ #
7
+ # Context options:
8
+ # :autolink => false Disable autolinking URLs
9
+ #
10
+ # This filter does not write any additional information to the context hash.
11
+ #
12
+ # NOTE This filter is provided for really old comments only. It probably
13
+ # shouldn't be used for anything new.
14
+ class TextileFilter < TextFilter
15
+ # Convert Textile to HTML and convert into a DocumentFragment.
16
+ def call
17
+ RedCloth.new(@text).to_html
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,28 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML filter that adds a 'name' attribute to all headers
4
+ # in a document, so they can be accessed from a table of contents
5
+ #
6
+ # TODO: besides adding the name attribute, we should get around to
7
+ # eventually generating the Table of Contents itself, with links
8
+ # to each header
9
+ class TableOfContentsFilter < Filter
10
+ def call
11
+ headers = Hash.new(0)
12
+ doc.css('h1, h2, h3, h4, h5, h6').each do |node|
13
+ name = node.text.downcase
14
+ name.gsub!(/[^\w\- ]/, '') # remove punctuation
15
+ name.gsub!(' ', '-') # replace spaces with dash
16
+ name = EscapeUtils.escape_uri(name) # escape extended UTF-8 chars
17
+
18
+ uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
19
+ headers[name] += 1
20
+ if header_content = node.children.first
21
+ header_content.add_previous_sibling(%Q{<a name="#{name}#{uniq}" class="anchor" href="##{name}#{uniq}"><span class="mini-icon mini-icon-link"></span></a>})
22
+ end
23
+ end
24
+ doc
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,5 @@
1
+ module HTML
2
+ class Pipeline
3
+ VERSION = "0.0.6"
4
+ end
5
+ end
@@ -0,0 +1,22 @@
1
+ require "test_helper"
2
+
3
+ AutolinkFilter = HTML::Pipeline::AutolinkFilter
4
+
5
+ class HTML::Pipeline::AutolinkFilterTest < Test::Unit::TestCase
6
+ def test_uses_rinku_for_autolinking
7
+ # just try to parse a complicated piece of HTML
8
+ # that Rails auto_link cannot handle
9
+ assert_equal '<p>"<a href="http://www.github.com">http://www.github.com</a>"</p>',
10
+ AutolinkFilter.to_html('<p>"http://www.github.com"</p>')
11
+ end
12
+
13
+ def test_autolink_option
14
+ assert_equal '<p>"http://www.github.com"</p>',
15
+ AutolinkFilter.to_html('<p>"http://www.github.com"</p>', :autolink => false)
16
+ end
17
+
18
+ def test_autolink_flags
19
+ assert_equal '<p>"<a href="http://github">http://github</a>"</p>',
20
+ AutolinkFilter.to_html('<p>"http://github"</p>', :flags => Rinku::AUTOLINK_SHORT_DOMAINS)
21
+ end
22
+ end
@@ -0,0 +1,47 @@
1
+ require "test_helper"
2
+
3
+ class HTML::Pipeline::CamoFilterTest < Test::Unit::TestCase
4
+ CamoFilter = HTML::Pipeline::CamoFilter
5
+
6
+ def setup
7
+ @asset_proxy_url = 'https//assets.example.org'
8
+ @asset_proxy_secret_key = 'ssssh-secret'
9
+ @options = {
10
+ :asset_proxy => @asset_proxy_url,
11
+ :asset_proxy_secret_key => @asset_proxy_secret_key
12
+ }
13
+ end
14
+
15
+ def test_camouflaging_http_image_urls
16
+ orig = %(<p><img src="http://twitter.com/img.png"></p>)
17
+ assert_includes 'img src="' + @asset_proxy_url,
18
+ CamoFilter.call(orig, @options).to_s
19
+ end
20
+
21
+ def test_rewrites_dotcom_image_urls
22
+ orig = %(<p><img src="http://github.com/img.png"></p>)
23
+ assert_equal "<p><img src=\"https://github.com/img.png\"></p>",
24
+ CamoFilter.call(orig, @options).to_s
25
+ end
26
+
27
+ def test_not_camouflaging_https_image_urls
28
+ orig = %(<p><img src="https://foo.com/img.png"></p>)
29
+ assert_doesnt_include 'img src="' + @asset_proxy_url,
30
+ CamoFilter.call(orig, @options).to_s
31
+ end
32
+
33
+ def test_handling_images_with_no_src_attribute
34
+ orig = %(<p><img></p>)
35
+ assert_nothing_raised do
36
+ CamoFilter.call(orig, @options).to_s
37
+ end
38
+ end
39
+
40
+ def test_required_context_validation
41
+ exception = assert_raise(ArgumentError) {
42
+ CamoFilter.call("", {})
43
+ }
44
+ assert_match /:asset_proxy[^_]/, exception.message
45
+ assert_match /:asset_proxy_secret_key/, exception.message
46
+ end
47
+ end
@@ -0,0 +1,18 @@
1
+ require 'test_helper'
2
+
3
+ class HTML::Pipeline::EmojiFilterTest < Test::Unit::TestCase
4
+ EmojiFilter = HTML::Pipeline::EmojiFilter
5
+
6
+ def test_emojify
7
+ filter = EmojiFilter.new("<p>:shipit:</p>", {:asset_root => 'https://foo.com'})
8
+ doc = filter.call
9
+ assert_match "https://foo.com/emoji/shipit.png", doc.search('img').attr('src').value
10
+ end
11
+
12
+ def test_required_context_validation
13
+ exception = assert_raise(ArgumentError) {
14
+ EmojiFilter.call("", {})
15
+ }
16
+ assert_match /:asset_root/, exception.message
17
+ end
18
+ end
@@ -0,0 +1,50 @@
1
+ require "test_helper"
2
+
3
+ class HTML::Pipeline::ImageMaxWidthFilterTest < Test::Unit::TestCase
4
+ def filter(html)
5
+ HTML::Pipeline::ImageMaxWidthFilter.call(html)
6
+ end
7
+
8
+ def test_rewrites_image_style_tags
9
+ body = "<p>Screenshot: <img src='screenshot.png'></p>"
10
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
11
+
12
+ res = filter(doc)
13
+ assert_equal_html %q(<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
14
+ res.to_html
15
+ end
16
+
17
+ def test_leaves_existing_image_style_tags_alone
18
+ body = "<p><img src='screenshot.png' style='width:100px;'></p>"
19
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
20
+
21
+ res = filter(doc)
22
+ assert_equal_html '<p><img src="screenshot.png" style="width:100px;"></p>',
23
+ res.to_html
24
+ end
25
+
26
+ def test_links_to_image
27
+ body = "<p>Screenshot: <img src='screenshot.png'></p>"
28
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
29
+
30
+ res = filter(doc)
31
+ assert_equal_html '<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>',
32
+ res.to_html
33
+ end
34
+
35
+ def test_doesnt_link_to_image_when_already_linked
36
+ body = "<p>Screenshot: <a href='blah.png'><img src='screenshot.png'></a></p>"
37
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
38
+
39
+ res = filter(doc)
40
+ assert_equal_html %q(<p>Screenshot: <a href="blah.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
41
+ res.to_html
42
+ end
43
+
44
+ def test_doesnt_screw_up_inlined_images
45
+ body = "<p>Screenshot <img src='screenshot.png'>, yes, this is a <b>screenshot</b> indeed.</p>"
46
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
47
+
48
+ assert_equal_html %q(<p>Screenshot <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a>, yes, this is a <b>screenshot</b> indeed.</p>), filter(doc).to_html
49
+ end
50
+ end