html-pipeline-no-charlock 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/.gitignore +19 -0
  2. data/.travis.yml +13 -0
  3. data/CHANGELOG.md +16 -0
  4. data/Gemfile +9 -0
  5. data/LICENSE +22 -0
  6. data/README.md +221 -0
  7. data/Rakefile +13 -0
  8. data/html-pipeline-no-charlock.gemspec +25 -0
  9. data/html-pipeline.gemspec +26 -0
  10. data/lib/html/pipeline.rb +130 -0
  11. data/lib/html/pipeline/@mention_filter.rb +118 -0
  12. data/lib/html/pipeline/autolink_filter.rb +22 -0
  13. data/lib/html/pipeline/body_content.rb +42 -0
  14. data/lib/html/pipeline/camo_filter.rb +70 -0
  15. data/lib/html/pipeline/email_reply_filter.rb +56 -0
  16. data/lib/html/pipeline/emoji_filter.rb +54 -0
  17. data/lib/html/pipeline/filter.rb +178 -0
  18. data/lib/html/pipeline/https_filter.rb +13 -0
  19. data/lib/html/pipeline/image_max_width_filter.rb +37 -0
  20. data/lib/html/pipeline/markdown_filter.rb +29 -0
  21. data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
  22. data/lib/html/pipeline/sanitization_filter.rb +105 -0
  23. data/lib/html/pipeline/syntax_highlight_filter.rb +29 -0
  24. data/lib/html/pipeline/text_filter.rb +14 -0
  25. data/lib/html/pipeline/textile_filter.rb +21 -0
  26. data/lib/html/pipeline/toc_filter.rb +28 -0
  27. data/lib/html/pipeline/version.rb +5 -0
  28. data/test/html/pipeline/autolink_filter_test.rb +22 -0
  29. data/test/html/pipeline/camo_filter_test.rb +47 -0
  30. data/test/html/pipeline/emoji_filter_test.rb +18 -0
  31. data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
  32. data/test/html/pipeline/markdown_filter_test.rb +101 -0
  33. data/test/html/pipeline/mention_filter_test.rb +158 -0
  34. data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
  35. data/test/html/pipeline/sanitization_filter_test.rb +47 -0
  36. data/test/html/pipeline/toc_filter_test.rb +47 -0
  37. data/test/test_helper.rb +38 -0
  38. metadata +214 -0
@@ -0,0 +1,13 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML Filter for replacing http github urls with https versions.
4
+ class HttpsFilter < Filter
5
+ def call
6
+ doc.css('a[href^="http://github.com"]').each do |element|
7
+ element['href'] = element['href'].sub(/^http:/,'https:')
8
+ end
9
+ doc
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,37 @@
1
+ module HTML
2
+ class Pipeline
3
+ # This filter rewrites image tags with a max-width inline style and also wraps
4
+ # the image in an <a> tag that causes the full size image to be opened in a
5
+ # new tab.
6
+ #
7
+ # The max-width inline styles are especially useful in HTML email which
8
+ # don't use a global stylesheets.
9
+ class ImageMaxWidthFilter < Filter
10
+ def call
11
+ doc.search('img').each do |element|
12
+ # Skip if there's already a style attribute. Not sure how this
13
+ # would happen but we can reconsider it in the future.
14
+ next if element['style']
15
+
16
+ # Bail out if src doesn't look like a valid http url. trying to avoid weird
17
+ # js injection via javascript: urls.
18
+ next if element['src'].to_s.strip =~ /\Ajavascript/i
19
+
20
+ element['style'] = "max-width:100%;"
21
+
22
+ if !has_ancestor?(element, %w(a))
23
+ link_image element
24
+ end
25
+ end
26
+
27
+ doc
28
+ end
29
+
30
+ def link_image(element)
31
+ link = doc.document.create_element('a', :href => element['src'], :target => '_blank')
32
+ link.add_child(element.dup)
33
+ element.replace(link)
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,29 @@
1
+ require 'github/markdown'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter that converts Markdown text into HTML and converts into a
6
+ # DocumentFragment. This is different from most filters in that it can take a
7
+ # non-HTML as input. It must be used as the first filter in a pipeline.
8
+ #
9
+ # Context options:
10
+ # :gfm => false Disable GFM line-end processing
11
+ #
12
+ # This filter does not write any additional information to the context hash.
13
+ class MarkdownFilter < TextFilter
14
+ def initialize(text, context = nil, result = nil)
15
+ super text, context, result
16
+ @text = @text.gsub "\r", ''
17
+ end
18
+
19
+ # Convert Markdown to HTML using the best available implementation
20
+ # and convert into a DocumentFragment.
21
+ def call
22
+ mode = (context[:gfm] != false) ? :gfm : :markdown
23
+ html = GitHub::Markdown.to_html(@text, mode)
24
+ html.rstrip!
25
+ html
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,11 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Simple filter for plain text input. HTML escapes the text input and wraps it
4
+ # in a div.
5
+ class PlainTextInputFilter < TextFilter
6
+ def call
7
+ "<div>#{EscapeUtils.escape_html(@text, false)}</div>"
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,105 @@
1
+ require 'sanitize'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML filter with sanization routines and whitelists. This module defines
6
+ # what HTML is allowed in user provided content and fixes up issues with
7
+ # unbalanced tags and whatnot.
8
+ #
9
+ # See the Sanitize docs for more information on the underlying library:
10
+ #
11
+ # https://github.com/rgrove/sanitize/#readme
12
+ #
13
+ # Context options:
14
+ # :whitelist - The sanitizer whitelist configuration to use. This can be one
15
+ # of the options constants defined in this class or a custom
16
+ # sanitize options hash.
17
+ #
18
+ # This filter does not write additional information to the context.
19
+ class SanitizationFilter < Filter
20
+ LISTS = Set.new(%w(ul ol).freeze)
21
+ LIST_ITEM = 'li'.freeze
22
+
23
+ # List of table child elements. These must be contained by a <table> element
24
+ # or they are not allowed through. Otherwise they can be used to break out
25
+ # of places we're using tables to contain formatted user content (like pull
26
+ # request review comments).
27
+ TABLE_ITEMS = Set.new(%w(tr td th).freeze)
28
+ TABLE = 'table'.freeze
29
+
30
+ # The main sanitization whitelist. Only these elements and attributes are
31
+ # allowed through by default.
32
+ WHITELIST = {
33
+ :elements => %w(
34
+ h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
35
+ div ins del sup sub p ol ul table blockquote dl dt dd
36
+ kbd q samp var hr ruby rt rp li tr td th
37
+ ),
38
+ :attributes => {
39
+ 'a' => ['href'],
40
+ 'img' => ['src'],
41
+ 'div' => ['itemscope', 'itemtype'],
42
+ :all => ['abbr', 'accept', 'accept-charset',
43
+ 'accesskey', 'action', 'align', 'alt', 'axis',
44
+ 'border', 'cellpadding', 'cellspacing', 'char',
45
+ 'charoff', 'charset', 'checked', 'cite',
46
+ 'clear', 'cols', 'colspan', 'color',
47
+ 'compact', 'coords', 'datetime', 'dir',
48
+ 'disabled', 'enctype', 'for', 'frame',
49
+ 'headers', 'height', 'hreflang',
50
+ 'hspace', 'ismap', 'label', 'lang',
51
+ 'longdesc', 'maxlength', 'media', 'method',
52
+ 'multiple', 'name', 'nohref', 'noshade',
53
+ 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
54
+ 'rows', 'rowspan', 'rules', 'scope',
55
+ 'selected', 'shape', 'size', 'span',
56
+ 'start', 'summary', 'tabindex', 'target',
57
+ 'title', 'type', 'usemap', 'valign', 'value',
58
+ 'vspace', 'width', 'itemprop']
59
+ },
60
+ :protocols => {
61
+ 'a' => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
62
+ 'img' => {'src' => ['http', 'https', :relative]}
63
+ },
64
+ :transformers => [
65
+ # Top-level <li> elements are removed because they can break out of
66
+ # containing markup.
67
+ lambda { |env|
68
+ name, node = env[:node_name], env[:node]
69
+ if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
70
+ node.replace(node.children)
71
+ end
72
+ },
73
+
74
+ # Table child elements that are not contained by a <table> are removed.
75
+ lambda { |env|
76
+ name, node = env[:node_name], env[:node]
77
+ if TABLE_ITEMS.include?(name) && !node.ancestors.any? { |n| n.name == TABLE }
78
+ node.replace(node.children)
79
+ end
80
+ }
81
+ ]
82
+ }
83
+
84
+ # A more limited sanitization whitelist. This includes all attributes,
85
+ # protocols, and transformers from WHITELIST but with a more locked down
86
+ # set of allowed elements.
87
+ LIMITED = WHITELIST.merge(
88
+ :elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))
89
+
90
+ # Strip all HTML tags from the document.
91
+ FULL = { :elements => [] }
92
+
93
+ # Sanitize markup using the Sanitize library.
94
+ def call
95
+ Sanitize.clean_node!(doc, whitelist)
96
+ end
97
+
98
+ # The whitelist to use when sanitizing. This can be passed in the context
99
+ # hash to the filter but defaults to WHITELIST constant value above.
100
+ def whitelist
101
+ context[:whitelist] || WHITELIST
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,29 @@
1
+ require 'linguist'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter that syntax highlights code blocks wrapped
6
+ # in <pre lang="...">.
7
+ class SyntaxHighlightFilter < Filter
8
+ def call
9
+ doc.search('pre').each do |node|
10
+ next unless lang = node['lang']
11
+ next unless lexer = Pygments::Lexer[lang]
12
+ text = node.inner_text
13
+
14
+ html = highlight_with_timeout_handling(lexer, text)
15
+ next if html.nil?
16
+
17
+ node.replace(html)
18
+ end
19
+ doc
20
+ end
21
+
22
+ def highlight_with_timeout_handling(lexer, text)
23
+ lexer.highlight(text)
24
+ rescue Timeout::Error => boom
25
+ nil
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,14 @@
1
+ module HTML
2
+ class Pipeline
3
+ class TextFilter < Filter
4
+ attr_reader :text
5
+
6
+ def initialize(text, context = nil, result = nil)
7
+ raise TypeError, "text cannot be HTML" if text.is_a?(DocumentFragment)
8
+ # Ensure that this is always a string
9
+ @text = text.respond_to?(:to_str) ? text.to_str : text.to_s
10
+ super nil, context, result
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,21 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML Filter that converts Textile text into HTML and converts into a
4
+ # DocumentFragment. This is different from most filters in that it can take a
5
+ # non-HTML as input. It must be used as the first filter in a pipeline.
6
+ #
7
+ # Context options:
8
+ # :autolink => false Disable autolinking URLs
9
+ #
10
+ # This filter does not write any additional information to the context hash.
11
+ #
12
+ # NOTE This filter is provided for really old comments only. It probably
13
+ # shouldn't be used for anything new.
14
+ class TextileFilter < TextFilter
15
+ # Convert Textile to HTML and convert into a DocumentFragment.
16
+ def call
17
+ RedCloth.new(@text).to_html
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,28 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML filter that adds a 'name' attribute to all headers
4
+ # in a document, so they can be accessed from a table of contents
5
+ #
6
+ # TODO: besides adding the name attribute, we should get around to
7
+ # eventually generating the Table of Contents itself, with links
8
+ # to each header
9
+ class TableOfContentsFilter < Filter
10
+ def call
11
+ headers = Hash.new(0)
12
+ doc.css('h1, h2, h3, h4, h5, h6').each do |node|
13
+ name = node.text.downcase
14
+ name.gsub!(/[^\w\- ]/, '') # remove punctuation
15
+ name.gsub!(' ', '-') # replace spaces with dash
16
+ name = EscapeUtils.escape_uri(name) # escape extended UTF-8 chars
17
+
18
+ uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
19
+ headers[name] += 1
20
+ if header_content = node.children.first
21
+ header_content.add_previous_sibling(%Q{<a name="#{name}#{uniq}" class="anchor" href="##{name}#{uniq}"><span class="mini-icon mini-icon-link"></span></a>})
22
+ end
23
+ end
24
+ doc
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,5 @@
1
+ module HTML
2
+ class Pipeline
3
+ VERSION = "0.0.6"
4
+ end
5
+ end
@@ -0,0 +1,22 @@
1
+ require "test_helper"
2
+
3
+ AutolinkFilter = HTML::Pipeline::AutolinkFilter
4
+
5
+ class HTML::Pipeline::AutolinkFilterTest < Test::Unit::TestCase
6
+ def test_uses_rinku_for_autolinking
7
+ # just try to parse a complicated piece of HTML
8
+ # that Rails auto_link cannot handle
9
+ assert_equal '<p>"<a href="http://www.github.com">http://www.github.com</a>"</p>',
10
+ AutolinkFilter.to_html('<p>"http://www.github.com"</p>')
11
+ end
12
+
13
+ def test_autolink_option
14
+ assert_equal '<p>"http://www.github.com"</p>',
15
+ AutolinkFilter.to_html('<p>"http://www.github.com"</p>', :autolink => false)
16
+ end
17
+
18
+ def test_autolink_flags
19
+ assert_equal '<p>"<a href="http://github">http://github</a>"</p>',
20
+ AutolinkFilter.to_html('<p>"http://github"</p>', :flags => Rinku::AUTOLINK_SHORT_DOMAINS)
21
+ end
22
+ end
@@ -0,0 +1,47 @@
1
+ require "test_helper"
2
+
3
+ class HTML::Pipeline::CamoFilterTest < Test::Unit::TestCase
4
+ CamoFilter = HTML::Pipeline::CamoFilter
5
+
6
+ def setup
7
+ @asset_proxy_url = 'https//assets.example.org'
8
+ @asset_proxy_secret_key = 'ssssh-secret'
9
+ @options = {
10
+ :asset_proxy => @asset_proxy_url,
11
+ :asset_proxy_secret_key => @asset_proxy_secret_key
12
+ }
13
+ end
14
+
15
+ def test_camouflaging_http_image_urls
16
+ orig = %(<p><img src="http://twitter.com/img.png"></p>)
17
+ assert_includes 'img src="' + @asset_proxy_url,
18
+ CamoFilter.call(orig, @options).to_s
19
+ end
20
+
21
+ def test_rewrites_dotcom_image_urls
22
+ orig = %(<p><img src="http://github.com/img.png"></p>)
23
+ assert_equal "<p><img src=\"https://github.com/img.png\"></p>",
24
+ CamoFilter.call(orig, @options).to_s
25
+ end
26
+
27
+ def test_not_camouflaging_https_image_urls
28
+ orig = %(<p><img src="https://foo.com/img.png"></p>)
29
+ assert_doesnt_include 'img src="' + @asset_proxy_url,
30
+ CamoFilter.call(orig, @options).to_s
31
+ end
32
+
33
+ def test_handling_images_with_no_src_attribute
34
+ orig = %(<p><img></p>)
35
+ assert_nothing_raised do
36
+ CamoFilter.call(orig, @options).to_s
37
+ end
38
+ end
39
+
40
+ def test_required_context_validation
41
+ exception = assert_raise(ArgumentError) {
42
+ CamoFilter.call("", {})
43
+ }
44
+ assert_match /:asset_proxy[^_]/, exception.message
45
+ assert_match /:asset_proxy_secret_key/, exception.message
46
+ end
47
+ end
@@ -0,0 +1,18 @@
1
+ require 'test_helper'
2
+
3
+ class HTML::Pipeline::EmojiFilterTest < Test::Unit::TestCase
4
+ EmojiFilter = HTML::Pipeline::EmojiFilter
5
+
6
+ def test_emojify
7
+ filter = EmojiFilter.new("<p>:shipit:</p>", {:asset_root => 'https://foo.com'})
8
+ doc = filter.call
9
+ assert_match "https://foo.com/emoji/shipit.png", doc.search('img').attr('src').value
10
+ end
11
+
12
+ def test_required_context_validation
13
+ exception = assert_raise(ArgumentError) {
14
+ EmojiFilter.call("", {})
15
+ }
16
+ assert_match /:asset_root/, exception.message
17
+ end
18
+ end
@@ -0,0 +1,50 @@
1
+ require "test_helper"
2
+
3
+ class HTML::Pipeline::ImageMaxWidthFilterTest < Test::Unit::TestCase
4
+ def filter(html)
5
+ HTML::Pipeline::ImageMaxWidthFilter.call(html)
6
+ end
7
+
8
+ def test_rewrites_image_style_tags
9
+ body = "<p>Screenshot: <img src='screenshot.png'></p>"
10
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
11
+
12
+ res = filter(doc)
13
+ assert_equal_html %q(<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
14
+ res.to_html
15
+ end
16
+
17
+ def test_leaves_existing_image_style_tags_alone
18
+ body = "<p><img src='screenshot.png' style='width:100px;'></p>"
19
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
20
+
21
+ res = filter(doc)
22
+ assert_equal_html '<p><img src="screenshot.png" style="width:100px;"></p>',
23
+ res.to_html
24
+ end
25
+
26
+ def test_links_to_image
27
+ body = "<p>Screenshot: <img src='screenshot.png'></p>"
28
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
29
+
30
+ res = filter(doc)
31
+ assert_equal_html '<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>',
32
+ res.to_html
33
+ end
34
+
35
+ def test_doesnt_link_to_image_when_already_linked
36
+ body = "<p>Screenshot: <a href='blah.png'><img src='screenshot.png'></a></p>"
37
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
38
+
39
+ res = filter(doc)
40
+ assert_equal_html %q(<p>Screenshot: <a href="blah.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
41
+ res.to_html
42
+ end
43
+
44
+ def test_doesnt_screw_up_inlined_images
45
+ body = "<p>Screenshot <img src='screenshot.png'>, yes, this is a <b>screenshot</b> indeed.</p>"
46
+ doc = Nokogiri::HTML::DocumentFragment.parse(body)
47
+
48
+ assert_equal_html %q(<p>Screenshot <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a>, yes, this is a <b>screenshot</b> indeed.</p>), filter(doc).to_html
49
+ end
50
+ end