html-pipeline-no-charlock 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/.travis.yml +13 -0
- data/CHANGELOG.md +16 -0
- data/Gemfile +9 -0
- data/LICENSE +22 -0
- data/README.md +221 -0
- data/Rakefile +13 -0
- data/html-pipeline-no-charlock.gemspec +25 -0
- data/html-pipeline.gemspec +26 -0
- data/lib/html/pipeline.rb +130 -0
- data/lib/html/pipeline/@mention_filter.rb +118 -0
- data/lib/html/pipeline/autolink_filter.rb +22 -0
- data/lib/html/pipeline/body_content.rb +42 -0
- data/lib/html/pipeline/camo_filter.rb +70 -0
- data/lib/html/pipeline/email_reply_filter.rb +56 -0
- data/lib/html/pipeline/emoji_filter.rb +54 -0
- data/lib/html/pipeline/filter.rb +178 -0
- data/lib/html/pipeline/https_filter.rb +13 -0
- data/lib/html/pipeline/image_max_width_filter.rb +37 -0
- data/lib/html/pipeline/markdown_filter.rb +29 -0
- data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
- data/lib/html/pipeline/sanitization_filter.rb +105 -0
- data/lib/html/pipeline/syntax_highlight_filter.rb +29 -0
- data/lib/html/pipeline/text_filter.rb +14 -0
- data/lib/html/pipeline/textile_filter.rb +21 -0
- data/lib/html/pipeline/toc_filter.rb +28 -0
- data/lib/html/pipeline/version.rb +5 -0
- data/test/html/pipeline/autolink_filter_test.rb +22 -0
- data/test/html/pipeline/camo_filter_test.rb +47 -0
- data/test/html/pipeline/emoji_filter_test.rb +18 -0
- data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
- data/test/html/pipeline/markdown_filter_test.rb +101 -0
- data/test/html/pipeline/mention_filter_test.rb +158 -0
- data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
- data/test/html/pipeline/sanitization_filter_test.rb +47 -0
- data/test/html/pipeline/toc_filter_test.rb +47 -0
- data/test/test_helper.rb +38 -0
- metadata +214 -0
@@ -0,0 +1,13 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# HTML Filter for replacing http github urls with https versions.
|
4
|
+
class HttpsFilter < Filter
|
5
|
+
def call
|
6
|
+
doc.css('a[href^="http://github.com"]').each do |element|
|
7
|
+
element['href'] = element['href'].sub(/^http:/,'https:')
|
8
|
+
end
|
9
|
+
doc
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# This filter rewrites image tags with a max-width inline style and also wraps
|
4
|
+
# the image in an <a> tag that causes the full size image to be opened in a
|
5
|
+
# new tab.
|
6
|
+
#
|
7
|
+
# The max-width inline styles are especially useful in HTML email which
|
8
|
+
# don't use a global stylesheets.
|
9
|
+
class ImageMaxWidthFilter < Filter
|
10
|
+
def call
|
11
|
+
doc.search('img').each do |element|
|
12
|
+
# Skip if there's already a style attribute. Not sure how this
|
13
|
+
# would happen but we can reconsider it in the future.
|
14
|
+
next if element['style']
|
15
|
+
|
16
|
+
# Bail out if src doesn't look like a valid http url. trying to avoid weird
|
17
|
+
# js injection via javascript: urls.
|
18
|
+
next if element['src'].to_s.strip =~ /\Ajavascript/i
|
19
|
+
|
20
|
+
element['style'] = "max-width:100%;"
|
21
|
+
|
22
|
+
if !has_ancestor?(element, %w(a))
|
23
|
+
link_image element
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
doc
|
28
|
+
end
|
29
|
+
|
30
|
+
def link_image(element)
|
31
|
+
link = doc.document.create_element('a', :href => element['src'], :target => '_blank')
|
32
|
+
link.add_child(element.dup)
|
33
|
+
element.replace(link)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'github/markdown'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML Filter that converts Markdown text into HTML and converts into a
|
6
|
+
# DocumentFragment. This is different from most filters in that it can take a
|
7
|
+
# non-HTML as input. It must be used as the first filter in a pipeline.
|
8
|
+
#
|
9
|
+
# Context options:
|
10
|
+
# :gfm => false Disable GFM line-end processing
|
11
|
+
#
|
12
|
+
# This filter does not write any additional information to the context hash.
|
13
|
+
class MarkdownFilter < TextFilter
|
14
|
+
def initialize(text, context = nil, result = nil)
|
15
|
+
super text, context, result
|
16
|
+
@text = @text.gsub "\r", ''
|
17
|
+
end
|
18
|
+
|
19
|
+
# Convert Markdown to HTML using the best available implementation
|
20
|
+
# and convert into a DocumentFragment.
|
21
|
+
def call
|
22
|
+
mode = (context[:gfm] != false) ? :gfm : :markdown
|
23
|
+
html = GitHub::Markdown.to_html(@text, mode)
|
24
|
+
html.rstrip!
|
25
|
+
html
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'sanitize'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML filter with sanization routines and whitelists. This module defines
|
6
|
+
# what HTML is allowed in user provided content and fixes up issues with
|
7
|
+
# unbalanced tags and whatnot.
|
8
|
+
#
|
9
|
+
# See the Sanitize docs for more information on the underlying library:
|
10
|
+
#
|
11
|
+
# https://github.com/rgrove/sanitize/#readme
|
12
|
+
#
|
13
|
+
# Context options:
|
14
|
+
# :whitelist - The sanitizer whitelist configuration to use. This can be one
|
15
|
+
# of the options constants defined in this class or a custom
|
16
|
+
# sanitize options hash.
|
17
|
+
#
|
18
|
+
# This filter does not write additional information to the context.
|
19
|
+
class SanitizationFilter < Filter
|
20
|
+
LISTS = Set.new(%w(ul ol).freeze)
|
21
|
+
LIST_ITEM = 'li'.freeze
|
22
|
+
|
23
|
+
# List of table child elements. These must be contained by a <table> element
|
24
|
+
# or they are not allowed through. Otherwise they can be used to break out
|
25
|
+
# of places we're using tables to contain formatted user content (like pull
|
26
|
+
# request review comments).
|
27
|
+
TABLE_ITEMS = Set.new(%w(tr td th).freeze)
|
28
|
+
TABLE = 'table'.freeze
|
29
|
+
|
30
|
+
# The main sanitization whitelist. Only these elements and attributes are
|
31
|
+
# allowed through by default.
|
32
|
+
WHITELIST = {
|
33
|
+
:elements => %w(
|
34
|
+
h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
|
35
|
+
div ins del sup sub p ol ul table blockquote dl dt dd
|
36
|
+
kbd q samp var hr ruby rt rp li tr td th
|
37
|
+
),
|
38
|
+
:attributes => {
|
39
|
+
'a' => ['href'],
|
40
|
+
'img' => ['src'],
|
41
|
+
'div' => ['itemscope', 'itemtype'],
|
42
|
+
:all => ['abbr', 'accept', 'accept-charset',
|
43
|
+
'accesskey', 'action', 'align', 'alt', 'axis',
|
44
|
+
'border', 'cellpadding', 'cellspacing', 'char',
|
45
|
+
'charoff', 'charset', 'checked', 'cite',
|
46
|
+
'clear', 'cols', 'colspan', 'color',
|
47
|
+
'compact', 'coords', 'datetime', 'dir',
|
48
|
+
'disabled', 'enctype', 'for', 'frame',
|
49
|
+
'headers', 'height', 'hreflang',
|
50
|
+
'hspace', 'ismap', 'label', 'lang',
|
51
|
+
'longdesc', 'maxlength', 'media', 'method',
|
52
|
+
'multiple', 'name', 'nohref', 'noshade',
|
53
|
+
'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
54
|
+
'rows', 'rowspan', 'rules', 'scope',
|
55
|
+
'selected', 'shape', 'size', 'span',
|
56
|
+
'start', 'summary', 'tabindex', 'target',
|
57
|
+
'title', 'type', 'usemap', 'valign', 'value',
|
58
|
+
'vspace', 'width', 'itemprop']
|
59
|
+
},
|
60
|
+
:protocols => {
|
61
|
+
'a' => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
|
62
|
+
'img' => {'src' => ['http', 'https', :relative]}
|
63
|
+
},
|
64
|
+
:transformers => [
|
65
|
+
# Top-level <li> elements are removed because they can break out of
|
66
|
+
# containing markup.
|
67
|
+
lambda { |env|
|
68
|
+
name, node = env[:node_name], env[:node]
|
69
|
+
if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
|
70
|
+
node.replace(node.children)
|
71
|
+
end
|
72
|
+
},
|
73
|
+
|
74
|
+
# Table child elements that are not contained by a <table> are removed.
|
75
|
+
lambda { |env|
|
76
|
+
name, node = env[:node_name], env[:node]
|
77
|
+
if TABLE_ITEMS.include?(name) && !node.ancestors.any? { |n| n.name == TABLE }
|
78
|
+
node.replace(node.children)
|
79
|
+
end
|
80
|
+
}
|
81
|
+
]
|
82
|
+
}
|
83
|
+
|
84
|
+
# A more limited sanitization whitelist. This includes all attributes,
|
85
|
+
# protocols, and transformers from WHITELIST but with a more locked down
|
86
|
+
# set of allowed elements.
|
87
|
+
LIMITED = WHITELIST.merge(
|
88
|
+
:elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))
|
89
|
+
|
90
|
+
# Strip all HTML tags from the document.
|
91
|
+
FULL = { :elements => [] }
|
92
|
+
|
93
|
+
# Sanitize markup using the Sanitize library.
|
94
|
+
def call
|
95
|
+
Sanitize.clean_node!(doc, whitelist)
|
96
|
+
end
|
97
|
+
|
98
|
+
# The whitelist to use when sanitizing. This can be passed in the context
|
99
|
+
# hash to the filter but defaults to WHITELIST constant value above.
|
100
|
+
def whitelist
|
101
|
+
context[:whitelist] || WHITELIST
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'linguist'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML Filter that syntax highlights code blocks wrapped
|
6
|
+
# in <pre lang="...">.
|
7
|
+
class SyntaxHighlightFilter < Filter
|
8
|
+
def call
|
9
|
+
doc.search('pre').each do |node|
|
10
|
+
next unless lang = node['lang']
|
11
|
+
next unless lexer = Pygments::Lexer[lang]
|
12
|
+
text = node.inner_text
|
13
|
+
|
14
|
+
html = highlight_with_timeout_handling(lexer, text)
|
15
|
+
next if html.nil?
|
16
|
+
|
17
|
+
node.replace(html)
|
18
|
+
end
|
19
|
+
doc
|
20
|
+
end
|
21
|
+
|
22
|
+
def highlight_with_timeout_handling(lexer, text)
|
23
|
+
lexer.highlight(text)
|
24
|
+
rescue Timeout::Error => boom
|
25
|
+
nil
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
class TextFilter < Filter
|
4
|
+
attr_reader :text
|
5
|
+
|
6
|
+
def initialize(text, context = nil, result = nil)
|
7
|
+
raise TypeError, "text cannot be HTML" if text.is_a?(DocumentFragment)
|
8
|
+
# Ensure that this is always a string
|
9
|
+
@text = text.respond_to?(:to_str) ? text.to_str : text.to_s
|
10
|
+
super nil, context, result
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# HTML Filter that converts Textile text into HTML and converts into a
|
4
|
+
# DocumentFragment. This is different from most filters in that it can take a
|
5
|
+
# non-HTML as input. It must be used as the first filter in a pipeline.
|
6
|
+
#
|
7
|
+
# Context options:
|
8
|
+
# :autolink => false Disable autolinking URLs
|
9
|
+
#
|
10
|
+
# This filter does not write any additional information to the context hash.
|
11
|
+
#
|
12
|
+
# NOTE This filter is provided for really old comments only. It probably
|
13
|
+
# shouldn't be used for anything new.
|
14
|
+
class TextileFilter < TextFilter
|
15
|
+
# Convert Textile to HTML and convert into a DocumentFragment.
|
16
|
+
def call
|
17
|
+
RedCloth.new(@text).to_html
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# HTML filter that adds a 'name' attribute to all headers
|
4
|
+
# in a document, so they can be accessed from a table of contents
|
5
|
+
#
|
6
|
+
# TODO: besides adding the name attribute, we should get around to
|
7
|
+
# eventually generating the Table of Contents itself, with links
|
8
|
+
# to each header
|
9
|
+
class TableOfContentsFilter < Filter
|
10
|
+
def call
|
11
|
+
headers = Hash.new(0)
|
12
|
+
doc.css('h1, h2, h3, h4, h5, h6').each do |node|
|
13
|
+
name = node.text.downcase
|
14
|
+
name.gsub!(/[^\w\- ]/, '') # remove punctuation
|
15
|
+
name.gsub!(' ', '-') # replace spaces with dash
|
16
|
+
name = EscapeUtils.escape_uri(name) # escape extended UTF-8 chars
|
17
|
+
|
18
|
+
uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
|
19
|
+
headers[name] += 1
|
20
|
+
if header_content = node.children.first
|
21
|
+
header_content.add_previous_sibling(%Q{<a name="#{name}#{uniq}" class="anchor" href="##{name}#{uniq}"><span class="mini-icon mini-icon-link"></span></a>})
|
22
|
+
end
|
23
|
+
end
|
24
|
+
doc
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
AutolinkFilter = HTML::Pipeline::AutolinkFilter
|
4
|
+
|
5
|
+
class HTML::Pipeline::AutolinkFilterTest < Test::Unit::TestCase
|
6
|
+
def test_uses_rinku_for_autolinking
|
7
|
+
# just try to parse a complicated piece of HTML
|
8
|
+
# that Rails auto_link cannot handle
|
9
|
+
assert_equal '<p>"<a href="http://www.github.com">http://www.github.com</a>"</p>',
|
10
|
+
AutolinkFilter.to_html('<p>"http://www.github.com"</p>')
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_autolink_option
|
14
|
+
assert_equal '<p>"http://www.github.com"</p>',
|
15
|
+
AutolinkFilter.to_html('<p>"http://www.github.com"</p>', :autolink => false)
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_autolink_flags
|
19
|
+
assert_equal '<p>"<a href="http://github">http://github</a>"</p>',
|
20
|
+
AutolinkFilter.to_html('<p>"http://github"</p>', :flags => Rinku::AUTOLINK_SHORT_DOMAINS)
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class HTML::Pipeline::CamoFilterTest < Test::Unit::TestCase
|
4
|
+
CamoFilter = HTML::Pipeline::CamoFilter
|
5
|
+
|
6
|
+
def setup
|
7
|
+
@asset_proxy_url = 'https//assets.example.org'
|
8
|
+
@asset_proxy_secret_key = 'ssssh-secret'
|
9
|
+
@options = {
|
10
|
+
:asset_proxy => @asset_proxy_url,
|
11
|
+
:asset_proxy_secret_key => @asset_proxy_secret_key
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_camouflaging_http_image_urls
|
16
|
+
orig = %(<p><img src="http://twitter.com/img.png"></p>)
|
17
|
+
assert_includes 'img src="' + @asset_proxy_url,
|
18
|
+
CamoFilter.call(orig, @options).to_s
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_rewrites_dotcom_image_urls
|
22
|
+
orig = %(<p><img src="http://github.com/img.png"></p>)
|
23
|
+
assert_equal "<p><img src=\"https://github.com/img.png\"></p>",
|
24
|
+
CamoFilter.call(orig, @options).to_s
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_not_camouflaging_https_image_urls
|
28
|
+
orig = %(<p><img src="https://foo.com/img.png"></p>)
|
29
|
+
assert_doesnt_include 'img src="' + @asset_proxy_url,
|
30
|
+
CamoFilter.call(orig, @options).to_s
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_handling_images_with_no_src_attribute
|
34
|
+
orig = %(<p><img></p>)
|
35
|
+
assert_nothing_raised do
|
36
|
+
CamoFilter.call(orig, @options).to_s
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_required_context_validation
|
41
|
+
exception = assert_raise(ArgumentError) {
|
42
|
+
CamoFilter.call("", {})
|
43
|
+
}
|
44
|
+
assert_match /:asset_proxy[^_]/, exception.message
|
45
|
+
assert_match /:asset_proxy_secret_key/, exception.message
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class HTML::Pipeline::EmojiFilterTest < Test::Unit::TestCase
|
4
|
+
EmojiFilter = HTML::Pipeline::EmojiFilter
|
5
|
+
|
6
|
+
def test_emojify
|
7
|
+
filter = EmojiFilter.new("<p>:shipit:</p>", {:asset_root => 'https://foo.com'})
|
8
|
+
doc = filter.call
|
9
|
+
assert_match "https://foo.com/emoji/shipit.png", doc.search('img').attr('src').value
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_required_context_validation
|
13
|
+
exception = assert_raise(ArgumentError) {
|
14
|
+
EmojiFilter.call("", {})
|
15
|
+
}
|
16
|
+
assert_match /:asset_root/, exception.message
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class HTML::Pipeline::ImageMaxWidthFilterTest < Test::Unit::TestCase
|
4
|
+
def filter(html)
|
5
|
+
HTML::Pipeline::ImageMaxWidthFilter.call(html)
|
6
|
+
end
|
7
|
+
|
8
|
+
def test_rewrites_image_style_tags
|
9
|
+
body = "<p>Screenshot: <img src='screenshot.png'></p>"
|
10
|
+
doc = Nokogiri::HTML::DocumentFragment.parse(body)
|
11
|
+
|
12
|
+
res = filter(doc)
|
13
|
+
assert_equal_html %q(<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
|
14
|
+
res.to_html
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_leaves_existing_image_style_tags_alone
|
18
|
+
body = "<p><img src='screenshot.png' style='width:100px;'></p>"
|
19
|
+
doc = Nokogiri::HTML::DocumentFragment.parse(body)
|
20
|
+
|
21
|
+
res = filter(doc)
|
22
|
+
assert_equal_html '<p><img src="screenshot.png" style="width:100px;"></p>',
|
23
|
+
res.to_html
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_links_to_image
|
27
|
+
body = "<p>Screenshot: <img src='screenshot.png'></p>"
|
28
|
+
doc = Nokogiri::HTML::DocumentFragment.parse(body)
|
29
|
+
|
30
|
+
res = filter(doc)
|
31
|
+
assert_equal_html '<p>Screenshot: <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a></p>',
|
32
|
+
res.to_html
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_doesnt_link_to_image_when_already_linked
|
36
|
+
body = "<p>Screenshot: <a href='blah.png'><img src='screenshot.png'></a></p>"
|
37
|
+
doc = Nokogiri::HTML::DocumentFragment.parse(body)
|
38
|
+
|
39
|
+
res = filter(doc)
|
40
|
+
assert_equal_html %q(<p>Screenshot: <a href="blah.png"><img src="screenshot.png" style="max-width:100%;"></a></p>),
|
41
|
+
res.to_html
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_doesnt_screw_up_inlined_images
|
45
|
+
body = "<p>Screenshot <img src='screenshot.png'>, yes, this is a <b>screenshot</b> indeed.</p>"
|
46
|
+
doc = Nokogiri::HTML::DocumentFragment.parse(body)
|
47
|
+
|
48
|
+
assert_equal_html %q(<p>Screenshot <a target="_blank" href="screenshot.png"><img src="screenshot.png" style="max-width:100%;"></a>, yes, this is a <b>screenshot</b> indeed.</p>), filter(doc).to_html
|
49
|
+
end
|
50
|
+
end
|