geothird-html-pipeline 0.0.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.travis.yml +13 -0
- data/CHANGELOG.md +43 -0
- data/Gemfile +9 -0
- data/LICENSE +22 -0
- data/README.md +274 -0
- data/Rakefile +11 -0
- data/bin/html-pipeline +80 -0
- data/geothird-html-pipeline.gemspec +27 -0
- data/lib/html/pipeline.rb +198 -0
- data/lib/html/pipeline/@mention_filter.rb +121 -0
- data/lib/html/pipeline/absolute_source_filter.rb +48 -0
- data/lib/html/pipeline/autolink_filter.rb +22 -0
- data/lib/html/pipeline/body_content.rb +42 -0
- data/lib/html/pipeline/camo_filter.rb +70 -0
- data/lib/html/pipeline/email_reply_filter.rb +56 -0
- data/lib/html/pipeline/emoji_filter.rb +54 -0
- data/lib/html/pipeline/filter.rb +178 -0
- data/lib/html/pipeline/https_filter.rb +13 -0
- data/lib/html/pipeline/image_max_width_filter.rb +37 -0
- data/lib/html/pipeline/markdown_filter.rb +29 -0
- data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
- data/lib/html/pipeline/sanitization_filter.rb +105 -0
- data/lib/html/pipeline/syntax_highlight_filter.rb +33 -0
- data/lib/html/pipeline/text_filter.rb +14 -0
- data/lib/html/pipeline/textile_filter.rb +21 -0
- data/lib/html/pipeline/toc_filter.rb +28 -0
- data/lib/html/pipeline/version.rb +5 -0
- data/test/helpers/mocked_instrumentation_service.rb +17 -0
- data/test/html/pipeline/absolute_source_filter_test.rb +56 -0
- data/test/html/pipeline/autolink_filter_test.rb +22 -0
- data/test/html/pipeline/camo_filter_test.rb +47 -0
- data/test/html/pipeline/emoji_filter_test.rb +18 -0
- data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
- data/test/html/pipeline/markdown_filter_test.rb +101 -0
- data/test/html/pipeline/mention_filter_test.rb +156 -0
- data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
- data/test/html/pipeline/sanitization_filter_test.rb +47 -0
- data/test/html/pipeline/toc_filter_test.rb +47 -0
- data/test/html/pipeline_test.rb +74 -0
- data/test/test_helper.rb +38 -0
- metadata +213 -0
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'openssl'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML Filter for replacing http image URLs with camo versions. See:
|
6
|
+
#
|
7
|
+
# https://github.com/atmos/camo
|
8
|
+
#
|
9
|
+
# All images provided in user content should be run through this
|
10
|
+
# filter so that http image sources do not cause mixed-content warnings
|
11
|
+
# in browser clients.
|
12
|
+
#
|
13
|
+
# Context options:
|
14
|
+
# :asset_proxy (required) - Base URL for constructed asset proxy URLs.
|
15
|
+
# :asset_proxy_secret_key (required) - The shared secret used to encode URLs.
|
16
|
+
#
|
17
|
+
# This filter does not write additional information to the context.
|
18
|
+
class CamoFilter < Filter
|
19
|
+
# Hijacks images in the markup provided, replacing them with URLs that
|
20
|
+
# go through the github asset proxy.
|
21
|
+
def call
|
22
|
+
doc.search("img").each do |element|
|
23
|
+
next if element['src'].nil?
|
24
|
+
src = element['src'].strip
|
25
|
+
src = src.sub(%r!^http://github.com!, 'https://github.com')
|
26
|
+
next if context[:disable_asset_proxy]
|
27
|
+
|
28
|
+
if src =~ /^http:/ || src =~ /^https:\/\/img.skitch.com\//
|
29
|
+
element['src'] = asset_proxy_url(src)
|
30
|
+
else
|
31
|
+
element['src'] = src
|
32
|
+
end
|
33
|
+
end
|
34
|
+
doc
|
35
|
+
end
|
36
|
+
|
37
|
+
# Implementation of validate hook.
|
38
|
+
# Errors should raise exceptions or use an existing validator.
|
39
|
+
def validate
|
40
|
+
needs :asset_proxy, :asset_proxy_secret_key
|
41
|
+
end
|
42
|
+
|
43
|
+
# The camouflaged URL for a given image URL.
|
44
|
+
def asset_proxy_url(url)
|
45
|
+
"#{asset_proxy_host}/#{asset_url_hash(url)}/#{hexencode(url)}"
|
46
|
+
end
|
47
|
+
|
48
|
+
# Private: calculate the HMAC digest for a image source URL.
|
49
|
+
def asset_url_hash(url)
|
50
|
+
digest = OpenSSL::Digest::Digest.new('sha1')
|
51
|
+
OpenSSL::HMAC.hexdigest(digest, asset_proxy_secret_key, url)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Private: the hostname to use for generated asset proxied URLs.
|
55
|
+
def asset_proxy_host
|
56
|
+
context[:asset_proxy]
|
57
|
+
end
|
58
|
+
|
59
|
+
def asset_proxy_secret_key
|
60
|
+
context[:asset_proxy_secret_key]
|
61
|
+
end
|
62
|
+
|
63
|
+
# Private: helper to hexencode a string. Each byte ends up encoded into
|
64
|
+
# two characters, zero padded value in the range [0-9a-f].
|
65
|
+
def hexencode(str)
|
66
|
+
str.to_enum(:each_byte).map { |byte| "%02x" % byte }.join
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# HTML Filter that converts email reply text into an HTML DocumentFragment.
|
4
|
+
# It must be used as the first filter in a pipeline.
|
5
|
+
#
|
6
|
+
# Context options:
|
7
|
+
# None
|
8
|
+
#
|
9
|
+
# This filter does not write any additional information to the context hash.
|
10
|
+
class EmailReplyFilter < TextFilter
|
11
|
+
include EscapeUtils
|
12
|
+
|
13
|
+
EMAIL_HIDDEN_HEADER = %(<span class="email-hidden-toggle"><a href="#">…</a></span><div class="email-hidden-reply" style="display:none">).freeze
|
14
|
+
EMAIL_QUOTED_HEADER = %(<div class="email-quoted-reply">).freeze
|
15
|
+
EMAIL_SIGNATURE_HEADER = %(<div class="email-signature-reply">).freeze
|
16
|
+
EMAIL_FRAGMENT_HEADER = %(<div class="email-fragment">).freeze
|
17
|
+
EMAIL_HEADER_END = "</div>".freeze
|
18
|
+
|
19
|
+
# Scans an email body to determine which bits are quoted and which should
|
20
|
+
# be hidden. EmailReplyParser is used to split the comment into an Array
|
21
|
+
# of quoted or unquoted Blocks. Now, we loop through them and attempt to
|
22
|
+
# add <div> tags around them so we can hide the hidden blocks, and style
|
23
|
+
# the quoted blocks differently. Since multiple blocks may be hidden, be
|
24
|
+
# sure to keep the "email-hidden-reply" <div>s around "email-quoted-reply"
|
25
|
+
# <div> tags. Call this on each comment of a visible thread in the order
|
26
|
+
# that they are displayed. Note: all comments are processed so we can
|
27
|
+
# maintain a Set of SHAs of paragraphs. Only plaintext comments skip the
|
28
|
+
# markdown step.
|
29
|
+
#
|
30
|
+
# Returns the email comment HTML as a String
|
31
|
+
def call
|
32
|
+
found_hidden = nil
|
33
|
+
paragraphs = EmailReplyParser.read(text.dup).fragments.map do |fragment|
|
34
|
+
pieces = [escape_html(fragment.to_s.strip).gsub(/^\s*(>|>)/, '')]
|
35
|
+
if fragment.quoted?
|
36
|
+
pieces.unshift EMAIL_QUOTED_HEADER
|
37
|
+
pieces << EMAIL_HEADER_END
|
38
|
+
elsif fragment.signature?
|
39
|
+
pieces.unshift EMAIL_SIGNATURE_HEADER
|
40
|
+
pieces << EMAIL_HEADER_END
|
41
|
+
else
|
42
|
+
pieces.unshift EMAIL_FRAGMENT_HEADER
|
43
|
+
pieces << EMAIL_HEADER_END
|
44
|
+
end
|
45
|
+
if fragment.hidden? && !found_hidden
|
46
|
+
found_hidden = true
|
47
|
+
pieces.unshift EMAIL_HIDDEN_HEADER
|
48
|
+
end
|
49
|
+
pieces.join
|
50
|
+
end
|
51
|
+
paragraphs << EMAIL_HEADER_END if found_hidden
|
52
|
+
paragraphs.join("\n")
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'emoji'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML filter that replaces :emoji: with images.
|
6
|
+
#
|
7
|
+
# Context:
|
8
|
+
# :asset_root (required) - base url to link to emoji sprite
|
9
|
+
class EmojiFilter < Filter
|
10
|
+
# Build a regexp that matches all valid :emoji: names.
|
11
|
+
EmojiPattern = /:(#{Emoji.names.map { |name| Regexp.escape(name) }.join('|')}):/
|
12
|
+
|
13
|
+
def call
|
14
|
+
doc.search('text()').each do |node|
|
15
|
+
content = node.to_html
|
16
|
+
next if !content.include?(':')
|
17
|
+
next if has_ancestor?(node, %w(pre code))
|
18
|
+
html = emoji_image_filter(content)
|
19
|
+
next if html == content
|
20
|
+
node.replace(html)
|
21
|
+
end
|
22
|
+
doc
|
23
|
+
end
|
24
|
+
|
25
|
+
# Implementation of validate hook.
|
26
|
+
# Errors should raise exceptions or use an existing validator.
|
27
|
+
def validate
|
28
|
+
needs :asset_root
|
29
|
+
end
|
30
|
+
|
31
|
+
# Replace :emoji: with corresponding images.
|
32
|
+
#
|
33
|
+
# text - String text to replace :emoji: in.
|
34
|
+
#
|
35
|
+
# Returns a String with :emoji: replaced with images.
|
36
|
+
def emoji_image_filter(text)
|
37
|
+
return text unless text.include?(':')
|
38
|
+
|
39
|
+
text.gsub EmojiPattern do |match|
|
40
|
+
name = $1
|
41
|
+
"<img class='emoji' title=':#{name}:' alt=':#{name}:' src='#{File.join(asset_root, "emoji", "#{name}.png")}' height='20' width='20' align='absmiddle' />"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# The base url to link emoji sprites
|
46
|
+
#
|
47
|
+
# Raises ArgumentError if context option has not been provided.
|
48
|
+
# Returns the context's asset_root.
|
49
|
+
def asset_root
|
50
|
+
context[:asset_root]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,178 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# Base class for user content HTML filters. Each filter takes an
|
4
|
+
# HTML string or Nokogiri::HTML::DocumentFragment, performs
|
5
|
+
# modifications and/or writes information to the result hash. Filters must
|
6
|
+
# return a DocumentFragment (typically the same instance provided to the call
|
7
|
+
# method) or a String with HTML markup.
|
8
|
+
#
|
9
|
+
# Example filter that replaces all images with trollface:
|
10
|
+
#
|
11
|
+
# class FuuuFilter < HTML::Pipeline::Filter
|
12
|
+
# def call
|
13
|
+
# doc.search('img').each do |img|
|
14
|
+
# img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
|
15
|
+
# end
|
16
|
+
# end
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# The context Hash passes options to filters and should not be changed in
|
20
|
+
# place. A Result Hash allows filters to make extracted information
|
21
|
+
# available to the caller and is mutable.
|
22
|
+
#
|
23
|
+
# Common context options:
|
24
|
+
# :base_url - The site's base URL
|
25
|
+
# :repository - A Repository providing context for the HTML being processed
|
26
|
+
#
|
27
|
+
# Each filter may define additional options and output values. See the class
|
28
|
+
# docs for more info.
|
29
|
+
class Filter
|
30
|
+
class InvalidDocumentException < StandardError; end
|
31
|
+
|
32
|
+
def initialize(doc, context = nil, result = nil)
|
33
|
+
if doc.kind_of?(String)
|
34
|
+
@html = doc.to_str
|
35
|
+
@doc = nil
|
36
|
+
else
|
37
|
+
@doc = doc
|
38
|
+
@html = nil
|
39
|
+
end
|
40
|
+
@context = context || {}
|
41
|
+
@result = result || {}
|
42
|
+
validate
|
43
|
+
end
|
44
|
+
|
45
|
+
# Public: Returns a simple Hash used to pass extra information into filters
|
46
|
+
# and also to allow filters to make extracted information available to the
|
47
|
+
# caller.
|
48
|
+
attr_reader :context
|
49
|
+
|
50
|
+
# Public: Returns a Hash used to allow filters to pass back information
|
51
|
+
# to callers of the various Pipelines. This can be used for
|
52
|
+
# #mentioned_users, for example.
|
53
|
+
attr_reader :result
|
54
|
+
|
55
|
+
# The Nokogiri::HTML::DocumentFragment to be manipulated. If the filter was
|
56
|
+
# provided a String, parse into a DocumentFragment the first time this
|
57
|
+
# method is called.
|
58
|
+
def doc
|
59
|
+
@doc ||= parse_html(html)
|
60
|
+
end
|
61
|
+
|
62
|
+
# The String representation of the document. If a DocumentFragment was
|
63
|
+
# provided to the Filter, it is serialized into a String when this method is
|
64
|
+
# called.
|
65
|
+
def html
|
66
|
+
raise InvalidDocumentException if @html.nil? && @doc.nil?
|
67
|
+
@html || doc.to_html
|
68
|
+
end
|
69
|
+
|
70
|
+
# The main filter entry point. The doc attribute is guaranteed to be a
|
71
|
+
# Nokogiri::HTML::DocumentFragment when invoked. Subclasses should modify
|
72
|
+
# this document in place or extract information and add it to the context
|
73
|
+
# hash.
|
74
|
+
def call
|
75
|
+
raise NotImplementedError
|
76
|
+
end
|
77
|
+
|
78
|
+
# Make sure the context has everything we need. Noop: Subclasses can override.
|
79
|
+
def validate
|
80
|
+
end
|
81
|
+
|
82
|
+
# The Repository object provided in the context hash, or nil when no
|
83
|
+
# :repository was specified.
|
84
|
+
#
|
85
|
+
# It's assumed that the repository context has already been checked
|
86
|
+
# for permissions
|
87
|
+
def repository
|
88
|
+
context[:repository]
|
89
|
+
end
|
90
|
+
|
91
|
+
# The User object provided in the context hash, or nil when no user
|
92
|
+
# was specified
|
93
|
+
def current_user
|
94
|
+
context[:current_user]
|
95
|
+
end
|
96
|
+
|
97
|
+
# Return whether the filter can access a given repo while
|
98
|
+
# applying a filter
|
99
|
+
#
|
100
|
+
# A repo can only be accessed if its pullable by the user who
|
101
|
+
# submitted the content of this filter, or if it's the same as
|
102
|
+
# the repository context in which the filter runs
|
103
|
+
def can_access_repo?(repo)
|
104
|
+
return false if repo.nil?
|
105
|
+
return true if repo == repository
|
106
|
+
repo.pullable_by?(current_user)
|
107
|
+
end
|
108
|
+
|
109
|
+
# The site's base URL provided in the context hash, or '/' when no
|
110
|
+
# base URL was specified.
|
111
|
+
def base_url
|
112
|
+
context[:base_url] || '/'
|
113
|
+
end
|
114
|
+
|
115
|
+
# Ensure the passed argument is a DocumentFragment. When a string is
|
116
|
+
# provided, it is parsed and returned; otherwise, the DocumentFragment is
|
117
|
+
# returned unmodified.
|
118
|
+
def parse_html(html)
|
119
|
+
HTML::Pipeline.parse(html)
|
120
|
+
end
|
121
|
+
|
122
|
+
# Helper method for filter subclasses used to determine if any of a node's
|
123
|
+
# ancestors have one of the tag names specified.
|
124
|
+
#
|
125
|
+
# node - The Node object to check.
|
126
|
+
# tags - An array of tag name strings to check. These should be downcase.
|
127
|
+
#
|
128
|
+
# Returns true when the node has a matching ancestor.
|
129
|
+
def has_ancestor?(node, tags)
|
130
|
+
while node = node.parent
|
131
|
+
if tags.include?(node.name.downcase)
|
132
|
+
break true
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# Perform a filter on doc with the given context.
|
138
|
+
#
|
139
|
+
# Returns a HTML::Pipeline::DocumentFragment or a String containing HTML
|
140
|
+
# markup.
|
141
|
+
def self.call(doc, context = nil, result = nil)
|
142
|
+
new(doc, context, result).call
|
143
|
+
end
|
144
|
+
|
145
|
+
# Like call but guarantees that a DocumentFragment is returned, even when
|
146
|
+
# the last filter returns a String.
|
147
|
+
def self.to_document(input, context = nil)
|
148
|
+
html = call(input, context)
|
149
|
+
HTML::Pipeline::parse(html)
|
150
|
+
end
|
151
|
+
|
152
|
+
# Like call but guarantees that a string of HTML markup is returned.
|
153
|
+
def self.to_html(input, context = nil)
|
154
|
+
output = call(input, context)
|
155
|
+
if output.respond_to?(:to_html)
|
156
|
+
output.to_html
|
157
|
+
else
|
158
|
+
output.to_s
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# Validator for required context. This will check that anything passed in
|
163
|
+
# contexts exists in @contexts
|
164
|
+
#
|
165
|
+
# If any errors are found an ArgumentError will be raised with a
|
166
|
+
# message listing all the missing contexts and the filters that
|
167
|
+
# require them.
|
168
|
+
def needs(*keys)
|
169
|
+
missing = keys.reject { |key| context.include? key }
|
170
|
+
|
171
|
+
if missing.any?
|
172
|
+
raise ArgumentError,
|
173
|
+
"Missing context keys for #{self.class.name}: #{missing.map(&:inspect).join ', '}"
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# HTML Filter for replacing http github urls with https versions.
|
4
|
+
class HttpsFilter < Filter
|
5
|
+
def call
|
6
|
+
doc.css('a[href^="http://github.com"]').each do |element|
|
7
|
+
element['href'] = element['href'].sub(/^http:/,'https:')
|
8
|
+
end
|
9
|
+
doc
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# This filter rewrites image tags with a max-width inline style and also wraps
|
4
|
+
# the image in an <a> tag that causes the full size image to be opened in a
|
5
|
+
# new tab.
|
6
|
+
#
|
7
|
+
# The max-width inline styles are especially useful in HTML email which
|
8
|
+
# don't use a global stylesheets.
|
9
|
+
class ImageMaxWidthFilter < Filter
|
10
|
+
def call
|
11
|
+
doc.search('img').each do |element|
|
12
|
+
# Skip if there's already a style attribute. Not sure how this
|
13
|
+
# would happen but we can reconsider it in the future.
|
14
|
+
next if element['style']
|
15
|
+
|
16
|
+
# Bail out if src doesn't look like a valid http url. trying to avoid weird
|
17
|
+
# js injection via javascript: urls.
|
18
|
+
next if element['src'].to_s.strip =~ /\Ajavascript/i
|
19
|
+
|
20
|
+
element['style'] = "max-width:100%;"
|
21
|
+
|
22
|
+
if !has_ancestor?(element, %w(a))
|
23
|
+
link_image element
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
doc
|
28
|
+
end
|
29
|
+
|
30
|
+
def link_image(element)
|
31
|
+
link = doc.document.create_element('a', :href => element['src'], :target => '_blank')
|
32
|
+
link.add_child(element.dup)
|
33
|
+
element.replace(link)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'github/markdown'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML Filter that converts Markdown text into HTML and converts into a
|
6
|
+
# DocumentFragment. This is different from most filters in that it can take a
|
7
|
+
# non-HTML as input. It must be used as the first filter in a pipeline.
|
8
|
+
#
|
9
|
+
# Context options:
|
10
|
+
# :gfm => false Disable GFM line-end processing
|
11
|
+
#
|
12
|
+
# This filter does not write any additional information to the context hash.
|
13
|
+
class MarkdownFilter < TextFilter
|
14
|
+
def initialize(text, context = nil, result = nil)
|
15
|
+
super text, context, result
|
16
|
+
@text = @text.gsub "\r", ''
|
17
|
+
end
|
18
|
+
|
19
|
+
# Convert Markdown to HTML using the best available implementation
|
20
|
+
# and convert into a DocumentFragment.
|
21
|
+
def call
|
22
|
+
mode = (context[:gfm] != false) ? :gfm : :markdown
|
23
|
+
html = GitHub::Markdown.to_html(@text, mode)
|
24
|
+
html.rstrip!
|
25
|
+
html
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|