geothird-html-pipeline 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.travis.yml +13 -0
- data/CHANGELOG.md +43 -0
- data/Gemfile +9 -0
- data/LICENSE +22 -0
- data/README.md +274 -0
- data/Rakefile +11 -0
- data/bin/html-pipeline +80 -0
- data/geothird-html-pipeline.gemspec +27 -0
- data/lib/html/pipeline.rb +198 -0
- data/lib/html/pipeline/@mention_filter.rb +121 -0
- data/lib/html/pipeline/absolute_source_filter.rb +48 -0
- data/lib/html/pipeline/autolink_filter.rb +22 -0
- data/lib/html/pipeline/body_content.rb +42 -0
- data/lib/html/pipeline/camo_filter.rb +70 -0
- data/lib/html/pipeline/email_reply_filter.rb +56 -0
- data/lib/html/pipeline/emoji_filter.rb +54 -0
- data/lib/html/pipeline/filter.rb +178 -0
- data/lib/html/pipeline/https_filter.rb +13 -0
- data/lib/html/pipeline/image_max_width_filter.rb +37 -0
- data/lib/html/pipeline/markdown_filter.rb +29 -0
- data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
- data/lib/html/pipeline/sanitization_filter.rb +105 -0
- data/lib/html/pipeline/syntax_highlight_filter.rb +33 -0
- data/lib/html/pipeline/text_filter.rb +14 -0
- data/lib/html/pipeline/textile_filter.rb +21 -0
- data/lib/html/pipeline/toc_filter.rb +28 -0
- data/lib/html/pipeline/version.rb +5 -0
- data/test/helpers/mocked_instrumentation_service.rb +17 -0
- data/test/html/pipeline/absolute_source_filter_test.rb +56 -0
- data/test/html/pipeline/autolink_filter_test.rb +22 -0
- data/test/html/pipeline/camo_filter_test.rb +47 -0
- data/test/html/pipeline/emoji_filter_test.rb +18 -0
- data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
- data/test/html/pipeline/markdown_filter_test.rb +101 -0
- data/test/html/pipeline/mention_filter_test.rb +156 -0
- data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
- data/test/html/pipeline/sanitization_filter_test.rb +47 -0
- data/test/html/pipeline/toc_filter_test.rb +47 -0
- data/test/html/pipeline_test.rb +74 -0
- data/test/test_helper.rb +38 -0
- metadata +213 -0
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'openssl'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML Filter for replacing http image URLs with camo versions. See:
|
6
|
+
#
|
7
|
+
# https://github.com/atmos/camo
|
8
|
+
#
|
9
|
+
# All images provided in user content should be run through this
|
10
|
+
# filter so that http image sources do not cause mixed-content warnings
|
11
|
+
# in browser clients.
|
12
|
+
#
|
13
|
+
# Context options:
|
14
|
+
# :asset_proxy (required) - Base URL for constructed asset proxy URLs.
|
15
|
+
# :asset_proxy_secret_key (required) - The shared secret used to encode URLs.
|
16
|
+
#
|
17
|
+
# This filter does not write additional information to the context.
|
18
|
+
class CamoFilter < Filter
|
19
|
+
# Hijacks images in the markup provided, replacing them with URLs that
|
20
|
+
# go through the github asset proxy.
|
21
|
+
def call
|
22
|
+
doc.search("img").each do |element|
|
23
|
+
next if element['src'].nil?
|
24
|
+
src = element['src'].strip
|
25
|
+
src = src.sub(%r!^http://github.com!, 'https://github.com')
|
26
|
+
next if context[:disable_asset_proxy]
|
27
|
+
|
28
|
+
if src =~ /^http:/ || src =~ /^https:\/\/img.skitch.com\//
|
29
|
+
element['src'] = asset_proxy_url(src)
|
30
|
+
else
|
31
|
+
element['src'] = src
|
32
|
+
end
|
33
|
+
end
|
34
|
+
doc
|
35
|
+
end
|
36
|
+
|
37
|
+
# Implementation of validate hook.
|
38
|
+
# Errors should raise exceptions or use an existing validator.
|
39
|
+
def validate
|
40
|
+
needs :asset_proxy, :asset_proxy_secret_key
|
41
|
+
end
|
42
|
+
|
43
|
+
# The camouflaged URL for a given image URL.
|
44
|
+
def asset_proxy_url(url)
|
45
|
+
"#{asset_proxy_host}/#{asset_url_hash(url)}/#{hexencode(url)}"
|
46
|
+
end
|
47
|
+
|
48
|
+
# Private: calculate the HMAC digest for a image source URL.
|
49
|
+
def asset_url_hash(url)
|
50
|
+
digest = OpenSSL::Digest::Digest.new('sha1')
|
51
|
+
OpenSSL::HMAC.hexdigest(digest, asset_proxy_secret_key, url)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Private: the hostname to use for generated asset proxied URLs.
|
55
|
+
def asset_proxy_host
|
56
|
+
context[:asset_proxy]
|
57
|
+
end
|
58
|
+
|
59
|
+
def asset_proxy_secret_key
|
60
|
+
context[:asset_proxy_secret_key]
|
61
|
+
end
|
62
|
+
|
63
|
+
# Private: helper to hexencode a string. Each byte ends up encoded into
|
64
|
+
# two characters, zero padded value in the range [0-9a-f].
|
65
|
+
def hexencode(str)
|
66
|
+
str.to_enum(:each_byte).map { |byte| "%02x" % byte }.join
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# HTML Filter that converts email reply text into an HTML DocumentFragment.
|
4
|
+
# It must be used as the first filter in a pipeline.
|
5
|
+
#
|
6
|
+
# Context options:
|
7
|
+
# None
|
8
|
+
#
|
9
|
+
# This filter does not write any additional information to the context hash.
|
10
|
+
class EmailReplyFilter < TextFilter
|
11
|
+
include EscapeUtils
|
12
|
+
|
13
|
+
EMAIL_HIDDEN_HEADER = %(<span class="email-hidden-toggle"><a href="#">…</a></span><div class="email-hidden-reply" style="display:none">).freeze
|
14
|
+
EMAIL_QUOTED_HEADER = %(<div class="email-quoted-reply">).freeze
|
15
|
+
EMAIL_SIGNATURE_HEADER = %(<div class="email-signature-reply">).freeze
|
16
|
+
EMAIL_FRAGMENT_HEADER = %(<div class="email-fragment">).freeze
|
17
|
+
EMAIL_HEADER_END = "</div>".freeze
|
18
|
+
|
19
|
+
# Scans an email body to determine which bits are quoted and which should
|
20
|
+
# be hidden. EmailReplyParser is used to split the comment into an Array
|
21
|
+
# of quoted or unquoted Blocks. Now, we loop through them and attempt to
|
22
|
+
# add <div> tags around them so we can hide the hidden blocks, and style
|
23
|
+
# the quoted blocks differently. Since multiple blocks may be hidden, be
|
24
|
+
# sure to keep the "email-hidden-reply" <div>s around "email-quoted-reply"
|
25
|
+
# <div> tags. Call this on each comment of a visible thread in the order
|
26
|
+
# that they are displayed. Note: all comments are processed so we can
|
27
|
+
# maintain a Set of SHAs of paragraphs. Only plaintext comments skip the
|
28
|
+
# markdown step.
|
29
|
+
#
|
30
|
+
# Returns the email comment HTML as a String
|
31
|
+
def call
|
32
|
+
found_hidden = nil
|
33
|
+
paragraphs = EmailReplyParser.read(text.dup).fragments.map do |fragment|
|
34
|
+
pieces = [escape_html(fragment.to_s.strip).gsub(/^\s*(>|>)/, '')]
|
35
|
+
if fragment.quoted?
|
36
|
+
pieces.unshift EMAIL_QUOTED_HEADER
|
37
|
+
pieces << EMAIL_HEADER_END
|
38
|
+
elsif fragment.signature?
|
39
|
+
pieces.unshift EMAIL_SIGNATURE_HEADER
|
40
|
+
pieces << EMAIL_HEADER_END
|
41
|
+
else
|
42
|
+
pieces.unshift EMAIL_FRAGMENT_HEADER
|
43
|
+
pieces << EMAIL_HEADER_END
|
44
|
+
end
|
45
|
+
if fragment.hidden? && !found_hidden
|
46
|
+
found_hidden = true
|
47
|
+
pieces.unshift EMAIL_HIDDEN_HEADER
|
48
|
+
end
|
49
|
+
pieces.join
|
50
|
+
end
|
51
|
+
paragraphs << EMAIL_HEADER_END if found_hidden
|
52
|
+
paragraphs.join("\n")
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'emoji'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML filter that replaces :emoji: with images.
|
6
|
+
#
|
7
|
+
# Context:
|
8
|
+
# :asset_root (required) - base url to link to emoji sprite
|
9
|
+
class EmojiFilter < Filter
|
10
|
+
# Build a regexp that matches all valid :emoji: names.
|
11
|
+
EmojiPattern = /:(#{Emoji.names.map { |name| Regexp.escape(name) }.join('|')}):/
|
12
|
+
|
13
|
+
def call
|
14
|
+
doc.search('text()').each do |node|
|
15
|
+
content = node.to_html
|
16
|
+
next if !content.include?(':')
|
17
|
+
next if has_ancestor?(node, %w(pre code))
|
18
|
+
html = emoji_image_filter(content)
|
19
|
+
next if html == content
|
20
|
+
node.replace(html)
|
21
|
+
end
|
22
|
+
doc
|
23
|
+
end
|
24
|
+
|
25
|
+
# Implementation of validate hook.
|
26
|
+
# Errors should raise exceptions or use an existing validator.
|
27
|
+
def validate
|
28
|
+
needs :asset_root
|
29
|
+
end
|
30
|
+
|
31
|
+
# Replace :emoji: with corresponding images.
|
32
|
+
#
|
33
|
+
# text - String text to replace :emoji: in.
|
34
|
+
#
|
35
|
+
# Returns a String with :emoji: replaced with images.
|
36
|
+
def emoji_image_filter(text)
|
37
|
+
return text unless text.include?(':')
|
38
|
+
|
39
|
+
text.gsub EmojiPattern do |match|
|
40
|
+
name = $1
|
41
|
+
"<img class='emoji' title=':#{name}:' alt=':#{name}:' src='#{File.join(asset_root, "emoji", "#{name}.png")}' height='20' width='20' align='absmiddle' />"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# The base url to link emoji sprites
|
46
|
+
#
|
47
|
+
# Raises ArgumentError if context option has not been provided.
|
48
|
+
# Returns the context's asset_root.
|
49
|
+
def asset_root
|
50
|
+
context[:asset_root]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,178 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# Base class for user content HTML filters. Each filter takes an
|
4
|
+
# HTML string or Nokogiri::HTML::DocumentFragment, performs
|
5
|
+
# modifications and/or writes information to the result hash. Filters must
|
6
|
+
# return a DocumentFragment (typically the same instance provided to the call
|
7
|
+
# method) or a String with HTML markup.
|
8
|
+
#
|
9
|
+
# Example filter that replaces all images with trollface:
|
10
|
+
#
|
11
|
+
# class FuuuFilter < HTML::Pipeline::Filter
|
12
|
+
# def call
|
13
|
+
# doc.search('img').each do |img|
|
14
|
+
# img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
|
15
|
+
# end
|
16
|
+
# end
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# The context Hash passes options to filters and should not be changed in
|
20
|
+
# place. A Result Hash allows filters to make extracted information
|
21
|
+
# available to the caller and is mutable.
|
22
|
+
#
|
23
|
+
# Common context options:
|
24
|
+
# :base_url - The site's base URL
|
25
|
+
# :repository - A Repository providing context for the HTML being processed
|
26
|
+
#
|
27
|
+
# Each filter may define additional options and output values. See the class
|
28
|
+
# docs for more info.
|
29
|
+
class Filter
|
30
|
+
class InvalidDocumentException < StandardError; end
|
31
|
+
|
32
|
+
def initialize(doc, context = nil, result = nil)
|
33
|
+
if doc.kind_of?(String)
|
34
|
+
@html = doc.to_str
|
35
|
+
@doc = nil
|
36
|
+
else
|
37
|
+
@doc = doc
|
38
|
+
@html = nil
|
39
|
+
end
|
40
|
+
@context = context || {}
|
41
|
+
@result = result || {}
|
42
|
+
validate
|
43
|
+
end
|
44
|
+
|
45
|
+
# Public: Returns a simple Hash used to pass extra information into filters
|
46
|
+
# and also to allow filters to make extracted information available to the
|
47
|
+
# caller.
|
48
|
+
attr_reader :context
|
49
|
+
|
50
|
+
# Public: Returns a Hash used to allow filters to pass back information
|
51
|
+
# to callers of the various Pipelines. This can be used for
|
52
|
+
# #mentioned_users, for example.
|
53
|
+
attr_reader :result
|
54
|
+
|
55
|
+
# The Nokogiri::HTML::DocumentFragment to be manipulated. If the filter was
|
56
|
+
# provided a String, parse into a DocumentFragment the first time this
|
57
|
+
# method is called.
|
58
|
+
def doc
|
59
|
+
@doc ||= parse_html(html)
|
60
|
+
end
|
61
|
+
|
62
|
+
# The String representation of the document. If a DocumentFragment was
|
63
|
+
# provided to the Filter, it is serialized into a String when this method is
|
64
|
+
# called.
|
65
|
+
def html
|
66
|
+
raise InvalidDocumentException if @html.nil? && @doc.nil?
|
67
|
+
@html || doc.to_html
|
68
|
+
end
|
69
|
+
|
70
|
+
# The main filter entry point. The doc attribute is guaranteed to be a
|
71
|
+
# Nokogiri::HTML::DocumentFragment when invoked. Subclasses should modify
|
72
|
+
# this document in place or extract information and add it to the context
|
73
|
+
# hash.
|
74
|
+
def call
|
75
|
+
raise NotImplementedError
|
76
|
+
end
|
77
|
+
|
78
|
+
# Make sure the context has everything we need. Noop: Subclasses can override.
|
79
|
+
def validate
|
80
|
+
end
|
81
|
+
|
82
|
+
# The Repository object provided in the context hash, or nil when no
|
83
|
+
# :repository was specified.
|
84
|
+
#
|
85
|
+
# It's assumed that the repository context has already been checked
|
86
|
+
# for permissions
|
87
|
+
def repository
|
88
|
+
context[:repository]
|
89
|
+
end
|
90
|
+
|
91
|
+
# The User object provided in the context hash, or nil when no user
|
92
|
+
# was specified
|
93
|
+
def current_user
|
94
|
+
context[:current_user]
|
95
|
+
end
|
96
|
+
|
97
|
+
# Return whether the filter can access a given repo while
|
98
|
+
# applying a filter
|
99
|
+
#
|
100
|
+
# A repo can only be accessed if its pullable by the user who
|
101
|
+
# submitted the content of this filter, or if it's the same as
|
102
|
+
# the repository context in which the filter runs
|
103
|
+
def can_access_repo?(repo)
|
104
|
+
return false if repo.nil?
|
105
|
+
return true if repo == repository
|
106
|
+
repo.pullable_by?(current_user)
|
107
|
+
end
|
108
|
+
|
109
|
+
# The site's base URL provided in the context hash, or '/' when no
|
110
|
+
# base URL was specified.
|
111
|
+
def base_url
|
112
|
+
context[:base_url] || '/'
|
113
|
+
end
|
114
|
+
|
115
|
+
# Ensure the passed argument is a DocumentFragment. When a string is
|
116
|
+
# provided, it is parsed and returned; otherwise, the DocumentFragment is
|
117
|
+
# returned unmodified.
|
118
|
+
def parse_html(html)
|
119
|
+
HTML::Pipeline.parse(html)
|
120
|
+
end
|
121
|
+
|
122
|
+
# Helper method for filter subclasses used to determine if any of a node's
|
123
|
+
# ancestors have one of the tag names specified.
|
124
|
+
#
|
125
|
+
# node - The Node object to check.
|
126
|
+
# tags - An array of tag name strings to check. These should be downcase.
|
127
|
+
#
|
128
|
+
# Returns true when the node has a matching ancestor.
|
129
|
+
def has_ancestor?(node, tags)
|
130
|
+
while node = node.parent
|
131
|
+
if tags.include?(node.name.downcase)
|
132
|
+
break true
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# Perform a filter on doc with the given context.
|
138
|
+
#
|
139
|
+
# Returns a HTML::Pipeline::DocumentFragment or a String containing HTML
|
140
|
+
# markup.
|
141
|
+
def self.call(doc, context = nil, result = nil)
|
142
|
+
new(doc, context, result).call
|
143
|
+
end
|
144
|
+
|
145
|
+
# Like call but guarantees that a DocumentFragment is returned, even when
|
146
|
+
# the last filter returns a String.
|
147
|
+
def self.to_document(input, context = nil)
|
148
|
+
html = call(input, context)
|
149
|
+
HTML::Pipeline::parse(html)
|
150
|
+
end
|
151
|
+
|
152
|
+
# Like call but guarantees that a string of HTML markup is returned.
|
153
|
+
def self.to_html(input, context = nil)
|
154
|
+
output = call(input, context)
|
155
|
+
if output.respond_to?(:to_html)
|
156
|
+
output.to_html
|
157
|
+
else
|
158
|
+
output.to_s
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# Validator for required context. This will check that anything passed in
|
163
|
+
# contexts exists in @contexts
|
164
|
+
#
|
165
|
+
# If any errors are found an ArgumentError will be raised with a
|
166
|
+
# message listing all the missing contexts and the filters that
|
167
|
+
# require them.
|
168
|
+
def needs(*keys)
|
169
|
+
missing = keys.reject { |key| context.include? key }
|
170
|
+
|
171
|
+
if missing.any?
|
172
|
+
raise ArgumentError,
|
173
|
+
"Missing context keys for #{self.class.name}: #{missing.map(&:inspect).join ', '}"
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# HTML Filter for replacing http github urls with https versions.
|
4
|
+
class HttpsFilter < Filter
|
5
|
+
def call
|
6
|
+
doc.css('a[href^="http://github.com"]').each do |element|
|
7
|
+
element['href'] = element['href'].sub(/^http:/,'https:')
|
8
|
+
end
|
9
|
+
doc
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# This filter rewrites image tags with a max-width inline style and also wraps
|
4
|
+
# the image in an <a> tag that causes the full size image to be opened in a
|
5
|
+
# new tab.
|
6
|
+
#
|
7
|
+
# The max-width inline styles are especially useful in HTML email which
|
8
|
+
# don't use a global stylesheets.
|
9
|
+
class ImageMaxWidthFilter < Filter
|
10
|
+
def call
|
11
|
+
doc.search('img').each do |element|
|
12
|
+
# Skip if there's already a style attribute. Not sure how this
|
13
|
+
# would happen but we can reconsider it in the future.
|
14
|
+
next if element['style']
|
15
|
+
|
16
|
+
# Bail out if src doesn't look like a valid http url. trying to avoid weird
|
17
|
+
# js injection via javascript: urls.
|
18
|
+
next if element['src'].to_s.strip =~ /\Ajavascript/i
|
19
|
+
|
20
|
+
element['style'] = "max-width:100%;"
|
21
|
+
|
22
|
+
if !has_ancestor?(element, %w(a))
|
23
|
+
link_image element
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
doc
|
28
|
+
end
|
29
|
+
|
30
|
+
def link_image(element)
|
31
|
+
link = doc.document.create_element('a', :href => element['src'], :target => '_blank')
|
32
|
+
link.add_child(element.dup)
|
33
|
+
element.replace(link)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'github/markdown'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML Filter that converts Markdown text into HTML and converts into a
|
6
|
+
# DocumentFragment. This is different from most filters in that it can take a
|
7
|
+
# non-HTML as input. It must be used as the first filter in a pipeline.
|
8
|
+
#
|
9
|
+
# Context options:
|
10
|
+
# :gfm => false Disable GFM line-end processing
|
11
|
+
#
|
12
|
+
# This filter does not write any additional information to the context hash.
|
13
|
+
class MarkdownFilter < TextFilter
|
14
|
+
def initialize(text, context = nil, result = nil)
|
15
|
+
super text, context, result
|
16
|
+
@text = @text.gsub "\r", ''
|
17
|
+
end
|
18
|
+
|
19
|
+
# Convert Markdown to HTML using the best available implementation
|
20
|
+
# and convert into a DocumentFragment.
|
21
|
+
def call
|
22
|
+
mode = (context[:gfm] != false) ? :gfm : :markdown
|
23
|
+
html = GitHub::Markdown.to_html(@text, mode)
|
24
|
+
html.rstrip!
|
25
|
+
html
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|