html-pipeline 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/.travis.yml +13 -0
- data/Gemfile +9 -0
- data/LICENSE +22 -0
- data/README.md +128 -0
- data/Rakefile +11 -0
- data/html-pipeline.gemspec +25 -0
- data/lib/html/pipeline.rb +130 -0
- data/lib/html/pipeline/@mention_filter.rb +118 -0
- data/lib/html/pipeline/autolink_filter.rb +22 -0
- data/lib/html/pipeline/body_content.rb +42 -0
- data/lib/html/pipeline/camo_filter.rb +64 -0
- data/lib/html/pipeline/email_reply_filter.rb +56 -0
- data/lib/html/pipeline/emoji_filter.rb +48 -0
- data/lib/html/pipeline/filter.rb +158 -0
- data/lib/html/pipeline/https_filter.rb +13 -0
- data/lib/html/pipeline/image_max_width_filter.rb +37 -0
- data/lib/html/pipeline/markdown_filter.rb +29 -0
- data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
- data/lib/html/pipeline/sanitization_filter.rb +107 -0
- data/lib/html/pipeline/syntax_highlight_filter.rb +29 -0
- data/lib/html/pipeline/text_filter.rb +14 -0
- data/lib/html/pipeline/textile_filter.rb +21 -0
- data/lib/html/pipeline/toc_filter.rb +28 -0
- data/lib/html/pipeline/version.rb +5 -0
- data/test/html/pipeline/autolink_filter_test.rb +22 -0
- data/test/html/pipeline/camo_filter_test.rb +39 -0
- data/test/html/pipeline/emoji_filter_test.rb +16 -0
- data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
- data/test/html/pipeline/markdown_filter_test.rb +101 -0
- data/test/html/pipeline/mention_filter_test.rb +158 -0
- data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
- data/test/html/pipeline/sanitization_filter_test.rb +47 -0
- data/test/html/pipeline/toc_filter_test.rb +47 -0
- data/test/test_helper.rb +38 -0
- metadata +221 -0
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'rinku'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML Filter for auto_linking urls in HTML.
|
6
|
+
#
|
7
|
+
# Context options:
|
8
|
+
# :autolink - boolean whether to autolink urls
|
9
|
+
# :flags - additional Rinku flags. See https://github.com/vmg/rinku
|
10
|
+
#
|
11
|
+
# This filter does not write additional information to the context.
|
12
|
+
class AutolinkFilter < Filter
|
13
|
+
def call
|
14
|
+
return html if context[:autolink] == false
|
15
|
+
flags = 0
|
16
|
+
flags |= context[:flags] if context[:flags]
|
17
|
+
|
18
|
+
Rinku.auto_link(html, :urls, nil, %w[a script kbd pre code], flags)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# Public: Runs a String of content through an HTML processing pipeline,
|
4
|
+
# providing easy access to a generated DocumentFragment.
|
5
|
+
class BodyContent
|
6
|
+
attr_reader :result
|
7
|
+
|
8
|
+
# Public: Initialize a BodyContent.
|
9
|
+
#
|
10
|
+
# body - A String body.
|
11
|
+
# context - A Hash of context options for the filters.
|
12
|
+
# pipeline - A HTML::Pipeline object with one or more Filters.
|
13
|
+
def initialize(body, context, pipeline)
|
14
|
+
@body = body
|
15
|
+
@context = context
|
16
|
+
@pipeline = pipeline
|
17
|
+
end
|
18
|
+
|
19
|
+
# Public: Gets the memoized result of the body content as it passed through
|
20
|
+
# the Pipeline.
|
21
|
+
#
|
22
|
+
# Returns a Hash, or something similar as defined by @pipeline.result_class.
|
23
|
+
def result
|
24
|
+
@result ||= @pipeline.call @body, @context
|
25
|
+
end
|
26
|
+
|
27
|
+
# Public: Gets the updated body from the Pipeline result.
|
28
|
+
#
|
29
|
+
# Returns a String or DocumentFragment.
|
30
|
+
def output
|
31
|
+
@output ||= result[:output]
|
32
|
+
end
|
33
|
+
|
34
|
+
# Public: Parses the output into a DocumentFragment.
|
35
|
+
#
|
36
|
+
# Returns a DocumentFragment.
|
37
|
+
def document
|
38
|
+
@document ||= HTML::Pipeline.parse output
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'openssl'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML Filter for replacing http image URLs with camo versions. See:
|
6
|
+
#
|
7
|
+
# https://github.com/atmos/camo
|
8
|
+
#
|
9
|
+
# All images provided in user content should be run through this
|
10
|
+
# filter so that http image sources do not cause mixed-content warnings
|
11
|
+
# in browser clients.
|
12
|
+
#
|
13
|
+
# Context options:
|
14
|
+
# :asset_proxy - Base URL for constructed asset proxy URLs.
|
15
|
+
# :asset_proxy_secret_key - The shared secret used to encode URLs.
|
16
|
+
#
|
17
|
+
# This filter does not write additional information to the context.
|
18
|
+
class CamoFilter < Filter
|
19
|
+
# Hijacks images in the markup provided, replacing them with URLs that
|
20
|
+
# go through the github asset proxy.
|
21
|
+
def call
|
22
|
+
doc.search("img").each do |element|
|
23
|
+
next if element['src'].nil?
|
24
|
+
src = element['src'].strip
|
25
|
+
src = src.sub(%r!^http://github.com!, 'https://github.com')
|
26
|
+
next if context[:disable_asset_proxy]
|
27
|
+
|
28
|
+
if src =~ /^http:/ || src =~ /^https:\/\/img.skitch.com\//
|
29
|
+
element['src'] = asset_proxy_url(src)
|
30
|
+
else
|
31
|
+
element['src'] = src
|
32
|
+
end
|
33
|
+
end
|
34
|
+
doc
|
35
|
+
end
|
36
|
+
|
37
|
+
# The camouflaged URL for a given image URL.
|
38
|
+
def asset_proxy_url(url)
|
39
|
+
"#{asset_proxy_host}/#{asset_url_hash(url)}/#{hexencode(url)}"
|
40
|
+
end
|
41
|
+
|
42
|
+
# Private: calculate the HMAC digest for a image source URL.
|
43
|
+
def asset_url_hash(url)
|
44
|
+
digest = OpenSSL::Digest::Digest.new('sha1')
|
45
|
+
OpenSSL::HMAC.hexdigest(digest, asset_proxy_secret_key, url)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Private: the hostname to use for generated asset proxied URLs.
|
49
|
+
def asset_proxy_host
|
50
|
+
context[:asset_proxy] or raise "Missing context :asset_proxy"
|
51
|
+
end
|
52
|
+
|
53
|
+
def asset_proxy_secret_key
|
54
|
+
context[:asset_proxy_secret_key] or raise "Missing context :asset_proxy_secret_key"
|
55
|
+
end
|
56
|
+
|
57
|
+
# Private: helper to hexencode a string. Each byte ends up encoded into
|
58
|
+
# two characters, zero padded value in the range [0-9a-f].
|
59
|
+
def hexencode(str)
|
60
|
+
str.to_enum(:each_byte).map { |byte| "%02x" % byte }.join
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# HTML Filter that converts email reply text into an HTML DocumentFragment.
|
4
|
+
# It must be used as the first filter in a pipeline.
|
5
|
+
#
|
6
|
+
# Context options:
|
7
|
+
# None
|
8
|
+
#
|
9
|
+
# This filter does not write any additional information to the context hash.
|
10
|
+
class EmailReplyFilter < TextFilter
|
11
|
+
include EscapeUtils
|
12
|
+
|
13
|
+
EMAIL_HIDDEN_HEADER = %(<span class="email-hidden-toggle"><a href="#">…</a></span><div class="email-hidden-reply" style="display:none">).freeze
|
14
|
+
EMAIL_QUOTED_HEADER = %(<div class="email-quoted-reply">).freeze
|
15
|
+
EMAIL_SIGNATURE_HEADER = %(<div class="email-signature-reply">).freeze
|
16
|
+
EMAIL_FRAGMENT_HEADER = %(<div class="email-fragment">).freeze
|
17
|
+
EMAIL_HEADER_END = "</div>".freeze
|
18
|
+
|
19
|
+
# Scans an email body to determine which bits are quoted and which should
|
20
|
+
# be hidden. EmailReplyParser is used to split the comment into an Array
|
21
|
+
# of quoted or unquoted Blocks. Now, we loop through them and attempt to
|
22
|
+
# add <div> tags around them so we can hide the hidden blocks, and style
|
23
|
+
# the quoted blocks differently. Since multiple blocks may be hidden, be
|
24
|
+
# sure to keep the "email-hidden-reply" <div>s around "email-quoted-reply"
|
25
|
+
# <div> tags. Call this on each comment of a visible thread in the order
|
26
|
+
# that they are displayed. Note: all comments are processed so we can
|
27
|
+
# maintain a Set of SHAs of paragraphs. Only plaintext comments skip the
|
28
|
+
# markdown step.
|
29
|
+
#
|
30
|
+
# Returns the email comment HTML as a String
|
31
|
+
def call
|
32
|
+
found_hidden = nil
|
33
|
+
paragraphs = EmailReplyParser.read(text.dup).fragments.map do |fragment|
|
34
|
+
pieces = [escape_html(fragment.to_s.strip).gsub(/^\s*(>|>)/, '')]
|
35
|
+
if fragment.quoted?
|
36
|
+
pieces.unshift EMAIL_QUOTED_HEADER
|
37
|
+
pieces << EMAIL_HEADER_END
|
38
|
+
elsif fragment.signature?
|
39
|
+
pieces.unshift EMAIL_SIGNATURE_HEADER
|
40
|
+
pieces << EMAIL_HEADER_END
|
41
|
+
else
|
42
|
+
pieces.unshift EMAIL_FRAGMENT_HEADER
|
43
|
+
pieces << EMAIL_HEADER_END
|
44
|
+
end
|
45
|
+
if fragment.hidden? && !found_hidden
|
46
|
+
found_hidden = true
|
47
|
+
pieces.unshift EMAIL_HIDDEN_HEADER
|
48
|
+
end
|
49
|
+
pieces.join
|
50
|
+
end
|
51
|
+
paragraphs << EMAIL_HEADER_END if found_hidden
|
52
|
+
paragraphs.join("\n")
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'emoji'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML filter that replaces :emoji: with images.
|
6
|
+
#
|
7
|
+
# Context:
|
8
|
+
# :asset_root - base url to link to emoji sprite
|
9
|
+
class EmojiFilter < Filter
|
10
|
+
# Build a regexp that matches all valid :emoji: names.
|
11
|
+
EmojiPattern = /:(#{Emoji.names.map { |name| Regexp.escape(name) }.join('|')}):/
|
12
|
+
|
13
|
+
def call
|
14
|
+
doc.search('text()').each do |node|
|
15
|
+
content = node.to_html
|
16
|
+
next if !content.include?(':')
|
17
|
+
next if has_ancestor?(node, %w(pre code))
|
18
|
+
html = emoji_image_filter(content)
|
19
|
+
next if html == content
|
20
|
+
node.replace(html)
|
21
|
+
end
|
22
|
+
doc
|
23
|
+
end
|
24
|
+
|
25
|
+
# Replace :emoji: with corresponding images.
|
26
|
+
#
|
27
|
+
# text - String text to replace :emoji: in.
|
28
|
+
#
|
29
|
+
# Returns a String with :emoji: replaced with images.
|
30
|
+
def emoji_image_filter(text)
|
31
|
+
return text unless text.include?(':')
|
32
|
+
|
33
|
+
text.gsub EmojiPattern do |match|
|
34
|
+
name = $1
|
35
|
+
"<img class='emoji' title=':#{name}:' alt=':#{name}:' src='#{File.join(asset_root, "emoji", "#{name}.png")}' height='20' width='20' align='absmiddle' />"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# The base url to link emoji sprites
|
40
|
+
#
|
41
|
+
# Raises ArgumentError if context option has not been provided.
|
42
|
+
# Returns the context's asset_root.
|
43
|
+
def asset_root
|
44
|
+
context[:asset_root] or raise ArgumentError, "Missing context :asset_root"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,158 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# Base class for user content HTML filters. Each filter takes an
|
4
|
+
# HTML string or Nokogiri::HTML::DocumentFragment, performs
|
5
|
+
# modifications and/or writes information to the result hash. Filters must
|
6
|
+
# return a DocumentFragment (typically the same instance provided to the call
|
7
|
+
# method) or a String with HTML markup.
|
8
|
+
#
|
9
|
+
# Example filter that replaces all images with trollface:
|
10
|
+
#
|
11
|
+
# class FuuuFilter < HTML::Pipeline::Filter
|
12
|
+
# def call
|
13
|
+
# doc.search('img').each do |img|
|
14
|
+
# img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
|
15
|
+
# end
|
16
|
+
# end
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# The context Hash passes options to filters and should not be changed in
|
20
|
+
# place. A Result Hash allows filters to make extracted information
|
21
|
+
# available to the caller and is mutable.
|
22
|
+
#
|
23
|
+
# Common context options:
|
24
|
+
# :base_url - The site's base URL
|
25
|
+
# :repository - A Repository providing context for the HTML being processed
|
26
|
+
#
|
27
|
+
# Each filter may define additional options and output values. See the class
|
28
|
+
# docs for more info.
|
29
|
+
class Filter
|
30
|
+
class InvalidDocumentException < StandardError; end
|
31
|
+
|
32
|
+
def initialize(doc, context = nil, result = nil)
|
33
|
+
if doc.kind_of?(String)
|
34
|
+
@html = doc.to_str
|
35
|
+
@doc = nil
|
36
|
+
else
|
37
|
+
@doc = doc
|
38
|
+
@html = nil
|
39
|
+
end
|
40
|
+
@context = context || {}
|
41
|
+
@result = result || {}
|
42
|
+
end
|
43
|
+
|
44
|
+
# Public: Returns a simple Hash used to pass extra information into filters
|
45
|
+
# and also to allow filters to make extracted information available to the
|
46
|
+
# caller.
|
47
|
+
attr_reader :context
|
48
|
+
|
49
|
+
# Public: Returns a Hash used to allow filters to pass back information
|
50
|
+
# to callers of the various Pipelines. This can be used for
|
51
|
+
# #mentioned_users, for example.
|
52
|
+
attr_reader :result
|
53
|
+
|
54
|
+
# The Nokogiri::HTML::DocumentFragment to be manipulated. If the filter was
|
55
|
+
# provided a String, parse into a DocumentFragment the first time this
|
56
|
+
# method is called.
|
57
|
+
def doc
|
58
|
+
@doc ||= parse_html(html)
|
59
|
+
end
|
60
|
+
|
61
|
+
# The String representation of the document. If a DocumentFragment was
|
62
|
+
# provided to the Filter, it is serialized into a String when this method is
|
63
|
+
# called.
|
64
|
+
def html
|
65
|
+
raise InvalidDocumentException if @html.nil? && @doc.nil?
|
66
|
+
@html || doc.to_html
|
67
|
+
end
|
68
|
+
|
69
|
+
# The main filter entry point. The doc attribute is guaranteed to be a
|
70
|
+
# Nokogiri::HTML::DocumentFragment when invoked. Subclasses should modify
|
71
|
+
# this document in place or extract information and add it to the context
|
72
|
+
# hash.
|
73
|
+
def call
|
74
|
+
raise NotImplementedError
|
75
|
+
end
|
76
|
+
|
77
|
+
# The Repository object provided in the context hash, or nil when no
|
78
|
+
# :repository was specified.
|
79
|
+
#
|
80
|
+
# It's assumed that the repository context has already been checked
|
81
|
+
# for permissions
|
82
|
+
def repository
|
83
|
+
context[:repository]
|
84
|
+
end
|
85
|
+
|
86
|
+
# The User object provided in the context hash, or nil when no user
|
87
|
+
# was specified
|
88
|
+
def current_user
|
89
|
+
context[:current_user]
|
90
|
+
end
|
91
|
+
|
92
|
+
# Return whether the filter can access a given repo while
|
93
|
+
# applying a filter
|
94
|
+
#
|
95
|
+
# A repo can only be accessed if its pullable by the user who
|
96
|
+
# submitted the content of this filter, or if it's the same as
|
97
|
+
# the repository context in which the filter runs
|
98
|
+
def can_access_repo?(repo)
|
99
|
+
return false if repo.nil?
|
100
|
+
return true if repo == repository
|
101
|
+
repo.pullable_by?(current_user)
|
102
|
+
end
|
103
|
+
|
104
|
+
# The site's base URL provided in the context hash, or '/' when no
|
105
|
+
# base URL was specified.
|
106
|
+
def base_url
|
107
|
+
context[:base_url] || '/'
|
108
|
+
end
|
109
|
+
|
110
|
+
# Ensure the passed argument is a DocumentFragment. When a string is
|
111
|
+
# provided, it is parsed and returned; otherwise, the DocumentFragment is
|
112
|
+
# returned unmodified.
|
113
|
+
def parse_html(html)
|
114
|
+
HTML::Pipeline.parse(html)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Helper method for filter subclasses used to determine if any of a node's
|
118
|
+
# ancestors have one of the tag names specified.
|
119
|
+
#
|
120
|
+
# node - The Node object to check.
|
121
|
+
# tags - An array of tag name strings to check. These should be downcase.
|
122
|
+
#
|
123
|
+
# Returns true when the node has a matching ancestor.
|
124
|
+
def has_ancestor?(node, tags)
|
125
|
+
while node = node.parent
|
126
|
+
if tags.include?(node.name.downcase)
|
127
|
+
break true
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# Perform a filter on doc with the given context.
|
133
|
+
#
|
134
|
+
# Returns a HTML::Pipeline::DocumentFragment or a String containing HTML
|
135
|
+
# markup.
|
136
|
+
def self.call(doc, context = nil, result = nil)
|
137
|
+
new(doc, context, result).call
|
138
|
+
end
|
139
|
+
|
140
|
+
# Like call but guarantees that a DocumentFragment is returned, even when
|
141
|
+
# the last filter returns a String.
|
142
|
+
def self.to_document(input, context = nil)
|
143
|
+
html = call(input, context)
|
144
|
+
HTML::Pipeline::parse(html)
|
145
|
+
end
|
146
|
+
|
147
|
+
# Like call but guarantees that a string of HTML markup is returned.
|
148
|
+
def self.to_html(input, context = nil)
|
149
|
+
output = call(input, context)
|
150
|
+
if output.respond_to?(:to_html)
|
151
|
+
output.to_html
|
152
|
+
else
|
153
|
+
output.to_s
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# HTML Filter for replacing http github urls with https versions.
|
4
|
+
class HttpsFilter < Filter
|
5
|
+
def call
|
6
|
+
doc.css('a[href^="http://github.com"]').each do |element|
|
7
|
+
element['href'] = element['href'].sub(/^http:/,'https:')
|
8
|
+
end
|
9
|
+
doc
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# This filter rewrites image tags with a max-width inline style and also wraps
|
4
|
+
# the image in an <a> tag that causes the full size image to be opened in a
|
5
|
+
# new tab.
|
6
|
+
#
|
7
|
+
# The max-width inline styles are especially useful in HTML email which
|
8
|
+
# don't use a global stylesheets.
|
9
|
+
class ImageMaxWidthFilter < Filter
|
10
|
+
def call
|
11
|
+
doc.search('img').each do |element|
|
12
|
+
# Skip if theres already a style attribute. Not sure how this
|
13
|
+
# would happen but we can reconsider it in the future.
|
14
|
+
next if element['style']
|
15
|
+
|
16
|
+
# Bail out if src doesn't look like a valid http url. tryna avoid weird
|
17
|
+
# js injection via javascript: urls.
|
18
|
+
next if element['src'].to_s.strip =~ /\Ajavascript/i
|
19
|
+
|
20
|
+
element['style'] = "max-width:100%;"
|
21
|
+
|
22
|
+
if !has_ancestor?(element, %w(a))
|
23
|
+
link_image element
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
doc
|
28
|
+
end
|
29
|
+
|
30
|
+
def link_image(element)
|
31
|
+
link = doc.document.create_element('a', :href => element['src'], :target => '_blank')
|
32
|
+
link.add_child(element.dup)
|
33
|
+
element.replace(link)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|