html-pipeline 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/.travis.yml +13 -0
- data/Gemfile +9 -0
- data/LICENSE +22 -0
- data/README.md +128 -0
- data/Rakefile +11 -0
- data/html-pipeline.gemspec +25 -0
- data/lib/html/pipeline.rb +130 -0
- data/lib/html/pipeline/@mention_filter.rb +118 -0
- data/lib/html/pipeline/autolink_filter.rb +22 -0
- data/lib/html/pipeline/body_content.rb +42 -0
- data/lib/html/pipeline/camo_filter.rb +64 -0
- data/lib/html/pipeline/email_reply_filter.rb +56 -0
- data/lib/html/pipeline/emoji_filter.rb +48 -0
- data/lib/html/pipeline/filter.rb +158 -0
- data/lib/html/pipeline/https_filter.rb +13 -0
- data/lib/html/pipeline/image_max_width_filter.rb +37 -0
- data/lib/html/pipeline/markdown_filter.rb +29 -0
- data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
- data/lib/html/pipeline/sanitization_filter.rb +107 -0
- data/lib/html/pipeline/syntax_highlight_filter.rb +29 -0
- data/lib/html/pipeline/text_filter.rb +14 -0
- data/lib/html/pipeline/textile_filter.rb +21 -0
- data/lib/html/pipeline/toc_filter.rb +28 -0
- data/lib/html/pipeline/version.rb +5 -0
- data/test/html/pipeline/autolink_filter_test.rb +22 -0
- data/test/html/pipeline/camo_filter_test.rb +39 -0
- data/test/html/pipeline/emoji_filter_test.rb +16 -0
- data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
- data/test/html/pipeline/markdown_filter_test.rb +101 -0
- data/test/html/pipeline/mention_filter_test.rb +158 -0
- data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
- data/test/html/pipeline/sanitization_filter_test.rb +47 -0
- data/test/html/pipeline/toc_filter_test.rb +47 -0
- data/test/test_helper.rb +38 -0
- metadata +221 -0
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'rinku'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML Filter for auto_linking urls in HTML.
|
6
|
+
#
|
7
|
+
# Context options:
|
8
|
+
# :autolink - boolean whether to autolink urls
|
9
|
+
# :flags - additional Rinku flags. See https://github.com/vmg/rinku
|
10
|
+
#
|
11
|
+
# This filter does not write additional information to the context.
|
12
|
+
class AutolinkFilter < Filter
|
13
|
+
def call
|
14
|
+
return html if context[:autolink] == false
|
15
|
+
flags = 0
|
16
|
+
flags |= context[:flags] if context[:flags]
|
17
|
+
|
18
|
+
Rinku.auto_link(html, :urls, nil, %w[a script kbd pre code], flags)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# Public: Runs a String of content through an HTML processing pipeline,
|
4
|
+
# providing easy access to a generated DocumentFragment.
|
5
|
+
class BodyContent
|
6
|
+
attr_reader :result
|
7
|
+
|
8
|
+
# Public: Initialize a BodyContent.
|
9
|
+
#
|
10
|
+
# body - A String body.
|
11
|
+
# context - A Hash of context options for the filters.
|
12
|
+
# pipeline - A HTML::Pipeline object with one or more Filters.
|
13
|
+
def initialize(body, context, pipeline)
|
14
|
+
@body = body
|
15
|
+
@context = context
|
16
|
+
@pipeline = pipeline
|
17
|
+
end
|
18
|
+
|
19
|
+
# Public: Gets the memoized result of the body content as it passed through
|
20
|
+
# the Pipeline.
|
21
|
+
#
|
22
|
+
# Returns a Hash, or something similar as defined by @pipeline.result_class.
|
23
|
+
def result
|
24
|
+
@result ||= @pipeline.call @body, @context
|
25
|
+
end
|
26
|
+
|
27
|
+
# Public: Gets the updated body from the Pipeline result.
|
28
|
+
#
|
29
|
+
# Returns a String or DocumentFragment.
|
30
|
+
def output
|
31
|
+
@output ||= result[:output]
|
32
|
+
end
|
33
|
+
|
34
|
+
# Public: Parses the output into a DocumentFragment.
|
35
|
+
#
|
36
|
+
# Returns a DocumentFragment.
|
37
|
+
def document
|
38
|
+
@document ||= HTML::Pipeline.parse output
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'openssl'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML Filter for replacing http image URLs with camo versions. See:
|
6
|
+
#
|
7
|
+
# https://github.com/atmos/camo
|
8
|
+
#
|
9
|
+
# All images provided in user content should be run through this
|
10
|
+
# filter so that http image sources do not cause mixed-content warnings
|
11
|
+
# in browser clients.
|
12
|
+
#
|
13
|
+
# Context options:
|
14
|
+
# :asset_proxy - Base URL for constructed asset proxy URLs.
|
15
|
+
# :asset_proxy_secret_key - The shared secret used to encode URLs.
|
16
|
+
#
|
17
|
+
# This filter does not write additional information to the context.
|
18
|
+
class CamoFilter < Filter
|
19
|
+
# Hijacks images in the markup provided, replacing them with URLs that
|
20
|
+
# go through the github asset proxy.
|
21
|
+
def call
|
22
|
+
doc.search("img").each do |element|
|
23
|
+
next if element['src'].nil?
|
24
|
+
src = element['src'].strip
|
25
|
+
src = src.sub(%r!^http://github.com!, 'https://github.com')
|
26
|
+
next if context[:disable_asset_proxy]
|
27
|
+
|
28
|
+
if src =~ /^http:/ || src =~ /^https:\/\/img.skitch.com\//
|
29
|
+
element['src'] = asset_proxy_url(src)
|
30
|
+
else
|
31
|
+
element['src'] = src
|
32
|
+
end
|
33
|
+
end
|
34
|
+
doc
|
35
|
+
end
|
36
|
+
|
37
|
+
# The camouflaged URL for a given image URL.
|
38
|
+
def asset_proxy_url(url)
|
39
|
+
"#{asset_proxy_host}/#{asset_url_hash(url)}/#{hexencode(url)}"
|
40
|
+
end
|
41
|
+
|
42
|
+
# Private: calculate the HMAC digest for a image source URL.
|
43
|
+
def asset_url_hash(url)
|
44
|
+
digest = OpenSSL::Digest::Digest.new('sha1')
|
45
|
+
OpenSSL::HMAC.hexdigest(digest, asset_proxy_secret_key, url)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Private: the hostname to use for generated asset proxied URLs.
|
49
|
+
def asset_proxy_host
|
50
|
+
context[:asset_proxy] or raise "Missing context :asset_proxy"
|
51
|
+
end
|
52
|
+
|
53
|
+
def asset_proxy_secret_key
|
54
|
+
context[:asset_proxy_secret_key] or raise "Missing context :asset_proxy_secret_key"
|
55
|
+
end
|
56
|
+
|
57
|
+
# Private: helper to hexencode a string. Each byte ends up encoded into
|
58
|
+
# two characters, zero padded value in the range [0-9a-f].
|
59
|
+
def hexencode(str)
|
60
|
+
str.to_enum(:each_byte).map { |byte| "%02x" % byte }.join
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# HTML Filter that converts email reply text into an HTML DocumentFragment.
|
4
|
+
# It must be used as the first filter in a pipeline.
|
5
|
+
#
|
6
|
+
# Context options:
|
7
|
+
# None
|
8
|
+
#
|
9
|
+
# This filter does not write any additional information to the context hash.
|
10
|
+
class EmailReplyFilter < TextFilter
|
11
|
+
include EscapeUtils
|
12
|
+
|
13
|
+
EMAIL_HIDDEN_HEADER = %(<span class="email-hidden-toggle"><a href="#">…</a></span><div class="email-hidden-reply" style="display:none">).freeze
|
14
|
+
EMAIL_QUOTED_HEADER = %(<div class="email-quoted-reply">).freeze
|
15
|
+
EMAIL_SIGNATURE_HEADER = %(<div class="email-signature-reply">).freeze
|
16
|
+
EMAIL_FRAGMENT_HEADER = %(<div class="email-fragment">).freeze
|
17
|
+
EMAIL_HEADER_END = "</div>".freeze
|
18
|
+
|
19
|
+
# Scans an email body to determine which bits are quoted and which should
|
20
|
+
# be hidden. EmailReplyParser is used to split the comment into an Array
|
21
|
+
# of quoted or unquoted Blocks. Now, we loop through them and attempt to
|
22
|
+
# add <div> tags around them so we can hide the hidden blocks, and style
|
23
|
+
# the quoted blocks differently. Since multiple blocks may be hidden, be
|
24
|
+
# sure to keep the "email-hidden-reply" <div>s around "email-quoted-reply"
|
25
|
+
# <div> tags. Call this on each comment of a visible thread in the order
|
26
|
+
# that they are displayed. Note: all comments are processed so we can
|
27
|
+
# maintain a Set of SHAs of paragraphs. Only plaintext comments skip the
|
28
|
+
# markdown step.
|
29
|
+
#
|
30
|
+
# Returns the email comment HTML as a String
|
31
|
+
def call
|
32
|
+
found_hidden = nil
|
33
|
+
paragraphs = EmailReplyParser.read(text.dup).fragments.map do |fragment|
|
34
|
+
pieces = [escape_html(fragment.to_s.strip).gsub(/^\s*(>|>)/, '')]
|
35
|
+
if fragment.quoted?
|
36
|
+
pieces.unshift EMAIL_QUOTED_HEADER
|
37
|
+
pieces << EMAIL_HEADER_END
|
38
|
+
elsif fragment.signature?
|
39
|
+
pieces.unshift EMAIL_SIGNATURE_HEADER
|
40
|
+
pieces << EMAIL_HEADER_END
|
41
|
+
else
|
42
|
+
pieces.unshift EMAIL_FRAGMENT_HEADER
|
43
|
+
pieces << EMAIL_HEADER_END
|
44
|
+
end
|
45
|
+
if fragment.hidden? && !found_hidden
|
46
|
+
found_hidden = true
|
47
|
+
pieces.unshift EMAIL_HIDDEN_HEADER
|
48
|
+
end
|
49
|
+
pieces.join
|
50
|
+
end
|
51
|
+
paragraphs << EMAIL_HEADER_END if found_hidden
|
52
|
+
paragraphs.join("\n")
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'emoji'
|
2
|
+
|
3
|
+
module HTML
|
4
|
+
class Pipeline
|
5
|
+
# HTML filter that replaces :emoji: with images.
|
6
|
+
#
|
7
|
+
# Context:
|
8
|
+
# :asset_root - base url to link to emoji sprite
|
9
|
+
class EmojiFilter < Filter
|
10
|
+
# Build a regexp that matches all valid :emoji: names.
|
11
|
+
EmojiPattern = /:(#{Emoji.names.map { |name| Regexp.escape(name) }.join('|')}):/
|
12
|
+
|
13
|
+
def call
|
14
|
+
doc.search('text()').each do |node|
|
15
|
+
content = node.to_html
|
16
|
+
next if !content.include?(':')
|
17
|
+
next if has_ancestor?(node, %w(pre code))
|
18
|
+
html = emoji_image_filter(content)
|
19
|
+
next if html == content
|
20
|
+
node.replace(html)
|
21
|
+
end
|
22
|
+
doc
|
23
|
+
end
|
24
|
+
|
25
|
+
# Replace :emoji: with corresponding images.
|
26
|
+
#
|
27
|
+
# text - String text to replace :emoji: in.
|
28
|
+
#
|
29
|
+
# Returns a String with :emoji: replaced with images.
|
30
|
+
def emoji_image_filter(text)
|
31
|
+
return text unless text.include?(':')
|
32
|
+
|
33
|
+
text.gsub EmojiPattern do |match|
|
34
|
+
name = $1
|
35
|
+
"<img class='emoji' title=':#{name}:' alt=':#{name}:' src='#{File.join(asset_root, "emoji", "#{name}.png")}' height='20' width='20' align='absmiddle' />"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# The base url to link emoji sprites
|
40
|
+
#
|
41
|
+
# Raises ArgumentError if context option has not been provided.
|
42
|
+
# Returns the context's asset_root.
|
43
|
+
def asset_root
|
44
|
+
context[:asset_root] or raise ArgumentError, "Missing context :asset_root"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,158 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# Base class for user content HTML filters. Each filter takes an
|
4
|
+
# HTML string or Nokogiri::HTML::DocumentFragment, performs
|
5
|
+
# modifications and/or writes information to the result hash. Filters must
|
6
|
+
# return a DocumentFragment (typically the same instance provided to the call
|
7
|
+
# method) or a String with HTML markup.
|
8
|
+
#
|
9
|
+
# Example filter that replaces all images with trollface:
|
10
|
+
#
|
11
|
+
# class FuuuFilter < HTML::Pipeline::Filter
|
12
|
+
# def call
|
13
|
+
# doc.search('img').each do |img|
|
14
|
+
# img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
|
15
|
+
# end
|
16
|
+
# end
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# The context Hash passes options to filters and should not be changed in
|
20
|
+
# place. A Result Hash allows filters to make extracted information
|
21
|
+
# available to the caller and is mutable.
|
22
|
+
#
|
23
|
+
# Common context options:
|
24
|
+
# :base_url - The site's base URL
|
25
|
+
# :repository - A Repository providing context for the HTML being processed
|
26
|
+
#
|
27
|
+
# Each filter may define additional options and output values. See the class
|
28
|
+
# docs for more info.
|
29
|
+
class Filter
|
30
|
+
class InvalidDocumentException < StandardError; end
|
31
|
+
|
32
|
+
def initialize(doc, context = nil, result = nil)
|
33
|
+
if doc.kind_of?(String)
|
34
|
+
@html = doc.to_str
|
35
|
+
@doc = nil
|
36
|
+
else
|
37
|
+
@doc = doc
|
38
|
+
@html = nil
|
39
|
+
end
|
40
|
+
@context = context || {}
|
41
|
+
@result = result || {}
|
42
|
+
end
|
43
|
+
|
44
|
+
# Public: Returns a simple Hash used to pass extra information into filters
|
45
|
+
# and also to allow filters to make extracted information available to the
|
46
|
+
# caller.
|
47
|
+
attr_reader :context
|
48
|
+
|
49
|
+
# Public: Returns a Hash used to allow filters to pass back information
|
50
|
+
# to callers of the various Pipelines. This can be used for
|
51
|
+
# #mentioned_users, for example.
|
52
|
+
attr_reader :result
|
53
|
+
|
54
|
+
# The Nokogiri::HTML::DocumentFragment to be manipulated. If the filter was
|
55
|
+
# provided a String, parse into a DocumentFragment the first time this
|
56
|
+
# method is called.
|
57
|
+
def doc
|
58
|
+
@doc ||= parse_html(html)
|
59
|
+
end
|
60
|
+
|
61
|
+
# The String representation of the document. If a DocumentFragment was
|
62
|
+
# provided to the Filter, it is serialized into a String when this method is
|
63
|
+
# called.
|
64
|
+
def html
|
65
|
+
raise InvalidDocumentException if @html.nil? && @doc.nil?
|
66
|
+
@html || doc.to_html
|
67
|
+
end
|
68
|
+
|
69
|
+
# The main filter entry point. The doc attribute is guaranteed to be a
|
70
|
+
# Nokogiri::HTML::DocumentFragment when invoked. Subclasses should modify
|
71
|
+
# this document in place or extract information and add it to the context
|
72
|
+
# hash.
|
73
|
+
def call
|
74
|
+
raise NotImplementedError
|
75
|
+
end
|
76
|
+
|
77
|
+
# The Repository object provided in the context hash, or nil when no
|
78
|
+
# :repository was specified.
|
79
|
+
#
|
80
|
+
# It's assumed that the repository context has already been checked
|
81
|
+
# for permissions
|
82
|
+
def repository
|
83
|
+
context[:repository]
|
84
|
+
end
|
85
|
+
|
86
|
+
# The User object provided in the context hash, or nil when no user
|
87
|
+
# was specified
|
88
|
+
def current_user
|
89
|
+
context[:current_user]
|
90
|
+
end
|
91
|
+
|
92
|
+
# Return whether the filter can access a given repo while
|
93
|
+
# applying a filter
|
94
|
+
#
|
95
|
+
# A repo can only be accessed if its pullable by the user who
|
96
|
+
# submitted the content of this filter, or if it's the same as
|
97
|
+
# the repository context in which the filter runs
|
98
|
+
def can_access_repo?(repo)
|
99
|
+
return false if repo.nil?
|
100
|
+
return true if repo == repository
|
101
|
+
repo.pullable_by?(current_user)
|
102
|
+
end
|
103
|
+
|
104
|
+
# The site's base URL provided in the context hash, or '/' when no
|
105
|
+
# base URL was specified.
|
106
|
+
def base_url
|
107
|
+
context[:base_url] || '/'
|
108
|
+
end
|
109
|
+
|
110
|
+
# Ensure the passed argument is a DocumentFragment. When a string is
|
111
|
+
# provided, it is parsed and returned; otherwise, the DocumentFragment is
|
112
|
+
# returned unmodified.
|
113
|
+
def parse_html(html)
|
114
|
+
HTML::Pipeline.parse(html)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Helper method for filter subclasses used to determine if any of a node's
|
118
|
+
# ancestors have one of the tag names specified.
|
119
|
+
#
|
120
|
+
# node - The Node object to check.
|
121
|
+
# tags - An array of tag name strings to check. These should be downcase.
|
122
|
+
#
|
123
|
+
# Returns true when the node has a matching ancestor.
|
124
|
+
def has_ancestor?(node, tags)
|
125
|
+
while node = node.parent
|
126
|
+
if tags.include?(node.name.downcase)
|
127
|
+
break true
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# Perform a filter on doc with the given context.
|
133
|
+
#
|
134
|
+
# Returns a HTML::Pipeline::DocumentFragment or a String containing HTML
|
135
|
+
# markup.
|
136
|
+
def self.call(doc, context = nil, result = nil)
|
137
|
+
new(doc, context, result).call
|
138
|
+
end
|
139
|
+
|
140
|
+
# Like call but guarantees that a DocumentFragment is returned, even when
|
141
|
+
# the last filter returns a String.
|
142
|
+
def self.to_document(input, context = nil)
|
143
|
+
html = call(input, context)
|
144
|
+
HTML::Pipeline::parse(html)
|
145
|
+
end
|
146
|
+
|
147
|
+
# Like call but guarantees that a string of HTML markup is returned.
|
148
|
+
def self.to_html(input, context = nil)
|
149
|
+
output = call(input, context)
|
150
|
+
if output.respond_to?(:to_html)
|
151
|
+
output.to_html
|
152
|
+
else
|
153
|
+
output.to_s
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# HTML Filter for replacing http github urls with https versions.
|
4
|
+
class HttpsFilter < Filter
|
5
|
+
def call
|
6
|
+
doc.css('a[href^="http://github.com"]').each do |element|
|
7
|
+
element['href'] = element['href'].sub(/^http:/,'https:')
|
8
|
+
end
|
9
|
+
doc
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module HTML
|
2
|
+
class Pipeline
|
3
|
+
# This filter rewrites image tags with a max-width inline style and also wraps
|
4
|
+
# the image in an <a> tag that causes the full size image to be opened in a
|
5
|
+
# new tab.
|
6
|
+
#
|
7
|
+
# The max-width inline styles are especially useful in HTML email which
|
8
|
+
# don't use a global stylesheets.
|
9
|
+
class ImageMaxWidthFilter < Filter
|
10
|
+
def call
|
11
|
+
doc.search('img').each do |element|
|
12
|
+
# Skip if theres already a style attribute. Not sure how this
|
13
|
+
# would happen but we can reconsider it in the future.
|
14
|
+
next if element['style']
|
15
|
+
|
16
|
+
# Bail out if src doesn't look like a valid http url. tryna avoid weird
|
17
|
+
# js injection via javascript: urls.
|
18
|
+
next if element['src'].to_s.strip =~ /\Ajavascript/i
|
19
|
+
|
20
|
+
element['style'] = "max-width:100%;"
|
21
|
+
|
22
|
+
if !has_ancestor?(element, %w(a))
|
23
|
+
link_image element
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
doc
|
28
|
+
end
|
29
|
+
|
30
|
+
def link_image(element)
|
31
|
+
link = doc.document.create_element('a', :href => element['src'], :target => '_blank')
|
32
|
+
link.add_child(element.dup)
|
33
|
+
element.replace(link)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|