html-pipeline 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. data/.gitignore +19 -0
  2. data/.travis.yml +13 -0
  3. data/Gemfile +9 -0
  4. data/LICENSE +22 -0
  5. data/README.md +128 -0
  6. data/Rakefile +11 -0
  7. data/html-pipeline.gemspec +25 -0
  8. data/lib/html/pipeline.rb +130 -0
  9. data/lib/html/pipeline/@mention_filter.rb +118 -0
  10. data/lib/html/pipeline/autolink_filter.rb +22 -0
  11. data/lib/html/pipeline/body_content.rb +42 -0
  12. data/lib/html/pipeline/camo_filter.rb +64 -0
  13. data/lib/html/pipeline/email_reply_filter.rb +56 -0
  14. data/lib/html/pipeline/emoji_filter.rb +48 -0
  15. data/lib/html/pipeline/filter.rb +158 -0
  16. data/lib/html/pipeline/https_filter.rb +13 -0
  17. data/lib/html/pipeline/image_max_width_filter.rb +37 -0
  18. data/lib/html/pipeline/markdown_filter.rb +29 -0
  19. data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
  20. data/lib/html/pipeline/sanitization_filter.rb +107 -0
  21. data/lib/html/pipeline/syntax_highlight_filter.rb +29 -0
  22. data/lib/html/pipeline/text_filter.rb +14 -0
  23. data/lib/html/pipeline/textile_filter.rb +21 -0
  24. data/lib/html/pipeline/toc_filter.rb +28 -0
  25. data/lib/html/pipeline/version.rb +5 -0
  26. data/test/html/pipeline/autolink_filter_test.rb +22 -0
  27. data/test/html/pipeline/camo_filter_test.rb +39 -0
  28. data/test/html/pipeline/emoji_filter_test.rb +16 -0
  29. data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
  30. data/test/html/pipeline/markdown_filter_test.rb +101 -0
  31. data/test/html/pipeline/mention_filter_test.rb +158 -0
  32. data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
  33. data/test/html/pipeline/sanitization_filter_test.rb +47 -0
  34. data/test/html/pipeline/toc_filter_test.rb +47 -0
  35. data/test/test_helper.rb +38 -0
  36. metadata +221 -0
@@ -0,0 +1,22 @@
1
+ require 'rinku'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter for auto_linking urls in HTML.
6
+ #
7
+ # Context options:
8
+ # :autolink - boolean whether to autolink urls
9
+ # :flags - additional Rinku flags. See https://github.com/vmg/rinku
10
+ #
11
+ # This filter does not write additional information to the context.
12
+ class AutolinkFilter < Filter
13
+ def call
14
+ return html if context[:autolink] == false
15
+ flags = 0
16
+ flags |= context[:flags] if context[:flags]
17
+
18
+ Rinku.auto_link(html, :urls, nil, %w[a script kbd pre code], flags)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,42 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Public: Runs a String of content through an HTML processing pipeline,
4
+ # providing easy access to a generated DocumentFragment.
5
+ class BodyContent
6
+ attr_reader :result
7
+
8
+ # Public: Initialize a BodyContent.
9
+ #
10
+ # body - A String body.
11
+ # context - A Hash of context options for the filters.
12
+ # pipeline - A HTML::Pipeline object with one or more Filters.
13
+ def initialize(body, context, pipeline)
14
+ @body = body
15
+ @context = context
16
+ @pipeline = pipeline
17
+ end
18
+
19
+ # Public: Gets the memoized result of the body content as it passed through
20
+ # the Pipeline.
21
+ #
22
+ # Returns a Hash, or something similar as defined by @pipeline.result_class.
23
+ def result
24
+ @result ||= @pipeline.call @body, @context
25
+ end
26
+
27
+ # Public: Gets the updated body from the Pipeline result.
28
+ #
29
+ # Returns a String or DocumentFragment.
30
+ def output
31
+ @output ||= result[:output]
32
+ end
33
+
34
+ # Public: Parses the output into a DocumentFragment.
35
+ #
36
+ # Returns a DocumentFragment.
37
+ def document
38
+ @document ||= HTML::Pipeline.parse output
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,64 @@
1
+ require 'openssl'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter for replacing http image URLs with camo versions. See:
6
+ #
7
+ # https://github.com/atmos/camo
8
+ #
9
+ # All images provided in user content should be run through this
10
+ # filter so that http image sources do not cause mixed-content warnings
11
+ # in browser clients.
12
+ #
13
+ # Context options:
14
+ # :asset_proxy - Base URL for constructed asset proxy URLs.
15
+ # :asset_proxy_secret_key - The shared secret used to encode URLs.
16
+ #
17
+ # This filter does not write additional information to the context.
18
+ class CamoFilter < Filter
19
+ # Hijacks images in the markup provided, replacing them with URLs that
20
+ # go through the github asset proxy.
21
+ def call
22
+ doc.search("img").each do |element|
23
+ next if element['src'].nil?
24
+ src = element['src'].strip
25
+ src = src.sub(%r!^http://github.com!, 'https://github.com')
26
+ next if context[:disable_asset_proxy]
27
+
28
+ if src =~ /^http:/ || src =~ /^https:\/\/img.skitch.com\//
29
+ element['src'] = asset_proxy_url(src)
30
+ else
31
+ element['src'] = src
32
+ end
33
+ end
34
+ doc
35
+ end
36
+
37
+ # The camouflaged URL for a given image URL.
38
+ def asset_proxy_url(url)
39
+ "#{asset_proxy_host}/#{asset_url_hash(url)}/#{hexencode(url)}"
40
+ end
41
+
42
+ # Private: calculate the HMAC digest for a image source URL.
43
+ def asset_url_hash(url)
44
+ digest = OpenSSL::Digest::Digest.new('sha1')
45
+ OpenSSL::HMAC.hexdigest(digest, asset_proxy_secret_key, url)
46
+ end
47
+
48
+ # Private: the hostname to use for generated asset proxied URLs.
49
+ def asset_proxy_host
50
+ context[:asset_proxy] or raise "Missing context :asset_proxy"
51
+ end
52
+
53
+ def asset_proxy_secret_key
54
+ context[:asset_proxy_secret_key] or raise "Missing context :asset_proxy_secret_key"
55
+ end
56
+
57
+ # Private: helper to hexencode a string. Each byte ends up encoded into
58
+ # two characters, zero padded value in the range [0-9a-f].
59
+ def hexencode(str)
60
+ str.to_enum(:each_byte).map { |byte| "%02x" % byte }.join
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,56 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML Filter that converts email reply text into an HTML DocumentFragment.
4
+ # It must be used as the first filter in a pipeline.
5
+ #
6
+ # Context options:
7
+ # None
8
+ #
9
+ # This filter does not write any additional information to the context hash.
10
+ class EmailReplyFilter < TextFilter
11
+ include EscapeUtils
12
+
13
+ EMAIL_HIDDEN_HEADER = %(<span class="email-hidden-toggle"><a href="#">&hellip;</a></span><div class="email-hidden-reply" style="display:none">).freeze
14
+ EMAIL_QUOTED_HEADER = %(<div class="email-quoted-reply">).freeze
15
+ EMAIL_SIGNATURE_HEADER = %(<div class="email-signature-reply">).freeze
16
+ EMAIL_FRAGMENT_HEADER = %(<div class="email-fragment">).freeze
17
+ EMAIL_HEADER_END = "</div>".freeze
18
+
19
+ # Scans an email body to determine which bits are quoted and which should
20
+ # be hidden. EmailReplyParser is used to split the comment into an Array
21
+ # of quoted or unquoted Blocks. Now, we loop through them and attempt to
22
+ # add <div> tags around them so we can hide the hidden blocks, and style
23
+ # the quoted blocks differently. Since multiple blocks may be hidden, be
24
+ # sure to keep the "email-hidden-reply" <div>s around "email-quoted-reply"
25
+ # <div> tags. Call this on each comment of a visible thread in the order
26
+ # that they are displayed. Note: all comments are processed so we can
27
+ # maintain a Set of SHAs of paragraphs. Only plaintext comments skip the
28
+ # markdown step.
29
+ #
30
+ # Returns the email comment HTML as a String
31
+ def call
32
+ found_hidden = nil
33
+ paragraphs = EmailReplyParser.read(text.dup).fragments.map do |fragment|
34
+ pieces = [escape_html(fragment.to_s.strip).gsub(/^\s*(>|&gt;)/, '')]
35
+ if fragment.quoted?
36
+ pieces.unshift EMAIL_QUOTED_HEADER
37
+ pieces << EMAIL_HEADER_END
38
+ elsif fragment.signature?
39
+ pieces.unshift EMAIL_SIGNATURE_HEADER
40
+ pieces << EMAIL_HEADER_END
41
+ else
42
+ pieces.unshift EMAIL_FRAGMENT_HEADER
43
+ pieces << EMAIL_HEADER_END
44
+ end
45
+ if fragment.hidden? && !found_hidden
46
+ found_hidden = true
47
+ pieces.unshift EMAIL_HIDDEN_HEADER
48
+ end
49
+ pieces.join
50
+ end
51
+ paragraphs << EMAIL_HEADER_END if found_hidden
52
+ paragraphs.join("\n")
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,48 @@
1
+ require 'emoji'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML filter that replaces :emoji: with images.
6
+ #
7
+ # Context:
8
+ # :asset_root - base url to link to emoji sprite
9
+ class EmojiFilter < Filter
10
+ # Build a regexp that matches all valid :emoji: names.
11
+ EmojiPattern = /:(#{Emoji.names.map { |name| Regexp.escape(name) }.join('|')}):/
12
+
13
+ def call
14
+ doc.search('text()').each do |node|
15
+ content = node.to_html
16
+ next if !content.include?(':')
17
+ next if has_ancestor?(node, %w(pre code))
18
+ html = emoji_image_filter(content)
19
+ next if html == content
20
+ node.replace(html)
21
+ end
22
+ doc
23
+ end
24
+
25
+ # Replace :emoji: with corresponding images.
26
+ #
27
+ # text - String text to replace :emoji: in.
28
+ #
29
+ # Returns a String with :emoji: replaced with images.
30
+ def emoji_image_filter(text)
31
+ return text unless text.include?(':')
32
+
33
+ text.gsub EmojiPattern do |match|
34
+ name = $1
35
+ "<img class='emoji' title=':#{name}:' alt=':#{name}:' src='#{File.join(asset_root, "emoji", "#{name}.png")}' height='20' width='20' align='absmiddle' />"
36
+ end
37
+ end
38
+
39
+ # The base url to link emoji sprites
40
+ #
41
+ # Raises ArgumentError if context option has not been provided.
42
+ # Returns the context's asset_root.
43
+ def asset_root
44
+ context[:asset_root] or raise ArgumentError, "Missing context :asset_root"
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,158 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Base class for user content HTML filters. Each filter takes an
4
+ # HTML string or Nokogiri::HTML::DocumentFragment, performs
5
+ # modifications and/or writes information to the result hash. Filters must
6
+ # return a DocumentFragment (typically the same instance provided to the call
7
+ # method) or a String with HTML markup.
8
+ #
9
+ # Example filter that replaces all images with trollface:
10
+ #
11
+ # class FuuuFilter < HTML::Pipeline::Filter
12
+ # def call
13
+ # doc.search('img').each do |img|
14
+ # img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
15
+ # end
16
+ # end
17
+ # end
18
+ #
19
+ # The context Hash passes options to filters and should not be changed in
20
+ # place. A Result Hash allows filters to make extracted information
21
+ # available to the caller and is mutable.
22
+ #
23
+ # Common context options:
24
+ # :base_url - The site's base URL
25
+ # :repository - A Repository providing context for the HTML being processed
26
+ #
27
+ # Each filter may define additional options and output values. See the class
28
+ # docs for more info.
29
+ class Filter
30
+ class InvalidDocumentException < StandardError; end
31
+
32
+ def initialize(doc, context = nil, result = nil)
33
+ if doc.kind_of?(String)
34
+ @html = doc.to_str
35
+ @doc = nil
36
+ else
37
+ @doc = doc
38
+ @html = nil
39
+ end
40
+ @context = context || {}
41
+ @result = result || {}
42
+ end
43
+
44
+ # Public: Returns a simple Hash used to pass extra information into filters
45
+ # and also to allow filters to make extracted information available to the
46
+ # caller.
47
+ attr_reader :context
48
+
49
+ # Public: Returns a Hash used to allow filters to pass back information
50
+ # to callers of the various Pipelines. This can be used for
51
+ # #mentioned_users, for example.
52
+ attr_reader :result
53
+
54
+ # The Nokogiri::HTML::DocumentFragment to be manipulated. If the filter was
55
+ # provided a String, parse into a DocumentFragment the first time this
56
+ # method is called.
57
+ def doc
58
+ @doc ||= parse_html(html)
59
+ end
60
+
61
+ # The String representation of the document. If a DocumentFragment was
62
+ # provided to the Filter, it is serialized into a String when this method is
63
+ # called.
64
+ def html
65
+ raise InvalidDocumentException if @html.nil? && @doc.nil?
66
+ @html || doc.to_html
67
+ end
68
+
69
+ # The main filter entry point. The doc attribute is guaranteed to be a
70
+ # Nokogiri::HTML::DocumentFragment when invoked. Subclasses should modify
71
+ # this document in place or extract information and add it to the context
72
+ # hash.
73
+ def call
74
+ raise NotImplementedError
75
+ end
76
+
77
+ # The Repository object provided in the context hash, or nil when no
78
+ # :repository was specified.
79
+ #
80
+ # It's assumed that the repository context has already been checked
81
+ # for permissions
82
+ def repository
83
+ context[:repository]
84
+ end
85
+
86
+ # The User object provided in the context hash, or nil when no user
87
+ # was specified
88
+ def current_user
89
+ context[:current_user]
90
+ end
91
+
92
+ # Return whether the filter can access a given repo while
93
+ # applying a filter
94
+ #
95
+ # A repo can only be accessed if its pullable by the user who
96
+ # submitted the content of this filter, or if it's the same as
97
+ # the repository context in which the filter runs
98
+ def can_access_repo?(repo)
99
+ return false if repo.nil?
100
+ return true if repo == repository
101
+ repo.pullable_by?(current_user)
102
+ end
103
+
104
+ # The site's base URL provided in the context hash, or '/' when no
105
+ # base URL was specified.
106
+ def base_url
107
+ context[:base_url] || '/'
108
+ end
109
+
110
+ # Ensure the passed argument is a DocumentFragment. When a string is
111
+ # provided, it is parsed and returned; otherwise, the DocumentFragment is
112
+ # returned unmodified.
113
+ def parse_html(html)
114
+ HTML::Pipeline.parse(html)
115
+ end
116
+
117
+ # Helper method for filter subclasses used to determine if any of a node's
118
+ # ancestors have one of the tag names specified.
119
+ #
120
+ # node - The Node object to check.
121
+ # tags - An array of tag name strings to check. These should be downcase.
122
+ #
123
+ # Returns true when the node has a matching ancestor.
124
+ def has_ancestor?(node, tags)
125
+ while node = node.parent
126
+ if tags.include?(node.name.downcase)
127
+ break true
128
+ end
129
+ end
130
+ end
131
+
132
+ # Perform a filter on doc with the given context.
133
+ #
134
+ # Returns a HTML::Pipeline::DocumentFragment or a String containing HTML
135
+ # markup.
136
+ def self.call(doc, context = nil, result = nil)
137
+ new(doc, context, result).call
138
+ end
139
+
140
+ # Like call but guarantees that a DocumentFragment is returned, even when
141
+ # the last filter returns a String.
142
+ def self.to_document(input, context = nil)
143
+ html = call(input, context)
144
+ HTML::Pipeline::parse(html)
145
+ end
146
+
147
+ # Like call but guarantees that a string of HTML markup is returned.
148
+ def self.to_html(input, context = nil)
149
+ output = call(input, context)
150
+ if output.respond_to?(:to_html)
151
+ output.to_html
152
+ else
153
+ output.to_s
154
+ end
155
+ end
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,13 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML Filter for replacing http github urls with https versions.
4
+ class HttpsFilter < Filter
5
+ def call
6
+ doc.css('a[href^="http://github.com"]').each do |element|
7
+ element['href'] = element['href'].sub(/^http:/,'https:')
8
+ end
9
+ doc
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,37 @@
1
+ module HTML
2
+ class Pipeline
3
+ # This filter rewrites image tags with a max-width inline style and also wraps
4
+ # the image in an <a> tag that causes the full size image to be opened in a
5
+ # new tab.
6
+ #
7
+ # The max-width inline styles are especially useful in HTML email which
8
+ # don't use a global stylesheets.
9
+ class ImageMaxWidthFilter < Filter
10
+ def call
11
+ doc.search('img').each do |element|
12
+ # Skip if theres already a style attribute. Not sure how this
13
+ # would happen but we can reconsider it in the future.
14
+ next if element['style']
15
+
16
+ # Bail out if src doesn't look like a valid http url. tryna avoid weird
17
+ # js injection via javascript: urls.
18
+ next if element['src'].to_s.strip =~ /\Ajavascript/i
19
+
20
+ element['style'] = "max-width:100%;"
21
+
22
+ if !has_ancestor?(element, %w(a))
23
+ link_image element
24
+ end
25
+ end
26
+
27
+ doc
28
+ end
29
+
30
+ def link_image(element)
31
+ link = doc.document.create_element('a', :href => element['src'], :target => '_blank')
32
+ link.add_child(element.dup)
33
+ element.replace(link)
34
+ end
35
+ end
36
+ end
37
+ end