html-pipeline 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. data/.gitignore +19 -0
  2. data/.travis.yml +13 -0
  3. data/Gemfile +9 -0
  4. data/LICENSE +22 -0
  5. data/README.md +128 -0
  6. data/Rakefile +11 -0
  7. data/html-pipeline.gemspec +25 -0
  8. data/lib/html/pipeline.rb +130 -0
  9. data/lib/html/pipeline/@mention_filter.rb +118 -0
  10. data/lib/html/pipeline/autolink_filter.rb +22 -0
  11. data/lib/html/pipeline/body_content.rb +42 -0
  12. data/lib/html/pipeline/camo_filter.rb +64 -0
  13. data/lib/html/pipeline/email_reply_filter.rb +56 -0
  14. data/lib/html/pipeline/emoji_filter.rb +48 -0
  15. data/lib/html/pipeline/filter.rb +158 -0
  16. data/lib/html/pipeline/https_filter.rb +13 -0
  17. data/lib/html/pipeline/image_max_width_filter.rb +37 -0
  18. data/lib/html/pipeline/markdown_filter.rb +29 -0
  19. data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
  20. data/lib/html/pipeline/sanitization_filter.rb +107 -0
  21. data/lib/html/pipeline/syntax_highlight_filter.rb +29 -0
  22. data/lib/html/pipeline/text_filter.rb +14 -0
  23. data/lib/html/pipeline/textile_filter.rb +21 -0
  24. data/lib/html/pipeline/toc_filter.rb +28 -0
  25. data/lib/html/pipeline/version.rb +5 -0
  26. data/test/html/pipeline/autolink_filter_test.rb +22 -0
  27. data/test/html/pipeline/camo_filter_test.rb +39 -0
  28. data/test/html/pipeline/emoji_filter_test.rb +16 -0
  29. data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
  30. data/test/html/pipeline/markdown_filter_test.rb +101 -0
  31. data/test/html/pipeline/mention_filter_test.rb +158 -0
  32. data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
  33. data/test/html/pipeline/sanitization_filter_test.rb +47 -0
  34. data/test/html/pipeline/toc_filter_test.rb +47 -0
  35. data/test/test_helper.rb +38 -0
  36. metadata +221 -0
@@ -0,0 +1,22 @@
1
+ require 'rinku'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter for auto_linking urls in HTML.
6
+ #
7
+ # Context options:
8
+ # :autolink - boolean whether to autolink urls
9
+ # :flags - additional Rinku flags. See https://github.com/vmg/rinku
10
+ #
11
+ # This filter does not write additional information to the context.
12
+ class AutolinkFilter < Filter
13
+ def call
14
+ return html if context[:autolink] == false
15
+ flags = 0
16
+ flags |= context[:flags] if context[:flags]
17
+
18
+ Rinku.auto_link(html, :urls, nil, %w[a script kbd pre code], flags)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,42 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Public: Runs a String of content through an HTML processing pipeline,
4
+ # providing easy access to a generated DocumentFragment.
5
+ class BodyContent
6
+ attr_reader :result
7
+
8
+ # Public: Initialize a BodyContent.
9
+ #
10
+ # body - A String body.
11
+ # context - A Hash of context options for the filters.
12
+ # pipeline - A HTML::Pipeline object with one or more Filters.
13
+ def initialize(body, context, pipeline)
14
+ @body = body
15
+ @context = context
16
+ @pipeline = pipeline
17
+ end
18
+
19
+ # Public: Gets the memoized result of the body content as it passed through
20
+ # the Pipeline.
21
+ #
22
+ # Returns a Hash, or something similar as defined by @pipeline.result_class.
23
+ def result
24
+ @result ||= @pipeline.call @body, @context
25
+ end
26
+
27
+ # Public: Gets the updated body from the Pipeline result.
28
+ #
29
+ # Returns a String or DocumentFragment.
30
+ def output
31
+ @output ||= result[:output]
32
+ end
33
+
34
+ # Public: Parses the output into a DocumentFragment.
35
+ #
36
+ # Returns a DocumentFragment.
37
+ def document
38
+ @document ||= HTML::Pipeline.parse output
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,64 @@
1
+ require 'openssl'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter for replacing http image URLs with camo versions. See:
6
+ #
7
+ # https://github.com/atmos/camo
8
+ #
9
+ # All images provided in user content should be run through this
10
+ # filter so that http image sources do not cause mixed-content warnings
11
+ # in browser clients.
12
+ #
13
+ # Context options:
14
+ # :asset_proxy - Base URL for constructed asset proxy URLs.
15
+ # :asset_proxy_secret_key - The shared secret used to encode URLs.
16
+ #
17
+ # This filter does not write additional information to the context.
18
+ class CamoFilter < Filter
19
+ # Hijacks images in the markup provided, replacing them with URLs that
20
+ # go through the github asset proxy.
21
+ def call
22
+ doc.search("img").each do |element|
23
+ next if element['src'].nil?
24
+ src = element['src'].strip
25
+ src = src.sub(%r!^http://github.com!, 'https://github.com')
26
+ next if context[:disable_asset_proxy]
27
+
28
+ if src =~ /^http:/ || src =~ /^https:\/\/img.skitch.com\//
29
+ element['src'] = asset_proxy_url(src)
30
+ else
31
+ element['src'] = src
32
+ end
33
+ end
34
+ doc
35
+ end
36
+
37
+ # The camouflaged URL for a given image URL.
38
+ def asset_proxy_url(url)
39
+ "#{asset_proxy_host}/#{asset_url_hash(url)}/#{hexencode(url)}"
40
+ end
41
+
42
+ # Private: calculate the HMAC digest for a image source URL.
43
+ def asset_url_hash(url)
44
+ digest = OpenSSL::Digest::Digest.new('sha1')
45
+ OpenSSL::HMAC.hexdigest(digest, asset_proxy_secret_key, url)
46
+ end
47
+
48
+ # Private: the hostname to use for generated asset proxied URLs.
49
+ def asset_proxy_host
50
+ context[:asset_proxy] or raise "Missing context :asset_proxy"
51
+ end
52
+
53
+ def asset_proxy_secret_key
54
+ context[:asset_proxy_secret_key] or raise "Missing context :asset_proxy_secret_key"
55
+ end
56
+
57
+ # Private: helper to hexencode a string. Each byte ends up encoded into
58
+ # two characters, zero padded value in the range [0-9a-f].
59
+ def hexencode(str)
60
+ str.to_enum(:each_byte).map { |byte| "%02x" % byte }.join
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,56 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML Filter that converts email reply text into an HTML DocumentFragment.
4
+ # It must be used as the first filter in a pipeline.
5
+ #
6
+ # Context options:
7
+ # None
8
+ #
9
+ # This filter does not write any additional information to the context hash.
10
+ class EmailReplyFilter < TextFilter
11
+ include EscapeUtils
12
+
13
+ EMAIL_HIDDEN_HEADER = %(<span class="email-hidden-toggle"><a href="#">&hellip;</a></span><div class="email-hidden-reply" style="display:none">).freeze
14
+ EMAIL_QUOTED_HEADER = %(<div class="email-quoted-reply">).freeze
15
+ EMAIL_SIGNATURE_HEADER = %(<div class="email-signature-reply">).freeze
16
+ EMAIL_FRAGMENT_HEADER = %(<div class="email-fragment">).freeze
17
+ EMAIL_HEADER_END = "</div>".freeze
18
+
19
+ # Scans an email body to determine which bits are quoted and which should
20
+ # be hidden. EmailReplyParser is used to split the comment into an Array
21
+ # of quoted or unquoted Blocks. Now, we loop through them and attempt to
22
+ # add <div> tags around them so we can hide the hidden blocks, and style
23
+ # the quoted blocks differently. Since multiple blocks may be hidden, be
24
+ # sure to keep the "email-hidden-reply" <div>s around "email-quoted-reply"
25
+ # <div> tags. Call this on each comment of a visible thread in the order
26
+ # that they are displayed. Note: all comments are processed so we can
27
+ # maintain a Set of SHAs of paragraphs. Only plaintext comments skip the
28
+ # markdown step.
29
+ #
30
+ # Returns the email comment HTML as a String
31
+ def call
32
+ found_hidden = nil
33
+ paragraphs = EmailReplyParser.read(text.dup).fragments.map do |fragment|
34
+ pieces = [escape_html(fragment.to_s.strip).gsub(/^\s*(>|&gt;)/, '')]
35
+ if fragment.quoted?
36
+ pieces.unshift EMAIL_QUOTED_HEADER
37
+ pieces << EMAIL_HEADER_END
38
+ elsif fragment.signature?
39
+ pieces.unshift EMAIL_SIGNATURE_HEADER
40
+ pieces << EMAIL_HEADER_END
41
+ else
42
+ pieces.unshift EMAIL_FRAGMENT_HEADER
43
+ pieces << EMAIL_HEADER_END
44
+ end
45
+ if fragment.hidden? && !found_hidden
46
+ found_hidden = true
47
+ pieces.unshift EMAIL_HIDDEN_HEADER
48
+ end
49
+ pieces.join
50
+ end
51
+ paragraphs << EMAIL_HEADER_END if found_hidden
52
+ paragraphs.join("\n")
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,48 @@
1
+ require 'emoji'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML filter that replaces :emoji: with images.
6
+ #
7
+ # Context:
8
+ # :asset_root - base url to link to emoji sprite
9
+ class EmojiFilter < Filter
10
+ # Build a regexp that matches all valid :emoji: names.
11
+ EmojiPattern = /:(#{Emoji.names.map { |name| Regexp.escape(name) }.join('|')}):/
12
+
13
+ def call
14
+ doc.search('text()').each do |node|
15
+ content = node.to_html
16
+ next if !content.include?(':')
17
+ next if has_ancestor?(node, %w(pre code))
18
+ html = emoji_image_filter(content)
19
+ next if html == content
20
+ node.replace(html)
21
+ end
22
+ doc
23
+ end
24
+
25
+ # Replace :emoji: with corresponding images.
26
+ #
27
+ # text - String text to replace :emoji: in.
28
+ #
29
+ # Returns a String with :emoji: replaced with images.
30
+ def emoji_image_filter(text)
31
+ return text unless text.include?(':')
32
+
33
+ text.gsub EmojiPattern do |match|
34
+ name = $1
35
+ "<img class='emoji' title=':#{name}:' alt=':#{name}:' src='#{File.join(asset_root, "emoji", "#{name}.png")}' height='20' width='20' align='absmiddle' />"
36
+ end
37
+ end
38
+
39
+ # The base url to link emoji sprites
40
+ #
41
+ # Raises ArgumentError if context option has not been provided.
42
+ # Returns the context's asset_root.
43
+ def asset_root
44
+ context[:asset_root] or raise ArgumentError, "Missing context :asset_root"
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,158 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Base class for user content HTML filters. Each filter takes an
4
+ # HTML string or Nokogiri::HTML::DocumentFragment, performs
5
+ # modifications and/or writes information to the result hash. Filters must
6
+ # return a DocumentFragment (typically the same instance provided to the call
7
+ # method) or a String with HTML markup.
8
+ #
9
+ # Example filter that replaces all images with trollface:
10
+ #
11
+ # class FuuuFilter < HTML::Pipeline::Filter
12
+ # def call
13
+ # doc.search('img').each do |img|
14
+ # img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
15
+ # end
16
+ # end
17
+ # end
18
+ #
19
+ # The context Hash passes options to filters and should not be changed in
20
+ # place. A Result Hash allows filters to make extracted information
21
+ # available to the caller and is mutable.
22
+ #
23
+ # Common context options:
24
+ # :base_url - The site's base URL
25
+ # :repository - A Repository providing context for the HTML being processed
26
+ #
27
+ # Each filter may define additional options and output values. See the class
28
+ # docs for more info.
29
+ class Filter
30
+ class InvalidDocumentException < StandardError; end
31
+
32
+ def initialize(doc, context = nil, result = nil)
33
+ if doc.kind_of?(String)
34
+ @html = doc.to_str
35
+ @doc = nil
36
+ else
37
+ @doc = doc
38
+ @html = nil
39
+ end
40
+ @context = context || {}
41
+ @result = result || {}
42
+ end
43
+
44
+ # Public: Returns a simple Hash used to pass extra information into filters
45
+ # and also to allow filters to make extracted information available to the
46
+ # caller.
47
+ attr_reader :context
48
+
49
+ # Public: Returns a Hash used to allow filters to pass back information
50
+ # to callers of the various Pipelines. This can be used for
51
+ # #mentioned_users, for example.
52
+ attr_reader :result
53
+
54
+ # The Nokogiri::HTML::DocumentFragment to be manipulated. If the filter was
55
+ # provided a String, parse into a DocumentFragment the first time this
56
+ # method is called.
57
+ def doc
58
+ @doc ||= parse_html(html)
59
+ end
60
+
61
+ # The String representation of the document. If a DocumentFragment was
62
+ # provided to the Filter, it is serialized into a String when this method is
63
+ # called.
64
+ def html
65
+ raise InvalidDocumentException if @html.nil? && @doc.nil?
66
+ @html || doc.to_html
67
+ end
68
+
69
+ # The main filter entry point. The doc attribute is guaranteed to be a
70
+ # Nokogiri::HTML::DocumentFragment when invoked. Subclasses should modify
71
+ # this document in place or extract information and add it to the context
72
+ # hash.
73
+ def call
74
+ raise NotImplementedError
75
+ end
76
+
77
+ # The Repository object provided in the context hash, or nil when no
78
+ # :repository was specified.
79
+ #
80
+ # It's assumed that the repository context has already been checked
81
+ # for permissions
82
+ def repository
83
+ context[:repository]
84
+ end
85
+
86
+ # The User object provided in the context hash, or nil when no user
87
+ # was specified
88
+ def current_user
89
+ context[:current_user]
90
+ end
91
+
92
+ # Return whether the filter can access a given repo while
93
+ # applying a filter
94
+ #
95
+ # A repo can only be accessed if its pullable by the user who
96
+ # submitted the content of this filter, or if it's the same as
97
+ # the repository context in which the filter runs
98
+ def can_access_repo?(repo)
99
+ return false if repo.nil?
100
+ return true if repo == repository
101
+ repo.pullable_by?(current_user)
102
+ end
103
+
104
+ # The site's base URL provided in the context hash, or '/' when no
105
+ # base URL was specified.
106
+ def base_url
107
+ context[:base_url] || '/'
108
+ end
109
+
110
+ # Ensure the passed argument is a DocumentFragment. When a string is
111
+ # provided, it is parsed and returned; otherwise, the DocumentFragment is
112
+ # returned unmodified.
113
+ def parse_html(html)
114
+ HTML::Pipeline.parse(html)
115
+ end
116
+
117
+ # Helper method for filter subclasses used to determine if any of a node's
118
+ # ancestors have one of the tag names specified.
119
+ #
120
+ # node - The Node object to check.
121
+ # tags - An array of tag name strings to check. These should be downcase.
122
+ #
123
+ # Returns true when the node has a matching ancestor.
124
+ def has_ancestor?(node, tags)
125
+ while node = node.parent
126
+ if tags.include?(node.name.downcase)
127
+ break true
128
+ end
129
+ end
130
+ end
131
+
132
+ # Perform a filter on doc with the given context.
133
+ #
134
+ # Returns a HTML::Pipeline::DocumentFragment or a String containing HTML
135
+ # markup.
136
+ def self.call(doc, context = nil, result = nil)
137
+ new(doc, context, result).call
138
+ end
139
+
140
+ # Like call but guarantees that a DocumentFragment is returned, even when
141
+ # the last filter returns a String.
142
+ def self.to_document(input, context = nil)
143
+ html = call(input, context)
144
+ HTML::Pipeline::parse(html)
145
+ end
146
+
147
+ # Like call but guarantees that a string of HTML markup is returned.
148
+ def self.to_html(input, context = nil)
149
+ output = call(input, context)
150
+ if output.respond_to?(:to_html)
151
+ output.to_html
152
+ else
153
+ output.to_s
154
+ end
155
+ end
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,13 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML Filter for replacing http github urls with https versions.
4
+ class HttpsFilter < Filter
5
+ def call
6
+ doc.css('a[href^="http://github.com"]').each do |element|
7
+ element['href'] = element['href'].sub(/^http:/,'https:')
8
+ end
9
+ doc
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,37 @@
1
+ module HTML
2
+ class Pipeline
3
+ # This filter rewrites image tags with a max-width inline style and also wraps
4
+ # the image in an <a> tag that causes the full size image to be opened in a
5
+ # new tab.
6
+ #
7
+ # The max-width inline styles are especially useful in HTML email which
8
+ # don't use a global stylesheets.
9
+ class ImageMaxWidthFilter < Filter
10
+ def call
11
+ doc.search('img').each do |element|
12
+ # Skip if theres already a style attribute. Not sure how this
13
+ # would happen but we can reconsider it in the future.
14
+ next if element['style']
15
+
16
+ # Bail out if src doesn't look like a valid http url. tryna avoid weird
17
+ # js injection via javascript: urls.
18
+ next if element['src'].to_s.strip =~ /\Ajavascript/i
19
+
20
+ element['style'] = "max-width:100%;"
21
+
22
+ if !has_ancestor?(element, %w(a))
23
+ link_image element
24
+ end
25
+ end
26
+
27
+ doc
28
+ end
29
+
30
+ def link_image(element)
31
+ link = doc.document.create_element('a', :href => element['src'], :target => '_blank')
32
+ link.add_child(element.dup)
33
+ element.replace(link)
34
+ end
35
+ end
36
+ end
37
+ end