geothird-html-pipeline 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.travis.yml +13 -0
  4. data/CHANGELOG.md +43 -0
  5. data/Gemfile +9 -0
  6. data/LICENSE +22 -0
  7. data/README.md +274 -0
  8. data/Rakefile +11 -0
  9. data/bin/html-pipeline +80 -0
  10. data/geothird-html-pipeline.gemspec +27 -0
  11. data/lib/html/pipeline.rb +198 -0
  12. data/lib/html/pipeline/@mention_filter.rb +121 -0
  13. data/lib/html/pipeline/absolute_source_filter.rb +48 -0
  14. data/lib/html/pipeline/autolink_filter.rb +22 -0
  15. data/lib/html/pipeline/body_content.rb +42 -0
  16. data/lib/html/pipeline/camo_filter.rb +70 -0
  17. data/lib/html/pipeline/email_reply_filter.rb +56 -0
  18. data/lib/html/pipeline/emoji_filter.rb +54 -0
  19. data/lib/html/pipeline/filter.rb +178 -0
  20. data/lib/html/pipeline/https_filter.rb +13 -0
  21. data/lib/html/pipeline/image_max_width_filter.rb +37 -0
  22. data/lib/html/pipeline/markdown_filter.rb +29 -0
  23. data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
  24. data/lib/html/pipeline/sanitization_filter.rb +105 -0
  25. data/lib/html/pipeline/syntax_highlight_filter.rb +33 -0
  26. data/lib/html/pipeline/text_filter.rb +14 -0
  27. data/lib/html/pipeline/textile_filter.rb +21 -0
  28. data/lib/html/pipeline/toc_filter.rb +28 -0
  29. data/lib/html/pipeline/version.rb +5 -0
  30. data/test/helpers/mocked_instrumentation_service.rb +17 -0
  31. data/test/html/pipeline/absolute_source_filter_test.rb +56 -0
  32. data/test/html/pipeline/autolink_filter_test.rb +22 -0
  33. data/test/html/pipeline/camo_filter_test.rb +47 -0
  34. data/test/html/pipeline/emoji_filter_test.rb +18 -0
  35. data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
  36. data/test/html/pipeline/markdown_filter_test.rb +101 -0
  37. data/test/html/pipeline/mention_filter_test.rb +156 -0
  38. data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
  39. data/test/html/pipeline/sanitization_filter_test.rb +47 -0
  40. data/test/html/pipeline/toc_filter_test.rb +47 -0
  41. data/test/html/pipeline_test.rb +74 -0
  42. data/test/test_helper.rb +38 -0
  43. metadata +213 -0
@@ -0,0 +1,70 @@
1
+ require 'openssl'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter for replacing http image URLs with camo versions. See:
6
+ #
7
+ # https://github.com/atmos/camo
8
+ #
9
+ # All images provided in user content should be run through this
10
+ # filter so that http image sources do not cause mixed-content warnings
11
+ # in browser clients.
12
+ #
13
+ # Context options:
14
+ # :asset_proxy (required) - Base URL for constructed asset proxy URLs.
15
+ # :asset_proxy_secret_key (required) - The shared secret used to encode URLs.
16
+ #
17
+ # This filter does not write additional information to the context.
18
+ class CamoFilter < Filter
19
+ # Hijacks images in the markup provided, replacing them with URLs that
20
+ # go through the github asset proxy.
21
+ def call
22
+ doc.search("img").each do |element|
23
+ next if element['src'].nil?
24
+ src = element['src'].strip
25
+ src = src.sub(%r!^http://github.com!, 'https://github.com')
26
+ next if context[:disable_asset_proxy]
27
+
28
+ if src =~ /^http:/ || src =~ /^https:\/\/img.skitch.com\//
29
+ element['src'] = asset_proxy_url(src)
30
+ else
31
+ element['src'] = src
32
+ end
33
+ end
34
+ doc
35
+ end
36
+
37
+ # Implementation of validate hook.
38
+ # Errors should raise exceptions or use an existing validator.
39
+ def validate
40
+ needs :asset_proxy, :asset_proxy_secret_key
41
+ end
42
+
43
+ # The camouflaged URL for a given image URL.
44
+ def asset_proxy_url(url)
45
+ "#{asset_proxy_host}/#{asset_url_hash(url)}/#{hexencode(url)}"
46
+ end
47
+
48
+ # Private: calculate the HMAC digest for a image source URL.
49
+ def asset_url_hash(url)
50
+ digest = OpenSSL::Digest::Digest.new('sha1')
51
+ OpenSSL::HMAC.hexdigest(digest, asset_proxy_secret_key, url)
52
+ end
53
+
54
+ # Private: the hostname to use for generated asset proxied URLs.
55
+ def asset_proxy_host
56
+ context[:asset_proxy]
57
+ end
58
+
59
+ def asset_proxy_secret_key
60
+ context[:asset_proxy_secret_key]
61
+ end
62
+
63
+ # Private: helper to hexencode a string. Each byte ends up encoded into
64
+ # two characters, zero padded value in the range [0-9a-f].
65
+ def hexencode(str)
66
+ str.to_enum(:each_byte).map { |byte| "%02x" % byte }.join
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,56 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML Filter that converts email reply text into an HTML DocumentFragment.
4
+ # It must be used as the first filter in a pipeline.
5
+ #
6
+ # Context options:
7
+ # None
8
+ #
9
+ # This filter does not write any additional information to the context hash.
10
+ class EmailReplyFilter < TextFilter
11
+ include EscapeUtils
12
+
13
+ EMAIL_HIDDEN_HEADER = %(<span class="email-hidden-toggle"><a href="#">&hellip;</a></span><div class="email-hidden-reply" style="display:none">).freeze
14
+ EMAIL_QUOTED_HEADER = %(<div class="email-quoted-reply">).freeze
15
+ EMAIL_SIGNATURE_HEADER = %(<div class="email-signature-reply">).freeze
16
+ EMAIL_FRAGMENT_HEADER = %(<div class="email-fragment">).freeze
17
+ EMAIL_HEADER_END = "</div>".freeze
18
+
19
+ # Scans an email body to determine which bits are quoted and which should
20
+ # be hidden. EmailReplyParser is used to split the comment into an Array
21
+ # of quoted or unquoted Blocks. Now, we loop through them and attempt to
22
+ # add <div> tags around them so we can hide the hidden blocks, and style
23
+ # the quoted blocks differently. Since multiple blocks may be hidden, be
24
+ # sure to keep the "email-hidden-reply" <div>s around "email-quoted-reply"
25
+ # <div> tags. Call this on each comment of a visible thread in the order
26
+ # that they are displayed. Note: all comments are processed so we can
27
+ # maintain a Set of SHAs of paragraphs. Only plaintext comments skip the
28
+ # markdown step.
29
+ #
30
+ # Returns the email comment HTML as a String
31
+ def call
32
+ found_hidden = nil
33
+ paragraphs = EmailReplyParser.read(text.dup).fragments.map do |fragment|
34
+ pieces = [escape_html(fragment.to_s.strip).gsub(/^\s*(>|&gt;)/, '')]
35
+ if fragment.quoted?
36
+ pieces.unshift EMAIL_QUOTED_HEADER
37
+ pieces << EMAIL_HEADER_END
38
+ elsif fragment.signature?
39
+ pieces.unshift EMAIL_SIGNATURE_HEADER
40
+ pieces << EMAIL_HEADER_END
41
+ else
42
+ pieces.unshift EMAIL_FRAGMENT_HEADER
43
+ pieces << EMAIL_HEADER_END
44
+ end
45
+ if fragment.hidden? && !found_hidden
46
+ found_hidden = true
47
+ pieces.unshift EMAIL_HIDDEN_HEADER
48
+ end
49
+ pieces.join
50
+ end
51
+ paragraphs << EMAIL_HEADER_END if found_hidden
52
+ paragraphs.join("\n")
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,54 @@
1
+ require 'emoji'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML filter that replaces :emoji: with images.
6
+ #
7
+ # Context:
8
+ # :asset_root (required) - base url to link to emoji sprite
9
+ class EmojiFilter < Filter
10
+ # Build a regexp that matches all valid :emoji: names.
11
+ EmojiPattern = /:(#{Emoji.names.map { |name| Regexp.escape(name) }.join('|')}):/
12
+
13
+ def call
14
+ doc.search('text()').each do |node|
15
+ content = node.to_html
16
+ next if !content.include?(':')
17
+ next if has_ancestor?(node, %w(pre code))
18
+ html = emoji_image_filter(content)
19
+ next if html == content
20
+ node.replace(html)
21
+ end
22
+ doc
23
+ end
24
+
25
+ # Implementation of validate hook.
26
+ # Errors should raise exceptions or use an existing validator.
27
+ def validate
28
+ needs :asset_root
29
+ end
30
+
31
+ # Replace :emoji: with corresponding images.
32
+ #
33
+ # text - String text to replace :emoji: in.
34
+ #
35
+ # Returns a String with :emoji: replaced with images.
36
+ def emoji_image_filter(text)
37
+ return text unless text.include?(':')
38
+
39
+ text.gsub EmojiPattern do |match|
40
+ name = $1
41
+ "<img class='emoji' title=':#{name}:' alt=':#{name}:' src='#{File.join(asset_root, "emoji", "#{name}.png")}' height='20' width='20' align='absmiddle' />"
42
+ end
43
+ end
44
+
45
+ # The base url to link emoji sprites
46
+ #
47
+ # Raises ArgumentError if context option has not been provided.
48
+ # Returns the context's asset_root.
49
+ def asset_root
50
+ context[:asset_root]
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,178 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Base class for user content HTML filters. Each filter takes an
4
+ # HTML string or Nokogiri::HTML::DocumentFragment, performs
5
+ # modifications and/or writes information to the result hash. Filters must
6
+ # return a DocumentFragment (typically the same instance provided to the call
7
+ # method) or a String with HTML markup.
8
+ #
9
+ # Example filter that replaces all images with trollface:
10
+ #
11
+ # class FuuuFilter < HTML::Pipeline::Filter
12
+ # def call
13
+ # doc.search('img').each do |img|
14
+ # img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
15
+ # end
16
+ # end
17
+ # end
18
+ #
19
+ # The context Hash passes options to filters and should not be changed in
20
+ # place. A Result Hash allows filters to make extracted information
21
+ # available to the caller and is mutable.
22
+ #
23
+ # Common context options:
24
+ # :base_url - The site's base URL
25
+ # :repository - A Repository providing context for the HTML being processed
26
+ #
27
+ # Each filter may define additional options and output values. See the class
28
+ # docs for more info.
29
+ class Filter
30
+ class InvalidDocumentException < StandardError; end
31
+
32
+ def initialize(doc, context = nil, result = nil)
33
+ if doc.kind_of?(String)
34
+ @html = doc.to_str
35
+ @doc = nil
36
+ else
37
+ @doc = doc
38
+ @html = nil
39
+ end
40
+ @context = context || {}
41
+ @result = result || {}
42
+ validate
43
+ end
44
+
45
+ # Public: Returns a simple Hash used to pass extra information into filters
46
+ # and also to allow filters to make extracted information available to the
47
+ # caller.
48
+ attr_reader :context
49
+
50
+ # Public: Returns a Hash used to allow filters to pass back information
51
+ # to callers of the various Pipelines. This can be used for
52
+ # #mentioned_users, for example.
53
+ attr_reader :result
54
+
55
+ # The Nokogiri::HTML::DocumentFragment to be manipulated. If the filter was
56
+ # provided a String, parse into a DocumentFragment the first time this
57
+ # method is called.
58
+ def doc
59
+ @doc ||= parse_html(html)
60
+ end
61
+
62
+ # The String representation of the document. If a DocumentFragment was
63
+ # provided to the Filter, it is serialized into a String when this method is
64
+ # called.
65
+ def html
66
+ raise InvalidDocumentException if @html.nil? && @doc.nil?
67
+ @html || doc.to_html
68
+ end
69
+
70
+ # The main filter entry point. The doc attribute is guaranteed to be a
71
+ # Nokogiri::HTML::DocumentFragment when invoked. Subclasses should modify
72
+ # this document in place or extract information and add it to the context
73
+ # hash.
74
+ def call
75
+ raise NotImplementedError
76
+ end
77
+
78
+ # Make sure the context has everything we need. Noop: Subclasses can override.
79
+ def validate
80
+ end
81
+
82
+ # The Repository object provided in the context hash, or nil when no
83
+ # :repository was specified.
84
+ #
85
+ # It's assumed that the repository context has already been checked
86
+ # for permissions
87
+ def repository
88
+ context[:repository]
89
+ end
90
+
91
+ # The User object provided in the context hash, or nil when no user
92
+ # was specified
93
+ def current_user
94
+ context[:current_user]
95
+ end
96
+
97
+ # Return whether the filter can access a given repo while
98
+ # applying a filter
99
+ #
100
+ # A repo can only be accessed if its pullable by the user who
101
+ # submitted the content of this filter, or if it's the same as
102
+ # the repository context in which the filter runs
103
+ def can_access_repo?(repo)
104
+ return false if repo.nil?
105
+ return true if repo == repository
106
+ repo.pullable_by?(current_user)
107
+ end
108
+
109
+ # The site's base URL provided in the context hash, or '/' when no
110
+ # base URL was specified.
111
+ def base_url
112
+ context[:base_url] || '/'
113
+ end
114
+
115
+ # Ensure the passed argument is a DocumentFragment. When a string is
116
+ # provided, it is parsed and returned; otherwise, the DocumentFragment is
117
+ # returned unmodified.
118
+ def parse_html(html)
119
+ HTML::Pipeline.parse(html)
120
+ end
121
+
122
+ # Helper method for filter subclasses used to determine if any of a node's
123
+ # ancestors have one of the tag names specified.
124
+ #
125
+ # node - The Node object to check.
126
+ # tags - An array of tag name strings to check. These should be downcase.
127
+ #
128
+ # Returns true when the node has a matching ancestor.
129
+ def has_ancestor?(node, tags)
130
+ while node = node.parent
131
+ if tags.include?(node.name.downcase)
132
+ break true
133
+ end
134
+ end
135
+ end
136
+
137
+ # Perform a filter on doc with the given context.
138
+ #
139
+ # Returns a HTML::Pipeline::DocumentFragment or a String containing HTML
140
+ # markup.
141
+ def self.call(doc, context = nil, result = nil)
142
+ new(doc, context, result).call
143
+ end
144
+
145
+ # Like call but guarantees that a DocumentFragment is returned, even when
146
+ # the last filter returns a String.
147
+ def self.to_document(input, context = nil)
148
+ html = call(input, context)
149
+ HTML::Pipeline::parse(html)
150
+ end
151
+
152
+ # Like call but guarantees that a string of HTML markup is returned.
153
+ def self.to_html(input, context = nil)
154
+ output = call(input, context)
155
+ if output.respond_to?(:to_html)
156
+ output.to_html
157
+ else
158
+ output.to_s
159
+ end
160
+ end
161
+
162
+ # Validator for required context. This will check that anything passed in
163
+ # contexts exists in @contexts
164
+ #
165
+ # If any errors are found an ArgumentError will be raised with a
166
+ # message listing all the missing contexts and the filters that
167
+ # require them.
168
+ def needs(*keys)
169
+ missing = keys.reject { |key| context.include? key }
170
+
171
+ if missing.any?
172
+ raise ArgumentError,
173
+ "Missing context keys for #{self.class.name}: #{missing.map(&:inspect).join ', '}"
174
+ end
175
+ end
176
+ end
177
+ end
178
+ end
@@ -0,0 +1,13 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML Filter for replacing http github urls with https versions.
4
+ class HttpsFilter < Filter
5
+ def call
6
+ doc.css('a[href^="http://github.com"]').each do |element|
7
+ element['href'] = element['href'].sub(/^http:/,'https:')
8
+ end
9
+ doc
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,37 @@
1
+ module HTML
2
+ class Pipeline
3
+ # This filter rewrites image tags with a max-width inline style and also wraps
4
+ # the image in an <a> tag that causes the full size image to be opened in a
5
+ # new tab.
6
+ #
7
+ # The max-width inline styles are especially useful in HTML email which
8
+ # don't use a global stylesheets.
9
+ class ImageMaxWidthFilter < Filter
10
+ def call
11
+ doc.search('img').each do |element|
12
+ # Skip if there's already a style attribute. Not sure how this
13
+ # would happen but we can reconsider it in the future.
14
+ next if element['style']
15
+
16
+ # Bail out if src doesn't look like a valid http url. trying to avoid weird
17
+ # js injection via javascript: urls.
18
+ next if element['src'].to_s.strip =~ /\Ajavascript/i
19
+
20
+ element['style'] = "max-width:100%;"
21
+
22
+ if !has_ancestor?(element, %w(a))
23
+ link_image element
24
+ end
25
+ end
26
+
27
+ doc
28
+ end
29
+
30
+ def link_image(element)
31
+ link = doc.document.create_element('a', :href => element['src'], :target => '_blank')
32
+ link.add_child(element.dup)
33
+ element.replace(link)
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,29 @@
1
+ require 'github/markdown'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter that converts Markdown text into HTML and converts into a
6
+ # DocumentFragment. This is different from most filters in that it can take a
7
+ # non-HTML as input. It must be used as the first filter in a pipeline.
8
+ #
9
+ # Context options:
10
+ # :gfm => false Disable GFM line-end processing
11
+ #
12
+ # This filter does not write any additional information to the context hash.
13
+ class MarkdownFilter < TextFilter
14
+ def initialize(text, context = nil, result = nil)
15
+ super text, context, result
16
+ @text = @text.gsub "\r", ''
17
+ end
18
+
19
+ # Convert Markdown to HTML using the best available implementation
20
+ # and convert into a DocumentFragment.
21
+ def call
22
+ mode = (context[:gfm] != false) ? :gfm : :markdown
23
+ html = GitHub::Markdown.to_html(@text, mode)
24
+ html.rstrip!
25
+ html
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,11 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Simple filter for plain text input. HTML escapes the text input and wraps it
4
+ # in a div.
5
+ class PlainTextInputFilter < TextFilter
6
+ def call
7
+ "<div>#{EscapeUtils.escape_html(@text, false)}</div>"
8
+ end
9
+ end
10
+ end
11
+ end