html-pipeline-no-charlock 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/.gitignore +19 -0
  2. data/.travis.yml +13 -0
  3. data/CHANGELOG.md +16 -0
  4. data/Gemfile +9 -0
  5. data/LICENSE +22 -0
  6. data/README.md +221 -0
  7. data/Rakefile +13 -0
  8. data/html-pipeline-no-charlock.gemspec +25 -0
  9. data/html-pipeline.gemspec +26 -0
  10. data/lib/html/pipeline.rb +130 -0
  11. data/lib/html/pipeline/@mention_filter.rb +118 -0
  12. data/lib/html/pipeline/autolink_filter.rb +22 -0
  13. data/lib/html/pipeline/body_content.rb +42 -0
  14. data/lib/html/pipeline/camo_filter.rb +70 -0
  15. data/lib/html/pipeline/email_reply_filter.rb +56 -0
  16. data/lib/html/pipeline/emoji_filter.rb +54 -0
  17. data/lib/html/pipeline/filter.rb +178 -0
  18. data/lib/html/pipeline/https_filter.rb +13 -0
  19. data/lib/html/pipeline/image_max_width_filter.rb +37 -0
  20. data/lib/html/pipeline/markdown_filter.rb +29 -0
  21. data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
  22. data/lib/html/pipeline/sanitization_filter.rb +105 -0
  23. data/lib/html/pipeline/syntax_highlight_filter.rb +29 -0
  24. data/lib/html/pipeline/text_filter.rb +14 -0
  25. data/lib/html/pipeline/textile_filter.rb +21 -0
  26. data/lib/html/pipeline/toc_filter.rb +28 -0
  27. data/lib/html/pipeline/version.rb +5 -0
  28. data/test/html/pipeline/autolink_filter_test.rb +22 -0
  29. data/test/html/pipeline/camo_filter_test.rb +47 -0
  30. data/test/html/pipeline/emoji_filter_test.rb +18 -0
  31. data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
  32. data/test/html/pipeline/markdown_filter_test.rb +101 -0
  33. data/test/html/pipeline/mention_filter_test.rb +158 -0
  34. data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
  35. data/test/html/pipeline/sanitization_filter_test.rb +47 -0
  36. data/test/html/pipeline/toc_filter_test.rb +47 -0
  37. data/test/test_helper.rb +38 -0
  38. metadata +214 -0
@@ -0,0 +1,118 @@
1
+ require 'set'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML filter that replaces @user mentions with links. Mentions within <pre>,
6
+ # <code>, and <a> elements are ignored. Mentions that reference users that do
7
+ # not exist are ignored.
8
+ #
9
+ # Context options:
10
+ # :base_url - Used to construct links to user profile pages for each
11
+ # mention.
12
+ # :info_url - Used to link to "more info" when someone mentions @mention
13
+ # or @mentioned.
14
+ #
15
+ class MentionFilter < Filter
16
+ # Public: Find user @mentions in text. See
17
+ # MentionFilter#mention_link_filter.
18
+ #
19
+ # MentionFilter.mentioned_logins_in(text) do |match, login, is_mentioned|
20
+ # "<a href=...>#{login}</a>"
21
+ # end
22
+ #
23
+ # text - String text to search.
24
+ #
25
+ # Yields the String match, the String login name, and a Boolean determining
26
+ # if the match = "@mention[ed]". The yield's return replaces the match in
27
+ # the original text.
28
+ #
29
+ # Returns a String replaced with the return of the block.
30
+ def self.mentioned_logins_in(text)
31
+ text.gsub MentionPattern do |match|
32
+ login = $1
33
+ yield match, login, MentionLogins.include?(login.downcase)
34
+ end
35
+ end
36
+
37
+ # Pattern used to extract @mentions from text.
38
+ MentionPattern = /
39
+ (?:^|\W) # beginning of string or non-word char
40
+ @((?>[a-z0-9][a-z0-9-]*)) # @username
41
+ (?!\/) # without a trailing slash
42
+ (?=
43
+ \.+[ \t\W]| # dots followed by space or non-word character
44
+ \.+$| # dots at end of line
45
+ [^0-9a-zA-Z_.]| # non-word character except dot
46
+ $ # end of line
47
+ )
48
+ /ix
49
+
50
+ # List of username logins that, when mentioned, link to the blog post
51
+ # about @mentions instead of triggering a real mention.
52
+ MentionLogins = %w(
53
+ mention
54
+ mentions
55
+ mentioned
56
+ mentioning
57
+ )
58
+
59
+ # Don't look for mentions in text nodes that are children of these elements
60
+ IGNORE_PARENTS = %w(pre code a).to_set
61
+
62
+ def call
63
+ doc.search('text()').each do |node|
64
+ content = node.to_html
65
+ next if !content.include?('@')
66
+ next if has_ancestor?(node, IGNORE_PARENTS)
67
+ html = mention_link_filter(content, base_url, info_url)
68
+ next if html == content
69
+ node.replace(html)
70
+ end
71
+ doc
72
+ end
73
+
74
+ # The URL to provide when someone @mentions a "mention" name, such as
75
+ # @mention or @mentioned, that will give them more info on mentions.
76
+ def info_url
77
+ context[:info_url] || nil
78
+ end
79
+
80
+ # Replace user @mentions in text with links to the mentioned user's
81
+ # profile page.
82
+ #
83
+ # text - String text to replace @mention usernames in.
84
+ # base_url - The base URL used to construct user profile URLs.
85
+ # info_url - The "more info" URL used to link to more info on @mentions.
86
+ # If nil we don't link @mention or @mentioned.
87
+ #
88
+ # Returns a string with @mentions replaced with links. All links have a
89
+ # 'user-mention' class name attached for styling.
90
+ def mention_link_filter(text, base_url='/', info_url=nil)
91
+ self.class.mentioned_logins_in(text) do |match, login, is_mentioned|
92
+ link =
93
+ if is_mentioned
94
+ link_to_mention_info(login, info_url)
95
+ else
96
+ link_to_mentioned_user(login)
97
+ end
98
+
99
+ link ? match.sub("@#{login}", link) : match
100
+ end
101
+ end
102
+
103
+ def link_to_mention_info(text, info_url=nil)
104
+ return "@#{text}" if info_url.nil?
105
+ "<a href='#{info_url}' class='user-mention'>" +
106
+ "@#{text}" +
107
+ "</a>"
108
+ end
109
+
110
+ def link_to_mentioned_user(login)
111
+ url = File.join(base_url, login)
112
+ "<a href='#{url}' class='user-mention'>" +
113
+ "@#{login}" +
114
+ "</a>"
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,22 @@
1
+ require 'rinku'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter for auto_linking urls in HTML.
6
+ #
7
+ # Context options:
8
+ # :autolink - boolean whether to autolink urls
9
+ # :flags - additional Rinku flags. See https://github.com/vmg/rinku
10
+ #
11
+ # This filter does not write additional information to the context.
12
+ class AutolinkFilter < Filter
13
+ def call
14
+ return html if context[:autolink] == false
15
+ flags = 0
16
+ flags |= context[:flags] if context[:flags]
17
+
18
+ Rinku.auto_link(html, :urls, nil, %w[a script kbd pre code], flags)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,42 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Public: Runs a String of content through an HTML processing pipeline,
4
+ # providing easy access to a generated DocumentFragment.
5
+ class BodyContent
6
+ attr_reader :result
7
+
8
+ # Public: Initialize a BodyContent.
9
+ #
10
+ # body - A String body.
11
+ # context - A Hash of context options for the filters.
12
+ # pipeline - A HTML::Pipeline object with one or more Filters.
13
+ def initialize(body, context, pipeline)
14
+ @body = body
15
+ @context = context
16
+ @pipeline = pipeline
17
+ end
18
+
19
+ # Public: Gets the memoized result of the body content as it passed through
20
+ # the Pipeline.
21
+ #
22
+ # Returns a Hash, or something similar as defined by @pipeline.result_class.
23
+ def result
24
+ @result ||= @pipeline.call @body, @context
25
+ end
26
+
27
+ # Public: Gets the updated body from the Pipeline result.
28
+ #
29
+ # Returns a String or DocumentFragment.
30
+ def output
31
+ @output ||= result[:output]
32
+ end
33
+
34
+ # Public: Parses the output into a DocumentFragment.
35
+ #
36
+ # Returns a DocumentFragment.
37
+ def document
38
+ @document ||= HTML::Pipeline.parse output
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,70 @@
1
+ require 'openssl'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter for replacing http image URLs with camo versions. See:
6
+ #
7
+ # https://github.com/atmos/camo
8
+ #
9
+ # All images provided in user content should be run through this
10
+ # filter so that http image sources do not cause mixed-content warnings
11
+ # in browser clients.
12
+ #
13
+ # Context options:
14
+ # :asset_proxy (required) - Base URL for constructed asset proxy URLs.
15
+ # :asset_proxy_secret_key (required) - The shared secret used to encode URLs.
16
+ #
17
+ # This filter does not write additional information to the context.
18
+ class CamoFilter < Filter
19
+ # Hijacks images in the markup provided, replacing them with URLs that
20
+ # go through the github asset proxy.
21
+ def call
22
+ doc.search("img").each do |element|
23
+ next if element['src'].nil?
24
+ src = element['src'].strip
25
+ src = src.sub(%r!^http://github.com!, 'https://github.com')
26
+ next if context[:disable_asset_proxy]
27
+
28
+ if src =~ /^http:/ || src =~ /^https:\/\/img.skitch.com\//
29
+ element['src'] = asset_proxy_url(src)
30
+ else
31
+ element['src'] = src
32
+ end
33
+ end
34
+ doc
35
+ end
36
+
37
+ # Implementation of validate hook.
38
+ # Errors should raise exceptions or use an existing validator.
39
+ def validate
40
+ needs :asset_proxy, :asset_proxy_secret_key
41
+ end
42
+
43
+ # The camouflaged URL for a given image URL.
44
+ def asset_proxy_url(url)
45
+ "#{asset_proxy_host}/#{asset_url_hash(url)}/#{hexencode(url)}"
46
+ end
47
+
48
+ # Private: calculate the HMAC digest for a image source URL.
49
+ def asset_url_hash(url)
50
+ digest = OpenSSL::Digest::Digest.new('sha1')
51
+ OpenSSL::HMAC.hexdigest(digest, asset_proxy_secret_key, url)
52
+ end
53
+
54
+ # Private: the hostname to use for generated asset proxied URLs.
55
+ def asset_proxy_host
56
+ context[:asset_proxy]
57
+ end
58
+
59
+ def asset_proxy_secret_key
60
+ context[:asset_proxy_secret_key]
61
+ end
62
+
63
+ # Private: helper to hexencode a string. Each byte ends up encoded into
64
+ # two characters, zero padded value in the range [0-9a-f].
65
+ def hexencode(str)
66
+ str.to_enum(:each_byte).map { |byte| "%02x" % byte }.join
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,56 @@
1
+ module HTML
2
+ class Pipeline
3
+ # HTML Filter that converts email reply text into an HTML DocumentFragment.
4
+ # It must be used as the first filter in a pipeline.
5
+ #
6
+ # Context options:
7
+ # None
8
+ #
9
+ # This filter does not write any additional information to the context hash.
10
+ class EmailReplyFilter < TextFilter
11
+ include EscapeUtils
12
+
13
+ EMAIL_HIDDEN_HEADER = %(<span class="email-hidden-toggle"><a href="#">&hellip;</a></span><div class="email-hidden-reply" style="display:none">).freeze
14
+ EMAIL_QUOTED_HEADER = %(<div class="email-quoted-reply">).freeze
15
+ EMAIL_SIGNATURE_HEADER = %(<div class="email-signature-reply">).freeze
16
+ EMAIL_FRAGMENT_HEADER = %(<div class="email-fragment">).freeze
17
+ EMAIL_HEADER_END = "</div>".freeze
18
+
19
+ # Scans an email body to determine which bits are quoted and which should
20
+ # be hidden. EmailReplyParser is used to split the comment into an Array
21
+ # of quoted or unquoted Blocks. Now, we loop through them and attempt to
22
+ # add <div> tags around them so we can hide the hidden blocks, and style
23
+ # the quoted blocks differently. Since multiple blocks may be hidden, be
24
+ # sure to keep the "email-hidden-reply" <div>s around "email-quoted-reply"
25
+ # <div> tags. Call this on each comment of a visible thread in the order
26
+ # that they are displayed. Note: all comments are processed so we can
27
+ # maintain a Set of SHAs of paragraphs. Only plaintext comments skip the
28
+ # markdown step.
29
+ #
30
+ # Returns the email comment HTML as a String
31
+ def call
32
+ found_hidden = nil
33
+ paragraphs = EmailReplyParser.read(text.dup).fragments.map do |fragment|
34
+ pieces = [escape_html(fragment.to_s.strip).gsub(/^\s*(>|&gt;)/, '')]
35
+ if fragment.quoted?
36
+ pieces.unshift EMAIL_QUOTED_HEADER
37
+ pieces << EMAIL_HEADER_END
38
+ elsif fragment.signature?
39
+ pieces.unshift EMAIL_SIGNATURE_HEADER
40
+ pieces << EMAIL_HEADER_END
41
+ else
42
+ pieces.unshift EMAIL_FRAGMENT_HEADER
43
+ pieces << EMAIL_HEADER_END
44
+ end
45
+ if fragment.hidden? && !found_hidden
46
+ found_hidden = true
47
+ pieces.unshift EMAIL_HIDDEN_HEADER
48
+ end
49
+ pieces.join
50
+ end
51
+ paragraphs << EMAIL_HEADER_END if found_hidden
52
+ paragraphs.join("\n")
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,54 @@
1
+ require 'emoji'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML filter that replaces :emoji: with images.
6
+ #
7
+ # Context:
8
+ # :asset_root (required) - base url to link to emoji sprite
9
+ class EmojiFilter < Filter
10
+ # Build a regexp that matches all valid :emoji: names.
11
+ EmojiPattern = /:(#{Emoji.names.map { |name| Regexp.escape(name) }.join('|')}):/
12
+
13
+ def call
14
+ doc.search('text()').each do |node|
15
+ content = node.to_html
16
+ next if !content.include?(':')
17
+ next if has_ancestor?(node, %w(pre code))
18
+ html = emoji_image_filter(content)
19
+ next if html == content
20
+ node.replace(html)
21
+ end
22
+ doc
23
+ end
24
+
25
+ # Implementation of validate hook.
26
+ # Errors should raise exceptions or use an existing validator.
27
+ def validate
28
+ needs :asset_root
29
+ end
30
+
31
+ # Replace :emoji: with corresponding images.
32
+ #
33
+ # text - String text to replace :emoji: in.
34
+ #
35
+ # Returns a String with :emoji: replaced with images.
36
+ def emoji_image_filter(text)
37
+ return text unless text.include?(':')
38
+
39
+ text.gsub EmojiPattern do |match|
40
+ name = $1
41
+ "<img class='emoji' title=':#{name}:' alt=':#{name}:' src='#{File.join(asset_root, "emoji", "#{name}.png")}' height='20' width='20' align='absmiddle' />"
42
+ end
43
+ end
44
+
45
+ # The base url to link emoji sprites
46
+ #
47
+ # Raises ArgumentError if context option has not been provided.
48
+ # Returns the context's asset_root.
49
+ def asset_root
50
+ context[:asset_root]
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,178 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Base class for user content HTML filters. Each filter takes an
4
+ # HTML string or Nokogiri::HTML::DocumentFragment, performs
5
+ # modifications and/or writes information to the result hash. Filters must
6
+ # return a DocumentFragment (typically the same instance provided to the call
7
+ # method) or a String with HTML markup.
8
+ #
9
+ # Example filter that replaces all images with trollface:
10
+ #
11
+ # class FuuuFilter < HTML::Pipeline::Filter
12
+ # def call
13
+ # doc.search('img').each do |img|
14
+ # img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
15
+ # end
16
+ # end
17
+ # end
18
+ #
19
+ # The context Hash passes options to filters and should not be changed in
20
+ # place. A Result Hash allows filters to make extracted information
21
+ # available to the caller and is mutable.
22
+ #
23
+ # Common context options:
24
+ # :base_url - The site's base URL
25
+ # :repository - A Repository providing context for the HTML being processed
26
+ #
27
+ # Each filter may define additional options and output values. See the class
28
+ # docs for more info.
29
+ class Filter
30
+ class InvalidDocumentException < StandardError; end
31
+
32
+ def initialize(doc, context = nil, result = nil)
33
+ if doc.kind_of?(String)
34
+ @html = doc.to_str
35
+ @doc = nil
36
+ else
37
+ @doc = doc
38
+ @html = nil
39
+ end
40
+ @context = context || {}
41
+ @result = result || {}
42
+ validate
43
+ end
44
+
45
+ # Public: Returns a simple Hash used to pass extra information into filters
46
+ # and also to allow filters to make extracted information available to the
47
+ # caller.
48
+ attr_reader :context
49
+
50
+ # Public: Returns a Hash used to allow filters to pass back information
51
+ # to callers of the various Pipelines. This can be used for
52
+ # #mentioned_users, for example.
53
+ attr_reader :result
54
+
55
+ # The Nokogiri::HTML::DocumentFragment to be manipulated. If the filter was
56
+ # provided a String, parse into a DocumentFragment the first time this
57
+ # method is called.
58
+ def doc
59
+ @doc ||= parse_html(html)
60
+ end
61
+
62
+ # The String representation of the document. If a DocumentFragment was
63
+ # provided to the Filter, it is serialized into a String when this method is
64
+ # called.
65
+ def html
66
+ raise InvalidDocumentException if @html.nil? && @doc.nil?
67
+ @html || doc.to_html
68
+ end
69
+
70
+ # The main filter entry point. The doc attribute is guaranteed to be a
71
+ # Nokogiri::HTML::DocumentFragment when invoked. Subclasses should modify
72
+ # this document in place or extract information and add it to the context
73
+ # hash.
74
+ def call
75
+ raise NotImplementedError
76
+ end
77
+
78
+ # Make sure the context has everything we need. Noop: Subclasses can override.
79
+ def validate
80
+ end
81
+
82
+ # The Repository object provided in the context hash, or nil when no
83
+ # :repository was specified.
84
+ #
85
+ # It's assumed that the repository context has already been checked
86
+ # for permissions
87
+ def repository
88
+ context[:repository]
89
+ end
90
+
91
+ # The User object provided in the context hash, or nil when no user
92
+ # was specified
93
+ def current_user
94
+ context[:current_user]
95
+ end
96
+
97
+ # Return whether the filter can access a given repo while
98
+ # applying a filter
99
+ #
100
+ # A repo can only be accessed if its pullable by the user who
101
+ # submitted the content of this filter, or if it's the same as
102
+ # the repository context in which the filter runs
103
+ def can_access_repo?(repo)
104
+ return false if repo.nil?
105
+ return true if repo == repository
106
+ repo.pullable_by?(current_user)
107
+ end
108
+
109
+ # The site's base URL provided in the context hash, or '/' when no
110
+ # base URL was specified.
111
+ def base_url
112
+ context[:base_url] || '/'
113
+ end
114
+
115
+ # Ensure the passed argument is a DocumentFragment. When a string is
116
+ # provided, it is parsed and returned; otherwise, the DocumentFragment is
117
+ # returned unmodified.
118
+ def parse_html(html)
119
+ HTML::Pipeline.parse(html)
120
+ end
121
+
122
+ # Helper method for filter subclasses used to determine if any of a node's
123
+ # ancestors have one of the tag names specified.
124
+ #
125
+ # node - The Node object to check.
126
+ # tags - An array of tag name strings to check. These should be downcase.
127
+ #
128
+ # Returns true when the node has a matching ancestor.
129
+ def has_ancestor?(node, tags)
130
+ while node = node.parent
131
+ if tags.include?(node.name.downcase)
132
+ break true
133
+ end
134
+ end
135
+ end
136
+
137
+ # Perform a filter on doc with the given context.
138
+ #
139
+ # Returns a HTML::Pipeline::DocumentFragment or a String containing HTML
140
+ # markup.
141
+ def self.call(doc, context = nil, result = nil)
142
+ new(doc, context, result).call
143
+ end
144
+
145
+ # Like call but guarantees that a DocumentFragment is returned, even when
146
+ # the last filter returns a String.
147
+ def self.to_document(input, context = nil)
148
+ html = call(input, context)
149
+ HTML::Pipeline::parse(html)
150
+ end
151
+
152
+ # Like call but guarantees that a string of HTML markup is returned.
153
+ def self.to_html(input, context = nil)
154
+ output = call(input, context)
155
+ if output.respond_to?(:to_html)
156
+ output.to_html
157
+ else
158
+ output.to_s
159
+ end
160
+ end
161
+
162
+ # Validator for required context. This will check that anything passed in
163
+ # contexts exists in @contexts
164
+ #
165
+ # If any errors are found an ArgumentError will be raised with a
166
+ # message listing all the missing contexts and the filters that
167
+ # require them.
168
+ def needs(*keys)
169
+ missing = keys.reject { |key| context.include? key }
170
+
171
+ if missing.any?
172
+ raise ArgumentError,
173
+ "Missing context keys for #{self.class.name}: #{missing.map(&:inspect).join ', '}"
174
+ end
175
+ end
176
+ end
177
+ end
178
+ end