geothird-html-pipeline 0.0.12

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.travis.yml +13 -0
  4. data/CHANGELOG.md +43 -0
  5. data/Gemfile +9 -0
  6. data/LICENSE +22 -0
  7. data/README.md +274 -0
  8. data/Rakefile +11 -0
  9. data/bin/html-pipeline +80 -0
  10. data/geothird-html-pipeline.gemspec +27 -0
  11. data/lib/html/pipeline.rb +198 -0
  12. data/lib/html/pipeline/@mention_filter.rb +121 -0
  13. data/lib/html/pipeline/absolute_source_filter.rb +48 -0
  14. data/lib/html/pipeline/autolink_filter.rb +22 -0
  15. data/lib/html/pipeline/body_content.rb +42 -0
  16. data/lib/html/pipeline/camo_filter.rb +70 -0
  17. data/lib/html/pipeline/email_reply_filter.rb +56 -0
  18. data/lib/html/pipeline/emoji_filter.rb +54 -0
  19. data/lib/html/pipeline/filter.rb +178 -0
  20. data/lib/html/pipeline/https_filter.rb +13 -0
  21. data/lib/html/pipeline/image_max_width_filter.rb +37 -0
  22. data/lib/html/pipeline/markdown_filter.rb +29 -0
  23. data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
  24. data/lib/html/pipeline/sanitization_filter.rb +105 -0
  25. data/lib/html/pipeline/syntax_highlight_filter.rb +33 -0
  26. data/lib/html/pipeline/text_filter.rb +14 -0
  27. data/lib/html/pipeline/textile_filter.rb +21 -0
  28. data/lib/html/pipeline/toc_filter.rb +28 -0
  29. data/lib/html/pipeline/version.rb +5 -0
  30. data/test/helpers/mocked_instrumentation_service.rb +17 -0
  31. data/test/html/pipeline/absolute_source_filter_test.rb +56 -0
  32. data/test/html/pipeline/autolink_filter_test.rb +22 -0
  33. data/test/html/pipeline/camo_filter_test.rb +47 -0
  34. data/test/html/pipeline/emoji_filter_test.rb +18 -0
  35. data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
  36. data/test/html/pipeline/markdown_filter_test.rb +101 -0
  37. data/test/html/pipeline/mention_filter_test.rb +156 -0
  38. data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
  39. data/test/html/pipeline/sanitization_filter_test.rb +47 -0
  40. data/test/html/pipeline/toc_filter_test.rb +47 -0
  41. data/test/html/pipeline_test.rb +74 -0
  42. data/test/test_helper.rb +38 -0
  43. metadata +213 -0
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path("../lib/html/pipeline/version", __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = "geothird-html-pipeline"
6
+ gem.version = HTML::Pipeline::VERSION
7
+ gem.license = "MIT"
8
+ gem.authors = ["Ryan Tomayko", "Jerry Cheung"]
9
+ gem.email = ["ryan@github.com", "jerry@github.com"]
10
+ gem.description = %q{GitHub HTML processing filters and utilities}
11
+ gem.summary = %q{Helpers for processing content through a chain of filters}
12
+ gem.homepage = "https://github.com/jch/html-pipeline"
13
+
14
+ gem.files = `git ls-files`.split $/
15
+ gem.test_files = gem.files.grep(%r{^test})
16
+ gem.require_paths = ["lib"]
17
+
18
+ gem.add_dependency "gemoji", "~> 1.0"
19
+ gem.add_dependency "nokogiri", "~> 1.4"
20
+ gem.add_dependency "github-markdown", "~> 0.5"
21
+ gem.add_dependency "sanitize", "~> 2.0"
22
+ gem.add_dependency "rinku", "~> 1.7"
23
+ gem.add_dependency "escape_utils", "~> 0.3"
24
+ gem.add_dependency "activesupport", ">= 2"
25
+
26
+ gem.add_development_dependency "geothird-linguist", "~> 2.6.8"
27
+ end
@@ -0,0 +1,198 @@
1
+ require "nokogiri"
2
+ require "active_support/xml_mini/nokogiri" # convert Documents to hashes
3
+ require "escape_utils"
4
+
5
+ module HTML
6
+ # GitHub HTML processing filters and utilities. This module includes a small
7
+ # framework for defining DOM based content filters and applying them to user
8
+ # provided content.
9
+ #
10
+ # See HTML::Pipeline::Filter for information on building filters.
11
+ #
12
+ # Construct a Pipeline for running multiple HTML filters. A pipeline is created once
13
+ # with one to many filters, and it then can be `call`ed many times over the course
14
+ # of its lifetime with input.
15
+ #
16
+ # filters - Array of Filter objects. Each must respond to call(doc,
17
+ # context) and return the modified DocumentFragment or a
18
+ # String containing HTML markup. Filters are performed in the
19
+ # order provided.
20
+ # default_context - The default context hash. Values specified here will be merged
21
+ # into values from the each individual pipeline run. Can NOT be
22
+ # nil. Default: empty Hash.
23
+ # result_class - The default Class of the result object for individual
24
+ # calls. Default: Hash. Protip: Pass in a Struct to get
25
+ # some semblance of type safety.
26
+ class Pipeline
27
+ autoload :VERSION, 'html/pipeline/version'
28
+ autoload :Filter, 'html/pipeline/filter'
29
+ autoload :AbsoluteSourceFilter, 'html/pipeline/absolute_source_filter'
30
+ autoload :BodyContent, 'html/pipeline/body_content'
31
+ autoload :AutolinkFilter, 'html/pipeline/autolink_filter'
32
+ autoload :CamoFilter, 'html/pipeline/camo_filter'
33
+ autoload :EmailReplyFilter, 'html/pipeline/email_reply_filter'
34
+ autoload :EmojiFilter, 'html/pipeline/emoji_filter'
35
+ autoload :HttpsFilter, 'html/pipeline/https_filter'
36
+ autoload :ImageMaxWidthFilter, 'html/pipeline/image_max_width_filter'
37
+ autoload :MarkdownFilter, 'html/pipeline/markdown_filter'
38
+ autoload :MentionFilter, 'html/pipeline/@mention_filter'
39
+ autoload :PlainTextInputFilter, 'html/pipeline/plain_text_input_filter'
40
+ autoload :SanitizationFilter, 'html/pipeline/sanitization_filter'
41
+ autoload :SyntaxHighlightFilter, 'html/pipeline/syntax_highlight_filter'
42
+ autoload :TextileFilter, 'html/pipeline/textile_filter'
43
+ autoload :TableOfContentsFilter, 'html/pipeline/toc_filter'
44
+ autoload :TextFilter, 'html/pipeline/text_filter'
45
+
46
+ # Our DOM implementation.
47
+ DocumentFragment = Nokogiri::HTML::DocumentFragment
48
+
49
+ # Parse a String into a DocumentFragment object. When a DocumentFragment is
50
+ # provided, return it verbatim.
51
+ def self.parse(document_or_html)
52
+ document_or_html ||= ''
53
+ if document_or_html.is_a?(String)
54
+ DocumentFragment.parse(document_or_html)
55
+ else
56
+ document_or_html
57
+ end
58
+ end
59
+
60
+ # Public: Returns an Array of Filter objects for this Pipeline.
61
+ attr_reader :filters
62
+
63
+ # Public: Instrumentation service for the pipeline.
64
+ # Set an ActiveSupport::Notifications compatible object to enable.
65
+ attr_accessor :instrumentation_service
66
+
67
+ # Public: String name for this Pipeline. Defaults to Class name.
68
+ attr_writer :instrumentation_name
69
+ def instrumentation_name
70
+ @instrumentation_name || self.class.name
71
+ end
72
+
73
+ class << self
74
+ # Public: Default instrumentation service for new pipeline objects.
75
+ attr_accessor :default_instrumentation_service
76
+ end
77
+
78
+ def initialize(filters, default_context = {}, result_class = nil)
79
+ raise ArgumentError, "default_context cannot be nil" if default_context.nil?
80
+ @filters = filters.flatten.freeze
81
+ @default_context = default_context.freeze
82
+ @result_class = result_class || Hash
83
+ @instrumentation_service = self.class.default_instrumentation_service
84
+ end
85
+
86
+ # Apply all filters in the pipeline to the given HTML.
87
+ #
88
+ # html - A String containing HTML or a DocumentFragment object.
89
+ # context - The context hash passed to each filter. See the Filter docs
90
+ # for more info on possible values. This object MUST NOT be modified
91
+ # in place by filters. Use the Result for passing state back.
92
+ # result - The result Hash passed to each filter for modification. This
93
+ # is where Filters store extracted information from the content.
94
+ #
95
+ # Returns the result Hash after being filtered by this Pipeline. Contains an
96
+ # :output key with the DocumentFragment or String HTML markup based on the
97
+ # output of the last filter in the pipeline.
98
+ def call(html, context = {}, result = nil)
99
+ context = @default_context.merge(context)
100
+ context = context.freeze
101
+ result ||= @result_class.new
102
+ payload = default_payload :filters => @filters.map(&:name),
103
+ :context => context, :result => result
104
+ instrument "call_pipeline.html_pipeline", payload do
105
+ result[:output] =
106
+ @filters.inject(html) do |doc, filter|
107
+ perform_filter(filter, doc, context, result)
108
+ end
109
+ end
110
+ result
111
+ end
112
+
113
+ # Internal: Applies a specific filter to the supplied doc.
114
+ #
115
+ # The filter is instrumented.
116
+ #
117
+ # Returns the result of the filter.
118
+ def perform_filter(filter, doc, context, result)
119
+ payload = default_payload :filter => filter.name,
120
+ :context => context, :result => result
121
+ instrument "call_filter.html_pipeline", payload do
122
+ filter.call(doc, context, result)
123
+ end
124
+ end
125
+
126
+ # Like call but guarantee the value returned is a DocumentFragment.
127
+ # Pipelines may return a DocumentFragment or a String. Callers that need a
128
+ # DocumentFragment should use this method.
129
+ def to_document(input, context = {}, result = nil)
130
+ result = call(input, context, result)
131
+ HTML::Pipeline.parse(result[:output])
132
+ end
133
+
134
+ # Like call but guarantee the value returned is a string of HTML markup.
135
+ def to_html(input, context = {}, result = nil)
136
+ result = call(input, context, result = nil)
137
+ output = result[:output]
138
+ if output.respond_to?(:to_html)
139
+ output.to_html
140
+ else
141
+ output.to_s
142
+ end
143
+ end
144
+
145
+ # Public: setup instrumentation for this pipeline.
146
+ #
147
+ # Returns nothing.
148
+ def setup_instrumentation(name = nil, service = nil)
149
+ self.instrumentation_name = name
150
+ self.instrumentation_service =
151
+ service || self.class.default_instrumentation_service
152
+ end
153
+
154
+ # Internal: if the `instrumentation_service` object is set, instruments the
155
+ # block, otherwise the block is ran without instrumentation.
156
+ #
157
+ # Returns the result of the provided block.
158
+ def instrument(event, payload = nil)
159
+ payload ||= default_payload
160
+ return yield(payload) unless instrumentation_service
161
+ instrumentation_service.instrument event, payload do |payload|
162
+ yield payload
163
+ end
164
+ end
165
+
166
+ # Internal: Default payload for instrumentation.
167
+ #
168
+ # Accepts a Hash of additional payload data to be merged.
169
+ #
170
+ # Returns a Hash.
171
+ def default_payload(payload = {})
172
+ {:pipeline => instrumentation_name}.merge(payload)
173
+ end
174
+ end
175
+ end
176
+
177
+ # XXX nokogiri monkey patches for 1.8
178
+ if not ''.respond_to?(:force_encoding)
179
+ class Nokogiri::XML::Node
180
+ # Work around an issue with utf-8 encoded data being erroneously converted to
181
+ # ... some other shit when replacing text nodes. See 'utf-8 output 2' in
182
+ # user_content_test.rb for details.
183
+ def replace_with_encoding_fix(replacement)
184
+ if replacement.respond_to?(:to_str)
185
+ replacement = document.fragment("<div>#{replacement}</div>").children.first.children
186
+ end
187
+ replace_without_encoding_fix(replacement)
188
+ end
189
+
190
+ alias_method :replace_without_encoding_fix, :replace
191
+ alias_method :replace, :replace_with_encoding_fix
192
+
193
+ def swap(replacement)
194
+ replace(replacement)
195
+ self
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,121 @@
1
+ require 'set'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML filter that replaces @user mentions with links. Mentions within <pre>,
6
+ # <code>, and <a> elements are ignored. Mentions that reference users that do
7
+ # not exist are ignored.
8
+ #
9
+ # Context options:
10
+ # :base_url - Used to construct links to user profile pages for each
11
+ # mention.
12
+ # :info_url - Used to link to "more info" when someone mentions @mention
13
+ # or @mentioned.
14
+ #
15
+ class MentionFilter < Filter
16
+ # Public: Find user @mentions in text. See
17
+ # MentionFilter#mention_link_filter.
18
+ #
19
+ # MentionFilter.mentioned_logins_in(text) do |match, login, is_mentioned|
20
+ # "<a href=...>#{login}</a>"
21
+ # end
22
+ #
23
+ # text - String text to search.
24
+ #
25
+ # Yields the String match, the String login name, and a Boolean determining
26
+ # if the match = "@mention[ed]". The yield's return replaces the match in
27
+ # the original text.
28
+ #
29
+ # Returns a String replaced with the return of the block.
30
+ def self.mentioned_logins_in(text)
31
+ text.gsub MentionPattern do |match|
32
+ login = $1
33
+ yield match, login, MentionLogins.include?(login.downcase)
34
+ end
35
+ end
36
+
37
+ # Pattern used to extract @mentions from text.
38
+ MentionPattern = /
39
+ (?:^|\W) # beginning of string or non-word char
40
+ @((?>[a-z0-9][a-z0-9-]*)) # @username
41
+ (?!\/) # without a trailing slash
42
+ (?=
43
+ \.+[ \t\W]| # dots followed by space or non-word character
44
+ \.+$| # dots at end of line
45
+ [^0-9a-zA-Z_.]| # non-word character except dot
46
+ $ # end of line
47
+ )
48
+ /ix
49
+
50
+ # List of username logins that, when mentioned, link to the blog post
51
+ # about @mentions instead of triggering a real mention.
52
+ MentionLogins = %w(
53
+ mention
54
+ mentions
55
+ mentioned
56
+ mentioning
57
+ )
58
+
59
+ # Don't look for mentions in text nodes that are children of these elements
60
+ IGNORE_PARENTS = %w(pre code a).to_set
61
+
62
+ def call
63
+ result[:mentioned_usernames] ||= []
64
+
65
+ doc.search('text()').each do |node|
66
+ content = node.to_html
67
+ next if !content.include?('@')
68
+ next if has_ancestor?(node, IGNORE_PARENTS)
69
+ html = mention_link_filter(content, base_url, info_url)
70
+ next if html == content
71
+ node.replace(html)
72
+ end
73
+ doc
74
+ end
75
+
76
+ # The URL to provide when someone @mentions a "mention" name, such as
77
+ # @mention or @mentioned, that will give them more info on mentions.
78
+ def info_url
79
+ context[:info_url] || nil
80
+ end
81
+
82
+ # Replace user @mentions in text with links to the mentioned user's
83
+ # profile page.
84
+ #
85
+ # text - String text to replace @mention usernames in.
86
+ # base_url - The base URL used to construct user profile URLs.
87
+ # info_url - The "more info" URL used to link to more info on @mentions.
88
+ # If nil we don't link @mention or @mentioned.
89
+ #
90
+ # Returns a string with @mentions replaced with links. All links have a
91
+ # 'user-mention' class name attached for styling.
92
+ def mention_link_filter(text, base_url='/', info_url=nil)
93
+ self.class.mentioned_logins_in(text) do |match, login, is_mentioned|
94
+ link =
95
+ if is_mentioned
96
+ link_to_mention_info(login, info_url)
97
+ else
98
+ link_to_mentioned_user(login)
99
+ end
100
+
101
+ link ? match.sub("@#{login}", link) : match
102
+ end
103
+ end
104
+
105
+ def link_to_mention_info(text, info_url=nil)
106
+ return "@#{text}" if info_url.nil?
107
+ "<a href='#{info_url}' class='user-mention'>" +
108
+ "@#{text}" +
109
+ "</a>"
110
+ end
111
+
112
+ def link_to_mentioned_user(login)
113
+ result[:mentioned_usernames] |= [login]
114
+ url = File.join(base_url, login)
115
+ "<a href='#{url}' class='user-mention'>" +
116
+ "@#{login}" +
117
+ "</a>"
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,48 @@
1
+ require 'uri'
2
+
3
+ module HTML
4
+ class Pipeline
5
+
6
+ class AbsoluteSourceFilter < Filter
7
+ # HTML Filter for replacing relative and root relative image URLs with
8
+ # fully qualified URLs
9
+ #
10
+ # This is useful if an image is root relative but should really be going
11
+ # through a cdn, or if the content for the page assumes the host is known
12
+ # i.e. scraped webpages and some RSS feeds.
13
+ #
14
+ # Context options:
15
+ # :image_base_url - Base URL for image host for root relative src.
16
+ # :image_subpage_url - For relative src.
17
+ #
18
+ # This filter does not write additional information to the context.
19
+ # This filter would need to be run before CamoFilter.
20
+ def call
21
+ doc.search("img").each do |element|
22
+ next if element['src'].nil? || element['src'].empty?
23
+ src = element['src'].strip
24
+ unless src.start_with? 'http'
25
+ if src.start_with? '/'
26
+ base = image_base_url
27
+ else
28
+ base = image_subpage_url
29
+ end
30
+ element["src"] = URI.join(base, src).to_s
31
+ end
32
+ end
33
+ doc
34
+ end
35
+
36
+ # Private: the base url you want to use
37
+ def image_base_url
38
+ context[:image_base_url] or raise "Missing context :image_base_url for #{self.class.name}"
39
+ end
40
+
41
+ # Private: the relative url you want to use
42
+ def image_subpage_url
43
+ context[:image_subpage_url] or raise "Missing context :image_subpage_url for #{self.class.name}"
44
+ end
45
+
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,22 @@
1
+ require 'rinku'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter for auto_linking urls in HTML.
6
+ #
7
+ # Context options:
8
+ # :autolink - boolean whether to autolink urls
9
+ # :flags - additional Rinku flags. See https://github.com/vmg/rinku
10
+ #
11
+ # This filter does not write additional information to the context.
12
+ class AutolinkFilter < Filter
13
+ def call
14
+ return html if context[:autolink] == false
15
+ flags = 0
16
+ flags |= context[:flags] if context[:flags]
17
+
18
+ Rinku.auto_link(html, :urls, nil, %w[a script kbd pre code], flags)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,42 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Public: Runs a String of content through an HTML processing pipeline,
4
+ # providing easy access to a generated DocumentFragment.
5
+ class BodyContent
6
+ attr_reader :result
7
+
8
+ # Public: Initialize a BodyContent.
9
+ #
10
+ # body - A String body.
11
+ # context - A Hash of context options for the filters.
12
+ # pipeline - A HTML::Pipeline object with one or more Filters.
13
+ def initialize(body, context, pipeline)
14
+ @body = body
15
+ @context = context
16
+ @pipeline = pipeline
17
+ end
18
+
19
+ # Public: Gets the memoized result of the body content as it passed through
20
+ # the Pipeline.
21
+ #
22
+ # Returns a Hash, or something similar as defined by @pipeline.result_class.
23
+ def result
24
+ @result ||= @pipeline.call @body, @context
25
+ end
26
+
27
+ # Public: Gets the updated body from the Pipeline result.
28
+ #
29
+ # Returns a String or DocumentFragment.
30
+ def output
31
+ @output ||= result[:output]
32
+ end
33
+
34
+ # Public: Parses the output into a DocumentFragment.
35
+ #
36
+ # Returns a DocumentFragment.
37
+ def document
38
+ @document ||= HTML::Pipeline.parse output
39
+ end
40
+ end
41
+ end
42
+ end