geothird-html-pipeline 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.travis.yml +13 -0
  4. data/CHANGELOG.md +43 -0
  5. data/Gemfile +9 -0
  6. data/LICENSE +22 -0
  7. data/README.md +274 -0
  8. data/Rakefile +11 -0
  9. data/bin/html-pipeline +80 -0
  10. data/geothird-html-pipeline.gemspec +27 -0
  11. data/lib/html/pipeline.rb +198 -0
  12. data/lib/html/pipeline/@mention_filter.rb +121 -0
  13. data/lib/html/pipeline/absolute_source_filter.rb +48 -0
  14. data/lib/html/pipeline/autolink_filter.rb +22 -0
  15. data/lib/html/pipeline/body_content.rb +42 -0
  16. data/lib/html/pipeline/camo_filter.rb +70 -0
  17. data/lib/html/pipeline/email_reply_filter.rb +56 -0
  18. data/lib/html/pipeline/emoji_filter.rb +54 -0
  19. data/lib/html/pipeline/filter.rb +178 -0
  20. data/lib/html/pipeline/https_filter.rb +13 -0
  21. data/lib/html/pipeline/image_max_width_filter.rb +37 -0
  22. data/lib/html/pipeline/markdown_filter.rb +29 -0
  23. data/lib/html/pipeline/plain_text_input_filter.rb +11 -0
  24. data/lib/html/pipeline/sanitization_filter.rb +105 -0
  25. data/lib/html/pipeline/syntax_highlight_filter.rb +33 -0
  26. data/lib/html/pipeline/text_filter.rb +14 -0
  27. data/lib/html/pipeline/textile_filter.rb +21 -0
  28. data/lib/html/pipeline/toc_filter.rb +28 -0
  29. data/lib/html/pipeline/version.rb +5 -0
  30. data/test/helpers/mocked_instrumentation_service.rb +17 -0
  31. data/test/html/pipeline/absolute_source_filter_test.rb +56 -0
  32. data/test/html/pipeline/autolink_filter_test.rb +22 -0
  33. data/test/html/pipeline/camo_filter_test.rb +47 -0
  34. data/test/html/pipeline/emoji_filter_test.rb +18 -0
  35. data/test/html/pipeline/image_max_width_filter_test.rb +50 -0
  36. data/test/html/pipeline/markdown_filter_test.rb +101 -0
  37. data/test/html/pipeline/mention_filter_test.rb +156 -0
  38. data/test/html/pipeline/plain_text_input_filter_test.rb +22 -0
  39. data/test/html/pipeline/sanitization_filter_test.rb +47 -0
  40. data/test/html/pipeline/toc_filter_test.rb +47 -0
  41. data/test/html/pipeline_test.rb +74 -0
  42. data/test/test_helper.rb +38 -0
  43. metadata +213 -0
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path("../lib/html/pipeline/version", __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = "geothird-html-pipeline"
6
+ gem.version = HTML::Pipeline::VERSION
7
+ gem.license = "MIT"
8
+ gem.authors = ["Ryan Tomayko", "Jerry Cheung"]
9
+ gem.email = ["ryan@github.com", "jerry@github.com"]
10
+ gem.description = %q{GitHub HTML processing filters and utilities}
11
+ gem.summary = %q{Helpers for processing content through a chain of filters}
12
+ gem.homepage = "https://github.com/jch/html-pipeline"
13
+
14
+ gem.files = `git ls-files`.split $/
15
+ gem.test_files = gem.files.grep(%r{^test})
16
+ gem.require_paths = ["lib"]
17
+
18
+ gem.add_dependency "gemoji", "~> 1.0"
19
+ gem.add_dependency "nokogiri", "~> 1.4"
20
+ gem.add_dependency "github-markdown", "~> 0.5"
21
+ gem.add_dependency "sanitize", "~> 2.0"
22
+ gem.add_dependency "rinku", "~> 1.7"
23
+ gem.add_dependency "escape_utils", "~> 0.3"
24
+ gem.add_dependency "activesupport", ">= 2"
25
+
26
+ gem.add_development_dependency "geothird-linguist", "~> 2.6.8"
27
+ end
@@ -0,0 +1,198 @@
1
+ require "nokogiri"
2
+ require "active_support/xml_mini/nokogiri" # convert Documents to hashes
3
+ require "escape_utils"
4
+
5
+ module HTML
6
+ # GitHub HTML processing filters and utilities. This module includes a small
7
+ # framework for defining DOM based content filters and applying them to user
8
+ # provided content.
9
+ #
10
+ # See HTML::Pipeline::Filter for information on building filters.
11
+ #
12
+ # Construct a Pipeline for running multiple HTML filters. A pipeline is created once
13
+ # with one to many filters, and it then can be `call`ed many times over the course
14
+ # of its lifetime with input.
15
+ #
16
+ # filters - Array of Filter objects. Each must respond to call(doc,
17
+ # context) and return the modified DocumentFragment or a
18
+ # String containing HTML markup. Filters are performed in the
19
+ # order provided.
20
+ # default_context - The default context hash. Values specified here will be merged
21
+ # into values from the each individual pipeline run. Can NOT be
22
+ # nil. Default: empty Hash.
23
+ # result_class - The default Class of the result object for individual
24
+ # calls. Default: Hash. Protip: Pass in a Struct to get
25
+ # some semblance of type safety.
26
+ class Pipeline
27
+ autoload :VERSION, 'html/pipeline/version'
28
+ autoload :Filter, 'html/pipeline/filter'
29
+ autoload :AbsoluteSourceFilter, 'html/pipeline/absolute_source_filter'
30
+ autoload :BodyContent, 'html/pipeline/body_content'
31
+ autoload :AutolinkFilter, 'html/pipeline/autolink_filter'
32
+ autoload :CamoFilter, 'html/pipeline/camo_filter'
33
+ autoload :EmailReplyFilter, 'html/pipeline/email_reply_filter'
34
+ autoload :EmojiFilter, 'html/pipeline/emoji_filter'
35
+ autoload :HttpsFilter, 'html/pipeline/https_filter'
36
+ autoload :ImageMaxWidthFilter, 'html/pipeline/image_max_width_filter'
37
+ autoload :MarkdownFilter, 'html/pipeline/markdown_filter'
38
+ autoload :MentionFilter, 'html/pipeline/@mention_filter'
39
+ autoload :PlainTextInputFilter, 'html/pipeline/plain_text_input_filter'
40
+ autoload :SanitizationFilter, 'html/pipeline/sanitization_filter'
41
+ autoload :SyntaxHighlightFilter, 'html/pipeline/syntax_highlight_filter'
42
+ autoload :TextileFilter, 'html/pipeline/textile_filter'
43
+ autoload :TableOfContentsFilter, 'html/pipeline/toc_filter'
44
+ autoload :TextFilter, 'html/pipeline/text_filter'
45
+
46
+ # Our DOM implementation.
47
+ DocumentFragment = Nokogiri::HTML::DocumentFragment
48
+
49
+ # Parse a String into a DocumentFragment object. When a DocumentFragment is
50
+ # provided, return it verbatim.
51
+ def self.parse(document_or_html)
52
+ document_or_html ||= ''
53
+ if document_or_html.is_a?(String)
54
+ DocumentFragment.parse(document_or_html)
55
+ else
56
+ document_or_html
57
+ end
58
+ end
59
+
60
+ # Public: Returns an Array of Filter objects for this Pipeline.
61
+ attr_reader :filters
62
+
63
+ # Public: Instrumentation service for the pipeline.
64
+ # Set an ActiveSupport::Notifications compatible object to enable.
65
+ attr_accessor :instrumentation_service
66
+
67
+ # Public: String name for this Pipeline. Defaults to Class name.
68
+ attr_writer :instrumentation_name
69
+ def instrumentation_name
70
+ @instrumentation_name || self.class.name
71
+ end
72
+
73
+ class << self
74
+ # Public: Default instrumentation service for new pipeline objects.
75
+ attr_accessor :default_instrumentation_service
76
+ end
77
+
78
+ def initialize(filters, default_context = {}, result_class = nil)
79
+ raise ArgumentError, "default_context cannot be nil" if default_context.nil?
80
+ @filters = filters.flatten.freeze
81
+ @default_context = default_context.freeze
82
+ @result_class = result_class || Hash
83
+ @instrumentation_service = self.class.default_instrumentation_service
84
+ end
85
+
86
+ # Apply all filters in the pipeline to the given HTML.
87
+ #
88
+ # html - A String containing HTML or a DocumentFragment object.
89
+ # context - The context hash passed to each filter. See the Filter docs
90
+ # for more info on possible values. This object MUST NOT be modified
91
+ # in place by filters. Use the Result for passing state back.
92
+ # result - The result Hash passed to each filter for modification. This
93
+ # is where Filters store extracted information from the content.
94
+ #
95
+ # Returns the result Hash after being filtered by this Pipeline. Contains an
96
+ # :output key with the DocumentFragment or String HTML markup based on the
97
+ # output of the last filter in the pipeline.
98
+ def call(html, context = {}, result = nil)
99
+ context = @default_context.merge(context)
100
+ context = context.freeze
101
+ result ||= @result_class.new
102
+ payload = default_payload :filters => @filters.map(&:name),
103
+ :context => context, :result => result
104
+ instrument "call_pipeline.html_pipeline", payload do
105
+ result[:output] =
106
+ @filters.inject(html) do |doc, filter|
107
+ perform_filter(filter, doc, context, result)
108
+ end
109
+ end
110
+ result
111
+ end
112
+
113
+ # Internal: Applies a specific filter to the supplied doc.
114
+ #
115
+ # The filter is instrumented.
116
+ #
117
+ # Returns the result of the filter.
118
+ def perform_filter(filter, doc, context, result)
119
+ payload = default_payload :filter => filter.name,
120
+ :context => context, :result => result
121
+ instrument "call_filter.html_pipeline", payload do
122
+ filter.call(doc, context, result)
123
+ end
124
+ end
125
+
126
+ # Like call but guarantee the value returned is a DocumentFragment.
127
+ # Pipelines may return a DocumentFragment or a String. Callers that need a
128
+ # DocumentFragment should use this method.
129
+ def to_document(input, context = {}, result = nil)
130
+ result = call(input, context, result)
131
+ HTML::Pipeline.parse(result[:output])
132
+ end
133
+
134
+ # Like call but guarantee the value returned is a string of HTML markup.
135
+ def to_html(input, context = {}, result = nil)
136
+ result = call(input, context, result = nil)
137
+ output = result[:output]
138
+ if output.respond_to?(:to_html)
139
+ output.to_html
140
+ else
141
+ output.to_s
142
+ end
143
+ end
144
+
145
+ # Public: setup instrumentation for this pipeline.
146
+ #
147
+ # Returns nothing.
148
+ def setup_instrumentation(name = nil, service = nil)
149
+ self.instrumentation_name = name
150
+ self.instrumentation_service =
151
+ service || self.class.default_instrumentation_service
152
+ end
153
+
154
+ # Internal: if the `instrumentation_service` object is set, instruments the
155
+ # block, otherwise the block is ran without instrumentation.
156
+ #
157
+ # Returns the result of the provided block.
158
+ def instrument(event, payload = nil)
159
+ payload ||= default_payload
160
+ return yield(payload) unless instrumentation_service
161
+ instrumentation_service.instrument event, payload do |payload|
162
+ yield payload
163
+ end
164
+ end
165
+
166
+ # Internal: Default payload for instrumentation.
167
+ #
168
+ # Accepts a Hash of additional payload data to be merged.
169
+ #
170
+ # Returns a Hash.
171
+ def default_payload(payload = {})
172
+ {:pipeline => instrumentation_name}.merge(payload)
173
+ end
174
+ end
175
+ end
176
+
177
+ # XXX nokogiri monkey patches for 1.8
178
+ if not ''.respond_to?(:force_encoding)
179
+ class Nokogiri::XML::Node
180
+ # Work around an issue with utf-8 encoded data being erroneously converted to
181
+ # ... some other shit when replacing text nodes. See 'utf-8 output 2' in
182
+ # user_content_test.rb for details.
183
+ def replace_with_encoding_fix(replacement)
184
+ if replacement.respond_to?(:to_str)
185
+ replacement = document.fragment("<div>#{replacement}</div>").children.first.children
186
+ end
187
+ replace_without_encoding_fix(replacement)
188
+ end
189
+
190
+ alias_method :replace_without_encoding_fix, :replace
191
+ alias_method :replace, :replace_with_encoding_fix
192
+
193
+ def swap(replacement)
194
+ replace(replacement)
195
+ self
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,121 @@
1
+ require 'set'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML filter that replaces @user mentions with links. Mentions within <pre>,
6
+ # <code>, and <a> elements are ignored. Mentions that reference users that do
7
+ # not exist are ignored.
8
+ #
9
+ # Context options:
10
+ # :base_url - Used to construct links to user profile pages for each
11
+ # mention.
12
+ # :info_url - Used to link to "more info" when someone mentions @mention
13
+ # or @mentioned.
14
+ #
15
+ class MentionFilter < Filter
16
+ # Public: Find user @mentions in text. See
17
+ # MentionFilter#mention_link_filter.
18
+ #
19
+ # MentionFilter.mentioned_logins_in(text) do |match, login, is_mentioned|
20
+ # "<a href=...>#{login}</a>"
21
+ # end
22
+ #
23
+ # text - String text to search.
24
+ #
25
+ # Yields the String match, the String login name, and a Boolean determining
26
+ # if the match = "@mention[ed]". The yield's return replaces the match in
27
+ # the original text.
28
+ #
29
+ # Returns a String replaced with the return of the block.
30
+ def self.mentioned_logins_in(text)
31
+ text.gsub MentionPattern do |match|
32
+ login = $1
33
+ yield match, login, MentionLogins.include?(login.downcase)
34
+ end
35
+ end
36
+
37
+ # Pattern used to extract @mentions from text.
38
+ MentionPattern = /
39
+ (?:^|\W) # beginning of string or non-word char
40
+ @((?>[a-z0-9][a-z0-9-]*)) # @username
41
+ (?!\/) # without a trailing slash
42
+ (?=
43
+ \.+[ \t\W]| # dots followed by space or non-word character
44
+ \.+$| # dots at end of line
45
+ [^0-9a-zA-Z_.]| # non-word character except dot
46
+ $ # end of line
47
+ )
48
+ /ix
49
+
50
+ # List of username logins that, when mentioned, link to the blog post
51
+ # about @mentions instead of triggering a real mention.
52
+ MentionLogins = %w(
53
+ mention
54
+ mentions
55
+ mentioned
56
+ mentioning
57
+ )
58
+
59
+ # Don't look for mentions in text nodes that are children of these elements
60
+ IGNORE_PARENTS = %w(pre code a).to_set
61
+
62
+ def call
63
+ result[:mentioned_usernames] ||= []
64
+
65
+ doc.search('text()').each do |node|
66
+ content = node.to_html
67
+ next if !content.include?('@')
68
+ next if has_ancestor?(node, IGNORE_PARENTS)
69
+ html = mention_link_filter(content, base_url, info_url)
70
+ next if html == content
71
+ node.replace(html)
72
+ end
73
+ doc
74
+ end
75
+
76
+ # The URL to provide when someone @mentions a "mention" name, such as
77
+ # @mention or @mentioned, that will give them more info on mentions.
78
+ def info_url
79
+ context[:info_url] || nil
80
+ end
81
+
82
+ # Replace user @mentions in text with links to the mentioned user's
83
+ # profile page.
84
+ #
85
+ # text - String text to replace @mention usernames in.
86
+ # base_url - The base URL used to construct user profile URLs.
87
+ # info_url - The "more info" URL used to link to more info on @mentions.
88
+ # If nil we don't link @mention or @mentioned.
89
+ #
90
+ # Returns a string with @mentions replaced with links. All links have a
91
+ # 'user-mention' class name attached for styling.
92
+ def mention_link_filter(text, base_url='/', info_url=nil)
93
+ self.class.mentioned_logins_in(text) do |match, login, is_mentioned|
94
+ link =
95
+ if is_mentioned
96
+ link_to_mention_info(login, info_url)
97
+ else
98
+ link_to_mentioned_user(login)
99
+ end
100
+
101
+ link ? match.sub("@#{login}", link) : match
102
+ end
103
+ end
104
+
105
+ def link_to_mention_info(text, info_url=nil)
106
+ return "@#{text}" if info_url.nil?
107
+ "<a href='#{info_url}' class='user-mention'>" +
108
+ "@#{text}" +
109
+ "</a>"
110
+ end
111
+
112
+ def link_to_mentioned_user(login)
113
+ result[:mentioned_usernames] |= [login]
114
+ url = File.join(base_url, login)
115
+ "<a href='#{url}' class='user-mention'>" +
116
+ "@#{login}" +
117
+ "</a>"
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,48 @@
1
+ require 'uri'
2
+
3
+ module HTML
4
+ class Pipeline
5
+
6
+ class AbsoluteSourceFilter < Filter
7
+ # HTML Filter for replacing relative and root relative image URLs with
8
+ # fully qualified URLs
9
+ #
10
+ # This is useful if an image is root relative but should really be going
11
+ # through a cdn, or if the content for the page assumes the host is known
12
+ # i.e. scraped webpages and some RSS feeds.
13
+ #
14
+ # Context options:
15
+ # :image_base_url - Base URL for image host for root relative src.
16
+ # :image_subpage_url - For relative src.
17
+ #
18
+ # This filter does not write additional information to the context.
19
+ # This filter would need to be run before CamoFilter.
20
+ def call
21
+ doc.search("img").each do |element|
22
+ next if element['src'].nil? || element['src'].empty?
23
+ src = element['src'].strip
24
+ unless src.start_with? 'http'
25
+ if src.start_with? '/'
26
+ base = image_base_url
27
+ else
28
+ base = image_subpage_url
29
+ end
30
+ element["src"] = URI.join(base, src).to_s
31
+ end
32
+ end
33
+ doc
34
+ end
35
+
36
+ # Private: the base url you want to use
37
+ def image_base_url
38
+ context[:image_base_url] or raise "Missing context :image_base_url for #{self.class.name}"
39
+ end
40
+
41
+ # Private: the relative url you want to use
42
+ def image_subpage_url
43
+ context[:image_subpage_url] or raise "Missing context :image_subpage_url for #{self.class.name}"
44
+ end
45
+
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,22 @@
1
+ require 'rinku'
2
+
3
+ module HTML
4
+ class Pipeline
5
+ # HTML Filter for auto_linking urls in HTML.
6
+ #
7
+ # Context options:
8
+ # :autolink - boolean whether to autolink urls
9
+ # :flags - additional Rinku flags. See https://github.com/vmg/rinku
10
+ #
11
+ # This filter does not write additional information to the context.
12
+ class AutolinkFilter < Filter
13
+ def call
14
+ return html if context[:autolink] == false
15
+ flags = 0
16
+ flags |= context[:flags] if context[:flags]
17
+
18
+ Rinku.auto_link(html, :urls, nil, %w[a script kbd pre code], flags)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,42 @@
1
+ module HTML
2
+ class Pipeline
3
+ # Public: Runs a String of content through an HTML processing pipeline,
4
+ # providing easy access to a generated DocumentFragment.
5
+ class BodyContent
6
+ attr_reader :result
7
+
8
+ # Public: Initialize a BodyContent.
9
+ #
10
+ # body - A String body.
11
+ # context - A Hash of context options for the filters.
12
+ # pipeline - A HTML::Pipeline object with one or more Filters.
13
+ def initialize(body, context, pipeline)
14
+ @body = body
15
+ @context = context
16
+ @pipeline = pipeline
17
+ end
18
+
19
+ # Public: Gets the memoized result of the body content as it passed through
20
+ # the Pipeline.
21
+ #
22
+ # Returns a Hash, or something similar as defined by @pipeline.result_class.
23
+ def result
24
+ @result ||= @pipeline.call @body, @context
25
+ end
26
+
27
+ # Public: Gets the updated body from the Pipeline result.
28
+ #
29
+ # Returns a String or DocumentFragment.
30
+ def output
31
+ @output ||= result[:output]
32
+ end
33
+
34
+ # Public: Parses the output into a DocumentFragment.
35
+ #
36
+ # Returns a DocumentFragment.
37
+ def document
38
+ @document ||= HTML::Pipeline.parse output
39
+ end
40
+ end
41
+ end
42
+ end