motion-html-pipeline 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +379 -0
  3. data/lib/motion-html-pipeline.rb +14 -0
  4. data/lib/motion-html-pipeline/document_fragment.rb +27 -0
  5. data/lib/motion-html-pipeline/pipeline.rb +153 -0
  6. data/lib/motion-html-pipeline/pipeline/absolute_source_filter.rb +45 -0
  7. data/lib/motion-html-pipeline/pipeline/body_content.rb +42 -0
  8. data/lib/motion-html-pipeline/pipeline/disabled/@mention_filter.rb +140 -0
  9. data/lib/motion-html-pipeline/pipeline/disabled/autolink_filter.rb +27 -0
  10. data/lib/motion-html-pipeline/pipeline/disabled/camo_filter.rb +93 -0
  11. data/lib/motion-html-pipeline/pipeline/disabled/email_reply_filter.rb +66 -0
  12. data/lib/motion-html-pipeline/pipeline/disabled/emoji_filter.rb +125 -0
  13. data/lib/motion-html-pipeline/pipeline/disabled/markdown_filter.rb +37 -0
  14. data/lib/motion-html-pipeline/pipeline/disabled/plain_text_input_filter.rb +13 -0
  15. data/lib/motion-html-pipeline/pipeline/disabled/sanitization_filter.rb +137 -0
  16. data/lib/motion-html-pipeline/pipeline/disabled/syntax_highlight_filter.rb +44 -0
  17. data/lib/motion-html-pipeline/pipeline/disabled/toc_filter.rb +67 -0
  18. data/lib/motion-html-pipeline/pipeline/filter.rb +163 -0
  19. data/lib/motion-html-pipeline/pipeline/https_filter.rb +27 -0
  20. data/lib/motion-html-pipeline/pipeline/image_filter.rb +17 -0
  21. data/lib/motion-html-pipeline/pipeline/image_max_width_filter.rb +37 -0
  22. data/lib/motion-html-pipeline/pipeline/text_filter.rb +14 -0
  23. data/lib/motion-html-pipeline/pipeline/version.rb +5 -0
  24. data/spec/motion-html-pipeline/_helpers/mock_instumentation_service.rb +19 -0
  25. data/spec/motion-html-pipeline/pipeline/absolute_source_filter_spec.rb +47 -0
  26. data/spec/motion-html-pipeline/pipeline/disabled/auto_link_filter_spec.rb +33 -0
  27. data/spec/motion-html-pipeline/pipeline/disabled/camo_filter_spec.rb +75 -0
  28. data/spec/motion-html-pipeline/pipeline/disabled/email_reply_filter_spec.rb +64 -0
  29. data/spec/motion-html-pipeline/pipeline/disabled/emoji_filter_spec.rb +92 -0
  30. data/spec/motion-html-pipeline/pipeline/disabled/markdown_filter_spec.rb +112 -0
  31. data/spec/motion-html-pipeline/pipeline/disabled/plain_text_input_filter_spec.rb +20 -0
  32. data/spec/motion-html-pipeline/pipeline/disabled/sanitization_filter_spec.rb +164 -0
  33. data/spec/motion-html-pipeline/pipeline/disabled/syntax_highlighting_filter_spec.rb +59 -0
  34. data/spec/motion-html-pipeline/pipeline/disabled/toc_filter_spec.rb +137 -0
  35. data/spec/motion-html-pipeline/pipeline/https_filter_spec.rb +52 -0
  36. data/spec/motion-html-pipeline/pipeline/image_filter_spec.rb +37 -0
  37. data/spec/motion-html-pipeline/pipeline/image_max_width_filter_spec.rb +57 -0
  38. data/spec/motion-html-pipeline/pipeline_spec.rb +80 -0
  39. data/spec/spec_helper.rb +48 -0
  40. metadata +147 -0
@@ -0,0 +1,37 @@
1
+ # MotionHTMLPipeline::Pipeline.require_dependency('commonmarker', 'MarkdownFilter')
2
+ #
3
+ # module MotionHTMLPipeline
4
+ # class Pipeline
5
+ # # HTML Filter that converts Markdown text into HTML and converts into a
6
+ # # DocumentFragment. This is different from most filters in that it can take a
7
+ # # non-HTML as input. It must be used as the first filter in a pipeline.
8
+ # #
9
+ # # Context options:
10
+ # # :gfm => false Disable GFM line-end processing
11
+ # # :commonmarker_extensions => [ :table, :strikethrough,
12
+ # # :tagfilter, :autolink ] Common marker extensions to include
13
+ # #
14
+ # # This filter does not write any additional information to the context hash.
15
+ # class MarkdownFilter < TextFilter
16
+ # def initialize(text, context = nil, result = nil)
17
+ # super text, context, result
18
+ # @text = @text.delete "\r"
19
+ # end
20
+ #
21
+ # # Convert Markdown to HTML using the best available implementation
22
+ # # and convert into a DocumentFragment.
23
+ # def call
24
+ # options = [:GITHUB_PRE_LANG]
25
+ # options << :HARDBREAKS if context[:gfm] != false
26
+ # options << :UNSAFE if context[:unsafe]
27
+ # extensions = context.fetch(
28
+ # :commonmarker_extensions,
29
+ # ['table', 'strikethrough', 'tagfilter', 'autolink']
30
+ # )
31
+ # html = CommonMarker.render_html(@text, options, extensions)
32
+ # html.rstrip!
33
+ # html
34
+ # end
35
+ # end
36
+ # end
37
+ # end
@@ -0,0 +1,13 @@
1
+ # MotionHTMLPipeline::Pipeline.require_dependency('escape_utils', 'PlainTextInputFilter')
2
+ #
3
+ # module MotionHTMLPipeline
4
+ # class Pipeline
5
+ # # Simple filter for plain text input. HTML escapes the text input and wraps it
6
+ # # in a div.
7
+ # class PlainTextInputFilter < TextFilter
8
+ # def call
9
+ # "<div>#{EscapeUtils.escape_html(@text, false)}</div>"
10
+ # end
11
+ # end
12
+ # end
13
+ # end
@@ -0,0 +1,137 @@
1
+ # MotionHTMLPipeline::Pipeline.require_dependency('sanitize', 'SanitizationFilter')
2
+ #
3
+ # module MotionHTMLPipeline
4
+ # class Pipeline
5
+ # # HTML filter with sanization routines and whitelists. This module defines
6
+ # # what HTML is allowed in user provided content and fixes up issues with
7
+ # # unbalanced tags and whatnot.
8
+ # #
9
+ # # See the Sanitize docs for more information on the underlying library:
10
+ # #
11
+ # # https://github.com/rgrove/sanitize/#readme
12
+ # #
13
+ # # Context options:
14
+ # # :whitelist - The sanitizer whitelist configuration to use. This
15
+ # # can be one of the options constants defined in this
16
+ # # class or a custom sanitize options hash.
17
+ # # :anchor_schemes - The URL schemes to allow in <a href> attributes. The
18
+ # # default set is provided in the ANCHOR_SCHEMES
19
+ # # constant in this class. If passed, this overrides any
20
+ # # schemes specified in the whitelist configuration.
21
+ # #
22
+ # # This filter does not write additional information to the context.
23
+ # class SanitizationFilter < Filter
24
+ # LISTS = Set.new(%w[ul ol].freeze)
25
+ # LIST_ITEM = 'li'.freeze
26
+ #
27
+ # # List of table child elements. These must be contained by a <table> element
28
+ # # or they are not allowed through. Otherwise they can be used to break out
29
+ # # of places we're using tables to contain formatted user content (like pull
30
+ # # request review comments).
31
+ # TABLE_ITEMS = Set.new(%w[tr td th].freeze)
32
+ # TABLE = 'table'.freeze
33
+ # TABLE_SECTIONS = Set.new(%w[thead tbody tfoot].freeze)
34
+ #
35
+ # # These schemes are the only ones allowed in <a href> attributes by default.
36
+ # ANCHOR_SCHEMES = ['http', 'https', 'mailto', 'xmpp', :relative, 'github-windows', 'github-mac', 'irc', 'ircs'].freeze
37
+ #
38
+ # # The main sanitization whitelist. Only these elements and attributes are
39
+ # # allowed through by default.
40
+ # WHITELIST = {
41
+ # elements: %w[
42
+ # h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
43
+ # div ins del sup sub p ol ul table thead tbody tfoot blockquote
44
+ # dl dt dd kbd q samp var hr ruby rt rp li tr td th s strike summary
45
+ # details caption figure figcaption
46
+ # abbr bdo cite dfn mark small span time wbr
47
+ # ].freeze,
48
+ # remove_contents: ['script'].freeze,
49
+ # attributes: {
50
+ # 'a' => ['href'].freeze,
51
+ # 'img' => %w[src longdesc].freeze,
52
+ # 'div' => %w[itemscope itemtype].freeze,
53
+ # 'blockquote' => ['cite'].freeze,
54
+ # 'del' => ['cite'].freeze,
55
+ # 'ins' => ['cite'].freeze,
56
+ # 'q' => ['cite'].freeze,
57
+ # all: %w[abbr accept accept-charset
58
+ # accesskey action align alt
59
+ # aria-describedby aria-hidden aria-label aria-labelledby
60
+ # axis border cellpadding cellspacing char
61
+ # charoff charset checked
62
+ # clear cols colspan color
63
+ # compact coords datetime dir
64
+ # disabled enctype for frame
65
+ # headers height hreflang
66
+ # hspace ismap label lang
67
+ # maxlength media method
68
+ # multiple name nohref noshade
69
+ # nowrap open prompt readonly rel rev
70
+ # rows rowspan rules scope
71
+ # selected shape size span
72
+ # start summary tabindex target
73
+ # title type usemap valign value
74
+ # vspace width itemprop].freeze
75
+ # }.freeze,
76
+ # protocols: {
77
+ # 'a' => { 'href' => ANCHOR_SCHEMES }.freeze,
78
+ # 'blockquote' => { 'cite' => ['http', 'https', :relative].freeze },
79
+ # 'del' => { 'cite' => ['http', 'https', :relative].freeze },
80
+ # 'ins' => { 'cite' => ['http', 'https', :relative].freeze },
81
+ # 'q' => { 'cite' => ['http', 'https', :relative].freeze },
82
+ # 'img' => {
83
+ # 'src' => ['http', 'https', :relative].freeze,
84
+ # 'longdesc' => ['http', 'https', :relative].freeze
85
+ # }.freeze
86
+ # },
87
+ # transformers: [
88
+ # # Top-level <li> elements are removed because they can break out of
89
+ # # containing markup.
90
+ # lambda { |env|
91
+ # name = env[:node_name]
92
+ # node = env[:node]
93
+ # if name == LIST_ITEM && node.ancestors.none? { |n| LISTS.include?(n.name) }
94
+ # node.replace(node.children)
95
+ # end
96
+ # },
97
+ #
98
+ # # Table child elements that are not contained by a <table> are removed.
99
+ # lambda { |env|
100
+ # name = env[:node_name]
101
+ # node = env[:node]
102
+ # if (TABLE_SECTIONS.include?(name) || TABLE_ITEMS.include?(name)) && node.ancestors.none? { |n| n.name == TABLE }
103
+ # node.replace(node.children)
104
+ # end
105
+ # }
106
+ # ].freeze
107
+ # }.freeze
108
+ #
109
+ # # A more limited sanitization whitelist. This includes all attributes,
110
+ # # protocols, and transformers from WHITELIST but with a more locked down
111
+ # # set of allowed elements.
112
+ # LIMITED = WHITELIST.merge(
113
+ # elements: %w[b i strong em a pre code img ins del sup sub mark abbr p ol ul li]
114
+ # )
115
+ #
116
+ # # Strip all HTML tags from the document.
117
+ # FULL = { elements: [] }.freeze
118
+ #
119
+ # # Sanitize markup using the Sanitize library.
120
+ # def call
121
+ # Sanitize.clean_node!(doc, whitelist)
122
+ # end
123
+ #
124
+ # # The whitelist to use when sanitizing. This can be passed in the context
125
+ # # hash to the filter but defaults to WHITELIST constant value above.
126
+ # def whitelist
127
+ # whitelist = context[:whitelist] || WHITELIST
128
+ # anchor_schemes = context[:anchor_schemes]
129
+ # return whitelist unless anchor_schemes
130
+ # whitelist = whitelist.dup
131
+ # whitelist[:protocols] = (whitelist[:protocols] || {}).dup
132
+ # whitelist[:protocols]['a'] = (whitelist[:protocols]['a'] || {}).merge('href' => anchor_schemes)
133
+ # whitelist
134
+ # end
135
+ # end
136
+ # end
137
+ # end
@@ -0,0 +1,44 @@
1
+ # MotionHTMLPipeline::Pipeline.require_dependency('rouge', 'SyntaxHighlightFilter')
2
+ #
3
+ # module MotionHTMLPipeline
4
+ # class Pipeline
5
+ # # HTML Filter that syntax highlights code blocks wrapped
6
+ # # in <pre lang="...">.
7
+ # class SyntaxHighlightFilter < Filter
8
+ # def initialize(*args)
9
+ # super(*args)
10
+ # @formatter = Rouge::Formatters::HTML.new
11
+ # end
12
+ #
13
+ # def call
14
+ # doc.search('pre').each do |node|
15
+ # default = context[:highlight] && context[:highlight].to_s
16
+ # next unless lang = node['lang'] || default
17
+ # next unless lexer = lexer_for(lang)
18
+ # text = node.inner_text
19
+ #
20
+ # html = highlight_with_timeout_handling(text, lang)
21
+ # next if html.nil?
22
+ #
23
+ # node.inner_html = html
24
+ # klass = node['class']
25
+ # scope = context[:scope] || "highlight-#{lang}"
26
+ # klass = [klass, scope].compact.join ' '
27
+ #
28
+ # node['class'] = klass
29
+ # end
30
+ # doc
31
+ # end
32
+ #
33
+ # def highlight_with_timeout_handling(text, lang)
34
+ # Rouge.highlight(text, lang, @formatter)
35
+ # rescue Timeout::Error => _
36
+ # nil
37
+ # end
38
+ #
39
+ # def lexer_for(lang)
40
+ # Rouge::Lexer.find(lang)
41
+ # end
42
+ # end
43
+ # end
44
+ # end
@@ -0,0 +1,67 @@
1
+ # MotionHTMLPipeline::Pipeline.require_dependency('escape_utils', 'TableOfContentsFilter')
2
+ #
3
+ # module MotionHTMLPipeline
4
+ # class Pipeline
5
+ # # HTML filter that adds an 'id' attribute to all headers
6
+ # # in a document, so they can be accessed from a table of contents.
7
+ # #
8
+ # # Generates the Table of Contents, with links to each header.
9
+ # #
10
+ # # Examples
11
+ # #
12
+ # # TocPipeline =
13
+ # # MotionHTMLPipeline::Pipeline.new [
14
+ # # MotionHTMLPipeline::Pipeline::TableOfContentsFilter
15
+ # # ]
16
+ # # # => #<MotionHTMLPipeline::Pipeline:0x007fc13c4528d8...>
17
+ # # orig = %(<h1>Ice cube</h1><p>is not for the pop chart</p>)
18
+ # # # => "<h1>Ice cube</h1><p>is not for the pop chart</p>"
19
+ # # result = {}
20
+ # # # => {}
21
+ # # TocPipeline.call(orig, {}, result)
22
+ # # # => {:toc=> ...}
23
+ # # result[:toc]
24
+ # # # => "<ul class=\"section-nav\">\n<li><a href=\"#ice-cube\">...</li><ul>"
25
+ # # result[:output].to_s
26
+ # # # => "<h1>\n<a id=\"ice-cube\" class=\"anchor\" href=\"#ice-cube\">..."
27
+ # class TableOfContentsFilter < Filter
28
+ # PUNCTUATION_REGEXP = RUBY_VERSION > '1.9' ? /[^\p{Word}\- ]/u : /[^\w\- ]/
29
+ #
30
+ # # The icon that will be placed next to an anchored rendered markdown header
31
+ # def anchor_icon
32
+ # context[:anchor_icon] || '<span aria-hidden="true" class="octicon octicon-link"></span>'
33
+ # end
34
+ #
35
+ # def call
36
+ # result[:toc] = ''
37
+ #
38
+ # headers = Hash.new(0)
39
+ # doc.css('h1, h2, h3, h4, h5, h6').each do |node|
40
+ # text = node.text
41
+ # id = ascii_downcase(text)
42
+ # id.gsub!(PUNCTUATION_REGEXP, '') # remove punctuation
43
+ # id.tr!(' ', '-') # replace spaces with dash
44
+ #
45
+ # uniq = headers[id] > 0 ? "-#{headers[id]}" : ''
46
+ # headers[id] += 1
47
+ # if header_content = node.children.first
48
+ # result[:toc] << %(<li><a href="##{id}#{uniq}">#{EscapeUtils.escape_html(text)}</a></li>\n)
49
+ # header_content.add_previous_sibling(%(<a id="#{id}#{uniq}" class="anchor" href="##{id}#{uniq}" aria-hidden="true">#{anchor_icon}</a>))
50
+ # end
51
+ # end
52
+ # result[:toc] = %(<ul class="section-nav">\n#{result[:toc]}</ul>) unless result[:toc].empty?
53
+ # doc
54
+ # end
55
+ #
56
+ # if RUBY_VERSION >= '2.4'
57
+ # def ascii_downcase(str)
58
+ # str.downcase(:ascii)
59
+ # end
60
+ # else
61
+ # def ascii_downcase(str)
62
+ # str.downcase
63
+ # end
64
+ # end
65
+ # end
66
+ # end
67
+ # end
@@ -0,0 +1,163 @@
1
+ module MotionHTMLPipeline
2
+ class Pipeline
3
+ # Base class for user content HTML filters. Each filter takes an
4
+ # HTML string or MotionHTMLPipeline::DocumentFragment, performs
5
+ # modifications and/or writes information to the result hash. Filters must
6
+ # return a DocumentFragment (typically the same instance provided to the call
7
+ # method) or a String with HTML markup.
8
+ #
9
+ # Example filter that replaces all images with trollface:
10
+ #
11
+ # class FuuuFilter < MotionHTMLPipeline::Pipeline::Filter
12
+ # def call
13
+ # doc.search('img').each do |img|
14
+ # img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
15
+ # end
16
+ # end
17
+ # end
18
+ #
19
+ # The context Hash passes options to filters and should not be changed in
20
+ # place. A Result Hash allows filters to make extracted information
21
+ # available to the caller and is mutable.
22
+ #
23
+ # Common context options:
24
+ # :base_url - The site's base URL
25
+ # :repository - A Repository providing context for the HTML being processed
26
+ #
27
+ # Each filter may define additional options and output values. See the class
28
+ # docs for more info.
29
+ class Filter
30
+ class InvalidDocumentException < StandardError; end
31
+
32
+ def initialize(doc, context = nil, result = nil)
33
+ if doc.is_a?(String)
34
+ @html = doc.to_str
35
+ @doc = nil
36
+ else
37
+ @doc = doc
38
+ @html = nil
39
+ end
40
+ @context = context || {}
41
+ @result = result || {}
42
+ validate
43
+ end
44
+
45
+ # Public: Returns a simple Hash used to pass extra information into filters
46
+ # and also to allow filters to make extracted information available to the
47
+ # caller.
48
+ attr_reader :context
49
+
50
+ # Public: Returns a Hash used to allow filters to pass back information
51
+ # to callers of the various Pipelines. This can be used for
52
+ # #mentioned_users, for example.
53
+ attr_reader :result
54
+
55
+ # The MotionHTMLPipeline::DocumentFragment to be manipulated. If the filter was
56
+ # provided a String, parse into a DocumentFragment the first time this
57
+ # method is called.
58
+ def doc
59
+ @doc ||= parse_html(html)
60
+ end
61
+
62
+ # The String representation of the document. If a DocumentFragment was
63
+ # provided to the Filter, it is serialized into a String when this method is
64
+ # called.
65
+ def html
66
+ raise InvalidDocumentException if @html.nil? && @doc.nil?
67
+ @html || doc.to_html
68
+ end
69
+
70
+ # The main filter entry point. The doc attribute is guaranteed to be a
71
+ # MotionHTMLPipeline::DocumentFragment when invoked. Subclasses should modify
72
+ # this document in place or extract information and add it to the context
73
+ # hash.
74
+ def call
75
+ raise NotImplementedError
76
+ end
77
+
78
+ # Make sure the context has everything we need. Noop: Subclasses can override.
79
+ def validate; end
80
+
81
+ # The Repository object provided in the context hash, or nil when no
82
+ # :repository was specified.
83
+ #
84
+ # It's assumed that the repository context has already been checked
85
+ # for permissions
86
+ def repository
87
+ context[:repository]
88
+ end
89
+
90
+ # The User object provided in the context hash, or nil when no user
91
+ # was specified
92
+ def current_user
93
+ context[:current_user]
94
+ end
95
+
96
+ # The site's base URL provided in the context hash, or '/' when no
97
+ # base URL was specified.
98
+ def base_url
99
+ context[:base_url] || '/'
100
+ end
101
+
102
+ # Ensure the passed argument is a DocumentFragment. When a string is
103
+ # provided, it is parsed and returned; otherwise, the DocumentFragment is
104
+ # returned unmodified.
105
+ def parse_html(html)
106
+ MotionHTMLPipeline::Pipeline.parse(html)
107
+ end
108
+
109
+ # Helper method for filter subclasses used to determine if any of a node's
110
+ # ancestors have one of the tag names specified.
111
+ #
112
+ # node - The Node object to check.
113
+ # tags - An array of tag name strings to check. These should be downcase.
114
+ #
115
+ # Returns true when the node has a matching ancestor.
116
+ def has_ancestor?(node, tags)
117
+ while node = node.parentNode
118
+ break true if tags.include?(node.name.downcase)
119
+ end
120
+ end
121
+
122
+ # Perform a filter on doc with the given context.
123
+ #
124
+ # Returns a MotionHTMLPipeline::Pipeline::DocumentFragment or a String containing HTML
125
+ # markup.
126
+ def self.call(doc, context = nil, result = nil)
127
+ new(doc, context, result).call
128
+ end
129
+
130
+ # Like call but guarantees that a DocumentFragment is returned, even when
131
+ # the last filter returns a String.
132
+ def self.to_document(input, context = nil)
133
+ html = call(input, context)
134
+ MotionHTMLPipeline::Pipeline.parse(html)
135
+ end
136
+
137
+ # Like call but guarantees that a string of HTML markup is returned.
138
+ def self.to_html(input, context = nil)
139
+ output = call(input, context)
140
+ if output.respond_to?(:to_html)
141
+ output.to_html
142
+ else
143
+ output.to_s
144
+ end
145
+ end
146
+
147
+ # Validator for required context. This will check that anything passed in
148
+ # contexts exists in @contexts
149
+ #
150
+ # If any errors are found an ArgumentError will be raised with a
151
+ # message listing all the missing contexts and the filters that
152
+ # require them.
153
+ def needs(*keys)
154
+ missing = keys.reject { |key| context.include? key }
155
+
156
+ if missing.any?
157
+ raise ArgumentError,
158
+ "Missing context keys for #{self.class.name}: #{missing.map(&:inspect).join ', '}"
159
+ end
160
+ end
161
+ end
162
+ end
163
+ end