motion-html-pipeline 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +379 -0
  3. data/lib/motion-html-pipeline.rb +14 -0
  4. data/lib/motion-html-pipeline/document_fragment.rb +27 -0
  5. data/lib/motion-html-pipeline/pipeline.rb +153 -0
  6. data/lib/motion-html-pipeline/pipeline/absolute_source_filter.rb +45 -0
  7. data/lib/motion-html-pipeline/pipeline/body_content.rb +42 -0
  8. data/lib/motion-html-pipeline/pipeline/disabled/@mention_filter.rb +140 -0
  9. data/lib/motion-html-pipeline/pipeline/disabled/autolink_filter.rb +27 -0
  10. data/lib/motion-html-pipeline/pipeline/disabled/camo_filter.rb +93 -0
  11. data/lib/motion-html-pipeline/pipeline/disabled/email_reply_filter.rb +66 -0
  12. data/lib/motion-html-pipeline/pipeline/disabled/emoji_filter.rb +125 -0
  13. data/lib/motion-html-pipeline/pipeline/disabled/markdown_filter.rb +37 -0
  14. data/lib/motion-html-pipeline/pipeline/disabled/plain_text_input_filter.rb +13 -0
  15. data/lib/motion-html-pipeline/pipeline/disabled/sanitization_filter.rb +137 -0
  16. data/lib/motion-html-pipeline/pipeline/disabled/syntax_highlight_filter.rb +44 -0
  17. data/lib/motion-html-pipeline/pipeline/disabled/toc_filter.rb +67 -0
  18. data/lib/motion-html-pipeline/pipeline/filter.rb +163 -0
  19. data/lib/motion-html-pipeline/pipeline/https_filter.rb +27 -0
  20. data/lib/motion-html-pipeline/pipeline/image_filter.rb +17 -0
  21. data/lib/motion-html-pipeline/pipeline/image_max_width_filter.rb +37 -0
  22. data/lib/motion-html-pipeline/pipeline/text_filter.rb +14 -0
  23. data/lib/motion-html-pipeline/pipeline/version.rb +5 -0
  24. data/spec/motion-html-pipeline/_helpers/mock_instumentation_service.rb +19 -0
  25. data/spec/motion-html-pipeline/pipeline/absolute_source_filter_spec.rb +47 -0
  26. data/spec/motion-html-pipeline/pipeline/disabled/auto_link_filter_spec.rb +33 -0
  27. data/spec/motion-html-pipeline/pipeline/disabled/camo_filter_spec.rb +75 -0
  28. data/spec/motion-html-pipeline/pipeline/disabled/email_reply_filter_spec.rb +64 -0
  29. data/spec/motion-html-pipeline/pipeline/disabled/emoji_filter_spec.rb +92 -0
  30. data/spec/motion-html-pipeline/pipeline/disabled/markdown_filter_spec.rb +112 -0
  31. data/spec/motion-html-pipeline/pipeline/disabled/plain_text_input_filter_spec.rb +20 -0
  32. data/spec/motion-html-pipeline/pipeline/disabled/sanitization_filter_spec.rb +164 -0
  33. data/spec/motion-html-pipeline/pipeline/disabled/syntax_highlighting_filter_spec.rb +59 -0
  34. data/spec/motion-html-pipeline/pipeline/disabled/toc_filter_spec.rb +137 -0
  35. data/spec/motion-html-pipeline/pipeline/https_filter_spec.rb +52 -0
  36. data/spec/motion-html-pipeline/pipeline/image_filter_spec.rb +37 -0
  37. data/spec/motion-html-pipeline/pipeline/image_max_width_filter_spec.rb +57 -0
  38. data/spec/motion-html-pipeline/pipeline_spec.rb +80 -0
  39. data/spec/spec_helper.rb +48 -0
  40. metadata +147 -0
@@ -0,0 +1,37 @@
1
+ # MotionHTMLPipeline::Pipeline.require_dependency('commonmarker', 'MarkdownFilter')
2
+ #
3
+ # module MotionHTMLPipeline
4
+ # class Pipeline
5
+ # # HTML Filter that converts Markdown text into HTML and converts into a
6
+ # # DocumentFragment. This is different from most filters in that it can take a
7
+ # # non-HTML as input. It must be used as the first filter in a pipeline.
8
+ # #
9
+ # # Context options:
10
+ # # :gfm => false Disable GFM line-end processing
11
+ # # :commonmarker_extensions => [ :table, :strikethrough,
12
+ # # :tagfilter, :autolink ] Common marker extensions to include
13
+ # #
14
+ # # This filter does not write any additional information to the context hash.
15
+ # class MarkdownFilter < TextFilter
16
+ # def initialize(text, context = nil, result = nil)
17
+ # super text, context, result
18
+ # @text = @text.delete "\r"
19
+ # end
20
+ #
21
+ # # Convert Markdown to HTML using the best available implementation
22
+ # # and convert into a DocumentFragment.
23
+ # def call
24
+ # options = [:GITHUB_PRE_LANG]
25
+ # options << :HARDBREAKS if context[:gfm] != false
26
+ # options << :UNSAFE if context[:unsafe]
27
+ # extensions = context.fetch(
28
+ # :commonmarker_extensions,
29
+ # ['table', 'strikethrough', 'tagfilter', 'autolink']
30
+ # )
31
+ # html = CommonMarker.render_html(@text, options, extensions)
32
+ # html.rstrip!
33
+ # html
34
+ # end
35
+ # end
36
+ # end
37
+ # end
@@ -0,0 +1,13 @@
1
+ # MotionHTMLPipeline::Pipeline.require_dependency('escape_utils', 'PlainTextInputFilter')
2
+ #
3
+ # module MotionHTMLPipeline
4
+ # class Pipeline
5
+ # # Simple filter for plain text input. HTML escapes the text input and wraps it
6
+ # # in a div.
7
+ # class PlainTextInputFilter < TextFilter
8
+ # def call
9
+ # "<div>#{EscapeUtils.escape_html(@text, false)}</div>"
10
+ # end
11
+ # end
12
+ # end
13
+ # end
@@ -0,0 +1,137 @@
1
+ # MotionHTMLPipeline::Pipeline.require_dependency('sanitize', 'SanitizationFilter')
2
+ #
3
+ # module MotionHTMLPipeline
4
+ # class Pipeline
5
+ # # HTML filter with sanization routines and whitelists. This module defines
6
+ # # what HTML is allowed in user provided content and fixes up issues with
7
+ # # unbalanced tags and whatnot.
8
+ # #
9
+ # # See the Sanitize docs for more information on the underlying library:
10
+ # #
11
+ # # https://github.com/rgrove/sanitize/#readme
12
+ # #
13
+ # # Context options:
14
+ # # :whitelist - The sanitizer whitelist configuration to use. This
15
+ # # can be one of the options constants defined in this
16
+ # # class or a custom sanitize options hash.
17
+ # # :anchor_schemes - The URL schemes to allow in <a href> attributes. The
18
+ # # default set is provided in the ANCHOR_SCHEMES
19
+ # # constant in this class. If passed, this overrides any
20
+ # # schemes specified in the whitelist configuration.
21
+ # #
22
+ # # This filter does not write additional information to the context.
23
+ # class SanitizationFilter < Filter
24
+ # LISTS = Set.new(%w[ul ol].freeze)
25
+ # LIST_ITEM = 'li'.freeze
26
+ #
27
+ # # List of table child elements. These must be contained by a <table> element
28
+ # # or they are not allowed through. Otherwise they can be used to break out
29
+ # # of places we're using tables to contain formatted user content (like pull
30
+ # # request review comments).
31
+ # TABLE_ITEMS = Set.new(%w[tr td th].freeze)
32
+ # TABLE = 'table'.freeze
33
+ # TABLE_SECTIONS = Set.new(%w[thead tbody tfoot].freeze)
34
+ #
35
+ # # These schemes are the only ones allowed in <a href> attributes by default.
36
+ # ANCHOR_SCHEMES = ['http', 'https', 'mailto', 'xmpp', :relative, 'github-windows', 'github-mac', 'irc', 'ircs'].freeze
37
+ #
38
+ # # The main sanitization whitelist. Only these elements and attributes are
39
+ # # allowed through by default.
40
+ # WHITELIST = {
41
+ # elements: %w[
42
+ # h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
43
+ # div ins del sup sub p ol ul table thead tbody tfoot blockquote
44
+ # dl dt dd kbd q samp var hr ruby rt rp li tr td th s strike summary
45
+ # details caption figure figcaption
46
+ # abbr bdo cite dfn mark small span time wbr
47
+ # ].freeze,
48
+ # remove_contents: ['script'].freeze,
49
+ # attributes: {
50
+ # 'a' => ['href'].freeze,
51
+ # 'img' => %w[src longdesc].freeze,
52
+ # 'div' => %w[itemscope itemtype].freeze,
53
+ # 'blockquote' => ['cite'].freeze,
54
+ # 'del' => ['cite'].freeze,
55
+ # 'ins' => ['cite'].freeze,
56
+ # 'q' => ['cite'].freeze,
57
+ # all: %w[abbr accept accept-charset
58
+ # accesskey action align alt
59
+ # aria-describedby aria-hidden aria-label aria-labelledby
60
+ # axis border cellpadding cellspacing char
61
+ # charoff charset checked
62
+ # clear cols colspan color
63
+ # compact coords datetime dir
64
+ # disabled enctype for frame
65
+ # headers height hreflang
66
+ # hspace ismap label lang
67
+ # maxlength media method
68
+ # multiple name nohref noshade
69
+ # nowrap open prompt readonly rel rev
70
+ # rows rowspan rules scope
71
+ # selected shape size span
72
+ # start summary tabindex target
73
+ # title type usemap valign value
74
+ # vspace width itemprop].freeze
75
+ # }.freeze,
76
+ # protocols: {
77
+ # 'a' => { 'href' => ANCHOR_SCHEMES }.freeze,
78
+ # 'blockquote' => { 'cite' => ['http', 'https', :relative].freeze },
79
+ # 'del' => { 'cite' => ['http', 'https', :relative].freeze },
80
+ # 'ins' => { 'cite' => ['http', 'https', :relative].freeze },
81
+ # 'q' => { 'cite' => ['http', 'https', :relative].freeze },
82
+ # 'img' => {
83
+ # 'src' => ['http', 'https', :relative].freeze,
84
+ # 'longdesc' => ['http', 'https', :relative].freeze
85
+ # }.freeze
86
+ # },
87
+ # transformers: [
88
+ # # Top-level <li> elements are removed because they can break out of
89
+ # # containing markup.
90
+ # lambda { |env|
91
+ # name = env[:node_name]
92
+ # node = env[:node]
93
+ # if name == LIST_ITEM && node.ancestors.none? { |n| LISTS.include?(n.name) }
94
+ # node.replace(node.children)
95
+ # end
96
+ # },
97
+ #
98
+ # # Table child elements that are not contained by a <table> are removed.
99
+ # lambda { |env|
100
+ # name = env[:node_name]
101
+ # node = env[:node]
102
+ # if (TABLE_SECTIONS.include?(name) || TABLE_ITEMS.include?(name)) && node.ancestors.none? { |n| n.name == TABLE }
103
+ # node.replace(node.children)
104
+ # end
105
+ # }
106
+ # ].freeze
107
+ # }.freeze
108
+ #
109
+ # # A more limited sanitization whitelist. This includes all attributes,
110
+ # # protocols, and transformers from WHITELIST but with a more locked down
111
+ # # set of allowed elements.
112
+ # LIMITED = WHITELIST.merge(
113
+ # elements: %w[b i strong em a pre code img ins del sup sub mark abbr p ol ul li]
114
+ # )
115
+ #
116
+ # # Strip all HTML tags from the document.
117
+ # FULL = { elements: [] }.freeze
118
+ #
119
+ # # Sanitize markup using the Sanitize library.
120
+ # def call
121
+ # Sanitize.clean_node!(doc, whitelist)
122
+ # end
123
+ #
124
+ # # The whitelist to use when sanitizing. This can be passed in the context
125
+ # # hash to the filter but defaults to WHITELIST constant value above.
126
+ # def whitelist
127
+ # whitelist = context[:whitelist] || WHITELIST
128
+ # anchor_schemes = context[:anchor_schemes]
129
+ # return whitelist unless anchor_schemes
130
+ # whitelist = whitelist.dup
131
+ # whitelist[:protocols] = (whitelist[:protocols] || {}).dup
132
+ # whitelist[:protocols]['a'] = (whitelist[:protocols]['a'] || {}).merge('href' => anchor_schemes)
133
+ # whitelist
134
+ # end
135
+ # end
136
+ # end
137
+ # end
@@ -0,0 +1,44 @@
1
+ # MotionHTMLPipeline::Pipeline.require_dependency('rouge', 'SyntaxHighlightFilter')
2
+ #
3
+ # module MotionHTMLPipeline
4
+ # class Pipeline
5
+ # # HTML Filter that syntax highlights code blocks wrapped
6
+ # # in <pre lang="...">.
7
+ # class SyntaxHighlightFilter < Filter
8
+ # def initialize(*args)
9
+ # super(*args)
10
+ # @formatter = Rouge::Formatters::HTML.new
11
+ # end
12
+ #
13
+ # def call
14
+ # doc.search('pre').each do |node|
15
+ # default = context[:highlight] && context[:highlight].to_s
16
+ # next unless lang = node['lang'] || default
17
+ # next unless lexer = lexer_for(lang)
18
+ # text = node.inner_text
19
+ #
20
+ # html = highlight_with_timeout_handling(text, lang)
21
+ # next if html.nil?
22
+ #
23
+ # node.inner_html = html
24
+ # klass = node['class']
25
+ # scope = context[:scope] || "highlight-#{lang}"
26
+ # klass = [klass, scope].compact.join ' '
27
+ #
28
+ # node['class'] = klass
29
+ # end
30
+ # doc
31
+ # end
32
+ #
33
+ # def highlight_with_timeout_handling(text, lang)
34
+ # Rouge.highlight(text, lang, @formatter)
35
+ # rescue Timeout::Error => _
36
+ # nil
37
+ # end
38
+ #
39
+ # def lexer_for(lang)
40
+ # Rouge::Lexer.find(lang)
41
+ # end
42
+ # end
43
+ # end
44
+ # end
@@ -0,0 +1,67 @@
1
+ # MotionHTMLPipeline::Pipeline.require_dependency('escape_utils', 'TableOfContentsFilter')
2
+ #
3
+ # module MotionHTMLPipeline
4
+ # class Pipeline
5
+ # # HTML filter that adds an 'id' attribute to all headers
6
+ # # in a document, so they can be accessed from a table of contents.
7
+ # #
8
+ # # Generates the Table of Contents, with links to each header.
9
+ # #
10
+ # # Examples
11
+ # #
12
+ # # TocPipeline =
13
+ # # MotionHTMLPipeline::Pipeline.new [
14
+ # # MotionHTMLPipeline::Pipeline::TableOfContentsFilter
15
+ # # ]
16
+ # # # => #<MotionHTMLPipeline::Pipeline:0x007fc13c4528d8...>
17
+ # # orig = %(<h1>Ice cube</h1><p>is not for the pop chart</p>)
18
+ # # # => "<h1>Ice cube</h1><p>is not for the pop chart</p>"
19
+ # # result = {}
20
+ # # # => {}
21
+ # # TocPipeline.call(orig, {}, result)
22
+ # # # => {:toc=> ...}
23
+ # # result[:toc]
24
+ # # # => "<ul class=\"section-nav\">\n<li><a href=\"#ice-cube\">...</li><ul>"
25
+ # # result[:output].to_s
26
+ # # # => "<h1>\n<a id=\"ice-cube\" class=\"anchor\" href=\"#ice-cube\">..."
27
+ # class TableOfContentsFilter < Filter
28
+ # PUNCTUATION_REGEXP = RUBY_VERSION > '1.9' ? /[^\p{Word}\- ]/u : /[^\w\- ]/
29
+ #
30
+ # # The icon that will be placed next to an anchored rendered markdown header
31
+ # def anchor_icon
32
+ # context[:anchor_icon] || '<span aria-hidden="true" class="octicon octicon-link"></span>'
33
+ # end
34
+ #
35
+ # def call
36
+ # result[:toc] = ''
37
+ #
38
+ # headers = Hash.new(0)
39
+ # doc.css('h1, h2, h3, h4, h5, h6').each do |node|
40
+ # text = node.text
41
+ # id = ascii_downcase(text)
42
+ # id.gsub!(PUNCTUATION_REGEXP, '') # remove punctuation
43
+ # id.tr!(' ', '-') # replace spaces with dash
44
+ #
45
+ # uniq = headers[id] > 0 ? "-#{headers[id]}" : ''
46
+ # headers[id] += 1
47
+ # if header_content = node.children.first
48
+ # result[:toc] << %(<li><a href="##{id}#{uniq}">#{EscapeUtils.escape_html(text)}</a></li>\n)
49
+ # header_content.add_previous_sibling(%(<a id="#{id}#{uniq}" class="anchor" href="##{id}#{uniq}" aria-hidden="true">#{anchor_icon}</a>))
50
+ # end
51
+ # end
52
+ # result[:toc] = %(<ul class="section-nav">\n#{result[:toc]}</ul>) unless result[:toc].empty?
53
+ # doc
54
+ # end
55
+ #
56
+ # if RUBY_VERSION >= '2.4'
57
+ # def ascii_downcase(str)
58
+ # str.downcase(:ascii)
59
+ # end
60
+ # else
61
+ # def ascii_downcase(str)
62
+ # str.downcase
63
+ # end
64
+ # end
65
+ # end
66
+ # end
67
+ # end
@@ -0,0 +1,163 @@
1
+ module MotionHTMLPipeline
2
+ class Pipeline
3
+ # Base class for user content HTML filters. Each filter takes an
4
+ # HTML string or MotionHTMLPipeline::DocumentFragment, performs
5
+ # modifications and/or writes information to the result hash. Filters must
6
+ # return a DocumentFragment (typically the same instance provided to the call
7
+ # method) or a String with HTML markup.
8
+ #
9
+ # Example filter that replaces all images with trollface:
10
+ #
11
+ # class FuuuFilter < MotionHTMLPipeline::Pipeline::Filter
12
+ # def call
13
+ # doc.search('img').each do |img|
14
+ # img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
15
+ # end
16
+ # end
17
+ # end
18
+ #
19
+ # The context Hash passes options to filters and should not be changed in
20
+ # place. A Result Hash allows filters to make extracted information
21
+ # available to the caller and is mutable.
22
+ #
23
+ # Common context options:
24
+ # :base_url - The site's base URL
25
+ # :repository - A Repository providing context for the HTML being processed
26
+ #
27
+ # Each filter may define additional options and output values. See the class
28
+ # docs for more info.
29
+ class Filter
30
+ class InvalidDocumentException < StandardError; end
31
+
32
+ def initialize(doc, context = nil, result = nil)
33
+ if doc.is_a?(String)
34
+ @html = doc.to_str
35
+ @doc = nil
36
+ else
37
+ @doc = doc
38
+ @html = nil
39
+ end
40
+ @context = context || {}
41
+ @result = result || {}
42
+ validate
43
+ end
44
+
45
+ # Public: Returns a simple Hash used to pass extra information into filters
46
+ # and also to allow filters to make extracted information available to the
47
+ # caller.
48
+ attr_reader :context
49
+
50
+ # Public: Returns a Hash used to allow filters to pass back information
51
+ # to callers of the various Pipelines. This can be used for
52
+ # #mentioned_users, for example.
53
+ attr_reader :result
54
+
55
+ # The MotionHTMLPipeline::DocumentFragment to be manipulated. If the filter was
56
+ # provided a String, parse into a DocumentFragment the first time this
57
+ # method is called.
58
+ def doc
59
+ @doc ||= parse_html(html)
60
+ end
61
+
62
+ # The String representation of the document. If a DocumentFragment was
63
+ # provided to the Filter, it is serialized into a String when this method is
64
+ # called.
65
+ def html
66
+ raise InvalidDocumentException if @html.nil? && @doc.nil?
67
+ @html || doc.to_html
68
+ end
69
+
70
+ # The main filter entry point. The doc attribute is guaranteed to be a
71
+ # MotionHTMLPipeline::DocumentFragment when invoked. Subclasses should modify
72
+ # this document in place or extract information and add it to the context
73
+ # hash.
74
+ def call
75
+ raise NotImplementedError
76
+ end
77
+
78
+ # Make sure the context has everything we need. Noop: Subclasses can override.
79
+ def validate; end
80
+
81
+ # The Repository object provided in the context hash, or nil when no
82
+ # :repository was specified.
83
+ #
84
+ # It's assumed that the repository context has already been checked
85
+ # for permissions
86
+ def repository
87
+ context[:repository]
88
+ end
89
+
90
+ # The User object provided in the context hash, or nil when no user
91
+ # was specified
92
+ def current_user
93
+ context[:current_user]
94
+ end
95
+
96
+ # The site's base URL provided in the context hash, or '/' when no
97
+ # base URL was specified.
98
+ def base_url
99
+ context[:base_url] || '/'
100
+ end
101
+
102
+ # Ensure the passed argument is a DocumentFragment. When a string is
103
+ # provided, it is parsed and returned; otherwise, the DocumentFragment is
104
+ # returned unmodified.
105
+ def parse_html(html)
106
+ MotionHTMLPipeline::Pipeline.parse(html)
107
+ end
108
+
109
+ # Helper method for filter subclasses used to determine if any of a node's
110
+ # ancestors have one of the tag names specified.
111
+ #
112
+ # node - The Node object to check.
113
+ # tags - An array of tag name strings to check. These should be downcase.
114
+ #
115
+ # Returns true when the node has a matching ancestor.
116
+ def has_ancestor?(node, tags)
117
+ while node = node.parentNode
118
+ break true if tags.include?(node.name.downcase)
119
+ end
120
+ end
121
+
122
+ # Perform a filter on doc with the given context.
123
+ #
124
+ # Returns a MotionHTMLPipeline::Pipeline::DocumentFragment or a String containing HTML
125
+ # markup.
126
+ def self.call(doc, context = nil, result = nil)
127
+ new(doc, context, result).call
128
+ end
129
+
130
+ # Like call but guarantees that a DocumentFragment is returned, even when
131
+ # the last filter returns a String.
132
+ def self.to_document(input, context = nil)
133
+ html = call(input, context)
134
+ MotionHTMLPipeline::Pipeline.parse(html)
135
+ end
136
+
137
+ # Like call but guarantees that a string of HTML markup is returned.
138
+ def self.to_html(input, context = nil)
139
+ output = call(input, context)
140
+ if output.respond_to?(:to_html)
141
+ output.to_html
142
+ else
143
+ output.to_s
144
+ end
145
+ end
146
+
147
+ # Validator for required context. This will check that anything passed in
148
+ # contexts exists in @contexts
149
+ #
150
+ # If any errors are found an ArgumentError will be raised with a
151
+ # message listing all the missing contexts and the filters that
152
+ # require them.
153
+ def needs(*keys)
154
+ missing = keys.reject { |key| context.include? key }
155
+
156
+ if missing.any?
157
+ raise ArgumentError,
158
+ "Missing context keys for #{self.class.name}: #{missing.map(&:inspect).join ', '}"
159
+ end
160
+ end
161
+ end
162
+ end
163
+ end