html-pipeline 2.14.3 → 3.0.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.github/FUNDING.yml +11 -3
  3. data/.github/dependabot.yml +20 -0
  4. data/.github/workflows/automerge.yml +34 -0
  5. data/.github/workflows/lint.yml +23 -0
  6. data/.github/workflows/tag_and_release.yml +70 -0
  7. data/.github/workflows/test.yml +33 -0
  8. data/.rubocop.yml +17 -0
  9. data/CHANGELOG.md +28 -2
  10. data/Gemfile +29 -15
  11. data/{LICENSE → LICENSE.txt} +2 -2
  12. data/README.md +209 -218
  13. data/Rakefile +14 -7
  14. data/UPGRADING.md +35 -0
  15. data/html-pipeline.gemspec +31 -21
  16. data/lib/html-pipeline.rb +3 -0
  17. data/lib/html_pipeline/convert_filter/markdown_filter.rb +26 -0
  18. data/lib/html_pipeline/convert_filter.rb +17 -0
  19. data/lib/html_pipeline/filter.rb +89 -0
  20. data/lib/{html/pipeline → html_pipeline/node_filter}/absolute_source_filter.rb +23 -21
  21. data/lib/{html/pipeline → html_pipeline/node_filter}/emoji_filter.rb +58 -54
  22. data/lib/html_pipeline/node_filter/https_filter.rb +22 -0
  23. data/lib/html_pipeline/node_filter/image_max_width_filter.rb +40 -0
  24. data/lib/{html/pipeline/@mention_filter.rb → html_pipeline/node_filter/mention_filter.rb} +55 -69
  25. data/lib/html_pipeline/node_filter/table_of_contents_filter.rb +68 -0
  26. data/lib/html_pipeline/node_filter/team_mention_filter.rb +105 -0
  27. data/lib/html_pipeline/node_filter.rb +31 -0
  28. data/lib/html_pipeline/sanitization_filter.rb +65 -0
  29. data/lib/{html/pipeline → html_pipeline/text_filter}/image_filter.rb +3 -3
  30. data/lib/{html/pipeline → html_pipeline/text_filter}/plain_text_input_filter.rb +3 -5
  31. data/lib/html_pipeline/text_filter.rb +21 -0
  32. data/lib/html_pipeline/version.rb +5 -0
  33. data/lib/html_pipeline.rb +252 -0
  34. metadata +52 -54
  35. data/.travis.yml +0 -43
  36. data/Appraisals +0 -19
  37. data/CONTRIBUTING.md +0 -60
  38. data/bin/html-pipeline +0 -78
  39. data/lib/html/pipeline/@team_mention_filter.rb +0 -99
  40. data/lib/html/pipeline/autolink_filter.rb +0 -34
  41. data/lib/html/pipeline/body_content.rb +0 -44
  42. data/lib/html/pipeline/camo_filter.rb +0 -105
  43. data/lib/html/pipeline/email_reply_filter.rb +0 -69
  44. data/lib/html/pipeline/filter.rb +0 -165
  45. data/lib/html/pipeline/https_filter.rb +0 -29
  46. data/lib/html/pipeline/image_max_width_filter.rb +0 -37
  47. data/lib/html/pipeline/markdown_filter.rb +0 -56
  48. data/lib/html/pipeline/sanitization_filter.rb +0 -144
  49. data/lib/html/pipeline/syntax_highlight_filter.rb +0 -50
  50. data/lib/html/pipeline/text_filter.rb +0 -16
  51. data/lib/html/pipeline/textile_filter.rb +0 -25
  52. data/lib/html/pipeline/toc_filter.rb +0 -69
  53. data/lib/html/pipeline/version.rb +0 -7
  54. data/lib/html/pipeline.rb +0 -210
@@ -1,9 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'set'
3
+ require "set"
4
4
 
5
- module HTML
6
- class Pipeline
5
+ class HTMLPipeline
6
+ class NodeFilter
7
7
  # HTML filter that replaces @user mentions with links. Mentions within <pre>,
8
8
  # <code>, and <a> elements are ignored. Mentions that reference users that do
9
9
  # not exist are ignored.
@@ -16,71 +16,69 @@ module HTML
16
16
  # :username_pattern - Used to provide a custom regular expression to
17
17
  # identify usernames
18
18
  #
19
- class MentionFilter < Filter
20
- # Public: Find user @mentions in text. See
21
- # MentionFilter#mention_link_filter.
22
- #
23
- # MentionFilter.mentioned_logins_in(text) do |match, login, is_mentioned|
24
- # "<a href=...>#{login}</a>"
25
- # end
26
- #
27
- # text - String text to search.
28
- #
29
- # Yields the String match, the String login name, and a Boolean determining
30
- # if the match = "@mention[ed]". The yield's return replaces the match in
31
- # the original text.
32
- #
33
- # Returns a String replaced with the return of the block.
34
- def self.mentioned_logins_in(text, username_pattern = UsernamePattern)
35
- text.gsub MentionPatterns[username_pattern] do |match|
36
- login = Regexp.last_match(1)
37
- yield match, login, MentionLogins.include?(login.downcase)
19
+ class MentionFilter < NodeFilter
20
+ class << self
21
+ # Public: Find user @mentions in text. See
22
+ # MentionFilter#mention_link_filter.
23
+ #
24
+ # MentionFilter.mentioned_logins_in(text) do |match, login, is_mentioned|
25
+ # "<a href=...>#{login}</a>"
26
+ # end
27
+ #
28
+ # text - String text to search.
29
+ #
30
+ # Yields the String match, the String login name, and a Boolean determining
31
+ # if the match = "@mention[ed]". The yield's return replaces the match in
32
+ # the original text.
33
+ #
34
+ # Returns a String replaced with the return of the block.
35
+ def mentioned_logins_in(text, username_pattern = USERNAME_PATTERN)
36
+ text.gsub(MENTION_PATTERNS[username_pattern]) do |match|
37
+ login = Regexp.last_match(1)
38
+ yield match, login
39
+ end
38
40
  end
39
- end
40
-
41
+ end
41
42
  # Hash that contains all of the mention patterns used by the pipeline
42
- MentionPatterns = Hash.new do |hash, key|
43
- hash[key] = /
43
+ MENTION_PATTERNS = Hash.new do |hash, key|
44
+ hash[key] = %r{
44
45
  (?:^|\W) # beginning of string or non-word char
45
46
  @((?>#{key})) # @username
46
- (?!\/) # without a trailing slash
47
+ (?!/) # without a trailing slash
47
48
  (?=
48
49
  \.+[ \t\W]| # dots followed by space or non-word character
49
50
  \.+$| # dots at end of line
50
51
  [^0-9a-zA-Z_.]| # non-word character except dot
51
52
  $ # end of line
52
53
  )
53
- /ix
54
+ }ix
54
55
  end
55
56
 
56
57
  # Default pattern used to extract usernames from text. The value can be
57
58
  # overriden by providing the username_pattern variable in the context.
58
- UsernamePattern = /[a-z0-9][a-z0-9-]*/
59
-
60
- # List of username logins that, when mentioned, link to the blog post
61
- # about @mentions instead of triggering a real mention.
62
- MentionLogins = %w[
63
- mention
64
- mentions
65
- mentioned
66
- mentioning
67
- ].freeze
59
+ USERNAME_PATTERN = /[a-z0-9][a-z0-9-]*/
68
60
 
69
61
  # Don't look for mentions in text nodes that are children of these elements
70
- IGNORE_PARENTS = %w(pre code a style script).to_set
62
+ IGNORE_PARENTS = ["pre", "code", "a", "style", "script"]
63
+
64
+ SELECTOR = Selma::Selector.new(match_text_within: "*", ignore_text_within: IGNORE_PARENTS)
71
65
 
72
- def call
66
+ def after_initialize
73
67
  result[:mentioned_usernames] ||= []
68
+ end
74
69
 
75
- doc.search('.//text()').each do |node|
76
- content = node.to_html
77
- next unless content.include?('@')
78
- next if has_ancestor?(node, IGNORE_PARENTS)
79
- html = mention_link_filter(content, base_url, info_url, username_pattern)
80
- next if html == content
81
- node.replace(html)
82
- end
83
- doc
70
+ def selector
71
+ SELECTOR
72
+ end
73
+
74
+ def handle_text_chunk(text)
75
+ content = text.to_s
76
+ return unless content.include?("@")
77
+
78
+ html = mention_link_filter(content, base_url: base_url, username_pattern: username_pattern)
79
+ return if html == content
80
+
81
+ text.replace(html, as: :html)
84
82
  end
85
83
 
86
84
  # The URL to provide when someone @mentions a "mention" name, such
@@ -90,7 +88,7 @@ module HTML
90
88
  end
91
89
 
92
90
  def username_pattern
93
- context[:username_pattern] || UsernamePattern
91
+ context[:username_pattern] || USERNAME_PATTERN
94
92
  end
95
93
 
96
94
  # Replace user @mentions in text with links to the mentioned user's
@@ -105,35 +103,23 @@ module HTML
105
103
  #
106
104
  # Returns a string with @mentions replaced with links. All links have a
107
105
  # 'user-mention' class name attached for styling.
108
- def mention_link_filter(text, _base_url = '/', info_url = nil, username_pattern = UsernamePattern)
109
- self.class.mentioned_logins_in(text, username_pattern) do |match, login, is_mentioned|
110
- link =
111
- if is_mentioned
112
- link_to_mention_info(login, info_url)
113
- else
114
- link_to_mentioned_user(login)
115
- end
106
+ def mention_link_filter(text, base_url: "/", username_pattern: USERNAME_PATTERN)
107
+ self.class.mentioned_logins_in(text, username_pattern) do |match, login|
108
+ link = link_to_mentioned_user(base_url, login)
116
109
 
117
110
  link ? match.sub("@#{login}", link) : match
118
111
  end
119
112
  end
120
113
 
121
- def link_to_mention_info(text, info_url = nil)
122
- return "@#{text}" if info_url.nil?
123
- "<a href='#{info_url}' class='user-mention'>" \
124
- "@#{text}" \
125
- '</a>'
126
- end
127
-
128
- def link_to_mentioned_user(login)
114
+ def link_to_mentioned_user(base_url, login)
129
115
  result[:mentioned_usernames] |= [login]
130
116
 
131
117
  url = base_url.dup
132
- url << '/' unless url =~ /[\/~]\z/
118
+ url << "/" unless %r{[/~]\z}.match?(url)
133
119
 
134
- "<a href='#{url << login}' class='user-mention'>" \
120
+ "<a href=\"#{url << login}\" class=\"user-mention\">" \
135
121
  "@#{login}" \
136
- '</a>'
122
+ "</a>"
137
123
  end
138
124
  end
139
125
  end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ class HTMLPipeline
4
+ class NodeFilter
5
+ # Generates a Table of Contents: an array of hashes containing:
6
+ # * `href`: the relative link to the header
7
+ # * `text`: the text of the header
8
+
9
+ # Examples
10
+ #
11
+ # TocPipeline =
12
+ # HTMLPipeline.new [
13
+ # HTMLPipeline::TableOfContentsFilter
14
+ # ]
15
+ # # => #<HTMLPipeline:0x007fc13c4528d8...>
16
+ # orig = %(<h1>Ice cube</h1><p>is not for the pop chart</p>)
17
+ # # => "<h1>Ice cube</h1><p>is not for the pop chart</p>"
18
+ # result = {}
19
+ # # => {}
20
+ # TocPipeline.call(orig, {}, result)
21
+ # # => {:toc=> ...}
22
+ # result[:toc]
23
+ # # => "{:href=>"#ice-cube", :text=>"Ice cube"}"
24
+ # result[:output].to_s
25
+ # # => "<h1>\n<a id=\"ice-cube\" class=\"anchor\" href=\"#ice-cube\">..."
26
+ class TableOfContentsFilter < NodeFilter
27
+ SELECTOR = Selma::Selector.new(match_element: "h1 a[href], h2 a[href], h3 a[href], h4 a[href], h5 a[href], h6 a[href]",
28
+ match_text_within: "h1, h2, h3, h4, h5, h6")
29
+
30
+ def selector
31
+ SELECTOR
32
+ end
33
+
34
+ # The icon that will be placed next to an anchored rendered markdown header
35
+ def anchor_html
36
+ @context[:anchor_html] || %(<span aria-hidden="true" class="anchor"></span>)
37
+ end
38
+
39
+ # The class that will be attached on the anchored rendered markdown header
40
+ def classes
41
+ context[:classes] || "anchor"
42
+ end
43
+
44
+ def after_initialize
45
+ result[:toc] = []
46
+ end
47
+
48
+ def handle_element(element)
49
+ header_href = element["href"]
50
+
51
+ return unless header_href.start_with?("#")
52
+
53
+ header_id = header_href[1..-1]
54
+
55
+ element["id"] = header_id
56
+ element["class"] = classes
57
+
58
+ element.set_inner_content(anchor_html, as: :html)
59
+
60
+ result[:toc] << { href: header_href }
61
+ end
62
+
63
+ def handle_text_chunk(text)
64
+ result[:toc].last[:text] = text.to_s
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,105 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+
5
+ class HTMLPipeline
6
+ class NodeFilter
7
+ # HTML filter that replaces @org/team mentions with links. Mentions within
8
+ # <pre>, <code>, <a>, <style>, and <script> elements are ignored.
9
+ #
10
+ # Context options:
11
+ # :base_url - Used to construct links to team profile pages for each
12
+ # mention.
13
+ # :team_pattern - Used to provide a custom regular expression to
14
+ # identify team names
15
+ #
16
+ class TeamMentionFilter < NodeFilter
17
+ class << self
18
+ # Public: Find @org/team mentions in text. See
19
+ # TeamMentionFilter#team_mention_link_filter.
20
+ #
21
+ # TeamMentionFilter.mentioned_teams_in(text) do |match, org, team|
22
+ # "<a href=...>#{team}</a>"
23
+ # end
24
+ #
25
+ # text - String text to search.
26
+ #
27
+ # Yields the String match, org name, and team name. The yield's
28
+ # return replaces the match in the original text.
29
+ #
30
+ # Returns a String replaced with the return of the block.
31
+ def mentioned_teams_in(text, team_pattern = TEAM_PATTERN)
32
+ text.gsub(team_pattern) do |match|
33
+ org = Regexp.last_match(1)
34
+ team = Regexp.last_match(2)
35
+ yield match, org, team
36
+ end
37
+ end
38
+ end
39
+
40
+ # Default pattern used to extract team names from text. The value can be
41
+ # overridden by providing the team_pattern variable in the context. To
42
+ # properly link the mention, should be in the format of /@(1)\/(2)/.
43
+ TEAM_PATTERN = %r{
44
+ (?<=^|\W) # beginning of string or non-word char
45
+ @([a-z0-9][a-z0-9-]*) # @organization
46
+ (?:/|&\#47;?) # dividing slash
47
+ ([a-z0-9][a-z0-9\-_]*) # team
48
+ \b
49
+ }ix
50
+
51
+ # Don't look for mentions in text nodes that are children of these elements
52
+ IGNORE_PARENTS = ["pre", "code", "a", "style", "script"]
53
+
54
+ SELECTOR = Selma::Selector.new(match_text_within: "*", ignore_text_within: IGNORE_PARENTS)
55
+
56
+ def after_initialize
57
+ result[:mentioned_teams] = []
58
+ end
59
+
60
+ def selector
61
+ SELECTOR
62
+ end
63
+
64
+ def handle_text_chunk(text)
65
+ content = text.to_s
66
+ return unless content.include?("@")
67
+
68
+ text.replace(mention_link_filter(content, base_url: base_url, team_pattern: team_pattern), as: :html)
69
+ end
70
+
71
+ def team_pattern
72
+ context[:team_pattern] || TEAM_PATTERN
73
+ end
74
+
75
+ # Replace @org/team mentions in text with links to the mentioned team's
76
+ # page.
77
+ #
78
+ # text - String text to replace @mention team names in.
79
+ # base_url - The base URL used to construct team page URLs.
80
+ # team_pattern - Regular expression used to identify teams in text
81
+ #
82
+ # Returns a string with @team mentions replaced with links. All links have a
83
+ # 'team-mention' class name attached for styling.
84
+ def mention_link_filter(text, base_url: "/", team_pattern: TEAM_PATTERN)
85
+ self.class.mentioned_teams_in(text, team_pattern) do |match, org, team|
86
+ link = link_to_mentioned_team(base_url, org, team)
87
+ seperator = %r{/|&\#47;?}
88
+
89
+ link ? match.sub(/@#{org}#{seperator}#{team}/, link) : match
90
+ end
91
+ end
92
+
93
+ def link_to_mentioned_team(base_url, org, team)
94
+ result[:mentioned_teams] |= [team]
95
+
96
+ url = base_url.dup
97
+ url << "/" unless %r{[/~]\z}.match?(url)
98
+
99
+ "<a href=\"#{url << org}/#{team}\" class=\"team-mention\">" \
100
+ "@#{org}/#{team}" \
101
+ "</a>"
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "selma"
4
+
5
+ class HTMLPipeline
6
+ class NodeFilter < Filter
7
+ def initialize(context: {}, result: {})
8
+ super(context: context, result: {})
9
+ send(:after_initialize) if respond_to?(:after_initialize)
10
+ end
11
+
12
+ # The String representation of the document.
13
+ def html
14
+ raise InvalidDocumentException if @html.nil? && @doc.nil?
15
+
16
+ @html || doc.to_html
17
+ end
18
+
19
+ def reset!
20
+ result = {} # rubocop:disable Lint/UselessAssignment
21
+ send(:after_initialize) if respond_to?(:after_initialize)
22
+ end
23
+
24
+ class << self
25
+ def call(html, context: {}, result: {})
26
+ node_filter = new(context: context, result: result)
27
+ Selma::Rewriter.new(sanitizer: nil, handlers: [node_filter]).rewrite(html)
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ class HTMLPipeline
4
+ # A special filter with sanization routines and allowlists. This module defines
5
+ # what HTML is allowed in user provided content and fixes up issues with
6
+ # unbalanced tags and whatnot.
7
+ #
8
+ # See the Selma docs for more information on the underlying library:
9
+ #
10
+ # https://github.com/gjtorikian/selma/#readme
11
+ #
12
+ # This filter does not write additional information to the context.
13
+ class SanitizationFilter
14
+ VALID_PROTOCOLS = Selma::Sanitizer::Config::VALID_PROTOCOLS.dup
15
+
16
+ # The main sanitization allowlist. Only these elements and attributes are
17
+ # allowed through by default.
18
+ DEFAULT_CONFIG = Selma::Sanitizer::Config.freeze_config({
19
+ elements: ["h1", "h2", "h3", "h4", "h5", "h6", "br", "b", "i", "strong", "em", "a", "pre", "code",
20
+ "img", "tt", "div", "ins", "del", "sup", "sub", "p", "picture", "ol", "ul", "table", "thead", "tbody", "tfoot",
21
+ "blockquote", "dl", "dt", "dd", "kbd", "q", "samp", "var", "hr", "ruby", "rt", "rp", "li", "tr", "td", "th",
22
+ "s", "strike", "summary", "details", "caption", "figure", "figcaption", "abbr", "bdo", "cite",
23
+ "dfn", "mark", "small", "source", "span", "time", "wbr",],
24
+
25
+ attributes: {
26
+ "a" => ["href"],
27
+ "img" => ["src", "longdesc", "loading", "alt"],
28
+ "div" => ["itemscope", "itemtype"],
29
+ "blockquote" => ["cite"],
30
+ "del" => ["cite"],
31
+ "ins" => ["cite"],
32
+ "q" => ["cite"],
33
+ "source" => ["srcset"],
34
+ all: ["abbr", "accept", "accept-charset", "accesskey", "action", "align", "alt", "aria-describedby",
35
+ "aria-hidden", "aria-label", "aria-labelledby", "axis", "border", "char",
36
+ "charoff", "charset", "checked", "clear", "cols", "colspan", "compact", "coords", "datetime", "dir",
37
+ "disabled", "enctype", "for", "frame", "headers", "height", "hreflang", "hspace", "id", "ismap", "label", "lang",
38
+ "maxlength", "media", "method", "multiple", "name", "nohref", "noshade", "nowrap", "open", "progress",
39
+ "prompt", "readonly", "rel", "rev", "role", "rows", "rowspan", "rules", "scope", "selected", "shape",
40
+ "size", "span", "start", "summary", "tabindex", "title", "type", "usemap", "valign", "value", "width", "itemprop",],
41
+ },
42
+ protocols: {
43
+ "a" => { "href" => Selma::Sanitizer::Config::VALID_PROTOCOLS }.freeze,
44
+ "blockquote" => { "cite" => ["http", "https", :relative].freeze },
45
+ "del" => { "cite" => ["http", "https", :relative].freeze },
46
+ "ins" => { "cite" => ["http", "https", :relative].freeze },
47
+ "q" => { "cite" => ["http", "https", :relative].freeze },
48
+ "img" => {
49
+ "src" => ["http", "https", :relative].freeze,
50
+ "longdesc" => ["http", "https", :relative].freeze,
51
+ },
52
+ },
53
+ })
54
+
55
+ class << self
56
+ def call(html, config)
57
+ raise ArgumentError, "html must be a String, not #{html.class}" unless html.is_a?(String)
58
+ raise ArgumentError, "config must be a Hash, not #{config.class}" unless config.is_a?(Hash)
59
+
60
+ sanitization_config = Selma::Sanitizer.new(config)
61
+ Selma::Rewriter.new(sanitizer: sanitization_config).rewrite(html)
62
+ end
63
+ end
64
+ end
65
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- module HTML
4
- class Pipeline
3
+ class HTMLPipeline
4
+ class TextFilter
5
5
  # HTML Filter that converts image's url into <img> tag.
6
6
  # For example, it will convert
7
7
  # http://example.com/test.jpg
@@ -10,7 +10,7 @@ module HTML
10
10
 
11
11
  class ImageFilter < TextFilter
12
12
  def call
13
- @text.gsub(/(https|http)?:\/\/.+\.(jpg|jpeg|bmp|gif|png)(\?\S+)?/i) do |match|
13
+ @text.gsub(%r{(https|http)?://.+\.(jpg|jpeg|bmp|gif|png)(\?\S+)?}i) do |match|
14
14
  %(<img src="#{match}" alt=""/>)
15
15
  end
16
16
  end
@@ -1,14 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- HTML::Pipeline.require_dependency('escape_utils', 'PlainTextInputFilter')
4
-
5
- module HTML
6
- class Pipeline
3
+ class HTMLPipeline
4
+ class TextFilter
7
5
  # Simple filter for plain text input. HTML escapes the text input and wraps it
8
6
  # in a div.
9
7
  class PlainTextInputFilter < TextFilter
10
8
  def call
11
- "<div>#{CGI.escape_html(@text)}</div>"
9
+ "<div>#{CGI.escapeHTML(@text)}</div>"
12
10
  end
13
11
  end
14
12
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ class HTMLPipeline
4
+ class TextFilter < Filter
5
+ attr_reader :text
6
+
7
+ def initialize(text, context: {}, result: {})
8
+ raise TypeError, "text must be a String" unless text.is_a?(String)
9
+
10
+ # Ensure that this is always a string
11
+ @text = text.respond_to?(:to_str) ? text.to_str : text.to_s
12
+ super(context: context, result: result)
13
+ end
14
+
15
+ class << self
16
+ def call(input, context: {}, result: {})
17
+ new(input, context: context, result: result).call
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class HTMLPipeline
4
+ VERSION = "3.0.0.pre1"
5
+ end