RubyGems - html-pipeline - Versions diffs - 2.14.3 → 3.0.3 - Mend

html-pipeline 2.14.3 → 3.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

checksums.yaml +4 -4
data/.github/FUNDING.yml +11 -3
data/.github/dependabot.yml +27 -0
data/.github/workflows/automerge.yml +13 -0
data/.github/workflows/ci.yml +22 -0
data/.github/workflows/lint.yml +23 -0
data/.github/workflows/publish.yml +19 -0
data/.rubocop.yml +17 -0
data/.ruby-version +1 -0
data/.vscode/settings.json +8 -0
data/CHANGELOG.md +128 -2
data/Gemfile +31 -15
data/{LICENSE → LICENSE.txt} +2 -2
data/README.md +241 -224
data/Rakefile +14 -7
data/UPGRADING.md +34 -0
data/html-pipeline.gemspec +31 -21
data/lib/html-pipeline.rb +3 -0
data/lib/html_pipeline/convert_filter/markdown_filter.rb +26 -0
data/lib/html_pipeline/convert_filter.rb +17 -0
data/lib/html_pipeline/filter.rb +89 -0
data/lib/html_pipeline/node_filter/absolute_source_filter.rb +54 -0
data/lib/html_pipeline/node_filter/asset_proxy_filter.rb +86 -0
data/lib/{html/pipeline → html_pipeline/node_filter}/emoji_filter.rb +58 -54
data/lib/html_pipeline/node_filter/https_filter.rb +22 -0
data/lib/html_pipeline/node_filter/image_max_width_filter.rb +40 -0
data/lib/{html/pipeline/@mention_filter.rb → html_pipeline/node_filter/mention_filter.rb} +54 -68
data/lib/html_pipeline/node_filter/syntax_highlight_filter.rb +62 -0
data/lib/html_pipeline/node_filter/table_of_contents_filter.rb +70 -0
data/lib/html_pipeline/node_filter/team_mention_filter.rb +105 -0
data/lib/html_pipeline/node_filter.rb +31 -0
data/lib/html_pipeline/sanitization_filter.rb +190 -0
data/lib/{html/pipeline → html_pipeline/text_filter}/image_filter.rb +3 -3
data/lib/{html/pipeline → html_pipeline/text_filter}/plain_text_input_filter.rb +3 -5
data/lib/html_pipeline/text_filter.rb +21 -0
data/lib/html_pipeline/version.rb +5 -0
data/lib/html_pipeline.rb +281 -0
metadata +58 -54
data/.travis.yml +0 -43
data/Appraisals +0 -19
data/CONTRIBUTING.md +0 -60
data/bin/html-pipeline +0 -78
data/lib/html/pipeline/@team_mention_filter.rb +0 -99
data/lib/html/pipeline/absolute_source_filter.rb +0 -52
data/lib/html/pipeline/autolink_filter.rb +0 -34
data/lib/html/pipeline/body_content.rb +0 -44
data/lib/html/pipeline/camo_filter.rb +0 -105
data/lib/html/pipeline/email_reply_filter.rb +0 -69
data/lib/html/pipeline/filter.rb +0 -165
data/lib/html/pipeline/https_filter.rb +0 -29
data/lib/html/pipeline/image_max_width_filter.rb +0 -37
data/lib/html/pipeline/markdown_filter.rb +0 -56
data/lib/html/pipeline/sanitization_filter.rb +0 -144
data/lib/html/pipeline/syntax_highlight_filter.rb +0 -50
data/lib/html/pipeline/text_filter.rb +0 -16
data/lib/html/pipeline/textile_filter.rb +0 -25
data/lib/html/pipeline/toc_filter.rb +0 -69
data/lib/html/pipeline/version.rb +0 -7
data/lib/html/pipeline.rb +0 -210

data/lib/{html/pipeline/@mention_filter.rb → html_pipeline/node_filter/mention_filter.rb} RENAMED Viewed

@@ -1,9 +1,9 @@
 # frozen_string_literal: true
-require 'set'
+require "set"
-module HTML
-  class Pipeline
+class HTMLPipeline
+  class NodeFilter
     # HTML filter that replaces @user mentions with links. Mentions within <pre>,
     # <code>, and <a> elements are ignored. Mentions that reference users that do
     # not exist are ignored.
@@ -16,71 +16,69 @@ module HTML
     #   :username_pattern - Used to provide a custom regular expression to
     #                       identify usernames
     #
-    class MentionFilter < Filter
-      # Public: Find user @mentions in text.  See
-      # MentionFilter#mention_link_filter.
-      #
-      #   MentionFilter.mentioned_logins_in(text) do |match, login, is_mentioned|
-      #     "<a href=...>#{login}</a>"
-      #   end
-      #
-      # text - String text to search.
-      #
-      # Yields the String match, the String login name, and a Boolean determining
-      # if the match = "@mention[ed]".  The yield's return replaces the match in
-      # the original text.
-      #
-      # Returns a String replaced with the return of the block.
-      def self.mentioned_logins_in(text, username_pattern = UsernamePattern)
-        text.gsub MentionPatterns[username_pattern] do |match|
-          login = Regexp.last_match(1)
-          yield match, login, MentionLogins.include?(login.downcase)
+    class MentionFilter < NodeFilter
+      class << self
+        # Public: Find user @mentions in text.  See
+        # MentionFilter#mention_link_filter.
+        #
+        #   MentionFilter.mentioned_logins_in(text) do |match, login, is_mentioned|
+        #     "<a href=...>#{login}</a>"
+        #   end
+        #
+        # text - String text to search.
+        #
+        # Yields the String match, the String login name, and a Boolean determining
+        # if the match = "@mention[ed]".  The yield's return replaces the match in
+        # the original text.
+        #
+        # Returns a String replaced with the return of the block.
+        def mentioned_logins_in(text, username_pattern = USERNAME_PATTERN)
+          text.gsub(MENTION_PATTERNS[username_pattern]) do |match|
+            login = Regexp.last_match(1)
+            yield match, login
+          end
         end
       end
       # Hash that contains all of the mention patterns used by the pipeline
-      MentionPatterns = Hash.new do |hash, key|
-        hash[key] = /
+      MENTION_PATTERNS = Hash.new do |hash, key|
+        hash[key] = %r{
           (?:^|\W)                    # beginning of string or non-word char
           @((?>#{key}))  # @username
-          (?!\/)                      # without a trailing slash
+          (?!/)                      # without a trailing slash
           (?=
             \.+[ \t\W]|               # dots followed by space or non-word character
             \.+$|                     # dots at end of line
             [^0-9a-zA-Z_.]|           # non-word character except dot
             $                         # end of line
           )
-        /ix
+        }ix
       end
       # Default pattern used to extract usernames from text. The value can be
       # overriden by providing the username_pattern variable in the context.
-      UsernamePattern = /[a-z0-9][a-z0-9-]*/
-      # List of username logins that, when mentioned, link to the blog post
-      # about @mentions instead of triggering a real mention.
-      MentionLogins = %w[
-        mention
-        mentions
-        mentioned
-        mentioning
-      ].freeze
+      USERNAME_PATTERN = /[a-z0-9][a-z0-9-]*/
       # Don't look for mentions in text nodes that are children of these elements
-      IGNORE_PARENTS = %w(pre code a style script).to_set
+      IGNORE_PARENTS = ["pre", "code", "a", "style", "script"]
-      def call
+      SELECTOR = Selma::Selector.new(match_text_within: "*", ignore_text_within: IGNORE_PARENTS)
+      def after_initialize
         result[:mentioned_usernames] ||= []
+      end
-        doc.search('.//text()').each do |node|
-          content = node.to_html
-          next unless content.include?('@')
-          next if has_ancestor?(node, IGNORE_PARENTS)
-          html = mention_link_filter(content, base_url, info_url, username_pattern)
-          next if html == content
-          node.replace(html)
-        end
-        doc
+      def selector
+        SELECTOR
+      end
+      def handle_text_chunk(text)
+        content = text.to_s
+        return unless content.include?("@")
+        html = mention_link_filter(content, base_url: base_url, username_pattern: username_pattern)
+        return if html == content
+        text.replace(html, as: :html)
       end
       # The URL to provide when someone @mentions a "mention" name, such
@@ -90,7 +88,7 @@ module HTML
       end
       def username_pattern
-        context[:username_pattern] || UsernamePattern
+        context[:username_pattern] || USERNAME_PATTERN
       end
       # Replace user @mentions in text with links to the mentioned user's
@@ -105,35 +103,23 @@ module HTML
       #
       # Returns a string with @mentions replaced with links. All links have a
       # 'user-mention' class name attached for styling.
-      def mention_link_filter(text, _base_url = '/', info_url = nil, username_pattern = UsernamePattern)
-        self.class.mentioned_logins_in(text, username_pattern) do |match, login, is_mentioned|
-          link =
-            if is_mentioned
-              link_to_mention_info(login, info_url)
-            else
-              link_to_mentioned_user(login)
-            end
+      def mention_link_filter(text, base_url: "/", username_pattern: USERNAME_PATTERN)
+        self.class.mentioned_logins_in(text, username_pattern) do |match, login|
+          link = link_to_mentioned_user(base_url, login)
           link ? match.sub("@#{login}", link) : match
         end
       end
-      def link_to_mention_info(text, info_url = nil)
-        return "@#{text}" if info_url.nil?
-        "<a href='#{info_url}' class='user-mention'>" \
-          "@#{text}" \
-          '</a>'
-      end
-      def link_to_mentioned_user(login)
+      def link_to_mentioned_user(base_url, login)
         result[:mentioned_usernames] |= [login]
         url = base_url.dup
-        url << '/' unless url =~ /[\/~]\z/
+        url << "/" unless %r{[/~]\z}.match?(url)
-        "<a href='#{url << login}' class='user-mention'>" \
+        "<a href=\"#{url << login}\" class=\"user-mention\">" \
           "@#{login}" \
-          '</a>'
+          "</a>"
       end
     end
   end

data/lib/html_pipeline/node_filter/syntax_highlight_filter.rb ADDED Viewed

@@ -0,0 +1,62 @@
+# frozen_string_literal: true
+HTMLPipeline.require_dependency("rouge", "SyntaxHighlightFilter")
+class HTMLPipeline
+  class NodeFilter
+    # HTML Filter that syntax highlights text inside code blocks.
+    #
+    # Context options:
+    #
+    #   :highlight => String represents the language to pick lexer. Defaults to empty string.
+    #   :scope => String represents the class attribute adds to pre element after.
+    #             Defaults to "highlight highlight-css" if highlights a css code block.
+    #
+    # This filter does not write any additional information to the context hash.
+    class SyntaxHighlightFilter < NodeFilter
+      def initialize(context: {}, result: {})
+        super(context: context, result: result)
+        # TODO: test the optionality of this
+        @formatter = context[:formatter] || Rouge::Formatters::HTML.new
+      end
+      SELECTOR = Selma::Selector.new(match_element: "pre", match_text_within: "pre")
+      def selector
+        SELECTOR
+      end
+      def handle_element(element)
+        default = context[:highlight]&.to_s
+        @lang = element["lang"] || default
+        scope = context.fetch(:scope, "highlight")
+        element["class"] = "#{scope} #{scope}-#{@lang}" if include_lang?
+      end
+      def handle_text_chunk(text)
+        return if @lang.nil?
+        return if (lexer = lexer_for(@lang)).nil?
+        content = text.to_s
+        text.replace(highlight_with_timeout_handling(content, lexer), as: :html)
+      end
+      def highlight_with_timeout_handling(text, lexer)
+        Rouge.highlight(text, lexer, @formatter)
+      rescue Timeout::Error => _e
+        text
+      end
+      def lexer_for(lang)
+        Rouge::Lexer.find(lang)
+      end
+      def include_lang?
+        !@lang.nil? && !@lang.empty?
+      end
+    end
+  end
+end

data/lib/html_pipeline/node_filter/table_of_contents_filter.rb ADDED Viewed

@@ -0,0 +1,70 @@
+# frozen_string_literal: true
+class HTMLPipeline
+  class NodeFilter
+    # Generates a Table of Contents: an array of hashes containing:
+    # * `href`: the relative link to the header
+    # * `text`: the text of the header
+    # Examples
+    #
+    #  TocPipeline =
+    #    HTMLPipeline.new [
+    #      HTMLPipeline::TableOfContentsFilter
+    #    ]
+    #  # => #<HTMLPipeline:0x007fc13c4528d8...>
+    #  orig = %(<h1>Ice cube</h1><p>is not for the pop chart</p>)
+    #  # => "<h1>Ice cube</h1><p>is not for the pop chart</p>"
+    #  result = {}
+    #  # => {}
+    #  TocPipeline.call(orig, {}, result)
+    #  # => {:toc=> ...}
+    #  result[:toc]
+    #  # => "{:href=>"#ice-cube", :text=>"Ice cube"}"
+    #  result[:output].to_s
+    #  # => "<h1>\n<a id=\"ice-cube\" class=\"anchor\" href=\"#ice-cube\">..."
+    class TableOfContentsFilter < NodeFilter
+      SELECTOR = Selma::Selector.new(
+        match_element: "h1 a[href], h2 a[href], h3 a[href], h4 a[href], h5 a[href], h6 a[href]",
+        match_text_within: "h1, h2, h3, h4, h5, h6",
+      )
+      def selector
+        SELECTOR
+      end
+      # The icon that will be placed next to an anchored rendered markdown header
+      def anchor_html
+        @context[:anchor_html] || %(<span aria-hidden="true" class="anchor"></span>)
+      end
+      # The class that will be attached on the anchored rendered markdown header
+      def classes
+        context[:classes] || "anchor"
+      end
+      def after_initialize
+        result[:toc] = []
+      end
+      def handle_element(element)
+        header_href = element["href"]
+        return unless header_href.start_with?("#")
+        header_id = header_href[1..-1]
+        element["id"] = header_id
+        element["class"] = classes
+        element.set_inner_content(anchor_html, as: :html)
+        result[:toc] << { href: header_href }
+      end
+      def handle_text_chunk(text)
+        result[:toc].last[:text] = text.to_s
+      end
+    end
+  end
+end

data/lib/html_pipeline/node_filter/team_mention_filter.rb ADDED Viewed

@@ -0,0 +1,105 @@
+# frozen_string_literal: true
+require "set"
+class HTMLPipeline
+  class NodeFilter
+    # HTML filter that replaces @org/team mentions with links. Mentions within
+    # <pre>, <code>, <a>, <style>, and <script> elements are ignored.
+    #
+    # Context options:
+    #   :base_url - Used to construct links to team profile pages for each
+    #               mention.
+    #   :team_pattern - Used to provide a custom regular expression to
+    #                       identify team names
+    #
+    class TeamMentionFilter < NodeFilter
+      class << self
+        # Public: Find @org/team mentions in text.  See
+        # TeamMentionFilter#team_mention_link_filter.
+        #
+        #   TeamMentionFilter.mentioned_teams_in(text) do |match, org, team|
+        #     "<a href=...>#{team}</a>"
+        #   end
+        #
+        # text - String text to search.
+        #
+        # Yields the String match, org name, and team name.  The yield's
+        # return replaces the match in the original text.
+        #
+        # Returns a String replaced with the return of the block.
+        def mentioned_teams_in(text, team_pattern = TEAM_PATTERN)
+          text.gsub(team_pattern) do |match|
+            org = Regexp.last_match(1)
+            team = Regexp.last_match(2)
+            yield match, org, team
+          end
+        end
+      end
+      # Default pattern used to extract team names from text. The value can be
+      # overridden by providing the team_pattern variable in the context. To
+      # properly link the mention, should be in the format of /@(1)\/(2)/.
+      TEAM_PATTERN = %r{
+        (?<=^|\W)                  # beginning of string or non-word char
+        @([a-z0-9][a-z0-9-]*)      # @organization
+          (?:/|&\#47;?)             # dividing slash
+          ([a-z0-9][a-z0-9\-_]*)   # team
+          \b
+      }ix
+      # Don't look for mentions in text nodes that are children of these elements
+      IGNORE_PARENTS = ["pre", "code", "a", "style", "script"]
+      SELECTOR = Selma::Selector.new(match_text_within: "*", ignore_text_within: IGNORE_PARENTS)
+      def after_initialize
+        result[:mentioned_teams] = []
+      end
+      def selector
+        SELECTOR
+      end
+      def handle_text_chunk(text)
+        content = text.to_s
+        return unless content.include?("@")
+        text.replace(mention_link_filter(content, base_url: base_url, team_pattern: team_pattern), as: :html)
+      end
+      def team_pattern
+        context[:team_pattern] || TEAM_PATTERN
+      end
+      # Replace @org/team mentions in text with links to the mentioned team's
+      # page.
+      #
+      # text      - String text to replace @mention team names in.
+      # base_url  - The base URL used to construct team page URLs.
+      # team_pattern  - Regular expression used to identify teams in text
+      #
+      # Returns a string with @team mentions replaced with links. All links have a
+      # 'team-mention' class name attached for styling.
+      def mention_link_filter(text, base_url: "/", team_pattern: TEAM_PATTERN)
+        self.class.mentioned_teams_in(text, team_pattern) do |match, org, team|
+          link = link_to_mentioned_team(base_url, org, team)
+          seperator = %r{/|&\#47;?}
+          link ? match.sub(/@#{org}#{seperator}#{team}/, link) : match
+        end
+      end
+      def link_to_mentioned_team(base_url, org, team)
+        result[:mentioned_teams] |= [team]
+        url = base_url.dup
+        url << "/" unless %r{[/~]\z}.match?(url)
+        "<a href=\"#{url << org}/#{team}\" class=\"team-mention\">" \
+          "@#{org}/#{team}" \
+          "</a>"
+      end
+    end
+  end
+end

data/lib/html_pipeline/node_filter.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+require "selma"
+class HTMLPipeline
+  class NodeFilter < Filter
+    def initialize(context: {}, result: {})
+      super(context: context, result: {})
+      send(:after_initialize) if respond_to?(:after_initialize)
+    end
+    # The String representation of the document.
+    def html
+      raise InvalidDocumentException if @html.nil? && @doc.nil?
+      @html || doc.to_html
+    end
+    def reset!
+      result = {} # rubocop:disable Lint/UselessAssignment
+      send(:after_initialize) if respond_to?(:after_initialize)
+    end
+    class << self
+      def call(html, context: {}, result: {})
+        node_filter = new(context: context, result: result)
+        Selma::Rewriter.new(sanitizer: nil, handlers: [node_filter]).rewrite(html)
+      end
+    end
+  end
+end

data/lib/html_pipeline/sanitization_filter.rb ADDED Viewed

@@ -0,0 +1,190 @@
+# frozen_string_literal: true
+require "selma"
+class HTMLPipeline
+  # A special filter with sanization routines and allowlists. This module defines
+  # what HTML is allowed in user provided content and fixes up issues with
+  # unbalanced tags and whatnot.
+  #
+  # See the Selma docs for more information on the underlying library:
+  #
+  # https://github.com/gjtorikian/selma/#readme
+  #
+  # This filter does not write additional information to the context.
+  class SanitizationFilter
+    VALID_PROTOCOLS = Selma::Sanitizer::Config::VALID_PROTOCOLS.dup
+    # The main sanitization allowlist. Only these elements and attributes are
+    # allowed through by default.
+    DEFAULT_CONFIG = Selma::Sanitizer::Config.freeze_config({
+      elements: [
+        "h1",
+        "h2",
+        "h3",
+        "h4",
+        "h5",
+        "h6",
+        "br",
+        "b",
+        "i",
+        "strong",
+        "em",
+        "a",
+        "pre",
+        "code",
+        "img",
+        "tt",
+        "div",
+        "ins",
+        "del",
+        "sup",
+        "sub",
+        "p",
+        "picture",
+        "ol",
+        "ul",
+        "table",
+        "thead",
+        "tbody",
+        "tfoot",
+        "blockquote",
+        "dl",
+        "dt",
+        "dd",
+        "kbd",
+        "q",
+        "samp",
+        "var",
+        "hr",
+        "ruby",
+        "rt",
+        "rp",
+        "li",
+        "tr",
+        "td",
+        "th",
+        "s",
+        "strike",
+        "summary",
+        "details",
+        "caption",
+        "figure",
+        "figcaption",
+        "abbr",
+        "bdo",
+        "cite",
+        "dfn",
+        "mark",
+        "small",
+        "source",
+        "span",
+        "time",
+        "wbr",
+      ],
+      attributes: {
+        "a" => ["href"],
+        "img" => ["src", "longdesc", "loading", "alt"],
+        "div" => ["itemscope", "itemtype"],
+        "blockquote" => ["cite"],
+        "del" => ["cite"],
+        "ins" => ["cite"],
+        "q" => ["cite"],
+        "source" => ["srcset"],
+        all: [
+          "abbr",
+          "accept",
+          "accept-charset",
+          "accesskey",
+          "action",
+          "align",
+          "alt",
+          "aria-describedby",
+          "aria-hidden",
+          "aria-label",
+          "aria-labelledby",
+          "axis",
+          "border",
+          "char",
+          "charoff",
+          "charset",
+          "checked",
+          "clear",
+          "cols",
+          "colspan",
+          "compact",
+          "coords",
+          "datetime",
+          "dir",
+          "disabled",
+          "enctype",
+          "for",
+          "frame",
+          "headers",
+          "height",
+          "hreflang",
+          "hspace",
+          "id",
+          "ismap",
+          "label",
+          "lang",
+          "maxlength",
+          "media",
+          "method",
+          "multiple",
+          "name",
+          "nohref",
+          "noshade",
+          "nowrap",
+          "open",
+          "progress",
+          "prompt",
+          "readonly",
+          "rel",
+          "rev",
+          "role",
+          "rows",
+          "rowspan",
+          "rules",
+          "scope",
+          "selected",
+          "shape",
+          "size",
+          "span",
+          "start",
+          "summary",
+          "tabindex",
+          "title",
+          "type",
+          "usemap",
+          "valign",
+          "value",
+          "width",
+          "itemprop",
+        ],
+      },
+      protocols: {
+        "a" => { "href" => Selma::Sanitizer::Config::VALID_PROTOCOLS }.freeze,
+        "blockquote" => { "cite" => ["http", "https", :relative].freeze },
+        "del" => { "cite" => ["http", "https", :relative].freeze },
+        "ins" => { "cite" => ["http", "https", :relative].freeze },
+        "q" => { "cite" => ["http", "https", :relative].freeze },
+        "img" => {
+          "src" => ["http", "https", :relative].freeze,
+          "longdesc" => ["http", "https", :relative].freeze,
+        },
+      },
+    })
+    class << self
+      def call(html, config)
+        raise ArgumentError, "html must be a String, not #{html.class}" unless html.is_a?(String)
+        raise ArgumentError, "config must be a Hash, not #{config.class}" unless config.is_a?(Hash)
+        sanitization_config = Selma::Sanitizer.new(config)
+        Selma::Rewriter.new(sanitizer: sanitization_config).rewrite(html)
+      end
+    end
+  end
+end

data/lib/{html/pipeline → html_pipeline/text_filter}/image_filter.rb RENAMED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
-module HTML
-  class Pipeline
+class HTMLPipeline
+  class TextFilter
     # HTML Filter that converts image's url into <img> tag.
     # For example, it will convert
     #   http://example.com/test.jpg
@@ -10,7 +10,7 @@ module HTML
     class ImageFilter < TextFilter
       def call
-        @text.gsub(/(https|http)?:\/\/.+\.(jpg|jpeg|bmp|gif|png)(\?\S+)?/i) do |match|
+        @text.gsub(%r{(https|http)?://.+\.(jpg|jpeg|bmp|gif|png)(\?\S+)?}i) do |match|
           %(<img src="#{match}" alt=""/>)
         end
       end

data/lib/{html/pipeline → html_pipeline/text_filter}/plain_text_input_filter.rb RENAMED Viewed

@@ -1,14 +1,12 @@
 # frozen_string_literal: true
-HTML::Pipeline.require_dependency('escape_utils', 'PlainTextInputFilter')
-module HTML
-  class Pipeline
+class HTMLPipeline
+  class TextFilter
     # Simple filter for plain text input. HTML escapes the text input and wraps it
     # in a div.
     class PlainTextInputFilter < TextFilter
       def call
-        "<div>#{CGI.escape_html(@text)}</div>"
+        "<div>#{CGI.escapeHTML(@text)}</div>"
       end
     end
   end