RubyGems - autolinker - Versions diffs - 0.1 - Mend

autolinker 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +7 -0
data/README.md +31 -0
data/Rakefile +45 -0
data/autolinker.gemspec +30 -0
data/lib/autolinker.rb +9 -0
data/lib/autolinker/html/node.rb +536 -0
data/lib/autolinker/html/sanitizer.rb +112 -0
data/lib/autolinker/html/tokenizer.rb +104 -0
data/lib/autolinker/text_helper.rb +292 -0
data/test/helper.rb +17 -0
data/test/test_autolinker.rb +270 -0
metadata +55 -0

data/lib/autolinker/html/sanitizer.rb ADDED

@@ -0,0 +1,112 @@
+require 'set'
+require 'cgi'
+module Autolinker
+  module HTML
+    class Sanitizer
+      attr_accessor :protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags,
+                    :allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties
+      def initialize
+        # A regular expression of the valid characters used to separate protocols like
+        # the ':' in 'http://foo.com'
+        @protocol_separator = /:|(&#0*58)|(&#x70)|(&#x0*3a)|(%|&#37;)3A/i
+        # Specifies a Set of HTML attributes that can have URIs.
+        @uri_attributes = Set.new(%w(href src cite action longdesc xlink:href lowsrc))
+        # Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed
+        # to just escaping harmless tags like &lt;font&gt;
+        @bad_tags = Set.new(%w(script))
+        # Specifies the default Set of tags that the #sanitize helper will allow unscathed.
+        @allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub
+      sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dl dt dd abbr
+      acronym a img blockquote del ins))
+        # Specifies the default Set of html attributes that the #sanitize helper will leave
+        # in the allowed tag.
+        @allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
+        # Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
+        @allowed_protocols = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto
+      feed svn urn aim rsync tag ssh sftp rtsp afs))
+        # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
+        @allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse
+      border-color border-left-color border-right-color border-top-color clear color cursor direction display
+      elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height
+      overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation
+      speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space
+      width))
+        # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
+        @allowed_css_keywords = Set.new(%w(auto aqua black block blue bold both bottom brown center
+      collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal
+      nowrap olive pointer purple red right solid silver teal top transparent underline white yellow))
+        # Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers.
+        @shorthand_css_properties = Set.new(%w(background border margin padding))
+      end
+      def sanitize(text, options = {})
+        return text unless sanitizeable?(text)
+        tokenize(text, options).join
+      end
+      def sanitizeable?(text)
+        !(text.nil? || text.empty? || !text.index("<"))
+      end
+      protected
+      def tokenize(text, options)
+        options[:parent] = []
+        options[:attributes] ||= allowed_attributes
+        options[:tags] ||= allowed_tags
+        tokenizer = HTML::Tokenizer.new(text)
+        result = []
+        while token = tokenizer.next
+          node = Node.parse(nil, 0, 0, token, false)
+          process_node node, result, options
+        end
+        result
+      end
+      def process_node(node, result, options)
+        result << case node
+                  when HTML::Tag
+                    if node.closing == :close
+                      options[:parent].shift
+                    else
+                      options[:parent].unshift node.name
+                    end
+                    process_attributes_for node, options
+                    options[:tags].include?(node.name) ? node : nil
+                  else
+                    bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(/</, "&lt;")
+                  end
+      end
+      def process_attributes_for(node, options)
+        return unless node.attributes
+        node.attributes.keys.each do |attr_name|
+          value = node.attributes[attr_name].to_s
+          if !options[:attributes].include?(attr_name) || contains_bad_protocols?(attr_name, value)
+            node.attributes.delete(attr_name)
+          else
+            node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(CGI::unescapeHTML(value))
+          end
+        end
+      end
+      def contains_bad_protocols?(attr_name, value)
+        uri_attributes.include?(attr_name) &&
+          (value =~ /(^[^\/:]*):|(&#0*58)|(&#x70)|(&#x0*3a)|(%|&#37;)3A/i && !allowed_protocols.include?(value.split(protocol_separator).first.downcase.strip))
+      end
+    end
+  end
+end

data/lib/autolinker/html/tokenizer.rb ADDED

@@ -0,0 +1,104 @@
+module Autolinker
+  module HTML
+    # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
+    # token is a string. Each string represents either "text", or an HTML element.
+    #
+    # This currently assumes valid XHTML, which means no free < or > characters.
+    #
+    # Usage:
+    #
+    #   tokenizer = HTML::Tokenizer.new(text)
+    #   while token = tokenizer.next
+    #     p token
+    #   end
+    class Tokenizer #:nodoc:
+      # The current (byte) position in the text
+      attr_reader :position
+      # The current line number
+      attr_reader :line
+      # Create a new Tokenizer for the given text.
+      def initialize(text)
+        @scanner = StringScanner.new(text)
+        @position = 0
+        @line = 0
+        @current_line = 1
+      end
+      # Return the next token in the sequence, or +nil+ if there are no more tokens in
+      # the stream.
+      def next
+        return nil if @scanner.eos?
+        @position = @scanner.pos
+        @line = @current_line
+        if @scanner.check(/<\S/)
+          update_current_line(scan_tag)
+        else
+          update_current_line(scan_text)
+        end
+      end
+      private
+      # Treat the text at the current position as a tag, and scan it. Supports
+      # comments, doctype tags, and regular tags, and ignores less-than and
+      # greater-than characters within quoted strings.
+      def scan_tag
+        tag = @scanner.getch
+        if @scanner.scan(/!--/) # comment
+          tag << @scanner.matched
+          tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
+        elsif @scanner.scan(/!\[CDATA\[/)
+          tag << @scanner.matched
+          tag << (@scanner.scan_until(/\]\]>/) || @scanner.scan_until(/\Z/))
+        elsif @scanner.scan(/!/) # doctype
+          tag << @scanner.matched
+          tag << consume_quoted_regions
+        else
+          tag << consume_quoted_regions
+        end
+        tag
+      end
+      # Scan all text up to the next < character and return it.
+      def scan_text
+        "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
+      end
+      # Counts the number of newlines in the text and updates the current line
+      # accordingly.
+      def update_current_line(text)
+        text.scan(/\r?\n/) { @current_line += 1 }
+      end
+      # Skips over quoted strings, so that less-than and greater-than characters
+      # within the strings are ignored.
+      def consume_quoted_regions
+        text = ""
+        loop do
+          match = @scanner.scan_until(/['"<>]/) or break
+          delim = @scanner.matched
+          if delim == "<"
+            match = match.chop
+            @scanner.pos -= 1
+          end
+          text << match
+          break if delim == "<" || delim == ">"
+          # consume the quoted region
+          while match = @scanner.scan_until(/[\\#{delim}]/)
+            text << match
+            break if @scanner.matched == delim
+            break if @scanner.eos?
+            text << @scanner.getch # skip the escaped character
+          end
+        end
+        text
+      end
+    end
+  end
+end

data/lib/autolinker/text_helper.rb ADDED

@@ -0,0 +1,292 @@
+module Autolinker
+  class TextHelper
+    # Turns all URLs and e-mail addresses into clickable links. The <tt>:link</tt> option
+    # will limit what should be linked. You can add HTML attributes to the links using
+    # <tt>:html</tt>. Possible values for <tt>:link</tt> are <tt>:all</tt> (default),
+    # <tt>:email_addresses</tt>, and <tt>:urls</tt>. If a block is given, each URL and
+    # e-mail address is yielded and the result is used as the link text. By default the
+    # text given is sanitized, you can override this behaviour setting the
+    # <tt>:sanitize</tt> option to false, or you can add options to the sanitization of
+    # the text using the <tt>:sanitize_options</tt> option hash.
+    #
+    # ==== Examples
+    #   auto_link("Go to http://www.rubyonrails.org and say hello to david@loudthinking.com")
+    #   # => "Go to <a href=\"http://www.rubyonrails.org\">http://www.rubyonrails.org</a> and
+    #   #     say hello to <a href=\"mailto:david@loudthinking.com\">david@loudthinking.com</a>"
+    #
+    #   auto_link("Visit http://www.loudthinking.com/ or e-mail david@loudthinking.com", :link => :urls)
+    #   # => "Visit <a href=\"http://www.loudthinking.com/\">http://www.loudthinking.com/</a>
+    #   #     or e-mail david@loudthinking.com"
+    #
+    #   auto_link("Visit http://www.loudthinking.com/ or e-mail david@loudthinking.com", :link => :email_addresses)
+    #   # => "Visit http://www.loudthinking.com/ or e-mail <a href=\"mailto:david@loudthinking.com\">david@loudthinking.com</a>"
+    #
+    #   post_body = "Welcome to my new blog at http://www.myblog.com/.  Please e-mail me at me@email.com."
+    #   auto_link(post_body, :html => { :target => '_blank' }) do |text|
+    #     truncate(text, :length => 15)
+    #   end
+    #   # => "Welcome to my new blog at <a href=\"http://www.myblog.com/\" target=\"_blank\">http://www.m...</a>.
+    #         Please e-mail me at <a href=\"mailto:me@email.com\">me@email.com</a>."
+    #
+    #
+    # You can still use <tt>auto_link</tt> with the old API that accepts the
+    # +link+ as its optional second parameter and the +html_options+ hash
+    # as its optional third parameter:
+    #   post_body = "Welcome to my new blog at http://www.myblog.com/. Please e-mail me at me@email.com."
+    #   auto_link(post_body, :urls)
+    #   # => "Welcome to my new blog at <a href=\"http://www.myblog.com/\">http://www.myblog.com</a>.
+    #         Please e-mail me at me@email.com."
+    #
+    #   auto_link(post_body, :all, :target => "_blank")
+    #   # => "Welcome to my new blog at <a href=\"http://www.myblog.com/\" target=\"_blank\">http://www.myblog.com</a>.
+    #         Please e-mail me at <a href=\"mailto:me@email.com\">me@email.com</a>."
+    def auto_link(text, *args, &block) #link = :all, html = {}, &block)
+      return '' if text.nil? || text.empty?
+      options = args.size == 2 ? {} : extract_options!(args) # this is necessary because the old auto_link API has a Hash as its last parameter
+      unless args.empty?
+        options[:link] = args[0] || :all
+        options[:html] = args[1] || {}
+      end
+      options = { :link => :all, :html => {} }.merge(options)
+      sanitize_options = options[:sanitize_options] || {}
+      sanitize = (options[:sanitize] != false)
+      text = conditional_sanitize(text, sanitize, sanitize_options).to_str
+      case options[:link].to_sym
+      when :all then
+        auto_link_email_addresses(auto_link_urls(text, options[:html], options, &block), options[:html], &block)
+      when :email_addresses then
+        auto_link_email_addresses(text, options[:html], &block)
+      when :urls then
+        auto_link_urls(text, options[:html], options, &block)
+      end
+    end
+    private
+    AUTO_LINK_RE = %r{
+        (?: ((?:ed2k|ftp|http|https|irc|mailto|news|gopher|nntp|telnet|webcal|xmpp|callto|feed|svn|urn|aim|rsync|tag|ssh|sftp|rtsp|afs|file):)// | www\. )
+        [^\s<\u00A0]+
+      }x
+    # regexps for determining context, used high-volume
+    AUTO_LINK_CRE = [/<[^>]+$/, /^[^>]*>/, /<a\b.*?>/i, /<\/a>/i]
+    AUTO_EMAIL_LOCAL_RE = /[\w.!#\$%&'*\/=?^`{|}~+-]/
+    AUTO_EMAIL_RE = /[\w.!#\$%+-]\.?(?:#{AUTO_EMAIL_LOCAL_RE}+\.)*#{AUTO_EMAIL_LOCAL_RE}*@[\w-]+(?:\.[\w-]+)+/
+    BRACKETS = { ']' => '[', ')' => '(', '}' => '{' }
+    WORD_PATTERN = RUBY_VERSION < '1.9' ? '\w' : '\p{Word}'
+    # Turns all urls into clickable links.  If a block is given, each url
+    # is yielded and the result is used as the link text.
+    def auto_link_urls(text, link_attributes = {}, options = {})
+      text.gsub(AUTO_LINK_RE) do
+        scheme, href = $1, $&
+        punctuation = []
+        if auto_linked?($`, $')
+          # do not change string; URL is already linked
+          href
+        else
+          # don't include trailing punctuation character as part of the URL
+          while href.sub!(/[^#{WORD_PATTERN}\/-]$/, '')
+            punctuation.push $&
+            if opening = BRACKETS[punctuation.last] and href.scan(opening).size > href.scan(punctuation.last).size
+              href << punctuation.pop
+              break
+            end
+          end
+          link_text = block_given? ? yield(href) : href
+          href = 'http://' + href unless scheme
+          unless options[:sanitize] == false
+            link_text = sanitize(link_text)
+            href = sanitize(href)
+          end
+          #"<a href='#{link_attributes.merge('href' => href)}'>#{link_text}</a>"
+          content_tag(:a, link_text, link_attributes.merge('href' => href), !!options[:sanitize]) + punctuation.reverse.join('')
+        end
+      end
+    end
+    # Turns all email addresses into clickable links.  If a block is given,
+    # each email is yielded and the result is used as the link text.
+    def auto_link_email_addresses(text, html_options = {}, options = {})
+      text.gsub(AUTO_EMAIL_RE) do
+        text = $&
+        if auto_linked?($`, $')
+          text
+        else
+          display_text = (block_given?) ? yield(text) : text
+          unless options[:sanitize] == false
+            text = sanitize(text)
+            display_text = sanitize(display_text) unless text == display_text
+          end
+          mail_to text, display_text, html_options
+        end
+      end
+    end
+    def extract_options!(args)
+      if args.last.is_a?(Hash)
+        args.pop
+      else
+        {}
+      end
+    end
+    # Detects already linked context or position in the middle of a tag
+    def auto_linked?(left, right)
+      (left =~ AUTO_LINK_CRE[0] and right =~ AUTO_LINK_CRE[1]) or
+        (left.rindex(AUTO_LINK_CRE[2]) and $' !~ AUTO_LINK_CRE[3])
+    end
+    def conditional_sanitize(target, condition, sanitize_options = {})
+      condition ? sanitize(target, sanitize_options) : target
+    end
+    def sanitize(html, options = {})
+      Autolinker::HTML::Sanitizer.new.sanitize(html, options)
+    end
+    def content_tag(name, content_or_options_with_block = nil, options = nil, escape = true, &block)
+      if block_given?
+        options = content_or_options_with_block if content_or_options_with_block.is_a?(Hash)
+        content_tag_string(name, capture(&block), options, escape)
+      else
+        content_tag_string(name, content_or_options_with_block, options, escape)
+      end
+    end
+    def content_tag_string(name, content, options, escape = true)
+      tag_options = tag_options(options, escape) if options
+      "<#{name}#{tag_options}>#{PRE_CONTENT_STRINGS[name.to_sym]}#{escape ? ERB::Util.h(content) : content}</#{name}>"
+    end
+    BOOLEAN_ATTRIBUTES = %w(disabled readonly multiple checked autobuffer
+                           autoplay controls loop selected hidden scoped async
+                           defer reversed ismap seemless muted required
+                           autofocus novalidate formnovalidate open pubdate).to_set
+    BOOLEAN_ATTRIBUTES.merge(BOOLEAN_ATTRIBUTES.map { |attribute| attribute.to_sym })
+    PRE_CONTENT_STRINGS = {
+      :textarea => "\n"
+    }
+    def tag_options(options, escape = true)
+      unless options.nil? || options.empty?
+        attrs = []
+        options.each_pair do |key, value|
+          if key.to_s == 'data' && value.is_a?(Hash)
+            value.each do |k, v|
+              unless v.is_a?(String) || v.is_a?(Symbol) || v.is_a?(BigDecimal)
+                v = v.to_json
+              end
+              v = ERB::Util.html_escape(v) if escape
+              attrs << %(data-#{k.to_s.dasherize}="#{v}")
+            end
+          elsif BOOLEAN_ATTRIBUTES.include?(key)
+            attrs << %(#{key}="#{key}") if value
+          elsif !value.nil?
+            final_value = value.is_a?(Array) ? value.join(" ") : value
+            final_value = ERB::Util.html_escape(final_value) if escape
+            attrs << %(#{key}="#{final_value}")
+          end
+        end
+        " #{attrs.sort * ' '}" unless attrs.empty?
+      end
+    end
+    # Creates a mailto link tag to the specified +email_address+, which is
+    # also used as the name of the link unless +name+ is specified. Additional
+    # HTML attributes for the link can be passed in +html_options+.
+    #
+    # +mail_to+ has several methods for hindering email harvesters and customizing
+    # the email itself by passing special keys to +html_options+.
+    #
+    # ==== Options
+    # * <tt>:encode</tt> - This key will accept the strings "javascript" or "hex".
+    #   Passing "javascript" will dynamically create and encode the mailto link then
+    #   eval it into the DOM of the page. This method will not show the link on
+    #   the page if the user has JavaScript disabled. Passing "hex" will hex
+    #   encode the +email_address+ before outputting the mailto link.
+    # * <tt>:replace_at</tt> - When the link +name+ isn't provided, the
+    #   +email_address+ is used for the link label. You can use this option to
+    #   obfuscate the +email_address+ by substituting the @ sign with the string
+    #   given as the value.
+    # * <tt>:replace_dot</tt> - When the link +name+ isn't provided, the
+    #   +email_address+ is used for the link label. You can use this option to
+    #   obfuscate the +email_address+ by substituting the . in the email with the
+    #   string given as the value.
+    # * <tt>:subject</tt> - Preset the subject line of the email.
+    # * <tt>:body</tt> - Preset the body of the email.
+    # * <tt>:cc</tt> - Carbon Copy additional recipients on the email.
+    # * <tt>:bcc</tt> - Blind Carbon Copy additional recipients on the email.
+    #
+    # ==== Examples
+    #   mail_to "me@domain.com"
+    #   # => <a href="mailto:me@domain.com">me@domain.com</a>
+    #
+    #   mail_to "me@domain.com", "My email", :encode => "javascript"
+    #   # => <script type="text/javascript">eval(decodeURIComponent('%64%6f%63...%27%29%3b'))</script>
+    #
+    #   mail_to "me@domain.com", "My email", :encode => "hex"
+    #   # => <a href="mailto:%6d%65@%64%6f%6d%61%69%6e.%63%6f%6d">My email</a>
+    #
+    #   mail_to "me@domain.com", nil, :replace_at => "_at_", :replace_dot => "_dot_", :class => "email"
+    #   # => <a href="mailto:me@domain.com" class="email">me_at_domain_dot_com</a>
+    #
+    #   mail_to "me@domain.com", "My email", :cc => "ccaddress@domain.com",
+    #            :subject => "This is an example email"
+    #   # => <a href="mailto:me@domain.com?cc=ccaddress@domain.com&subject=This%20is%20an%20example%20email">My email</a>
+    def mail_to(email_address, name = nil, html_options = {})
+      email_address = ERB::Util.html_escape(email_address)
+      encode = html_options.delete("encode").to_s
+      extras = %w{ cc bcc body subject }.map { |item|
+        option = html_options.delete(item) || next
+        "#{item}=#{Rack::Utils.escape(option).gsub("+", "%20")}"
+      }.compact
+      extras = extras.empty? ? '' : '?' + ERB::Util.html_escape(extras.join('&'))
+      email_address_obfuscated = email_address.to_str
+      email_address_obfuscated.gsub!(/@/, html_options.delete("replace_at")) if html_options.key?("replace_at")
+      email_address_obfuscated.gsub!(/\./, html_options.delete("replace_dot")) if html_options.key?("replace_dot")
+      case encode
+      when "javascript"
+        string = ''
+        html = content_tag("a", name || email_address_obfuscated, html_options.merge("href" => "mailto:#{email_address}#{extras}"))
+        html = escape_javascript(html.to_str)
+        "document.write('#{html}');".each_byte do |c|
+          string << sprintf("%%%x", c)
+        end
+        "<script type=\"#{Mime::JS}\">eval(decodeURIComponent('#{string}'))</script>"
+      when "hex"
+        email_address_encoded = email_address_obfuscated.unpack('C*').map { |c|
+          sprintf("&#%d;", c)
+        }.join
+        string = 'mailto:'.unpack('C*').map { |c|
+          sprintf("&#%d;", c)
+        }.join + email_address.unpack('C*').map { |c|
+          char = c.chr
+          char =~ /\w/ ? sprintf("%%%x", c) : char
+        }.join
+        content_tag "a", name || email_address_encoded, html_options.merge("href" => "#{string}#{extras}")
+      else
+        content_tag "a", name || email_address_obfuscated, html_options.merge("href" => "mailto:#{email_address}#{extras}")
+      end
+    end
+  end
+end