RubyGems - autolinker - Versions diffs - 0.1 - Mend

autolinker 0.1

Files changed (12) hide show

checksums.yaml +7 -0
data/README.md +31 -0
data/Rakefile +45 -0
data/autolinker.gemspec +30 -0
data/lib/autolinker.rb +9 -0
data/lib/autolinker/html/node.rb +536 -0
data/lib/autolinker/html/sanitizer.rb +112 -0
data/lib/autolinker/html/tokenizer.rb +104 -0
data/lib/autolinker/text_helper.rb +292 -0
data/test/helper.rb +17 -0
data/test/test_autolinker.rb +270 -0
metadata +55 -0

data/lib/autolinker/html/sanitizer.rb ADDED

@@ -0,0 +1,112 @@
+require 'set'
+require 'cgi'
+module Autolinker
+  module HTML
+    class Sanitizer
+      attr_accessor :protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags,
+                    :allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties
+      def initialize
+        # A regular expression of the valid characters used to separate protocols like
+        # the ':' in 'http://foo.com'
+        @protocol_separator = /:|(&#0*58)|(&#x70)|(&#x0*3a)|(%|&#37;)3A/i
+        # Specifies a Set of HTML attributes that can have URIs.
+        @uri_attributes = Set.new(%w(href src cite action longdesc xlink:href lowsrc))
+        # Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed
+        # to just escaping harmless tags like &lt;font&gt;
+        @bad_tags = Set.new(%w(script))
+        # Specifies the default Set of tags that the #sanitize helper will allow unscathed.
+        @allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub
+      sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dl dt dd abbr
+      acronym a img blockquote del ins))
+        # Specifies the default Set of html attributes that the #sanitize helper will leave
+        # in the allowed tag.
+        @allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
+        # Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
+        @allowed_protocols = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto
+      feed svn urn aim rsync tag ssh sftp rtsp afs))
+        # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
+        @allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse
+      border-color border-left-color border-right-color border-top-color clear color cursor direction display
+      elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height
+      overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation
+      speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space
+      width))
+        # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
+        @allowed_css_keywords = Set.new(%w(auto aqua black block blue bold both bottom brown center
+      collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal
+      nowrap olive pointer purple red right solid silver teal top transparent underline white yellow))
+        # Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers.
+        @shorthand_css_properties = Set.new(%w(background border margin padding))
+      end
+      def sanitize(text, options = {})
+        return text unless sanitizeable?(text)
+        tokenize(text, options).join
+      end
+      def sanitizeable?(text)
+        !(text.nil? || text.empty? || !text.index("<"))
+      end
+      protected
+      def tokenize(text, options)
+        options[:parent] = []
+        options[:attributes] ||= allowed_attributes
+        options[:tags] ||= allowed_tags
+        tokenizer = HTML::Tokenizer.new(text)
+        result = []
+        while token = tokenizer.next
+          node = Node.parse(nil, 0, 0, token, false)
+          process_node node, result, options
+        end
+        result
+      end
+      def process_node(node, result, options)
+        result << case node
+                  when HTML::Tag
+                    if node.closing == :close
+                      options[:parent].shift
+                    else
+                      options[:parent].unshift node.name
+                    end
+                    process_attributes_for node, options
+                    options[:tags].include?(node.name) ? node : nil
+                  else
+                    bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(/</, "&lt;")
+                  end
+      end
+      def process_attributes_for(node, options)
+        return unless node.attributes
+        node.attributes.keys.each do |attr_name|
+          value = node.attributes[attr_name].to_s
+          if !options[:attributes].include?(attr_name) || contains_bad_protocols?(attr_name, value)
+            node.attributes.delete(attr_name)
+          else
+            node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(CGI::unescapeHTML(value))
+          end
+        end
+      end
+      def contains_bad_protocols?(attr_name, value)
+        uri_attributes.include?(attr_name) &&
+          (value =~ /(^[^\/:]*):|(&#0*58)|(&#x70)|(&#x0*3a)|(%|&#37;)3A/i && !allowed_protocols.include?(value.split(protocol_separator).first.downcase.strip))
+      end
+    end
+  end
+end

data/lib/autolinker/html/tokenizer.rb ADDED

@@ -0,0 +1,104 @@
+module Autolinker
+  module HTML
+    # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
+    # token is a string. Each string represents either "text", or an HTML element.
+    #
+    # This currently assumes valid XHTML, which means no free < or > characters.
+    #
+    # Usage:
+    #
+    #   tokenizer = HTML::Tokenizer.new(text)
+    #   while token = tokenizer.next
+    #     p token
+    #   end
+    class Tokenizer #:nodoc:
+      # The current (byte) position in the text
+      attr_reader :position
+      # The current line number
+      attr_reader :line
+      # Create a new Tokenizer for the given text.
+      def initialize(text)
+        @scanner = StringScanner.new(text)
+        @position = 0
+        @line = 0
+        @current_line = 1
+      end
+      # Return the next token in the sequence, or +nil+ if there are no more tokens in
+      # the stream.
+      def next
+        return nil if @scanner.eos?
+        @position = @scanner.pos
+        @line = @current_line
+        if @scanner.check(/<\S/)
+          update_current_line(scan_tag)
+        else
+          update_current_line(scan_text)
+        end
+      end
+      private
+      # Treat the text at the current position as a tag, and scan it. Supports
+      # comments, doctype tags, and regular tags, and ignores less-than and
+      # greater-than characters within quoted strings.
+      def scan_tag
+        tag = @scanner.getch
+        if @scanner.scan(/!--/) # comment
+          tag << @scanner.matched
+          tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
+        elsif @scanner.scan(/!\[CDATA\[/)
+          tag << @scanner.matched
+          tag << (@scanner.scan_until(/\]\]>/) || @scanner.scan_until(/\Z/))
+        elsif @scanner.scan(/!/) # doctype
+          tag << @scanner.matched
+          tag << consume_quoted_regions
+        else
+          tag << consume_quoted_regions
+        end
+        tag
+      end
+      # Scan all text up to the next < character and return it.
+      def scan_text
+        "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
+      end
+      # Counts the number of newlines in the text and updates the current line
+      # accordingly.
+      def update_current_line(text)
+        text.scan(/\r?\n/) { @current_line += 1 }
+      end
+      # Skips over quoted strings, so that less-than and greater-than characters
+      # within the strings are ignored.
+      def consume_quoted_regions
+        text = ""
+        loop do
+          match = @scanner.scan_until(/['"<>]/) or break
+          delim = @scanner.matched
+          if delim == "<"
+            match = match.chop
+            @scanner.pos -= 1
+          end
+          text << match
+          break if delim == "<" || delim == ">"
+          # consume the quoted region
+          while match = @scanner.scan_until(/[\\#{delim}]/)
+            text << match
+            break if @scanner.matched == delim
+            break if @scanner.eos?
+            text << @scanner.getch # skip the escaped character
+          end
+        end
+        text
+      end
+    end
+  end
+end

data/lib/autolinker/text_helper.rb ADDED

@@ -0,0 +1,292 @@
+module Autolinker
+  class TextHelper
+    # Turns all URLs and e-mail addresses into clickable links. The <tt>:link</tt> option
+    # will limit what should be linked. You can add HTML attributes to the links using
+    # <tt>:html</tt>. Possible values for <tt>:link</tt> are <tt>:all</tt> (default),
+    # <tt>:email_addresses</tt>, and <tt>:urls</tt>. If a block is given, each URL and
+    # e-mail address is yielded and the result is used as the link text. By default the
+    # text given is sanitized, you can override this behaviour setting the
+    # <tt>:sanitize</tt> option to false, or you can add options to the sanitization of
+    # the text using the <tt>:sanitize_options</tt> option hash.
+    #
+    # ==== Examples
+    #   auto_link("Go to http://www.rubyonrails.org and say hello to david@loudthinking.com")
+    #   # => "Go to <a href=\"http://www.rubyonrails.org\">http://www.rubyonrails.org</a> and
+    #   #     say hello to <a href=\"mailto:david@loudthinking.com\">david@loudthinking.com</a>"
+    #
+    #   auto_link("Visit http://www.loudthinking.com/ or e-mail david@loudthinking.com", :link => :urls)
+    #   # => "Visit <a href=\"http://www.loudthinking.com/\">http://www.loudthinking.com/</a>
+    #   #     or e-mail david@loudthinking.com"
+    #
+    #   auto_link("Visit http://www.loudthinking.com/ or e-mail david@loudthinking.com", :link => :email_addresses)
+    #   # => "Visit http://www.loudthinking.com/ or e-mail <a href=\"mailto:david@loudthinking.com\">david@loudthinking.com</a>"
+    #
+    #   post_body = "Welcome to my new blog at http://www.myblog.com/.  Please e-mail me at me@email.com."
+    #   auto_link(post_body, :html => { :target => '_blank' }) do |text|
+    #     truncate(text, :length => 15)
+    #   end
+    #   # => "Welcome to my new blog at <a href=\"http://www.myblog.com/\" target=\"_blank\">http://www.m...</a>.
+    #         Please e-mail me at <a href=\"mailto:me@email.com\">me@email.com</a>."
+    #
+    #
+    # You can still use <tt>auto_link</tt> with the old API that accepts the
+    # +link+ as its optional second parameter and the +html_options+ hash
+    # as its optional third parameter:
+    #   post_body = "Welcome to my new blog at http://www.myblog.com/. Please e-mail me at me@email.com."
+    #   auto_link(post_body, :urls)
+    #   # => "Welcome to my new blog at <a href=\"http://www.myblog.com/\">http://www.myblog.com</a>.
+    #         Please e-mail me at me@email.com."
+    #
+    #   auto_link(post_body, :all, :target => "_blank")
+    #   # => "Welcome to my new blog at <a href=\"http://www.myblog.com/\" target=\"_blank\">http://www.myblog.com</a>.
+    #         Please e-mail me at <a href=\"mailto:me@email.com\">me@email.com</a>."
+    def auto_link(text, *args, &block) #link = :all, html = {}, &block)
+      return '' if text.nil? || text.empty?
+      options = args.size == 2 ? {} : extract_options!(args) # this is necessary because the old auto_link API has a Hash as its last parameter
+      unless args.empty?
+        options[:link] = args[0] || :all
+        options[:html] = args[1] || {}
+      end
+      options = { :link => :all, :html => {} }.merge(options)
+      sanitize_options = options[:sanitize_options] || {}
+      sanitize = (options[:sanitize] != false)
+      text = conditional_sanitize(text, sanitize, sanitize_options).to_str
+      case options[:link].to_sym
+      when :all then
+        auto_link_email_addresses(auto_link_urls(text, options[:html], options, &block), options[:html], &block)
+      when :email_addresses then
+        auto_link_email_addresses(text, options[:html], &block)
+      when :urls then
+        auto_link_urls(text, options[:html], options, &block)
+      end
+    end
+    private
+    AUTO_LINK_RE = %r{
+        (?: ((?:ed2k|ftp|http|https|irc|mailto|news|gopher|nntp|telnet|webcal|xmpp|callto|feed|svn|urn|aim|rsync|tag|ssh|sftp|rtsp|afs|file):)// | www\. )
+        [^\s<\u00A0]+
+      }x
+    # regexps for determining context, used high-volume
+    AUTO_LINK_CRE = [/<[^>]+$/, /^[^>]*>/, /<a\b.*?>/i, /<\/a>/i]
+    AUTO_EMAIL_LOCAL_RE = /[\w.!#\$%&'*\/=?^`{|}~+-]/
+    AUTO_EMAIL_RE = /[\w.!#\$%+-]\.?(?:#{AUTO_EMAIL_LOCAL_RE}+\.)*#{AUTO_EMAIL_LOCAL_RE}*@[\w-]+(?:\.[\w-]+)+/
+    BRACKETS = { ']' => '[', ')' => '(', '}' => '{' }
+    WORD_PATTERN = RUBY_VERSION < '1.9' ? '\w' : '\p{Word}'
+    # Turns all urls into clickable links.  If a block is given, each url
+    # is yielded and the result is used as the link text.
+    def auto_link_urls(text, link_attributes = {}, options = {})
+      text.gsub(AUTO_LINK_RE) do
+        scheme, href = $1, $&
+        punctuation = []
+        if auto_linked?($`, $')
+          # do not change string; URL is already linked
+          href
+        else
+          # don't include trailing punctuation character as part of the URL
+          while href.sub!(/[^#{WORD_PATTERN}\/-]$/, '')
+            punctuation.push $&
+            if opening = BRACKETS[punctuation.last] and href.scan(opening).size > href.scan(punctuation.last).size
+              href << punctuation.pop
+              break
+            end
+          end
+          link_text = block_given? ? yield(href) : href
+          href = 'http://' + href unless scheme
+          unless options[:sanitize] == false
+            link_text = sanitize(link_text)
+            href = sanitize(href)
+          end
+          #"<a href='#{link_attributes.merge('href' => href)}'>#{link_text}</a>"
+          content_tag(:a, link_text, link_attributes.merge('href' => href), !!options[:sanitize]) + punctuation.reverse.join('')
+        end
+      end
+    end
+    # Turns all email addresses into clickable links.  If a block is given,
+    # each email is yielded and the result is used as the link text.
+    def auto_link_email_addresses(text, html_options = {}, options = {})
+      text.gsub(AUTO_EMAIL_RE) do
+        text = $&
+        if auto_linked?($`, $')
+          text
+        else
+          display_text = (block_given?) ? yield(text) : text
+          unless options[:sanitize] == false
+            text = sanitize(text)
+            display_text = sanitize(display_text) unless text == display_text
+          end
+          mail_to text, display_text, html_options
+        end
+      end
+    end
+    def extract_options!(args)
+      if args.last.is_a?(Hash)
+        args.pop
+      else
+        {}
+      end
+    end
+    # Detects already linked context or position in the middle of a tag
+    def auto_linked?(left, right)
+      (left =~ AUTO_LINK_CRE[0] and right =~ AUTO_LINK_CRE[1]) or
+        (left.rindex(AUTO_LINK_CRE[2]) and $' !~ AUTO_LINK_CRE[3])
+    end
+    def conditional_sanitize(target, condition, sanitize_options = {})
+      condition ? sanitize(target, sanitize_options) : target
+    end
+    def sanitize(html, options = {})
+      Autolinker::HTML::Sanitizer.new.sanitize(html, options)
+    end
+    def content_tag(name, content_or_options_with_block = nil, options = nil, escape = true, &block)
+      if block_given?
+        options = content_or_options_with_block if content_or_options_with_block.is_a?(Hash)
+        content_tag_string(name, capture(&block), options, escape)
+      else
+        content_tag_string(name, content_or_options_with_block, options, escape)
+      end
+    end
+    def content_tag_string(name, content, options, escape = true)
+      tag_options = tag_options(options, escape) if options
+      "<#{name}#{tag_options}>#{PRE_CONTENT_STRINGS[name.to_sym]}#{escape ? ERB::Util.h(content) : content}</#{name}>"
+    end
+    BOOLEAN_ATTRIBUTES = %w(disabled readonly multiple checked autobuffer
+                           autoplay controls loop selected hidden scoped async
+                           defer reversed ismap seemless muted required
+                           autofocus novalidate formnovalidate open pubdate).to_set
+    BOOLEAN_ATTRIBUTES.merge(BOOLEAN_ATTRIBUTES.map { |attribute| attribute.to_sym })
+    PRE_CONTENT_STRINGS = {
+      :textarea => "\n"
+    }
+    def tag_options(options, escape = true)
+      unless options.nil? || options.empty?
+        attrs = []
+        options.each_pair do |key, value|
+          if key.to_s == 'data' && value.is_a?(Hash)
+            value.each do |k, v|
+              unless v.is_a?(String) || v.is_a?(Symbol) || v.is_a?(BigDecimal)
+                v = v.to_json
+              end
+              v = ERB::Util.html_escape(v) if escape
+              attrs << %(data-#{k.to_s.dasherize}="#{v}")
+            end
+          elsif BOOLEAN_ATTRIBUTES.include?(key)
+            attrs << %(#{key}="#{key}") if value
+          elsif !value.nil?
+            final_value = value.is_a?(Array) ? value.join(" ") : value
+            final_value = ERB::Util.html_escape(final_value) if escape
+            attrs << %(#{key}="#{final_value}")
+          end
+        end
+        " #{attrs.sort * ' '}" unless attrs.empty?
+      end
+    end
+    # Creates a mailto link tag to the specified +email_address+, which is
+    # also used as the name of the link unless +name+ is specified. Additional
+    # HTML attributes for the link can be passed in +html_options+.
+    #
+    # +mail_to+ has several methods for hindering email harvesters and customizing
+    # the email itself by passing special keys to +html_options+.
+    #
+    # ==== Options
+    # * <tt>:encode</tt> - This key will accept the strings "javascript" or "hex".
+    #   Passing "javascript" will dynamically create and encode the mailto link then
+    #   eval it into the DOM of the page. This method will not show the link on
+    #   the page if the user has JavaScript disabled. Passing "hex" will hex
+    #   encode the +email_address+ before outputting the mailto link.
+    # * <tt>:replace_at</tt> - When the link +name+ isn't provided, the
+    #   +email_address+ is used for the link label. You can use this option to
+    #   obfuscate the +email_address+ by substituting the @ sign with the string
+    #   given as the value.
+    # * <tt>:replace_dot</tt> - When the link +name+ isn't provided, the
+    #   +email_address+ is used for the link label. You can use this option to
+    #   obfuscate the +email_address+ by substituting the . in the email with the
+    #   string given as the value.
+    # * <tt>:subject</tt> - Preset the subject line of the email.
+    # * <tt>:body</tt> - Preset the body of the email.
+    # * <tt>:cc</tt> - Carbon Copy additional recipients on the email.
+    # * <tt>:bcc</tt> - Blind Carbon Copy additional recipients on the email.
+    #
+    # ==== Examples
+    #   mail_to "me@domain.com"
+    #   # => <a href="mailto:me@domain.com">me@domain.com</a>
+    #
+    #   mail_to "me@domain.com", "My email", :encode => "javascript"
+    #   # => <script type="text/javascript">eval(decodeURIComponent('%64%6f%63...%27%29%3b'))</script>
+    #
+    #   mail_to "me@domain.com", "My email", :encode => "hex"
+    #   # => <a href="mailto:%6d%65@%64%6f%6d%61%69%6e.%63%6f%6d">My email</a>
+    #
+    #   mail_to "me@domain.com", nil, :replace_at => "_at_", :replace_dot => "_dot_", :class => "email"
+    #   # => <a href="mailto:me@domain.com" class="email">me_at_domain_dot_com</a>
+    #
+    #   mail_to "me@domain.com", "My email", :cc => "ccaddress@domain.com",
+    #            :subject => "This is an example email"
+    #   # => <a href="mailto:me@domain.com?cc=ccaddress@domain.com&subject=This%20is%20an%20example%20email">My email</a>
+    def mail_to(email_address, name = nil, html_options = {})
+      email_address = ERB::Util.html_escape(email_address)
+      encode = html_options.delete("encode").to_s
+      extras = %w{ cc bcc body subject }.map { |item|
+        option = html_options.delete(item) || next
+        "#{item}=#{Rack::Utils.escape(option).gsub("+", "%20")}"
+      }.compact
+      extras = extras.empty? ? '' : '?' + ERB::Util.html_escape(extras.join('&'))
+      email_address_obfuscated = email_address.to_str
+      email_address_obfuscated.gsub!(/@/, html_options.delete("replace_at")) if html_options.key?("replace_at")
+      email_address_obfuscated.gsub!(/\./, html_options.delete("replace_dot")) if html_options.key?("replace_dot")
+      case encode
+      when "javascript"
+        string = ''
+        html = content_tag("a", name || email_address_obfuscated, html_options.merge("href" => "mailto:#{email_address}#{extras}"))
+        html = escape_javascript(html.to_str)
+        "document.write('#{html}');".each_byte do |c|
+          string << sprintf("%%%x", c)
+        end
+        "<script type=\"#{Mime::JS}\">eval(decodeURIComponent('#{string}'))</script>"
+      when "hex"
+        email_address_encoded = email_address_obfuscated.unpack('C*').map { |c|
+          sprintf("&#%d;", c)
+        }.join
+        string = 'mailto:'.unpack('C*').map { |c|
+          sprintf("&#%d;", c)
+        }.join + email_address.unpack('C*').map { |c|
+          char = c.chr
+          char =~ /\w/ ? sprintf("%%%x", c) : char
+        }.join
+        content_tag "a", name || email_address_encoded, html_options.merge("href" => "#{string}#{extras}")
+      else
+        content_tag "a", name || email_address_obfuscated, html_options.merge("href" => "mailto:#{email_address}#{extras}")
+      end
+    end
+  end
+end