RubyGems - twitter-text - Versions diffs - 1.4.17 → 1.5.0 - Mend

twitter-text 1.4.17 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

data/.travis.yml +4 -0
data/README.rdoc +3 -13
data/Rakefile +1 -0
data/lib/twitter-text/autolink.rb +436 -0
data/lib/twitter-text/deprecation.rb +15 -0
data/lib/{extractor.rb → twitter-text/extractor.rb} +125 -41
data/lib/{hithighlighter.rb → twitter-text/hit_highlighter.rb} +5 -7
data/lib/{regex.rb → twitter-text/regex.rb} +33 -23
data/lib/twitter-text/rewriter.rb +59 -0
data/lib/{unicode.rb → twitter-text/unicode.rb} +0 -0
data/lib/{validation.rb → twitter-text/validation.rb} +17 -3
data/lib/twitter-text.rb +13 -7
data/spec/autolinking_spec.rb +192 -16
data/spec/extractor_spec.rb +12 -0
data/spec/rewriter_spec.rb +2 -11
data/spec/spec_helper.rb +1 -1
data/test/conformance_test.rb +128 -129
data/twitter-text.gemspec +1 -1
metadata +14 -12
data/lib/autolink.rb +0 -266
data/lib/rewriter.rb +0 -65

data/lib/{extractor.rb → twitter-text/extractor.rb} RENAMED Viewed

@@ -1,3 +1,5 @@
+# encoding: UTF-8
 class String
   # Helper function to count the character length by first converting to an
   # array.  This is needed because with unicode strings, the return value
@@ -45,16 +47,48 @@ module Twitter
   # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
   # of usernames, lists, URLs and hashtags.
   module Extractor extend self
+    # Remove overlapping entities.
+    # This returns a new array with no overlapping entities.
+    def remove_overlapping_entities(entities)
+      # sort by start index
+      entities = entities.sort_by{|entity| entity[:indices].first}
+      # remove duplicates
+      prev = nil
+      entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
+      entities
+    end
+    # Extracts all usernames, lists, hashtags and URLs  in the Tweet <tt>text</tt>
+    # along with the indices for where the entity ocurred
+    # If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
+    # will be returned.
+    #
+    # If a block is given then it will be called for each entity.
+    def extract_entities_with_indices(text, options = {}, &block)
+      # extract all entities
+      entities = extract_urls_with_indices(text, options) +
+                 extract_hashtags_with_indices(text, :check_url_overlap => false) +
+                 extract_mentions_or_lists_with_indices(text) +
+                 extract_cashtags_with_indices(text)
+      return [] if entities.empty?
+      entities = remove_overlapping_entities(entities)
+      entities.each(&block) if block_given?
+      entities
+    end
     # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
     # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
     # will be returned.
     #
     # If a block is given then it will be called for each username.
-    def extract_mentioned_screen_names(text) # :yields: username
-      screen_names_only = extract_mentioned_screen_names_with_indices(text).map{|mention| mention[:screen_name] }
-      screen_names_only.each{|mention| yield mention } if block_given?
-      screen_names_only
+    def extract_mentioned_screen_names(text, &block) # :yields: username
+      screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
+      screen_names.each(&block) if block_given?
+      screen_names
     end
     # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
@@ -68,23 +102,20 @@ module Twitter
       return [] unless text
       possible_screen_names = []
-      text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn|
-        extract_mentions_match_data = $~
-        after = $'
-        unless after =~ Twitter::Regex[:end_screen_name_match]
-          start_position = extract_mentions_match_data.char_begin(2) - 1
-          end_position = extract_mentions_match_data.char_end(2)
-          possible_screen_names << {
-            :screen_name => sn,
-            :indices => [start_position, end_position]
-          }
-        end
+      extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
+        next unless list_slug.empty?
+        possible_screen_names << {
+          :screen_name => screen_name,
+          :indices => [start_position, end_position]
+        }
       end
       if block_given?
         possible_screen_names.each do |mention|
           yield mention[:screen_name], mention[:indices].first, mention[:indices].last
         end
       end
       possible_screen_names
     end
@@ -97,17 +128,17 @@ module Twitter
     # index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
     # if this is a username mention.
     def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
-      return [] unless text
+      return [] unless text =~ /[@＠]/
       possible_entries = []
-      text.to_s.scan(Twitter::Regex[:extract_mentions_or_lists]) do |before, sn, list_slug|
-        extract_mentions_match_data = $~
+      text.to_s.scan(Twitter::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
+        match_data = $~
         after = $'
-        unless after =~ Twitter::Regex[:end_screen_name_match]
-          start_position = extract_mentions_match_data.char_begin(2) - 1
-          end_position = extract_mentions_match_data.char_end(list_slug.nil? ? 2 : 3)
+        unless after =~ Twitter::Regex[:end_mention_match]
+          start_position = match_data.char_begin(3) - 1
+          end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
           possible_entries << {
-            :screen_name => sn,
+            :screen_name => screen_name,
             :list_slug => list_slug || "",
             :indices => [start_position, end_position]
           }
@@ -130,9 +161,9 @@ module Twitter
     def extract_reply_screen_name(text) # :yields: username
       return nil unless text
-      possible_screen_name = text.match(Twitter::Regex[:extract_reply])
+      possible_screen_name = text.match(Twitter::Regex[:valid_reply])
       return unless possible_screen_name.respond_to?(:captures)
-      return if $' =~ Twitter::Regex[:end_screen_name_match]
+      return if $' =~ Twitter::Regex[:end_mention_match]
       screen_name = possible_screen_name.captures.first
       yield screen_name if block_given?
       screen_name
@@ -143,10 +174,10 @@ module Twitter
     # will be returned.
     #
     # If a block is given then it will be called for each URL.
-    def extract_urls(text) # :yields: url
-      urls_only = extract_urls_with_indices(text).map{|url| url[:url] }
-      urls_only.each{|url| yield url } if block_given?
-      urls_only
+    def extract_urls(text, &block) # :yields: url
+      urls = extract_urls_with_indices(text).map{|u| u[:url]}
+      urls.each(&block) if block_given?
+      urls
     end
     # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
@@ -154,10 +185,11 @@ module Twitter
     # URLs an empty array will be returned.
     #
     # If a block is given then it will be called for each URL.
-    def extract_urls_with_indices(text) # :yields: url, start, end
-      return [] unless text
+    def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
+      return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
       urls = []
       position = 0
       text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
         valid_url_match_data = $~
@@ -167,6 +199,7 @@ module Twitter
         # If protocol is missing and domain contains non-ASCII characters,
         # extract ASCII-only domains.
         if !protocol
+          next if !options[:extract_url_without_protocol] || before =~ Twitter::Regex[:invalid_url_without_protocol_preceding_chars]
           last_url = nil
           last_url_invalid_match = nil
           domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
@@ -201,7 +234,7 @@ module Twitter
           }
         end
       end
-      urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last } if block_given?
+      urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
       urls
     end
@@ -211,10 +244,10 @@ module Twitter
     # character.
     #
     # If a block is given then it will be called for each hashtag.
-    def extract_hashtags(text) # :yields: hashtag_text
-      hashtags_only = extract_hashtags_with_indices(text).map{|hash| hash[:hashtag] }
-      hashtags_only.each{|hash| yield hash } if block_given?
-      hashtags_only
+    def extract_hashtags(text, &block) # :yields: hashtag_text
+      hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
+      hashtags.each(&block) if block_given?
+      hashtags
     end
     # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
@@ -223,13 +256,14 @@ module Twitter
     # character.
     #
     # If a block is given then it will be called for each hashtag.
-    def extract_hashtags_with_indices(text) # :yields: hashtag_text, start, end
-      return [] unless text
+    def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
+      return [] unless text =~ /[#＃]/
       tags = []
-      text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
-        start_position = $~.char_begin(2)
-        end_position = $~.char_end(3)
+      text.scan(Twitter::Regex[:valid_hashtag]) do |before, hash, hash_text|
+        match_data = $~
+        start_position = match_data.char_begin(2)
+        end_position = match_data.char_end(3)
         after = $'
         unless after =~ Twitter::Regex[:end_hashtag_match]
           tags << {
@@ -238,7 +272,57 @@ module Twitter
           }
         end
       end
-      tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given?
+      if options[:check_url_overlap]
+        # extract URLs
+        urls = extract_urls_with_indices(text)
+        unless urls.empty?
+          tags.concat(urls)
+          # remove duplicates
+          tags = remove_overlapping_entities(tags)
+          # remove URL entities
+          tags.reject!{|entity| !entity[:hashtag] }
+        end
+      end
+      tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
+      tags
+    end
+    # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
+    # will be returned. The array returned will not include the leading <tt>$</tt>
+    # character.
+    #
+    # If a block is given then it will be called for each cashtag.
+    def extract_cashtags(text, &block) # :yields: cashtag_text
+      cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
+      cashtags.each(&block) if block_given?
+      cashtags
+    end
+    # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
+    # will be returned. The array returned will not include the leading <tt>$</tt>
+    # character.
+    #
+    # If a block is given then it will be called for each cashtag.
+    def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
+      return [] unless text =~ /\$/
+      tags = []
+      text.scan(Twitter::Regex[:valid_cashtag]) do |cash_text|
+        match_data = $~
+        # cash_text doesn't contain $ symbol, so need to decrement start_position by one
+        start_position = match_data.char_begin(1) - 1
+        end_position = match_data.char_end(1)
+        tags << {
+          :cashtag => cash_text[0],
+          :indices => [start_position, end_position]
+        }
+      end
+      tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
       tags
     end
   end

data/lib/{hithighlighter.rb → twitter-text/hit_highlighter.rb} RENAMED Viewed

@@ -23,9 +23,9 @@ module Twitter
       chunks = text.split(/[<>]/)
-      result = ""
+      result = []
       chunk_index, chunk = 0, chunks[0]
-      chunk_chars = chunk.respond_to?("mb_chars") ? chunk.mb_chars : chunk.respond_to?("chars") && chunk.chars.respond_to?("[]") ? chunk.chars : chunk
+      chunk_chars = chunk.to_s.to_char_a
       prev_chunks_len = 0
       chunk_cursor = 0
       start_in_chunk = false
@@ -49,13 +49,13 @@ module Twitter
           chunk_cursor = 0
           chunk_index += 2
           chunk = chunks[chunk_index]
-          chunk_chars = chunk.respond_to?("mb_chars") ? chunk.mb_chars : chunk.respond_to?("chars") && chunk.chars.respond_to?("[]") ? chunk.chars : chunk
+          chunk_chars = chunk.to_s.to_char_a
           start_in_chunk = false
         end
         if !placed && !chunk.nil?
           hit_spot = hit - prev_chunks_len
-          result << chunk_chars[chunk_cursor...hit_spot].to_s + tag
+          result << chunk_chars[chunk_cursor...hit_spot] << tag
           chunk_cursor = hit_spot
           if index % 2 == 0
             start_in_chunk = true
@@ -80,9 +80,7 @@ module Twitter
         end
       end
-      result
-    rescue
-      text
+      result.flatten.join
     end
   end
 end

data/lib/{regex.rb → twitter-text/regex.rb} RENAMED Viewed

@@ -1,4 +1,5 @@
-# encoding: utf-8
+# encoding: UTF-8
 module Twitter
   # A collection of regular expressions for parsing Tweet text. The regular expression
   # list is frozen at load time to ensure immutability. These reular expressions are
@@ -77,6 +78,7 @@ module Twitter
           regex_range(0x0289),
           regex_range(0x028b),
           regex_range(0x02bb),
+          regex_range(0x0300, 0x036f),
           regex_range(0x1e00, 0x1eff)
     ].join('').freeze
@@ -86,13 +88,12 @@ module Twitter
       regex_range(0x0500, 0x0527), # Cyrillic Supplement
       regex_range(0x2de0, 0x2dff), # Cyrillic Extended A
       regex_range(0xa640, 0xa69f), # Cyrillic Extended B
-      regex_range(0x0591, 0x05bd), # Hebrew
-      regex_range(0x05bf),
+      regex_range(0x0591, 0x05bf), # Hebrew
       regex_range(0x05c1, 0x05c2),
       regex_range(0x05c4, 0x05c5),
       regex_range(0x05c7),
       regex_range(0x05d0, 0x05ea),
-      regex_range(0x05f0, 0x05f2),
+      regex_range(0x05f0, 0x05f4),
       regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms
       regex_range(0xfb2a, 0xfb36),
       regex_range(0xfb38, 0xfb3c),
@@ -141,38 +142,44 @@ module Twitter
       regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B)
       regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C)
       regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D)
-      regex_range(0x2F800, 0x2FA1F), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
+      regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
     ].join('').freeze
+    PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
+    SPACE_CHARS = " \t\n\x0B\f\r"
+    CTRL_CHARS = "\x00-\x1F\x7F"
     # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
     HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
     HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
-    HASHTAG_BOUNDARY = /\A|\z|[^&\/a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
+    HASHTAG_BOUNDARY = /\A|\z|[^&a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
     HASHTAG = /(#{HASHTAG_BOUNDARY})(#|＃)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
-    REGEXEN[:auto_link_hashtags] = /#{HASHTAG}/io
-    # Used in Extractor and Rewriter for final filtering
+    REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
+    # Used in Extractor for final filtering
     REGEXEN[:end_hashtag_match] = /\A(?:[#＃]|:\/\/)/o
+    REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-zA-Z0-9_!#\$%&*@＠]|^|RT:?)/o
     REGEXEN[:at_signs] = /[@＠]/
-    REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_!#\$%&*@＠])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
-    REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_!#\$%&*@＠])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
-    REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
-    # Used in Extractor and Rewriter for final filtering
-    REGEXEN[:end_screen_name_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
-    REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_!#\$%&*@＠]|^|RT:?)([@＠]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
-    REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
+    REGEXEN[:valid_mention_or_list] = /
+      (#{REGEXEN[:valid_mention_preceding_chars]})  # $1: Preceeding character
+      (#{REGEXEN[:at_signs]})                       # $2: At mark
+      ([a-zA-Z0-9_]{1,20})                          # $3: Screen name
+      (\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?             # $4: List (optional)
+    /ox
+    REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
+    # Used in Extractor for final filtering
+    REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
     # URL related hash regex collection
-    REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@＠\$#＃\.#{INVALID_CHARACTERS.join('')}]|^)/io
-    DOMAIN_VALID_CHARS = "[^[:punct:][:space:][:blank:][:cntrl:]#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
+    REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@＠$#＃#{INVALID_CHARACTERS.join('')}]|^)/io
+    REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
+    DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
     REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
     REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
-    REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^a-z]|$))/i
+    REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^0-9a-z]|$))/i
     REGEXEN[:valid_ccTLD] = %r{
       (?:
         (?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
@@ -181,10 +188,10 @@ module Twitter
         lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
         pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|
         tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
-        (?=[^a-z]|$)
+        (?=[^0-9a-z]|$)
       )
     }ix
-    REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/
+    REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
     REGEXEN[:valid_domain] = /(?:
       #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
@@ -226,7 +233,7 @@ module Twitter
     REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
     REGEXEN[:valid_url] = %r{
       (                                                                                     #   $1 total match
-        (#{REGEXEN[:valid_preceding_chars]})                                                #   $2 Preceeding chracter
+        (#{REGEXEN[:valid_url_preceding_chars]})                                            #   $2 Preceeding chracter
         (                                                                                   #   $3 URL
           (https?:\/\/)?                                                                    #   $4 Protocol (optional)
           (#{REGEXEN[:valid_domain]})                                                       #   $5 Domain(s)
@@ -237,6 +244,9 @@ module Twitter
       )
     }iox;
+    REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
+    REGEXEN[:valid_cashtag] = /(?:^|#{REGEXEN[:spaces]})\$(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
     # These URL validation pattern strings are based on the ABNF from RFC 3986
     REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
     REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i

data/lib/twitter-text/rewriter.rb ADDED Viewed

@@ -0,0 +1,59 @@
+module Twitter
+  # A module provides base methods to rewrite usernames, lists, hashtags and URLs.
+  module Rewriter extend self
+    def rewrite_entities(text, entities)
+      chars = text.to_s.to_char_a
+      # sort by start index
+      entities = entities.sort_by{|entity| entity[:indices].first}
+      result = []
+      last_index = entities.inject(0) do |last_index, entity|
+        result << chars[last_index...entity[:indices].first]
+        result << yield(entity, chars)
+        entity[:indices].last
+      end
+      result << chars[last_index..-1]
+      result.flatten.join
+    end
+    # These methods are deprecated, will be removed in future.
+    extend Deprecation
+    def rewrite(text, options = {})
+      [:hashtags, :urls, :usernames_or_lists].inject(text) do |key|
+        options[key] ? send(:"rewrite_#{key}", text, &options[key]) : text
+      end
+    end
+    deprecate :rewrite, :rewrite_entities
+    def rewrite_usernames_or_lists(text)
+      entities = Extractor.extract_mentions_or_lists_with_indices(text)
+      rewrite_entities(text, entities) do |entity, chars|
+        at = chars[entity[:indices].first]
+        list_slug = entity[:list_slug]
+        list_slug = nil if list_slug.empty?
+        yield(at, entity[:screen_name], list_slug)
+      end
+    end
+    deprecate :rewrite_usernames_or_lists, :rewrite_entities
+    def rewrite_hashtags(text)
+      entities = Extractor.extract_hashtags_with_indices(text)
+      rewrite_entities(text, entities) do |entity, chars|
+        hash = chars[entity[:indices].first]
+        yield(hash, entity[:hashtag])
+      end
+    end
+    deprecate :rewrite_hashtags, :rewrite_entities
+    def rewrite_urls(text)
+      entities = Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false)
+      rewrite_entities(text, entities) do |entity, chars|
+        yield(entity[:url])
+      end
+    end
+    deprecate :rewrite_urls, :rewrite_entities
+  end
+end

data/lib/{unicode.rb → twitter-text/unicode.rb} RENAMED Viewed

File without changes

data/lib/{validation.rb → twitter-text/validation.rb} RENAMED Viewed

@@ -2,6 +2,11 @@ module Twitter
   module Validation extend self
     MAX_LENGTH = 140
+    DEFAULT_TCO_URL_LENGTHS = {
+      :short_url_length => 20,
+      :short_url_length_https => 21
+    }.freeze
     # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
     # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
     # string no matter which actual form was transmitted. For example:
@@ -14,8 +19,17 @@ module Twitter
     #
     # The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
     #
-    def tweet_length(text)
-      ActiveSupport::Multibyte::Chars.new(text).normalize(:c).length
+    def tweet_length(text, options = {})
+      options = DEFAULT_TCO_URL_LENGTHS.merge(options)
+      length = ActiveSupport::Multibyte::Chars.new(text).normalize(:c).length
+      Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
+        length += start_position - end_position
+        length += url.downcase =~ /^https:\/\// ? options[:short_url_length_https] : options[:short_url_length]
+      end
+      length
     end
     # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
@@ -52,7 +66,7 @@ module Twitter
       extracted.size == 1 && extracted.first == username[1..-1]
     end
-    VALID_LIST_RE = /\A#{Twitter::Regex[:auto_link_usernames_or_lists]}\z/o
+    VALID_LIST_RE = /\A#{Twitter::Regex[:valid_mention_or_list]}\z/o
     def valid_list?(username_list)
       match = username_list.match(VALID_LIST_RE)
       # Must have matched and had nothing before or after

data/lib/twitter-text.rb CHANGED Viewed

@@ -10,11 +10,17 @@ end
 require 'active_support'
 require 'active_support/core_ext/string/multibyte.rb'
+require 'active_support/core_ext/hash/keys.rb'
-require File.join(File.dirname(__FILE__), 'regex')
-require File.join(File.dirname(__FILE__), 'rewriter')
-require File.join(File.dirname(__FILE__), 'autolink')
-require File.join(File.dirname(__FILE__), 'extractor')
-require File.join(File.dirname(__FILE__), 'unicode')
-require File.join(File.dirname(__FILE__), 'validation')
-require File.join(File.dirname(__FILE__), 'hithighlighter')
+%w(
+  deprecation
+  regex
+  rewriter
+  autolink
+  extractor
+  unicode
+  validation
+  hit_highlighter
+).each do |name|
+  require "twitter-text/#{name}"
+end