RubyGems - twitter-text - Versions diffs - 1.4.12 → 1.4.13 - Mend

twitter-text 1.4.12 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/lib/extractor.rb CHANGED Viewed

@@ -68,8 +68,9 @@ module Twitter
       return [] unless text
       possible_screen_names = []
-      text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
+      text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn|
         extract_mentions_match_data = $~
+        after = $'
         unless after =~ Twitter::Regex[:end_screen_name_match]
           start_position = extract_mentions_match_data.char_begin(2) - 1
           end_position = extract_mentions_match_data.char_end(2)
@@ -99,8 +100,9 @@ module Twitter
       return [] unless text
       possible_entries = []
-      text.to_s.scan(Twitter::Regex[:extract_mentions_or_lists]) do |before, sn, list_slug, after|
+      text.to_s.scan(Twitter::Regex[:extract_mentions_or_lists]) do |before, sn, list_slug|
         extract_mentions_match_data = $~
+        after = $'
         unless after =~ Twitter::Regex[:end_screen_name_match]
           start_position = extract_mentions_match_data.char_begin(2) - 1
           end_position = extract_mentions_match_data.char_end(list_slug.nil? ? 2 : 3)
@@ -130,6 +132,7 @@ module Twitter
       possible_screen_name = text.match(Twitter::Regex[:extract_reply])
       return unless possible_screen_name.respond_to?(:captures)
+      return if $' =~ Twitter::Regex[:end_screen_name_match]
       screen_name = possible_screen_name.captures.first
       yield screen_name if block_given?
       screen_name
@@ -161,18 +164,32 @@ module Twitter
         start_position = valid_url_match_data.char_begin(3)
         end_position = valid_url_match_data.char_end(3)
-        # If protocol is missing, check against valid_ascii_domain
+        # If protocol is missing and domain contains non-ASCII characters,
+        # extract ASCII-only domains.
         if !protocol
-          next unless domain =~ Twitter::Regex[:valid_ascii_domain]
-          if $~.char_begin(0)
-            start_position += $~.char_begin(0)
-            url.sub!(domain, $~.to_s())
+          last_url = nil
+          last_url_invalid_match = nil
+          domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
+            last_url = {
+              :url => ascii_domain,
+              :indices => [start_position + $~.char_begin(0),
+                           start_position + $~.char_end(0)]
+            }
+            last_url_invalid_match = ascii_domain =~ Twitter::Regex[:invalid_short_domain]
+            urls << last_url unless last_url_invalid_match
           end
-        end
-        # Regex in Ruby 1.8 doesn't support lookbehind, so we need to manually filter out
-        # the short URLs without protocol and path, i.e., [domain].[ccTLD]
-        unless !protocol && !path && domain =~ Twitter::Regex[:valid_short_domain]
+          # no ASCII-only domain found. Skip the entire URL
+          next unless last_url
+          # last_url only contains domain. Need to add path and query if they exist.
+          if path
+            # last_url was not added. Add it to urls here.
+            urls << last_url if last_url_invalid_match
+            last_url[:url] = url.sub(domain, last_url[:url])
+            last_url[:indices][1] = end_position
+          end
+        else
           urls << {
             :url => url,
             :indices => [start_position, end_position]
@@ -208,10 +225,13 @@ module Twitter
       text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
         start_position = $~.char_begin(2)
         end_position = $~.char_end(3)
-        tags << {
-          :hashtag => hash_text,
-          :indices => [start_position, end_position]
-        }
+        after = $'
+        unless after =~ Twitter::Regex[:end_hashtag_match]
+          tags << {
+            :hashtag => hash_text,
+            :indices => [start_position, end_position]
+          }
+        end
       end
       tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given?
       tags

data/lib/regex.rb CHANGED Viewed

@@ -50,11 +50,6 @@ module Twitter
     ].map{|cp| [cp].pack('U') }.freeze
     REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
-    REGEXEN[:at_signs] = /[@＠]/
-    REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
-    REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?(?=(.|$))/o
-    REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
     major, minor, patch = RUBY_VERSION.split('.')
     if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
       REGEXEN[:list_name] = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/
@@ -89,8 +84,6 @@ module Twitter
     ].join('').freeze
     REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
-    REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
     CJ_HASHTAG_CHARACTERS = [
       regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width)
       regex_range(0xFF66, 0xFF9F), # Katakana (half-width)
@@ -104,27 +97,35 @@ module Twitter
       regex_range(0x2F800, 0x2FA1F), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
     ].join('').freeze
-    HASHTAG_BOUNDARY = /(?:\A|\z|#{REGEXEN[:spaces]}|[「」。、.,!?！？:;"'])/o
     # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
     HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
     HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
+    HASHTAG_BOUNDARY = /\A|\z|[^&\/a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
     HASHTAG = /(#{HASHTAG_BOUNDARY})(#|＃)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
     REGEXEN[:auto_link_hashtags] = /#{HASHTAG}/io
+    # Used in Extractor and Rewriter for final filtering
+    REGEXEN[:end_hashtag_match] = /^(?:[#＃]|:\/\/)/o
+    REGEXEN[:at_signs] = /[@＠]/
+    REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
+    REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
+    REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
+    # Used in Extractor and Rewriter for final filtering
+    REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
     REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@＠]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
     REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
     # URL related hash regex collection
-    REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@＠\.#{INVALID_CHARACTERS.join('')}]|^)/io
+    REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@＠#＃\.#{INVALID_CHARACTERS.join('')}]|^)/io
     DOMAIN_VALID_CHARS = "[^[:punct:][:space:][:blank:][:cntrl:]#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
     REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
     REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
-    REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^[:alpha:]]|$))/i
+    REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^a-z]|$))/i
     REGEXEN[:valid_ccTLD] = %r{
       (?:
         (?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
@@ -133,7 +134,7 @@ module Twitter
         lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
         pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|
         tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
-        (?=[^[:alpha:]]|$)
+        (?=[^a-z]|$)
       )
     }ix
     REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/
@@ -145,12 +146,12 @@ module Twitter
     # This is used in Extractor
     REGEXEN[:valid_ascii_domain] = /
-      (?:(?:[[:alnum:]\-_]|#{REGEXEN[:latin_accents]})+\.)+
+      (?:(?:[A-Za-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
       (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
     /iox
     # This is used in Extractor to filter out unwanted URLs.
-    REGEXEN[:valid_short_domain] = /^#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}$/io
+    REGEXEN[:invalid_short_domain] = /^#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}$/io
     REGEXEN[:valid_port_number] = /[0-9]+/
@@ -171,7 +172,7 @@ module Twitter
       )|(?:@#{REGEXEN[:valid_general_url_path_chars]}+\/)
     )/iox
-    REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i
+    REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i
     REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
     REGEXEN[:valid_url] = %r{
       (                                                                                     #   $1 total match

data/lib/rewriter.rb CHANGED Viewed

@@ -42,10 +42,12 @@ module Twitter
     def rewrite_hashtags(text)
       text.to_s.gsub(Twitter::Regex[:auto_link_hashtags]) do
-        before = $1
-        hash = $2
-        hashtag = $3
-        "#{before}#{yield(hash, hashtag)}"
+        before, hash, hashtag, after = $1, $2, $3, $'
+        if after =~ Twitter::Regex[:end_hashtag_match]
+          "#{before}#{hash}#{hashtag}"
+        else
+          "#{before}#{yield(hash, hashtag)}"
+        end
       end
     end

data/twitter-text.gemspec CHANGED Viewed

@@ -2,7 +2,7 @@
 Gem::Specification.new do |s|
   s.name = "twitter-text"
-  s.version = "1.4.12"
+  s.version = "1.4.13"
   s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle",
                "Raffi Krikorian", "J.P. Cummins", "Yoshimasa Niwa", "Keita Fujii"]
   s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com",

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: twitter-text
 version: !ruby/object:Gem::Version
-  hash: 31
+  hash: 29
   prerelease:
   segments:
   - 1
   - 4
-  - 12
-  version: 1.4.12
+  - 13
+  version: 1.4.13
 platform: ruby
 authors:
 - Matt Sanford
@@ -22,7 +22,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-10-04 00:00:00 -07:00
+date: 2011-11-02 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency