RubyGems - twitter-text - Versions diffs - 1.3.1 → 1.3.2 - Mend

twitter-text 1.3.1 → 1.3.2

Files changed (10) hide show

data/.gitmodules CHANGED Viewed

@@ -1,3 +1,3 @@
 [submodule "test/twitter-text-conformance"]
 	path = test/twitter-text-conformance
-	url = git://github.com/mzsanford/twitter-text-conformance.git
+	url = git://github.com/twitter/twitter-text-conformance.git

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    twitter-text (1.3.1)
+    twitter-text (1.3.2)
       actionpack
 GEM

data/Rakefile CHANGED Viewed

@@ -39,7 +39,7 @@ namespace :test do
     desc "Run conformance test suite"
     task :run do
-      ruby "test/conformance_test.rb"
+      ruby '-rubygems', "test/conformance_test.rb"
     end
   end

data/lib/regex.rb CHANGED Viewed

@@ -46,17 +46,21 @@ module Twitter
     REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
-    # Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
-    HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
-    REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/\?]+)(#|＃)([0-9a-z_]*[a-z_]+#{HASHTAG_CHARACTERS}*)/io
+    # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
+    HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}]/io
+    HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}]/io
+    REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/\?]+)(#|＃)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
     REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@＠]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
     REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
     # URL related hash regex collection
     REGEXEN[:valid_preceding_chars] = /(?:[^-\/"':!=A-Z0-9_@＠]|^|\:)/i
-    REGEXEN[:valid_domain] = /(?:[^[:punct:]\s][\.-](?=[^[:punct:]\s])|[^[:punct:]\s]){1,}\.[a-z]{2,}(?::[0-9]+)?/i
-    REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~]/i
+    REGEXEN[:valid_subdomain] = /([^[:punct:]\s]([_-]|[^[:punct:]\s])*)?[^[:punct:]\s]\./
+    REGEXEN[:valid_domain_name] = /([^[:punct:]\s]([-]|[^[:punct:]\s])*)?[^[:punct:]\s]/
+    REGEXEN[:valid_domain] = /#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}\.[a-z]{2,}(?::[0-9]+)?/i
+    REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~|]/i
     # Allow URL paths to contain balanced parens
     #  1. Used in Wikipedia URLs like /Primer_(film)
     #  2. Used in IIS sessions like /S(dfd346)/
@@ -71,7 +75,7 @@ module Twitter
     # Valid end-of-path chracters (so /foo. does not gobble the period).
     #   1. Allow =&# for empty URL parameters and other URL-join artifacts
     REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-]|#{REGEXEN[:wikipedia_disambiguation]}/io
-    REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
+    REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i
     REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
     REGEXEN[:valid_url] = %r{
       (                                                                                     #   $1 total match
@@ -91,6 +95,103 @@ module Twitter
       )
     }iox;
+    # These URL validation pattern strings are based on the ABNF from RFC 3986
+    REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
+    REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
+    REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
+    REGEXEN[:validate_url_pchar] = /(?:
+      #{REGEXEN[:validate_url_unreserved]}|
+      #{REGEXEN[:validate_url_pct_encoded]}|
+      #{REGEXEN[:validate_url_sub_delims]}|
+      :|@
+    )/iox
+    REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
+    REGEXEN[:validate_url_userinfo] = /(?:
+      #{REGEXEN[:validate_url_unreserved]}|
+      #{REGEXEN[:validate_url_pct_encoded]}|
+      #{REGEXEN[:validate_url_sub_delims]}|
+      :
+    )*/iox
+    REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
+    REGEXEN[:validate_url_ipv4] =
+      /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
+    # Punting on real IPv6 validation for now
+    REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
+    # Also punting on IPvFuture for now
+    REGEXEN[:validate_url_ip] = /(?:
+      #{REGEXEN[:validate_url_ipv4]}|
+      #{REGEXEN[:validate_url_ipv6]}
+    )/iox
+    # This is more strict than the rfc specifies
+    REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
+    REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
+    REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
+    REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
+                                     (?:#{REGEXEN[:validate_url_domain_segment]}\.)
+                                     #{REGEXEN[:validate_url_domain_tld]})/iox
+    REGEXEN[:validate_url_host] = /(?:
+      #{REGEXEN[:validate_url_ip]}|
+      #{REGEXEN[:validate_url_domain]}
+    )/iox
+    # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
+    REGEXEN[:validate_url_unicode_subdomain_segment] =
+      /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
+    REGEXEN[:validate_url_unicode_domain_segment] =
+      /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
+    REGEXEN[:validate_url_unicode_domain_tld] =
+      /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
+    REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
+                                             (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
+                                             #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
+    REGEXEN[:validate_url_unicode_host] = /(?:
+      #{REGEXEN[:validate_url_ip]}|
+      #{REGEXEN[:validate_url_unicode_domain]}
+    )/iox
+    REGEXEN[:validate_url_port] = /[0-9]{1,5}/
+    REGEXEN[:validate_url_unicode_authority] = %r{
+      (?:(#{REGEXEN[:validate_url_userinfo]})@)?     #  $1 userinfo
+      (#{REGEXEN[:validate_url_unicode_host]})       #  $2 host
+      (?::(#{REGEXEN[:validate_url_port]}))?         #  $3 port
+    }iox
+    REGEXEN[:validate_url_authority] = %r{
+      (?:(#{REGEXEN[:validate_url_userinfo]})@)?     #  $1 userinfo
+      (#{REGEXEN[:validate_url_host]})               #  $2 host
+      (?::(#{REGEXEN[:validate_url_port]}))?         #  $3 port
+    }iox
+    REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
+    REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
+    REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
+    # Modified version of RFC 3986 Appendix B
+    REGEXEN[:validate_url_unencoded] = %r{
+      \A                                #  Full URL
+      (?:
+        ([^:/?#]+):                    #  $1 Scheme
+      )
+      (?://
+        ([^/?#]*)                      #  $2 Authority
+      )
+      ([^?#]*)                         #  $3 Path
+      (?:
+        \?([^#]*)                      #  $4 Query
+      )?
+      (?:
+        \#(.*)                         #  $5 Fragment
+      )?\Z
+    }ix
     REGEXEN.each_pair{|k,v| v.freeze }
     # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>

data/lib/validation.rb CHANGED Viewed

@@ -46,5 +46,57 @@ module Twitter
       return false
     end
+    def valid_tweet_text?(text)
+      !tweet_invalid?(text)
+    end
+    def valid_username?(username)
+      return false if username.blank?
+      extracted = Twitter::Extractor.extract_mentioned_screen_names(username)
+      # Should extract the username minus the @ sign, hence the [1..-1]
+      extracted.size == 1 && extracted.first == username[1..-1]
+    end
+    VALID_LIST_RE = /\A#{Twitter::Regex[:auto_link_usernames_or_lists]}\z/o
+    def valid_list?(username_list)
+      match = username_list.match(VALID_LIST_RE)
+      # Must have matched and had nothing before or after
+      !!(match && match[1] == "" && !match[4].blank?)
+    end
+    def valid_hashtag?(hashtag)
+      return false if hashtag.blank?
+      extracted = Twitter::Extractor.extract_hashtags(hashtag)
+      # Should extract the hashtag minus the # sign, hence the [1..-1]
+      extracted.size == 1 && extracted.first == hashtag[1..-1]
+    end
+    def valid_url?(url, unicode_domains=true)
+      return false if url.blank?
+      url_parts = url.match(Twitter::Regex[:validate_url_unencoded])
+      return false unless (url_parts && url_parts.to_s == url)
+      scheme, authority, path, query, fragment = url_parts.captures
+      return false unless (valid_match?(scheme, Twitter::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i) &&
+                           valid_match?(path, Twitter::Regex[:validate_url_path]) &&
+                           valid_match?(query, Twitter::Regex[:validate_url_query], true) &&
+                           valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true))
+      return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) ||
+             (!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority]))
+    end
+    private
+    def valid_match?(string, regex, optional=false)
+      return (string && string.match(regex) && $~.to_s == string) unless optional
+      !(string && (!string.match(regex) || $~.to_s != string))
+    end
   end
 end

data/spec/autolinking_spec.rb CHANGED Viewed

@@ -310,6 +310,17 @@ describe Twitter::Autolink do
         end
       end
+      context "with a hashtag containing an accented latin character" do
+        def original_text
+          # the hashtag is #éhashtag
+          "##{[0x00e9].pack('U')}hashtag"
+        end
+        it "should be linked" do
+          @autolinked_text.should == "<a href=\"http://twitter.com/search?q=%23éhashtag\" title=\"#éhashtag\" class=\"tweet-url hashtag\" rel=\"nofollow\">#éhashtag</a>"
+        end
+      end
     end
     describe "URL autolinking" do

data/spec/test_urls.rb CHANGED Viewed

@@ -21,6 +21,11 @@ module TestUrls
     "http://mrs.domain-dash.biz",
     "http://x.com/has/one/char/domain",
     "http://t.co/nwcLTFF",
+    "http://sub_domain-dash.twitter.com",
+    "http://a.b.cd",
+    "http://a_b.c-d.com",
+    "http://a-b.b.com",
+    "http://twitter-dash.com",
     # "t.co/nwcLTFF"
   ] unless defined?(TestUrls::VALID)
@@ -29,7 +34,16 @@ module TestUrls
     "http://tld-too-short.x",
     "www.foobar.com",
     "WWW.FOOBAR.COM",
-    "http://-doman_dash.com"
+    "http://-doman_dash.com",
+    "http://_leadingunderscore.twitter.com",
+    "http://trailingunderscore_.twitter.com",
+    "http://-leadingdash.twitter.com",
+    "http://trailingdash-.twitter.com",
+    "http://-leadingdash.com",
+    "http://trailingdash-.com",
+    "http://no_underscores.com",
+    "http://test.c_o_m",
+    "http://test.c-o-m"
   ] unless defined?(TestUrls::INVALID)
 end

data/test/conformance_test.rb CHANGED Viewed

@@ -7,6 +7,7 @@ class ConformanceTest < Test::Unit::TestCase
   include Twitter::Extractor
   include Twitter::Autolink
   include Twitter::HitHighlighter
+  include Twitter::Validation
   def setup
     @conformance_dir = ENV['CONFORMANCE_DIR'] || File.join(File.dirname(__FILE__), 'twitter-text-conformance')
@@ -35,6 +36,9 @@ class ConformanceTest < Test::Unit::TestCase
     def test_url_extractor_conformance
       run_conformance_test(File.join(@conformance_dir, 'extract.yml'), :urls) do |description, expected, input|
         assert_equal expected, extract_urls(input), description
+        expected.each do |expected_url|
+          assert_equal true, valid_url?(expected_url), "expected url [#{expected_url}] not valid"
+        end
       end
     end
@@ -109,6 +113,39 @@ class ConformanceTest < Test::Unit::TestCase
   end
   include HitHighlighterConformance
+  module ValidationConformance
+    def test_tweet_validation_conformance
+      run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :tweets) do |description, expected, input|
+        assert_equal expected, valid_tweet_text?(input), description
+      end
+    end
+    def test_users_validation_conformance
+      run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :usernames) do |description, expected, input|
+        assert_equal expected, valid_username?(input), description
+      end
+    end
+    def test_lists_validation_conformance
+      run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :lists) do |description, expected, input|
+        assert_equal expected, valid_list?(input), description
+      end
+    end
+    def test_urls_validation_conformance
+      run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :urls) do |description, expected, input|
+        assert_equal expected, valid_url?(input), description
+      end
+    end
+    def test_hashtags_validation_conformance
+      run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :hashtags) do |description, expected, input|
+        assert_equal expected, valid_hashtag?(input), description
+      end
+    end
+  end
+  include ValidationConformance
   private
   def run_conformance_test(file, test_type, hash_config = false, &block)
@@ -123,4 +160,4 @@ class ConformanceTest < Test::Unit::TestCase
       end
     end
   end
-end
+end

data/twitter-text.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 spec = Gem::Specification.new do |s|
   s.name = "twitter-text"
-  s.version = "1.3.1"
+  s.version = "1.3.2"
   s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle", "Raffi Krikorian"]
   s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com", "raffi@twitter.com"]
   s.homepage = "http://twitter.com"

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: twitter-text
 version: !ruby/object:Gem::Version
-  hash: 25
-  prerelease: false
+  hash: 31
+  prerelease:
   segments:
   - 1
   - 3
-  - 1
-  version: 1.3.1
+  - 2
+  version: 1.3.2
 platform: ruby
 authors:
 - Matt Sanford
@@ -19,11 +19,13 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-01-06 00:00:00 -08:00
+date: 2011-04-20 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
-  version_requirements: &id001 !ruby/object:Gem::Requirement
+  name: nokogiri
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ">="
@@ -32,12 +34,12 @@ dependencies:
         segments:
         - 0
         version: "0"
-  requirement: *id001
-  name: nokogiri
-  prerelease: false
   type: :development
+  version_requirements: *id001
 - !ruby/object:Gem::Dependency
-  version_requirements: &id002 !ruby/object:Gem::Requirement
+  name: rake
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ">="
@@ -46,12 +48,12 @@ dependencies:
         segments:
         - 0
         version: "0"
-  requirement: *id002
-  name: rake
-  prerelease: false
   type: :development
+  version_requirements: *id002
 - !ruby/object:Gem::Dependency
-  version_requirements: &id003 !ruby/object:Gem::Requirement
+  name: rspec
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ">="
@@ -60,12 +62,12 @@ dependencies:
         segments:
         - 0
         version: "0"
-  requirement: *id003
-  name: rspec
-  prerelease: false
   type: :development
+  version_requirements: *id003
 - !ruby/object:Gem::Dependency
-  version_requirements: &id004 !ruby/object:Gem::Requirement
+  name: simplecov
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ">="
@@ -74,12 +76,12 @@ dependencies:
         segments:
         - 0
         version: "0"
-  requirement: *id004
-  name: simplecov
-  prerelease: false
   type: :development
+  version_requirements: *id004
 - !ruby/object:Gem::Dependency
-  version_requirements: &id005 !ruby/object:Gem::Requirement
+  name: actionpack
+  prerelease: false
+  requirement: &id005 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ">="
@@ -88,10 +90,8 @@ dependencies:
         segments:
         - 0
         version: "0"
-  requirement: *id005
-  name: actionpack
-  prerelease: false
   type: :runtime
+  version_requirements: *id005
 description: A gem that provides text handling for Twitter
 email:
 - matt@twitter.com
@@ -165,7 +165,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project:
-rubygems_version: 1.3.7
+rubygems_version: 1.4.1
 signing_key:
 specification_version: 3
 summary: Twitter text handling library