RubyGems - twitter-text - Versions diffs - 1.3.1 → 1.3.2 - Mend

twitter-text 1.3.1 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/.gitmodules CHANGED Viewed

@@ -1,3 +1,3 @@
 [submodule "test/twitter-text-conformance"]
 	path = test/twitter-text-conformance
-	url = git://github.com/mzsanford/twitter-text-conformance.git
+	url = git://github.com/twitter/twitter-text-conformance.git

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    twitter-text (1.3.1)
+    twitter-text (1.3.2)
       actionpack
 GEM

data/Rakefile CHANGED Viewed

@@ -39,7 +39,7 @@ namespace :test do
     desc "Run conformance test suite"
     task :run do
-      ruby "test/conformance_test.rb"
+      ruby '-rubygems', "test/conformance_test.rb"
     end
   end

data/lib/regex.rb CHANGED Viewed

@@ -46,17 +46,21 @@ module Twitter
     REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
-    # Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
-    HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
-    REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/\?]+)(#|＃)([0-9a-z_]*[a-z_]+#{HASHTAG_CHARACTERS}*)/io
+    # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
+    HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}]/io
+    HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}]/io
+    REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/\?]+)(#|＃)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
     REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@＠]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
     REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
     # URL related hash regex collection
     REGEXEN[:valid_preceding_chars] = /(?:[^-\/"':!=A-Z0-9_@＠]|^|\:)/i
-    REGEXEN[:valid_domain] = /(?:[^[:punct:]\s][\.-](?=[^[:punct:]\s])|[^[:punct:]\s]){1,}\.[a-z]{2,}(?::[0-9]+)?/i
-    REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~]/i
+    REGEXEN[:valid_subdomain] = /([^[:punct:]\s]([_-]|[^[:punct:]\s])*)?[^[:punct:]\s]\./
+    REGEXEN[:valid_domain_name] = /([^[:punct:]\s]([-]|[^[:punct:]\s])*)?[^[:punct:]\s]/
+    REGEXEN[:valid_domain] = /#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}\.[a-z]{2,}(?::[0-9]+)?/i
+    REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~|]/i
     # Allow URL paths to contain balanced parens
     #  1. Used in Wikipedia URLs like /Primer_(film)
     #  2. Used in IIS sessions like /S(dfd346)/
@@ -71,7 +75,7 @@ module Twitter
     # Valid end-of-path chracters (so /foo. does not gobble the period).
     #   1. Allow =&# for empty URL parameters and other URL-join artifacts
     REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-]|#{REGEXEN[:wikipedia_disambiguation]}/io
-    REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
+    REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i
     REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
     REGEXEN[:valid_url] = %r{
       (                                                                                     #   $1 total match
@@ -91,6 +95,103 @@ module Twitter
       )
     }iox;
+    # These URL validation pattern strings are based on the ABNF from RFC 3986
+    REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
+    REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
+    REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
+    REGEXEN[:validate_url_pchar] = /(?:
+      #{REGEXEN[:validate_url_unreserved]}|
+      #{REGEXEN[:validate_url_pct_encoded]}|
+      #{REGEXEN[:validate_url_sub_delims]}|
+      :|@
+    )/iox
+    REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
+    REGEXEN[:validate_url_userinfo] = /(?:
+      #{REGEXEN[:validate_url_unreserved]}|
+      #{REGEXEN[:validate_url_pct_encoded]}|
+      #{REGEXEN[:validate_url_sub_delims]}|
+      :
+    )*/iox
+    REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
+    REGEXEN[:validate_url_ipv4] =
+      /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
+    # Punting on real IPv6 validation for now
+    REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
+    # Also punting on IPvFuture for now
+    REGEXEN[:validate_url_ip] = /(?:
+      #{REGEXEN[:validate_url_ipv4]}|
+      #{REGEXEN[:validate_url_ipv6]}
+    )/iox
+    # This is more strict than the rfc specifies
+    REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
+    REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
+    REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
+    REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
+                                     (?:#{REGEXEN[:validate_url_domain_segment]}\.)
+                                     #{REGEXEN[:validate_url_domain_tld]})/iox
+    REGEXEN[:validate_url_host] = /(?:
+      #{REGEXEN[:validate_url_ip]}|
+      #{REGEXEN[:validate_url_domain]}
+    )/iox
+    # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
+    REGEXEN[:validate_url_unicode_subdomain_segment] =
+      /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
+    REGEXEN[:validate_url_unicode_domain_segment] =
+      /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
+    REGEXEN[:validate_url_unicode_domain_tld] =
+      /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
+    REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
+                                             (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
+                                             #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
+    REGEXEN[:validate_url_unicode_host] = /(?:
+      #{REGEXEN[:validate_url_ip]}|
+      #{REGEXEN[:validate_url_unicode_domain]}
+    )/iox
+    REGEXEN[:validate_url_port] = /[0-9]{1,5}/
+    REGEXEN[:validate_url_unicode_authority] = %r{
+      (?:(#{REGEXEN[:validate_url_userinfo]})@)?     #  $1 userinfo
+      (#{REGEXEN[:validate_url_unicode_host]})       #  $2 host
+      (?::(#{REGEXEN[:validate_url_port]}))?         #  $3 port
+    }iox
+    REGEXEN[:validate_url_authority] = %r{
+      (?:(#{REGEXEN[:validate_url_userinfo]})@)?     #  $1 userinfo
+      (#{REGEXEN[:validate_url_host]})               #  $2 host
+      (?::(#{REGEXEN[:validate_url_port]}))?         #  $3 port
+    }iox
+    REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
+    REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
+    REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
+    # Modified version of RFC 3986 Appendix B
+    REGEXEN[:validate_url_unencoded] = %r{
+      \A                                #  Full URL
+      (?:
+        ([^:/?#]+):                    #  $1 Scheme
+      )
+      (?://
+        ([^/?#]*)                      #  $2 Authority
+      )
+      ([^?#]*)                         #  $3 Path
+      (?:
+        \?([^#]*)                      #  $4 Query
+      )?
+      (?:
+        \#(.*)                         #  $5 Fragment
+      )?\Z
+    }ix
     REGEXEN.each_pair{|k,v| v.freeze }
     # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>

data/lib/validation.rb CHANGED Viewed

@@ -46,5 +46,57 @@ module Twitter
       return false
     end
+    def valid_tweet_text?(text)
+      !tweet_invalid?(text)
+    end
+    def valid_username?(username)
+      return false if username.blank?
+      extracted = Twitter::Extractor.extract_mentioned_screen_names(username)
+      # Should extract the username minus the @ sign, hence the [1..-1]
+      extracted.size == 1 && extracted.first == username[1..-1]
+    end
+    VALID_LIST_RE = /\A#{Twitter::Regex[:auto_link_usernames_or_lists]}\z/o
+    def valid_list?(username_list)
+      match = username_list.match(VALID_LIST_RE)
+      # Must have matched and had nothing before or after
+      !!(match && match[1] == "" && !match[4].blank?)
+    end
+    def valid_hashtag?(hashtag)
+      return false if hashtag.blank?
+      extracted = Twitter::Extractor.extract_hashtags(hashtag)
+      # Should extract the hashtag minus the # sign, hence the [1..-1]
+      extracted.size == 1 && extracted.first == hashtag[1..-1]
+    end
+    def valid_url?(url, unicode_domains=true)
+      return false if url.blank?
+      url_parts = url.match(Twitter::Regex[:validate_url_unencoded])
+      return false unless (url_parts && url_parts.to_s == url)
+      scheme, authority, path, query, fragment = url_parts.captures
+      return false unless (valid_match?(scheme, Twitter::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i) &&
+                           valid_match?(path, Twitter::Regex[:validate_url_path]) &&
+                           valid_match?(query, Twitter::Regex[:validate_url_query], true) &&
+                           valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true))
+      return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) ||
+             (!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority]))
+    end
+    private
+    def valid_match?(string, regex, optional=false)
+      return (string && string.match(regex) && $~.to_s == string) unless optional
+      !(string && (!string.match(regex) || $~.to_s != string))
+    end
   end
 end

data/spec/autolinking_spec.rb CHANGED Viewed

@@ -310,6 +310,17 @@ describe Twitter::Autolink do
         end
       end
+      context "with a hashtag containing an accented latin character" do
+        def original_text
+          # the hashtag is #éhashtag
+          "##{[0x00e9].pack('U')}hashtag"
+        end
+        it "should be linked" do
+          @autolinked_text.should == "<a href=\"http://twitter.com/search?q=%23éhashtag\" title=\"#éhashtag\" class=\"tweet-url hashtag\" rel=\"nofollow\">#éhashtag</a>"
+        end
+      end
     end
     describe "URL autolinking" do

data/spec/test_urls.rb CHANGED Viewed

@@ -21,6 +21,11 @@ module TestUrls
     "http://mrs.domain-dash.biz",
     "http://x.com/has/one/char/domain",
     "http://t.co/nwcLTFF",
+    "http://sub_domain-dash.twitter.com",
+    "http://a.b.cd",
+    "http://a_b.c-d.com",
+    "http://a-b.b.com",
+    "http://twitter-dash.com",
     # "t.co/nwcLTFF"
   ] unless defined?(TestUrls::VALID)
@@ -29,7 +34,16 @@ module TestUrls
     "http://tld-too-short.x",
     "www.foobar.com",
     "WWW.FOOBAR.COM",
-    "http://-doman_dash.com"
+    "http://-doman_dash.com",
+    "http://_leadingunderscore.twitter.com",
+    "http://trailingunderscore_.twitter.com",
+    "http://-leadingdash.twitter.com",
+    "http://trailingdash-.twitter.com",
+    "http://-leadingdash.com",
+    "http://trailingdash-.com",
+    "http://no_underscores.com",
+    "http://test.c_o_m",
+    "http://test.c-o-m"
   ] unless defined?(TestUrls::INVALID)
 end

data/test/conformance_test.rb CHANGED Viewed

@@ -7,6 +7,7 @@ class ConformanceTest < Test::Unit::TestCase
   include Twitter::Extractor
   include Twitter::Autolink
   include Twitter::HitHighlighter
+  include Twitter::Validation
   def setup
     @conformance_dir = ENV['CONFORMANCE_DIR'] || File.join(File.dirname(__FILE__), 'twitter-text-conformance')
@@ -35,6 +36,9 @@ class ConformanceTest < Test::Unit::TestCase
     def test_url_extractor_conformance
       run_conformance_test(File.join(@conformance_dir, 'extract.yml'), :urls) do |description, expected, input|
         assert_equal expected, extract_urls(input), description
+        expected.each do |expected_url|
+          assert_equal true, valid_url?(expected_url), "expected url [#{expected_url}] not valid"
+        end
       end
     end
@@ -109,6 +113,39 @@ class ConformanceTest < Test::Unit::TestCase
   end
   include HitHighlighterConformance
+  module ValidationConformance
+    def test_tweet_validation_conformance
+      run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :tweets) do |description, expected, input|
+        assert_equal expected, valid_tweet_text?(input), description
+      end
+    end
+    def test_users_validation_conformance
+      run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :usernames) do |description, expected, input|
+        assert_equal expected, valid_username?(input), description
+      end
+    end
+    def test_lists_validation_conformance
+      run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :lists) do |description, expected, input|
+        assert_equal expected, valid_list?(input), description
+      end
+    end
+    def test_urls_validation_conformance
+      run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :urls) do |description, expected, input|
+        assert_equal expected, valid_url?(input), description
+      end
+    end
+    def test_hashtags_validation_conformance
+      run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :hashtags) do |description, expected, input|
+        assert_equal expected, valid_hashtag?(input), description
+      end
+    end
+  end
+  include ValidationConformance
   private
   def run_conformance_test(file, test_type, hash_config = false, &block)
@@ -123,4 +160,4 @@ class ConformanceTest < Test::Unit::TestCase
       end
     end
   end
-end
+end

data/twitter-text.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 spec = Gem::Specification.new do |s|
   s.name = "twitter-text"
-  s.version = "1.3.1"
+  s.version = "1.3.2"
   s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle", "Raffi Krikorian"]
   s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com", "raffi@twitter.com"]
   s.homepage = "http://twitter.com"

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: twitter-text
 version: !ruby/object:Gem::Version
-  hash: 25
-  prerelease: false
+  hash: 31
+  prerelease:
   segments:
   - 1
   - 3
-  - 1
-  version: 1.3.1
+  - 2
+  version: 1.3.2
 platform: ruby
 authors:
 - Matt Sanford
@@ -19,11 +19,13 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-01-06 00:00:00 -08:00
+date: 2011-04-20 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
-  version_requirements: &id001 !ruby/object:Gem::Requirement
+  name: nokogiri
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ">="
@@ -32,12 +34,12 @@ dependencies:
         segments:
         - 0
         version: "0"
-  requirement: *id001
-  name: nokogiri
-  prerelease: false
   type: :development
+  version_requirements: *id001
 - !ruby/object:Gem::Dependency
-  version_requirements: &id002 !ruby/object:Gem::Requirement
+  name: rake
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ">="
@@ -46,12 +48,12 @@ dependencies:
         segments:
         - 0
         version: "0"
-  requirement: *id002
-  name: rake
-  prerelease: false
   type: :development
+  version_requirements: *id002
 - !ruby/object:Gem::Dependency
-  version_requirements: &id003 !ruby/object:Gem::Requirement
+  name: rspec
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ">="
@@ -60,12 +62,12 @@ dependencies:
         segments:
         - 0
         version: "0"
-  requirement: *id003
-  name: rspec
-  prerelease: false
   type: :development
+  version_requirements: *id003
 - !ruby/object:Gem::Dependency
-  version_requirements: &id004 !ruby/object:Gem::Requirement
+  name: simplecov
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ">="
@@ -74,12 +76,12 @@ dependencies:
         segments:
         - 0
         version: "0"
-  requirement: *id004
-  name: simplecov
-  prerelease: false
   type: :development
+  version_requirements: *id004
 - !ruby/object:Gem::Dependency
-  version_requirements: &id005 !ruby/object:Gem::Requirement
+  name: actionpack
+  prerelease: false
+  requirement: &id005 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ">="
@@ -88,10 +90,8 @@ dependencies:
         segments:
         - 0
         version: "0"
-  requirement: *id005
-  name: actionpack
-  prerelease: false
   type: :runtime
+  version_requirements: *id005
 description: A gem that provides text handling for Twitter
 email:
 - matt@twitter.com
@@ -165,7 +165,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project:
-rubygems_version: 1.3.7
+rubygems_version: 1.4.1
 signing_key:
 specification_version: 3
 summary: Twitter text handling library