RubyGems - twitter-text - Versions diffs - 1.0.1 → 1.0.2 - Mend

twitter-text 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/README.rdoc CHANGED Viewed

@@ -56,3 +56,11 @@ between words.
 Special care has been taken to be sure that auto-linking and extraction work
 in Tweets of all languages. This means that languages without spaces between
 words should work equally well.
+=== Conformance
+To run the Conformance suite, you'll need to add that project as a git submodule.  From the root twitter-text-rb directory, run:
+git submodule add git@github.com:mzsanford/twitter-text-conformance.git test/twitter-text-conformance/
+git submodule init
+git submodule update

data/Rakefile CHANGED Viewed

@@ -7,10 +7,9 @@ require 'spec/rake/spectask'
 require 'spec/rake/verify_rcov'
 require 'digest'
 spec = Gem::Specification.new do |s|
   s.name = "twitter-text"
-  s.version = "1.0.1"
+  s.version = "1.0.2"
   s.author = "Matt Sanford"
   s.email = "matt@twitter.com"
   s.homepage = "http://twitter.com"

data/lib/extractor.rb CHANGED Viewed

@@ -1,4 +1,3 @@
 module Twitter
   # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
   # of usernames, lists, URLs and hashtags.
@@ -13,7 +12,9 @@ module Twitter
       return [] unless text
       possible_screen_names = []
-      text.scan(Twitter::Regex[:extract_mentions]) {|before,sn| possible_screen_names << sn }
+      text.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
+        possible_screen_names << sn unless after =~ Twitter::Regex[:at_signs]
+      end
       possible_screen_names.each{|sn| yield sn } if block_given?
       possible_screen_names
     end
@@ -39,7 +40,6 @@ module Twitter
     # If a block is given then it will be called for each URL.
     def extract_urls(text) # :yields: url
       return [] unless text
       urls = []
       text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
         urls << (protocol == "www." ? "http://#{url}" : url)
@@ -66,4 +66,4 @@ module Twitter
     end
   end
-end
+end

data/lib/regex.rb CHANGED Viewed

@@ -26,8 +26,9 @@ module Twitter
         ].flatten.freeze
     REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|'))
-    REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])[@＠]([a-zA-Z0-9_]{1,20})(?!@)/
-    REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*[@＠]([a-zA-Z0-9_]{1,20})/o
+    REGEXEN[:at_signs] = /[@＠]/
+    REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
+    REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
     REGEXEN[:list_name] = /^[a-zA-Z\x80-\xff].{0,79}$/
@@ -42,9 +43,9 @@ module Twitter
     REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
     # URL related hash regex collection
-    REGEXEN[:valid_preceeding_chars] = /(?:[^\/"':!=]|^|\:)/
-    REGEXEN[:valid_domain] = /(?:[\.-]|[^[:punct:]])+\.[a-z]{2,}(?::[0-9]+)?/i
-    REGEXEN[:valid_url_path_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
+    REGEXEN[:valid_preceding_chars] = /(?:[^\/"':!=]|^|\:)/
+    REGEXEN[:valid_domain] = /(?:[\.-]|[^[:punct:]\s])+\.[a-z]{2,}(?::[0-9]+)?/i
+    REGEXEN[:valid_url_path_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~@]/i
     # Valid end-of-path chracters (so /foo. does not gobble the period).
     #   1. Allow ) for Wikipedia URLs.
     #   2. Allow =&# for empty URL parameters and other URL-join artifacts
@@ -53,7 +54,7 @@ module Twitter
     REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
     REGEXEN[:valid_url] = %r{
       (                                                                                     #   $1 total match
-        (#{REGEXEN[:valid_preceeding_chars]})                                               #   $2 Preceeding chracter
+        (#{REGEXEN[:valid_preceding_chars]})                                                #   $2 Preceeding chracter
         (                                                                                   #   $3 URL
           (https?:\/\/|www\.)                                                               #   $4 Protocol or beginning
           (#{REGEXEN[:valid_domain]})                                                       #   $5 Domain(s) and optional post number

data/lib/twitter-text.rb CHANGED Viewed

@@ -1,4 +1,3 @@
 raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE)
 require 'rubygems'
@@ -10,4 +9,4 @@ require File.join(File.dirname(__FILE__), 'regex')
 require File.join(File.dirname(__FILE__), 'autolink')
 require File.join(File.dirname(__FILE__), 'extractor')
 require File.join(File.dirname(__FILE__), 'unicode')
-require File.join(File.dirname(__FILE__), 'validation')
+require File.join(File.dirname(__FILE__), 'validation')

data/lib/unicode.rb CHANGED Viewed

@@ -1,4 +1,3 @@
 module Twitter
   # This module lazily defines constants of the form Uxxxx for all Unicode
   # codepoints from U0000 to U10FFFF. The value of each constant is the
@@ -24,4 +23,4 @@ module Twitter
     end
   end
-end
+end

data/lib/validation.rb CHANGED Viewed

@@ -1,4 +1,3 @@
 module Twitter
   module Validation
     MAX_LENGTH = 140
@@ -48,4 +47,4 @@ module Twitter
       return false
     end
   end
-end
+end

data/spec/extractor_spec.rb CHANGED Viewed

@@ -59,21 +59,21 @@ describe Twitter::Extractor do
         @extractor.extract_reply_screen_name("@alice reply text").should == "alice"
       end
-      it "should extract preceeded by a space" do
+      it "should extract preceded by a space" do
         @extractor.extract_reply_screen_name(" @alice reply text").should == "alice"
       end
-      it "should extract preceeded by a full-width space" do
+      it "should extract preceded by a full-width space" do
         @extractor.extract_reply_screen_name("#{[0x3000].pack('U')}@alice reply text").should == "alice"
       end
     end
     context "should not be extracted from" do
-      it "should not be extracted when preceeded by text" do
+      it "should not be extracted when preceded by text" do
         @extractor.extract_reply_screen_name("reply @alice text").should == nil
       end
-      it "should not be extracted when preceeded by puctuation" do
+      it "should not be extracted when preceded by puctuation" do
         %w(. / _ - + # ! @).each do |punct|
           @extractor.extract_reply_screen_name("#{punct}@alice text").should == nil
         end
@@ -99,39 +99,21 @@ describe Twitter::Extractor do
   describe "urls" do
     describe "matching URLS" do
-      @urls = [
-        "http://google.com",
-        "http://foobar.com/#",
-        "http://google.com/#foo",
-        "http://google.com/#search?q=iphone%20-filter%3Alinks",
-        "http://twitter.com/#search?q=iphone%20-filter%3Alinks",
-        "http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
-        "http://somehost.com:3000",
-        "http://x.com/~matthew+%-x",
-        "http://en.wikipedia.org/wiki/Primer_(film)",
-        "http://www.ams.org/bookstore-getitem/item=mbk-59",
-        "http://chilp.it/?77e8fd",
-      ]
-      @urls.each do |url|
-        it "should extract the URL #{url}" do
-          @extractor.extract_urls(url).should == [url]
+      TestUrls::VALID.each do |url|
+        it "should extract the URL #{url} and prefix it with a protocol if missing" do
+          @extractor.extract_urls(url).first.should include(url)
         end
         it "should match the URL #{url} when it's embedded in other text" do
           text = "Sweet url: #{url} I found. #awesome"
-          @extractor.extract_urls(text).should == [url]
+          @extractor.extract_urls(text).first.should include(url)
         end
       end
     end
     describe "invalid URLS" do
-     it "does not link urls with invalid_domains" do
-        [ "http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
-          "http://no-tld",
-          "http://tld-too-short.x",
-          "http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
-        ].each {|url| @extractor.extract_urls(url).should == [] }
+      it "does not link urls with invalid domains" do
+        @extractor.extract_urls("http://tld-too-short.x").should == []
       end
     end
   end
@@ -150,7 +132,6 @@ describe Twitter::Extractor do
     end
     context "international hashtags" do
       context "should allow accents" do
         %w(mañana café münchen).each do |hashtag|
           it "should extract ##{hashtag}" do

data/spec/regex_spec.rb CHANGED Viewed

@@ -2,27 +2,7 @@ require File.dirname(__FILE__) + '/spec_helper'
 describe "Twitter::Regex regular expressions" do
   describe "matching URLS" do
-    @urls = [
-      "http://google.com",
-      "http://foobar.com/#",
-      "http://google.com/#foo",
-      "http://google.com/#search?q=iphone%20-filter%3Alinks",
-      "http://twitter.com/#search?q=iphone%20-filter%3Alinks",
-      "http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
-      "http://somehost.com:3000",
-      "http://x.com/~matthew+%-x",
-      "http://en.wikipedia.org/wiki/Primer_(film)",
-      "http://www.ams.org/bookstore-getitem/item=mbk-59",
-      "http://chilp.it/?77e8fd",
-      "www.foobar.com",
-      "WWW.FOOBAR.COM",
-      "http://tell.me/why",
-      "http://longtlds.mobi",
-      "http://✪df.ws/ejp",
-      "http://日本.com"
-    ]
-    @urls.each do |url|
+    TestUrls::VALID.each do |url|
       it "should match the URL #{url}" do
         url.should match_autolink_expression
       end
@@ -36,19 +16,8 @@ describe "Twitter::Regex regular expressions" do
   describe "invalid URLS" do
     it "does not link urls with invalid characters" do
-      [ "http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
-        "http://no-tld",
-        "http://tld-too-short.x",
-        "http://x.com/,,,/.../@@@/;;;/:::/---/%%%x",
-        "http://doman_dash_2314352345_dfasd.foo-cow_4352.com",
-      ].each {|url| url.should_not have_autolinked_url(url)}
-    end
-    it "does not link domains beginning with a hypen" do
-      pending
-      "http://-doman_dash_2314352345_dfasd.com".should_not match_autolink_expression
+      TestUrls::INVALID.each {|url| url.should_not have_autolinked_url(url)}
     end
   end
 end

data/spec/spec_helper.rb CHANGED Viewed

@@ -3,6 +3,11 @@ $:.push File.join(File.dirname(__FILE__), '..', 'lib')
 require 'twitter-text'
 require 'hpricot'
+require 'spec/test_urls'
+Spec::Runner.configure do |config|
+  config.include TestUrls
+end
 Spec::Matchers.define :match_autolink_expression do
   match do |string|
@@ -81,6 +86,10 @@ Spec::Matchers.define :have_autolinked_hashtag do |hashtag|
   end
   failure_message_for_should do |text|
-    "Expected hashtag #{hashtag} to be autolinked in '#{text}'"
+    if @link
+      "Expected link text to be #{hashtag}, but it was #{@link.inner_text}"
+    else
+      "Expected hashtag #{hashtag} to be autolinked in '#{text}', but no link was found."
+    end
   end
 end

data/spec/test_urls.rb ADDED Viewed

@@ -0,0 +1,30 @@
+module TestUrls
+  VALID = [
+    "http://google.com",
+    "http://foobar.com/#",
+    "http://google.com/#foo",
+    "http://google.com/#search?q=iphone%20-filter%3Alinks",
+    "http://twitter.com/#search?q=iphone%20-filter%3Alinks",
+    "http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
+    "http://somehost.com:3000",
+    "http://x.com/~matthew+%-x",
+    "http://en.wikipedia.org/wiki/Primer_(film)",
+    "http://www.ams.org/bookstore-getitem/item=mbk-59",
+    "http://chilp.it/?77e8fd",
+    "www.foobar.com",
+    "WWW.FOOBAR.COM",
+    "http://tell.me/why",
+    "http://longtlds.info",
+    "http://✪df.ws/ejp",
+    "http://日本.com"
+  ]
+  INVALID = [
+    "http://no-tld",
+    "http://tld-too-short.x",
+    "http://x.com/,,,/.../@@@/;;;/:::/---/%%%x",
+    "http://domain-dash.com",
+    "http://-doman_dash.com"
+  ]
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: twitter-text
 version: !ruby/object:Gem::Version
-  version: 1.0.1
+  version: 1.0.2
 platform: ruby
 authors:
 - Matt Sanford
@@ -9,7 +9,7 @@ autorequire: ""
 bindir: bin
 cert_chain: []
-date: 2010-02-10 00:00:00 -08:00
+date: 2010-03-05 00:00:00 -08:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -45,6 +45,7 @@ files:
 - spec/extractor_spec.rb
 - spec/regex_spec.rb
 - spec/spec_helper.rb
+- spec/test_urls.rb
 - spec/unicode_spec.rb
 - spec/validation_spec.rb
 has_rdoc: true