RubyGems - twitter-text - Versions diffs - 1.4.11 → 1.4.12 - Mend

twitter-text 1.4.11 → 1.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

data/.gemtest ADDED Viewed

File without changes

data/.gitignore CHANGED Viewed

@@ -1,6 +1,40 @@
 *.gem
 *.rbc
+*.sw[a-p]
+*.tmproj
+*.tmproject
+*.un~
+*~
+.DS_Store
+.Spotlight-V100
+.Trashes
+._*
 .bundle
-pkg/*
+.config
+.directory
+.elc
+.emacs.desktop
+.emacs.desktop.lock
+.redcar
+.yardoc
+Desktop.ini
+Gemfile.lock
+Icon?
+InstalledFiles
+Session.vim
+Thumbs.db
+\#*\#
+_yardoc
+auto-save-list
 coverage
 doc
+lib/bundler/man
+pkg
+pkg/*
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+tmtags
+tramp

data/Rakefile CHANGED Viewed

@@ -1,7 +1,9 @@
+#!/usr/bin/env rake
 require 'bundler'
 Bundler::GemHelper.install_tasks
-task :default => ["spec", "test:conformance"]
+task :default => ['spec', 'test:conformance']
+task :test => :spec
 require 'rspec/core/rake_task'
 RSpec::Core::RakeTask.new(:spec)
@@ -48,9 +50,9 @@ namespace :test do
   end
 end
-require 'rake/rdoctask'
+require 'rdoc/task'
 namespace :doc do
-  Rake::RDocTask.new do |rd|
+  RDoc::Task.new do |rd|
     rd.main = "README.rdoc"
     rd.rdoc_dir = 'doc'
     rd.rdoc_files.include("README.rdoc", "lib/**/*.rb")

data/lib/extractor.rb CHANGED Viewed

@@ -158,11 +158,21 @@ module Twitter
       text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
         valid_url_match_data = $~
+        start_position = valid_url_match_data.char_begin(3)
+        end_position = valid_url_match_data.char_end(3)
+        # If protocol is missing, check against valid_ascii_domain
+        if !protocol
+          next unless domain =~ Twitter::Regex[:valid_ascii_domain]
+          if $~.char_begin(0)
+            start_position += $~.char_begin(0)
+            url.sub!(domain, $~.to_s())
+          end
+        end
         # Regex in Ruby 1.8 doesn't support lookbehind, so we need to manually filter out
         # the short URLs without protocol and path, i.e., [domain].[ccTLD]
         unless !protocol && !path && domain =~ Twitter::Regex[:valid_short_domain]
-          start_position = valid_url_match_data.char_begin(3)
-          end_position = valid_url_match_data.char_end(3)
           urls << {
             :url => url,
             :indices => [start_position, end_position]

data/lib/regex.rb CHANGED Viewed

@@ -39,9 +39,16 @@ module Twitter
           0x202F,          # White_Space # Zs       NARROW NO-BREAK SPACE
           0x205F,          # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
           0x3000,          # White_Space # Zs       IDEOGRAPHIC SPACE
-        ].flatten.freeze
-    SPACE_CHAR_CLASS_VALUE = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join(''))
-    REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|'))
+    ].flatten.map{|c| [c].pack('U*')}.freeze
+    REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
+    # Character not allowed in Tweets
+    INVALID_CHARACTERS = [
+      0xFFFE, 0xFEFF, # BOM
+      0xFFFF,         # Special
+      0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
+    ].map{|cp| [cp].pack('U') }.freeze
+    REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
     REGEXEN[:at_signs] = /[@＠]/
     REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
@@ -97,7 +104,7 @@ module Twitter
       regex_range(0x2F800, 0x2FA1F), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
     ].join('').freeze
-    HASHTAG_BOUNDARY = /(?:\A|\z|#{REGEXEN[:spaces]}|[「」。、.,!?！？:;"'])/
+    HASHTAG_BOUNDARY = /(?:\A|\z|#{REGEXEN[:spaces]}|[「」。、.,!?！？:;"'])/o
     # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
     HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
@@ -111,11 +118,11 @@ module Twitter
     REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
     # URL related hash regex collection
-    REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@＠\.]|^)/i
+    REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@＠\.#{INVALID_CHARACTERS.join('')}]|^)/io
-    DOMAIN_VALID_CHARS = "[^[:punct:][:space:][:blank:]#{[0x00A0].pack('U')}]"
-    REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/i
-    REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/i
+    DOMAIN_VALID_CHARS = "[^[:punct:][:space:][:blank:][:cntrl:]#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
+    REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
+    REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
     REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^[:alpha:]]|$))/i
     REGEXEN[:valid_ccTLD] = %r{
@@ -134,26 +141,36 @@ module Twitter
     REGEXEN[:valid_domain] = /(?:
       #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
       (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
-    )/ix
-    REGEXEN[:valid_short_domain] = /^#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}$/
+    )/iox
+    # This is used in Extractor
+    REGEXEN[:valid_ascii_domain] = /
+      (?:(?:[[:alnum:]\-_]|#{REGEXEN[:latin_accents]})+\.)+
+      (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
+    /iox
+    # This is used in Extractor to filter out unwanted URLs.
+    REGEXEN[:valid_short_domain] = /^#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}$/io
     REGEXEN[:valid_port_number] = /[0-9]+/
-    REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~&|#{LATIN_ACCENTS}]/i
+    REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\.\$\/%#\[\]\-_~&|#{LATIN_ACCENTS}]/io
     # Allow URL paths to contain balanced parens
     #  1. Used in Wikipedia URLs like /Primer_(film)
     #  2. Used in IIS sessions like /S(dfd346)/
-    REGEXEN[:wikipedia_disambiguation] = /(?:\(#{REGEXEN[:valid_general_url_path_chars]}+\))/i
-    # Allow @ in a url, but only in the middle. Catch things like http://example.com/@user
-    REGEXEN[:valid_url_path_chars] = /(?:
-      #{REGEXEN[:wikipedia_disambiguation]}|
-      @#{REGEXEN[:valid_general_url_path_chars]}+\/|
-      [\.,]#{REGEXEN[:valid_general_url_path_chars]}?|
-      #{REGEXEN[:valid_general_url_path_chars]}+
-    )/ix
+    REGEXEN[:valid_url_balanced_parens] = /\(#{REGEXEN[:valid_general_url_path_chars]}+\)/io
     # Valid end-of-path chracters (so /foo. does not gobble the period).
     #   1. Allow =&# for empty URL parameters and other URL-join artifacts
-    REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-#{LATIN_ACCENTS}]|#{REGEXEN[:wikipedia_disambiguation]}/io
+    REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
+    # Allow @ in a url, but only in the middle. Catch things like http://example.com/@user/
+    REGEXEN[:valid_url_path] = /(?:
+      (?:
+        #{REGEXEN[:valid_general_url_path_chars]}*
+        (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
+        #{REGEXEN[:valid_url_path_ending_chars]}
+      )|(?:@#{REGEXEN[:valid_general_url_path_chars]}+\/)
+    )/iox
     REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i
     REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
     REGEXEN[:valid_url] = %r{
@@ -163,13 +180,7 @@ module Twitter
           (https?:\/\/)?                                                                    #   $4 Protocol (optional)
           (#{REGEXEN[:valid_domain]})                                                       #   $5 Domain(s)
           (?::(#{REGEXEN[:valid_port_number]}))?                                            #   $6 Port number (optional)
-          (/
-            (?:
-              #{REGEXEN[:valid_url_path_chars]}+#{REGEXEN[:valid_url_path_ending_chars]}|   # 1+ path chars and a valid last char
-              #{REGEXEN[:valid_url_path_chars]}+#{REGEXEN[:valid_url_path_ending_chars]}?|  # Optional last char to handle /@foo/ case
-              #{REGEXEN[:valid_url_path_ending_chars]}                                      # Just a # case
-            )?
-          )?                                                                                #   $7 URL Path and anchor
+          (/#{REGEXEN[:valid_url_path]}*)?                                                  #   $7 URL Path and anchor
           (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? #   $8 Query String
         )
       )

data/lib/validation.rb CHANGED Viewed

@@ -2,13 +2,6 @@ module Twitter
   module Validation extend self
     MAX_LENGTH = 140
-    # Character not allowed in Tweets
-    INVALID_CHARACTERS = [
-      0xFFFE, 0xFEFF, # BOM
-      0xFFFF,         # Special
-      0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
-    ].map{|cp| [cp].pack('U') }.freeze
     # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
     # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
     # string no matter which actual form was transmitted. For example:
@@ -38,7 +31,7 @@ module Twitter
       return :empty if !text || text.empty?
       begin
         return :too_long if tweet_length(text) > MAX_LENGTH
-        return :invalid_characters if INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
+        return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
       rescue ArgumentError, ActiveSupport::Multibyte::EncodingError => e
         # non-Unicode value.
         return :invalid_characters

data/spec/autolinking_spec.rb CHANGED Viewed

@@ -1,6 +1,5 @@
-#encoding: UTF-8
-# require File.dirname(__FILE__) + '/spec_helper'
-require 'spec_helper'
+# encoding: utf-8
+require File.dirname(__FILE__) + '/spec_helper'
 class TestAutolink
   include Twitter::Autolink

data/spec/extractor_spec.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-#encoding: UTF-8
+# encoding: utf-8
 require File.dirname(__FILE__) + '/spec_helper'
 class TestExtractor

data/spec/hithighlighter_spec.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-#encoding: UTF-8
+# encoding: utf-8
 require File.dirname(__FILE__) + '/spec_helper'
 class TestHitHighlighter

data/spec/rewriter_spec.rb CHANGED Viewed

@@ -1,6 +1,5 @@
-# encoding: UTF-8
-require 'spec_helper'
+# encoding: utf-8
+require File.dirname(__FILE__) + '/spec_helper'
 describe Twitter::Rewriter do
   def original_text; end

data/spec/test_urls.rb CHANGED Viewed

@@ -1,4 +1,5 @@
-#encoding: UTF-8
+# encoding: utf-8
 module TestUrls
   VALID = [
     "http://google.com",
@@ -26,6 +27,7 @@ module TestUrls
     "http://a_b.c-d.com",
     "http://a-b.b.com",
     "http://twitter-dash.com",
+    "http://msdn.microsoft.com/ja-jp/library/system.net.httpwebrequest(v=VS.100).aspx",
     "www.foobar.com",
     "WWW.FOOBAR.COM",
     "www.foobar.co.jp",
@@ -45,7 +47,11 @@ module TestUrls
     "http://trailingdash-.com",
     "http://no_underscores.com",
     "http://test.c_o_m",
-    "http://test.c-o-m"
+    "http://test.c-o-m",
+    "http://twitt#{[0x202A].pack('U')}er.com",
+    "http://twitt#{[0x202B].pack('U')}er.com",
+    "http://twitt#{[0x202C].pack('U')}er.com",
+    "http://twitt#{[0x202D].pack('U')}er.com",
+    "http://twitt#{[0x202E].pack('U')}er.com",
   ] unless defined?(TestUrls::INVALID)
 end

data/spec/twitter_text_spec.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# encoding: utf-8
 require File.dirname(__FILE__) + '/spec_helper'
 major, minor, patch = RUBY_VERSION.split('.')

data/spec/unicode_spec.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-#encoding: UTF-8
+# encoding: utf-8
 require File.dirname(__FILE__) + '/spec_helper'
 describe Twitter::Unicode do
@@ -28,4 +28,4 @@ describe Twitter::Unicode do
     lambda { Twitter::Unicode::FFFFFF }.should raise_error(NameError)
   end
-end
+end

data/spec/validation_spec.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-#encoding: BINARY
+# encoding: binary
 require File.dirname(__FILE__) + '/spec_helper'
 class TestValidation
@@ -40,4 +40,4 @@ describe Twitter::Validation do
     TestValidation.new.tweet_invalid?(char * 141).should == :too_long
   end
-end
+end

data/twitter-text.gemspec CHANGED Viewed

@@ -1,6 +1,8 @@
-spec = Gem::Specification.new do |s|
+# encoding: utf-8
+Gem::Specification.new do |s|
   s.name = "twitter-text"
-  s.version = "1.4.11"
+  s.version = "1.4.12"
   s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle",
                "Raffi Krikorian", "J.P. Cummins", "Yoshimasa Niwa", "Keita Fujii"]
   s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com",
@@ -14,6 +16,7 @@ spec = Gem::Specification.new do |s|
   s.add_development_dependency "nokogiri"
   s.add_development_dependency "rake"
+  s.add_development_dependency "rdoc"
   s.add_development_dependency "rspec"
   s.add_development_dependency "simplecov"
   s.add_runtime_dependency "activesupport"

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: twitter-text
 version: !ruby/object:Gem::Version
-  hash: 17
+  hash: 31
   prerelease:
   segments:
   - 1
   - 4
-  - 11
-  version: 1.4.11
+  - 12
+  version: 1.4.12
 platform: ruby
 authors:
 - Matt Sanford
@@ -22,7 +22,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-09-26 00:00:00 -07:00
+date: 2011-10-04 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -54,7 +54,7 @@ dependencies:
   type: :development
   version_requirements: *id002
 - !ruby/object:Gem::Dependency
-  name: rspec
+  name: rdoc
   prerelease: false
   requirement: &id003 !ruby/object:Gem::Requirement
     none: false
@@ -68,7 +68,7 @@ dependencies:
   type: :development
   version_requirements: *id003
 - !ruby/object:Gem::Dependency
-  name: simplecov
+  name: rspec
   prerelease: false
   requirement: &id004 !ruby/object:Gem::Requirement
     none: false
@@ -82,7 +82,7 @@ dependencies:
   type: :development
   version_requirements: *id004
 - !ruby/object:Gem::Dependency
-  name: activesupport
+  name: simplecov
   prerelease: false
   requirement: &id005 !ruby/object:Gem::Requirement
     none: false
@@ -93,8 +93,22 @@ dependencies:
         segments:
         - 0
         version: "0"
-  type: :runtime
+  type: :development
   version_requirements: *id005
+- !ruby/object:Gem::Dependency
+  name: activesupport
+  prerelease: false
+  requirement: &id006 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id006
 description: A gem that provides text handling for Twitter
 email:
 - matt@twitter.com
@@ -112,11 +126,11 @@ extensions: []
 extra_rdoc_files: []
 files:
+- .gemtest
 - .gitignore
 - .gitmodules
 - .rspec
 - Gemfile
-- Gemfile.lock
 - LICENSE
 - README.rdoc
 - Rakefile

data/Gemfile.lock DELETED Viewed

@@ -1,40 +0,0 @@
-PATH
-  remote: .
-  specs:
-    twitter-text (1.4.11)
-      activesupport
-GEM
-  remote: http://rubygems.org/
-  specs:
-    activesupport (3.1.0)
-      multi_json (~> 1.0)
-    diff-lcs (1.1.2)
-    multi_json (1.0.4)
-    nokogiri (1.4.4)
-    nokogiri (1.4.4-java)
-      weakling (>= 0.0.3)
-    rake (0.8.7)
-    rspec (2.3.0)
-      rspec-core (~> 2.3.0)
-      rspec-expectations (~> 2.3.0)
-      rspec-mocks (~> 2.3.0)
-    rspec-core (2.3.1)
-    rspec-expectations (2.3.0)
-      diff-lcs (~> 1.1.2)
-    rspec-mocks (2.3.0)
-    simplecov (0.3.7)
-      simplecov-html (>= 0.3.7)
-    simplecov-html (0.3.9)
-    weakling (0.0.4-java)
-PLATFORMS
-  java
-  ruby
-DEPENDENCIES
-  nokogiri
-  rake
-  rspec
-  simplecov
-  twitter-text!