twitter-text 1.4.11 → 1.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gemtest ADDED
File without changes
data/.gitignore CHANGED
@@ -1,6 +1,40 @@
1
1
  *.gem
2
2
  *.rbc
3
+ *.sw[a-p]
4
+ *.tmproj
5
+ *.tmproject
6
+ *.un~
7
+ *~
8
+ .DS_Store
9
+ .Spotlight-V100
10
+ .Trashes
11
+ ._*
3
12
  .bundle
4
- pkg/*
13
+ .config
14
+ .directory
15
+ .elc
16
+ .emacs.desktop
17
+ .emacs.desktop.lock
18
+ .redcar
19
+ .yardoc
20
+ Desktop.ini
21
+ Gemfile.lock
22
+ Icon?
23
+ InstalledFiles
24
+ Session.vim
25
+ Thumbs.db
26
+ \#*\#
27
+ _yardoc
28
+ auto-save-list
5
29
  coverage
6
30
  doc
31
+ lib/bundler/man
32
+ pkg
33
+ pkg/*
34
+ rdoc
35
+ spec/reports
36
+ test/tmp
37
+ test/version_tmp
38
+ tmp
39
+ tmtags
40
+ tramp
data/Rakefile CHANGED
@@ -1,7 +1,9 @@
1
+ #!/usr/bin/env rake
1
2
  require 'bundler'
2
3
  Bundler::GemHelper.install_tasks
3
4
 
4
- task :default => ["spec", "test:conformance"]
5
+ task :default => ['spec', 'test:conformance']
6
+ task :test => :spec
5
7
 
6
8
  require 'rspec/core/rake_task'
7
9
  RSpec::Core::RakeTask.new(:spec)
@@ -48,9 +50,9 @@ namespace :test do
48
50
  end
49
51
  end
50
52
 
51
- require 'rake/rdoctask'
53
+ require 'rdoc/task'
52
54
  namespace :doc do
53
- Rake::RDocTask.new do |rd|
55
+ RDoc::Task.new do |rd|
54
56
  rd.main = "README.rdoc"
55
57
  rd.rdoc_dir = 'doc'
56
58
  rd.rdoc_files.include("README.rdoc", "lib/**/*.rb")
data/lib/extractor.rb CHANGED
@@ -158,11 +158,21 @@ module Twitter
158
158
  text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
159
159
  valid_url_match_data = $~
160
160
 
161
+ start_position = valid_url_match_data.char_begin(3)
162
+ end_position = valid_url_match_data.char_end(3)
163
+
164
+ # If protocol is missing, check against valid_ascii_domain
165
+ if !protocol
166
+ next unless domain =~ Twitter::Regex[:valid_ascii_domain]
167
+ if $~.char_begin(0)
168
+ start_position += $~.char_begin(0)
169
+ url.sub!(domain, $~.to_s())
170
+ end
171
+ end
172
+
161
173
  # Regex in Ruby 1.8 doesn't support lookbehind, so we need to manually filter out
162
174
  # the short URLs without protocol and path, i.e., [domain].[ccTLD]
163
175
  unless !protocol && !path && domain =~ Twitter::Regex[:valid_short_domain]
164
- start_position = valid_url_match_data.char_begin(3)
165
- end_position = valid_url_match_data.char_end(3)
166
176
  urls << {
167
177
  :url => url,
168
178
  :indices => [start_position, end_position]
data/lib/regex.rb CHANGED
@@ -39,9 +39,16 @@ module Twitter
39
39
  0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
40
40
  0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
41
41
  0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
42
- ].flatten.freeze
43
- SPACE_CHAR_CLASS_VALUE = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join(''))
44
- REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|'))
42
+ ].flatten.map{|c| [c].pack('U*')}.freeze
43
+ REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
44
+
45
+ # Character not allowed in Tweets
46
+ INVALID_CHARACTERS = [
47
+ 0xFFFE, 0xFEFF, # BOM
48
+ 0xFFFF, # Special
49
+ 0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
50
+ ].map{|cp| [cp].pack('U') }.freeze
51
+ REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
45
52
 
46
53
  REGEXEN[:at_signs] = /[@@]/
47
54
  REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
@@ -97,7 +104,7 @@ module Twitter
97
104
  regex_range(0x2F800, 0x2FA1F), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
98
105
  ].join('').freeze
99
106
 
100
- HASHTAG_BOUNDARY = /(?:\A|\z|#{REGEXEN[:spaces]}|[「」。、.,!?!?:;"'])/
107
+ HASHTAG_BOUNDARY = /(?:\A|\z|#{REGEXEN[:spaces]}|[「」。、.,!?!?:;"'])/o
101
108
 
102
109
  # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
103
110
  HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
@@ -111,11 +118,11 @@ module Twitter
111
118
  REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
112
119
 
113
120
  # URL related hash regex collection
114
- REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@@\.]|^)/i
121
+ REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@@\.#{INVALID_CHARACTERS.join('')}]|^)/io
115
122
 
116
- DOMAIN_VALID_CHARS = "[^[:punct:][:space:][:blank:]#{[0x00A0].pack('U')}]"
117
- REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/i
118
- REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/i
123
+ DOMAIN_VALID_CHARS = "[^[:punct:][:space:][:blank:][:cntrl:]#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
124
+ REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
125
+ REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
119
126
 
120
127
  REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^[:alpha:]]|$))/i
121
128
  REGEXEN[:valid_ccTLD] = %r{
@@ -134,26 +141,36 @@ module Twitter
134
141
  REGEXEN[:valid_domain] = /(?:
135
142
  #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
136
143
  (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
137
- )/ix
138
- REGEXEN[:valid_short_domain] = /^#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}$/
144
+ )/iox
145
+
146
+ # This is used in Extractor
147
+ REGEXEN[:valid_ascii_domain] = /
148
+ (?:(?:[[:alnum:]\-_]|#{REGEXEN[:latin_accents]})+\.)+
149
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
150
+ /iox
151
+
152
+ # This is used in Extractor to filter out unwanted URLs.
153
+ REGEXEN[:valid_short_domain] = /^#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}$/io
139
154
 
140
155
  REGEXEN[:valid_port_number] = /[0-9]+/
141
156
 
142
- REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~&|#{LATIN_ACCENTS}]/i
157
+ REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\.\$\/%#\[\]\-_~&|#{LATIN_ACCENTS}]/io
143
158
  # Allow URL paths to contain balanced parens
144
159
  # 1. Used in Wikipedia URLs like /Primer_(film)
145
160
  # 2. Used in IIS sessions like /S(dfd346)/
146
- REGEXEN[:wikipedia_disambiguation] = /(?:\(#{REGEXEN[:valid_general_url_path_chars]}+\))/i
147
- # Allow @ in a url, but only in the middle. Catch things like http://example.com/@user
148
- REGEXEN[:valid_url_path_chars] = /(?:
149
- #{REGEXEN[:wikipedia_disambiguation]}|
150
- @#{REGEXEN[:valid_general_url_path_chars]}+\/|
151
- [\.,]#{REGEXEN[:valid_general_url_path_chars]}?|
152
- #{REGEXEN[:valid_general_url_path_chars]}+
153
- )/ix
161
+ REGEXEN[:valid_url_balanced_parens] = /\(#{REGEXEN[:valid_general_url_path_chars]}+\)/io
154
162
  # Valid end-of-path chracters (so /foo. does not gobble the period).
155
163
  # 1. Allow =&# for empty URL parameters and other URL-join artifacts
156
- REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-#{LATIN_ACCENTS}]|#{REGEXEN[:wikipedia_disambiguation]}/io
164
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
165
+ # Allow @ in a url, but only in the middle. Catch things like http://example.com/@user/
166
+ REGEXEN[:valid_url_path] = /(?:
167
+ (?:
168
+ #{REGEXEN[:valid_general_url_path_chars]}*
169
+ (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
170
+ #{REGEXEN[:valid_url_path_ending_chars]}
171
+ )|(?:@#{REGEXEN[:valid_general_url_path_chars]}+\/)
172
+ )/iox
173
+
157
174
  REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i
158
175
  REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
159
176
  REGEXEN[:valid_url] = %r{
@@ -163,13 +180,7 @@ module Twitter
163
180
  (https?:\/\/)? # $4 Protocol (optional)
164
181
  (#{REGEXEN[:valid_domain]}) # $5 Domain(s)
165
182
  (?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
166
- (/
167
- (?:
168
- #{REGEXEN[:valid_url_path_chars]}+#{REGEXEN[:valid_url_path_ending_chars]}| # 1+ path chars and a valid last char
169
- #{REGEXEN[:valid_url_path_chars]}+#{REGEXEN[:valid_url_path_ending_chars]}?| # Optional last char to handle /@foo/ case
170
- #{REGEXEN[:valid_url_path_ending_chars]} # Just a # case
171
- )?
172
- )? # $7 URL Path and anchor
183
+ (/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
173
184
  (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
174
185
  )
175
186
  )
data/lib/validation.rb CHANGED
@@ -2,13 +2,6 @@ module Twitter
2
2
  module Validation extend self
3
3
  MAX_LENGTH = 140
4
4
 
5
- # Character not allowed in Tweets
6
- INVALID_CHARACTERS = [
7
- 0xFFFE, 0xFEFF, # BOM
8
- 0xFFFF, # Special
9
- 0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
10
- ].map{|cp| [cp].pack('U') }.freeze
11
-
12
5
  # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
13
6
  # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
14
7
  # string no matter which actual form was transmitted. For example:
@@ -38,7 +31,7 @@ module Twitter
38
31
  return :empty if !text || text.empty?
39
32
  begin
40
33
  return :too_long if tweet_length(text) > MAX_LENGTH
41
- return :invalid_characters if INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
34
+ return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
42
35
  rescue ArgumentError, ActiveSupport::Multibyte::EncodingError => e
43
36
  # non-Unicode value.
44
37
  return :invalid_characters
@@ -1,6 +1,5 @@
1
- #encoding: UTF-8
2
- # require File.dirname(__FILE__) + '/spec_helper'
3
- require 'spec_helper'
1
+ # encoding: utf-8
2
+ require File.dirname(__FILE__) + '/spec_helper'
4
3
 
5
4
  class TestAutolink
6
5
  include Twitter::Autolink
@@ -1,4 +1,4 @@
1
- #encoding: UTF-8
1
+ # encoding: utf-8
2
2
  require File.dirname(__FILE__) + '/spec_helper'
3
3
 
4
4
  class TestExtractor
@@ -1,4 +1,4 @@
1
- #encoding: UTF-8
1
+ # encoding: utf-8
2
2
  require File.dirname(__FILE__) + '/spec_helper'
3
3
 
4
4
  class TestHitHighlighter
@@ -1,6 +1,5 @@
1
- # encoding: UTF-8
2
-
3
- require 'spec_helper'
1
+ # encoding: utf-8
2
+ require File.dirname(__FILE__) + '/spec_helper'
4
3
 
5
4
  describe Twitter::Rewriter do
6
5
  def original_text; end
data/spec/test_urls.rb CHANGED
@@ -1,4 +1,5 @@
1
- #encoding: UTF-8
1
+ # encoding: utf-8
2
+
2
3
  module TestUrls
3
4
  VALID = [
4
5
  "http://google.com",
@@ -26,6 +27,7 @@ module TestUrls
26
27
  "http://a_b.c-d.com",
27
28
  "http://a-b.b.com",
28
29
  "http://twitter-dash.com",
30
+ "http://msdn.microsoft.com/ja-jp/library/system.net.httpwebrequest(v=VS.100).aspx",
29
31
  "www.foobar.com",
30
32
  "WWW.FOOBAR.COM",
31
33
  "www.foobar.co.jp",
@@ -45,7 +47,11 @@ module TestUrls
45
47
  "http://trailingdash-.com",
46
48
  "http://no_underscores.com",
47
49
  "http://test.c_o_m",
48
- "http://test.c-o-m"
50
+ "http://test.c-o-m",
51
+ "http://twitt#{[0x202A].pack('U')}er.com",
52
+ "http://twitt#{[0x202B].pack('U')}er.com",
53
+ "http://twitt#{[0x202C].pack('U')}er.com",
54
+ "http://twitt#{[0x202D].pack('U')}er.com",
55
+ "http://twitt#{[0x202E].pack('U')}er.com",
49
56
  ] unless defined?(TestUrls::INVALID)
50
-
51
57
  end
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  require File.dirname(__FILE__) + '/spec_helper'
2
3
 
3
4
  major, minor, patch = RUBY_VERSION.split('.')
data/spec/unicode_spec.rb CHANGED
@@ -1,4 +1,4 @@
1
- #encoding: UTF-8
1
+ # encoding: utf-8
2
2
  require File.dirname(__FILE__) + '/spec_helper'
3
3
 
4
4
  describe Twitter::Unicode do
@@ -28,4 +28,4 @@ describe Twitter::Unicode do
28
28
  lambda { Twitter::Unicode::FFFFFF }.should raise_error(NameError)
29
29
  end
30
30
 
31
- end
31
+ end
@@ -1,4 +1,4 @@
1
- #encoding: BINARY
1
+ # encoding: binary
2
2
  require File.dirname(__FILE__) + '/spec_helper'
3
3
 
4
4
  class TestValidation
@@ -40,4 +40,4 @@ describe Twitter::Validation do
40
40
  TestValidation.new.tweet_invalid?(char * 141).should == :too_long
41
41
  end
42
42
 
43
- end
43
+ end
data/twitter-text.gemspec CHANGED
@@ -1,6 +1,8 @@
1
- spec = Gem::Specification.new do |s|
1
+ # encoding: utf-8
2
+
3
+ Gem::Specification.new do |s|
2
4
  s.name = "twitter-text"
3
- s.version = "1.4.11"
5
+ s.version = "1.4.12"
4
6
  s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle",
5
7
  "Raffi Krikorian", "J.P. Cummins", "Yoshimasa Niwa", "Keita Fujii"]
6
8
  s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com",
@@ -14,6 +16,7 @@ spec = Gem::Specification.new do |s|
14
16
 
15
17
  s.add_development_dependency "nokogiri"
16
18
  s.add_development_dependency "rake"
19
+ s.add_development_dependency "rdoc"
17
20
  s.add_development_dependency "rspec"
18
21
  s.add_development_dependency "simplecov"
19
22
  s.add_runtime_dependency "activesupport"
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter-text
3
3
  version: !ruby/object:Gem::Version
4
- hash: 17
4
+ hash: 31
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 4
9
- - 11
10
- version: 1.4.11
9
+ - 12
10
+ version: 1.4.12
11
11
  platform: ruby
12
12
  authors:
13
13
  - Matt Sanford
@@ -22,7 +22,7 @@ autorequire:
22
22
  bindir: bin
23
23
  cert_chain: []
24
24
 
25
- date: 2011-09-26 00:00:00 -07:00
25
+ date: 2011-10-04 00:00:00 -07:00
26
26
  default_executable:
27
27
  dependencies:
28
28
  - !ruby/object:Gem::Dependency
@@ -54,7 +54,7 @@ dependencies:
54
54
  type: :development
55
55
  version_requirements: *id002
56
56
  - !ruby/object:Gem::Dependency
57
- name: rspec
57
+ name: rdoc
58
58
  prerelease: false
59
59
  requirement: &id003 !ruby/object:Gem::Requirement
60
60
  none: false
@@ -68,7 +68,7 @@ dependencies:
68
68
  type: :development
69
69
  version_requirements: *id003
70
70
  - !ruby/object:Gem::Dependency
71
- name: simplecov
71
+ name: rspec
72
72
  prerelease: false
73
73
  requirement: &id004 !ruby/object:Gem::Requirement
74
74
  none: false
@@ -82,7 +82,7 @@ dependencies:
82
82
  type: :development
83
83
  version_requirements: *id004
84
84
  - !ruby/object:Gem::Dependency
85
- name: activesupport
85
+ name: simplecov
86
86
  prerelease: false
87
87
  requirement: &id005 !ruby/object:Gem::Requirement
88
88
  none: false
@@ -93,8 +93,22 @@ dependencies:
93
93
  segments:
94
94
  - 0
95
95
  version: "0"
96
- type: :runtime
96
+ type: :development
97
97
  version_requirements: *id005
98
+ - !ruby/object:Gem::Dependency
99
+ name: activesupport
100
+ prerelease: false
101
+ requirement: &id006 !ruby/object:Gem::Requirement
102
+ none: false
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ hash: 3
107
+ segments:
108
+ - 0
109
+ version: "0"
110
+ type: :runtime
111
+ version_requirements: *id006
98
112
  description: A gem that provides text handling for Twitter
99
113
  email:
100
114
  - matt@twitter.com
@@ -112,11 +126,11 @@ extensions: []
112
126
  extra_rdoc_files: []
113
127
 
114
128
  files:
129
+ - .gemtest
115
130
  - .gitignore
116
131
  - .gitmodules
117
132
  - .rspec
118
133
  - Gemfile
119
- - Gemfile.lock
120
134
  - LICENSE
121
135
  - README.rdoc
122
136
  - Rakefile
data/Gemfile.lock DELETED
@@ -1,40 +0,0 @@
1
- PATH
2
- remote: .
3
- specs:
4
- twitter-text (1.4.11)
5
- activesupport
6
-
7
- GEM
8
- remote: http://rubygems.org/
9
- specs:
10
- activesupport (3.1.0)
11
- multi_json (~> 1.0)
12
- diff-lcs (1.1.2)
13
- multi_json (1.0.4)
14
- nokogiri (1.4.4)
15
- nokogiri (1.4.4-java)
16
- weakling (>= 0.0.3)
17
- rake (0.8.7)
18
- rspec (2.3.0)
19
- rspec-core (~> 2.3.0)
20
- rspec-expectations (~> 2.3.0)
21
- rspec-mocks (~> 2.3.0)
22
- rspec-core (2.3.1)
23
- rspec-expectations (2.3.0)
24
- diff-lcs (~> 1.1.2)
25
- rspec-mocks (2.3.0)
26
- simplecov (0.3.7)
27
- simplecov-html (>= 0.3.7)
28
- simplecov-html (0.3.9)
29
- weakling (0.0.4-java)
30
-
31
- PLATFORMS
32
- java
33
- ruby
34
-
35
- DEPENDENCIES
36
- nokogiri
37
- rake
38
- rspec
39
- simplecov
40
- twitter-text!