twitter-text 1.3.1 → 1.3.2

Sign up to get free protection for your applications and to get access to all the features.
data/.gitmodules CHANGED
@@ -1,3 +1,3 @@
1
1
  [submodule "test/twitter-text-conformance"]
2
2
  path = test/twitter-text-conformance
3
- url = git://github.com/mzsanford/twitter-text-conformance.git
3
+ url = git://github.com/twitter/twitter-text-conformance.git
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitter-text (1.3.1)
4
+ twitter-text (1.3.2)
5
5
  actionpack
6
6
 
7
7
  GEM
data/Rakefile CHANGED
@@ -39,7 +39,7 @@ namespace :test do
39
39
 
40
40
  desc "Run conformance test suite"
41
41
  task :run do
42
- ruby "test/conformance_test.rb"
42
+ ruby '-rubygems', "test/conformance_test.rb"
43
43
  end
44
44
  end
45
45
 
data/lib/regex.rb CHANGED
@@ -46,17 +46,21 @@ module Twitter
46
46
 
47
47
  REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
48
48
 
49
- # Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
50
- HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
51
- REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/\?]+)(#|#)([0-9a-z_]*[a-z_]+#{HASHTAG_CHARACTERS}*)/io
49
+ # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
50
+ HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}]/io
51
+ HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}]/io
52
+ REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/\?]+)(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
52
53
  REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
53
54
  REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
54
55
 
55
56
  # URL related hash regex collection
56
57
  REGEXEN[:valid_preceding_chars] = /(?:[^-\/"':!=A-Z0-9_@@]|^|\:)/i
57
- REGEXEN[:valid_domain] = /(?:[^[:punct:]\s][\.-](?=[^[:punct:]\s])|[^[:punct:]\s]){1,}\.[a-z]{2,}(?::[0-9]+)?/i
58
58
 
59
- REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~]/i
59
+ REGEXEN[:valid_subdomain] = /([^[:punct:]\s]([_-]|[^[:punct:]\s])*)?[^[:punct:]\s]\./
60
+ REGEXEN[:valid_domain_name] = /([^[:punct:]\s]([-]|[^[:punct:]\s])*)?[^[:punct:]\s]/
61
+ REGEXEN[:valid_domain] = /#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}\.[a-z]{2,}(?::[0-9]+)?/i
62
+
63
+ REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~|]/i
60
64
  # Allow URL paths to contain balanced parens
61
65
  # 1. Used in Wikipedia URLs like /Primer_(film)
62
66
  # 2. Used in IIS sessions like /S(dfd346)/
@@ -71,7 +75,7 @@ module Twitter
71
75
  # Valid end-of-path chracters (so /foo. does not gobble the period).
72
76
  # 1. Allow =&# for empty URL parameters and other URL-join artifacts
73
77
  REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-]|#{REGEXEN[:wikipedia_disambiguation]}/io
74
- REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
78
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i
75
79
  REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
76
80
  REGEXEN[:valid_url] = %r{
77
81
  ( # $1 total match
@@ -91,6 +95,103 @@ module Twitter
91
95
  )
92
96
  }iox;
93
97
 
98
+ # These URL validation pattern strings are based on the ABNF from RFC 3986
99
+ REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
100
+ REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
101
+ REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
102
+ REGEXEN[:validate_url_pchar] = /(?:
103
+ #{REGEXEN[:validate_url_unreserved]}|
104
+ #{REGEXEN[:validate_url_pct_encoded]}|
105
+ #{REGEXEN[:validate_url_sub_delims]}|
106
+ :|@
107
+ )/iox
108
+
109
+ REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
110
+ REGEXEN[:validate_url_userinfo] = /(?:
111
+ #{REGEXEN[:validate_url_unreserved]}|
112
+ #{REGEXEN[:validate_url_pct_encoded]}|
113
+ #{REGEXEN[:validate_url_sub_delims]}|
114
+ :
115
+ )*/iox
116
+
117
+ REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
118
+ REGEXEN[:validate_url_ipv4] =
119
+ /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
120
+
121
+ # Punting on real IPv6 validation for now
122
+ REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
123
+
124
+ # Also punting on IPvFuture for now
125
+ REGEXEN[:validate_url_ip] = /(?:
126
+ #{REGEXEN[:validate_url_ipv4]}|
127
+ #{REGEXEN[:validate_url_ipv6]}
128
+ )/iox
129
+
130
+ # This is more strict than the rfc specifies
131
+ REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
132
+ REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
133
+ REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
134
+ REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
135
+ (?:#{REGEXEN[:validate_url_domain_segment]}\.)
136
+ #{REGEXEN[:validate_url_domain_tld]})/iox
137
+
138
+ REGEXEN[:validate_url_host] = /(?:
139
+ #{REGEXEN[:validate_url_ip]}|
140
+ #{REGEXEN[:validate_url_domain]}
141
+ )/iox
142
+
143
+ # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
144
+ REGEXEN[:validate_url_unicode_subdomain_segment] =
145
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
146
+ REGEXEN[:validate_url_unicode_domain_segment] =
147
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
148
+ REGEXEN[:validate_url_unicode_domain_tld] =
149
+ /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
150
+ REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
151
+ (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
152
+ #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
153
+
154
+ REGEXEN[:validate_url_unicode_host] = /(?:
155
+ #{REGEXEN[:validate_url_ip]}|
156
+ #{REGEXEN[:validate_url_unicode_domain]}
157
+ )/iox
158
+
159
+ REGEXEN[:validate_url_port] = /[0-9]{1,5}/
160
+
161
+ REGEXEN[:validate_url_unicode_authority] = %r{
162
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
163
+ (#{REGEXEN[:validate_url_unicode_host]}) # $2 host
164
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
165
+ }iox
166
+
167
+ REGEXEN[:validate_url_authority] = %r{
168
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
169
+ (#{REGEXEN[:validate_url_host]}) # $2 host
170
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
171
+ }iox
172
+
173
+ REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
174
+ REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
175
+ REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
176
+
177
+ # Modified version of RFC 3986 Appendix B
178
+ REGEXEN[:validate_url_unencoded] = %r{
179
+ \A # Full URL
180
+ (?:
181
+ ([^:/?#]+): # $1 Scheme
182
+ )
183
+ (?://
184
+ ([^/?#]*) # $2 Authority
185
+ )
186
+ ([^?#]*) # $3 Path
187
+ (?:
188
+ \?([^#]*) # $4 Query
189
+ )?
190
+ (?:
191
+ \#(.*) # $5 Fragment
192
+ )?\Z
193
+ }ix
194
+
94
195
  REGEXEN.each_pair{|k,v| v.freeze }
95
196
 
96
197
  # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
data/lib/validation.rb CHANGED
@@ -46,5 +46,57 @@ module Twitter
46
46
 
47
47
  return false
48
48
  end
49
+
50
+ def valid_tweet_text?(text)
51
+ !tweet_invalid?(text)
52
+ end
53
+
54
+ def valid_username?(username)
55
+ return false if username.blank?
56
+
57
+ extracted = Twitter::Extractor.extract_mentioned_screen_names(username)
58
+ # Should extract the username minus the @ sign, hence the [1..-1]
59
+ extracted.size == 1 && extracted.first == username[1..-1]
60
+ end
61
+
62
+ VALID_LIST_RE = /\A#{Twitter::Regex[:auto_link_usernames_or_lists]}\z/o
63
+ def valid_list?(username_list)
64
+ match = username_list.match(VALID_LIST_RE)
65
+ # Must have matched and had nothing before or after
66
+ !!(match && match[1] == "" && !match[4].blank?)
67
+ end
68
+
69
+ def valid_hashtag?(hashtag)
70
+ return false if hashtag.blank?
71
+
72
+ extracted = Twitter::Extractor.extract_hashtags(hashtag)
73
+ # Should extract the hashtag minus the # sign, hence the [1..-1]
74
+ extracted.size == 1 && extracted.first == hashtag[1..-1]
75
+ end
76
+
77
+ def valid_url?(url, unicode_domains=true)
78
+ return false if url.blank?
79
+
80
+ url_parts = url.match(Twitter::Regex[:validate_url_unencoded])
81
+ return false unless (url_parts && url_parts.to_s == url)
82
+
83
+ scheme, authority, path, query, fragment = url_parts.captures
84
+
85
+ return false unless (valid_match?(scheme, Twitter::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i) &&
86
+ valid_match?(path, Twitter::Regex[:validate_url_path]) &&
87
+ valid_match?(query, Twitter::Regex[:validate_url_query], true) &&
88
+ valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true))
89
+
90
+ return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) ||
91
+ (!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority]))
92
+ end
93
+
94
+ private
95
+
96
+ def valid_match?(string, regex, optional=false)
97
+ return (string && string.match(regex) && $~.to_s == string) unless optional
98
+
99
+ !(string && (!string.match(regex) || $~.to_s != string))
100
+ end
49
101
  end
50
102
  end
@@ -310,6 +310,17 @@ describe Twitter::Autolink do
310
310
  end
311
311
  end
312
312
 
313
+ context "with a hashtag containing an accented latin character" do
314
+ def original_text
315
+ # the hashtag is #éhashtag
316
+ "##{[0x00e9].pack('U')}hashtag"
317
+ end
318
+
319
+ it "should be linked" do
320
+ @autolinked_text.should == "<a href=\"http://twitter.com/search?q=%23éhashtag\" title=\"#éhashtag\" class=\"tweet-url hashtag\" rel=\"nofollow\">#éhashtag</a>"
321
+ end
322
+ end
323
+
313
324
  end
314
325
 
315
326
  describe "URL autolinking" do
data/spec/test_urls.rb CHANGED
@@ -21,6 +21,11 @@ module TestUrls
21
21
  "http://mrs.domain-dash.biz",
22
22
  "http://x.com/has/one/char/domain",
23
23
  "http://t.co/nwcLTFF",
24
+ "http://sub_domain-dash.twitter.com",
25
+ "http://a.b.cd",
26
+ "http://a_b.c-d.com",
27
+ "http://a-b.b.com",
28
+ "http://twitter-dash.com",
24
29
  # "t.co/nwcLTFF"
25
30
  ] unless defined?(TestUrls::VALID)
26
31
 
@@ -29,7 +34,16 @@ module TestUrls
29
34
  "http://tld-too-short.x",
30
35
  "www.foobar.com",
31
36
  "WWW.FOOBAR.COM",
32
- "http://-doman_dash.com"
37
+ "http://-doman_dash.com",
38
+ "http://_leadingunderscore.twitter.com",
39
+ "http://trailingunderscore_.twitter.com",
40
+ "http://-leadingdash.twitter.com",
41
+ "http://trailingdash-.twitter.com",
42
+ "http://-leadingdash.com",
43
+ "http://trailingdash-.com",
44
+ "http://no_underscores.com",
45
+ "http://test.c_o_m",
46
+ "http://test.c-o-m"
33
47
  ] unless defined?(TestUrls::INVALID)
34
48
 
35
49
  end
@@ -7,6 +7,7 @@ class ConformanceTest < Test::Unit::TestCase
7
7
  include Twitter::Extractor
8
8
  include Twitter::Autolink
9
9
  include Twitter::HitHighlighter
10
+ include Twitter::Validation
10
11
 
11
12
  def setup
12
13
  @conformance_dir = ENV['CONFORMANCE_DIR'] || File.join(File.dirname(__FILE__), 'twitter-text-conformance')
@@ -35,6 +36,9 @@ class ConformanceTest < Test::Unit::TestCase
35
36
  def test_url_extractor_conformance
36
37
  run_conformance_test(File.join(@conformance_dir, 'extract.yml'), :urls) do |description, expected, input|
37
38
  assert_equal expected, extract_urls(input), description
39
+ expected.each do |expected_url|
40
+ assert_equal true, valid_url?(expected_url), "expected url [#{expected_url}] not valid"
41
+ end
38
42
  end
39
43
  end
40
44
 
@@ -109,6 +113,39 @@ class ConformanceTest < Test::Unit::TestCase
109
113
  end
110
114
  include HitHighlighterConformance
111
115
 
116
+ module ValidationConformance
117
+ def test_tweet_validation_conformance
118
+ run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :tweets) do |description, expected, input|
119
+ assert_equal expected, valid_tweet_text?(input), description
120
+ end
121
+ end
122
+
123
+ def test_users_validation_conformance
124
+ run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :usernames) do |description, expected, input|
125
+ assert_equal expected, valid_username?(input), description
126
+ end
127
+ end
128
+
129
+ def test_lists_validation_conformance
130
+ run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :lists) do |description, expected, input|
131
+ assert_equal expected, valid_list?(input), description
132
+ end
133
+ end
134
+
135
+ def test_urls_validation_conformance
136
+ run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :urls) do |description, expected, input|
137
+ assert_equal expected, valid_url?(input), description
138
+ end
139
+ end
140
+
141
+ def test_hashtags_validation_conformance
142
+ run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :hashtags) do |description, expected, input|
143
+ assert_equal expected, valid_hashtag?(input), description
144
+ end
145
+ end
146
+ end
147
+ include ValidationConformance
148
+
112
149
  private
113
150
 
114
151
  def run_conformance_test(file, test_type, hash_config = false, &block)
@@ -123,4 +160,4 @@ class ConformanceTest < Test::Unit::TestCase
123
160
  end
124
161
  end
125
162
  end
126
- end
163
+ end
data/twitter-text.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  spec = Gem::Specification.new do |s|
2
2
  s.name = "twitter-text"
3
- s.version = "1.3.1"
3
+ s.version = "1.3.2"
4
4
  s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle", "Raffi Krikorian"]
5
5
  s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com", "raffi@twitter.com"]
6
6
  s.homepage = "http://twitter.com"
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter-text
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
5
- prerelease: false
4
+ hash: 31
5
+ prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 3
9
- - 1
10
- version: 1.3.1
9
+ - 2
10
+ version: 1.3.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Matt Sanford
@@ -19,11 +19,13 @@ autorequire:
19
19
  bindir: bin
20
20
  cert_chain: []
21
21
 
22
- date: 2011-01-06 00:00:00 -08:00
22
+ date: 2011-04-20 00:00:00 -07:00
23
23
  default_executable:
24
24
  dependencies:
25
25
  - !ruby/object:Gem::Dependency
26
- version_requirements: &id001 !ruby/object:Gem::Requirement
26
+ name: nokogiri
27
+ prerelease: false
28
+ requirement: &id001 !ruby/object:Gem::Requirement
27
29
  none: false
28
30
  requirements:
29
31
  - - ">="
@@ -32,12 +34,12 @@ dependencies:
32
34
  segments:
33
35
  - 0
34
36
  version: "0"
35
- requirement: *id001
36
- name: nokogiri
37
- prerelease: false
38
37
  type: :development
38
+ version_requirements: *id001
39
39
  - !ruby/object:Gem::Dependency
40
- version_requirements: &id002 !ruby/object:Gem::Requirement
40
+ name: rake
41
+ prerelease: false
42
+ requirement: &id002 !ruby/object:Gem::Requirement
41
43
  none: false
42
44
  requirements:
43
45
  - - ">="
@@ -46,12 +48,12 @@ dependencies:
46
48
  segments:
47
49
  - 0
48
50
  version: "0"
49
- requirement: *id002
50
- name: rake
51
- prerelease: false
52
51
  type: :development
52
+ version_requirements: *id002
53
53
  - !ruby/object:Gem::Dependency
54
- version_requirements: &id003 !ruby/object:Gem::Requirement
54
+ name: rspec
55
+ prerelease: false
56
+ requirement: &id003 !ruby/object:Gem::Requirement
55
57
  none: false
56
58
  requirements:
57
59
  - - ">="
@@ -60,12 +62,12 @@ dependencies:
60
62
  segments:
61
63
  - 0
62
64
  version: "0"
63
- requirement: *id003
64
- name: rspec
65
- prerelease: false
66
65
  type: :development
66
+ version_requirements: *id003
67
67
  - !ruby/object:Gem::Dependency
68
- version_requirements: &id004 !ruby/object:Gem::Requirement
68
+ name: simplecov
69
+ prerelease: false
70
+ requirement: &id004 !ruby/object:Gem::Requirement
69
71
  none: false
70
72
  requirements:
71
73
  - - ">="
@@ -74,12 +76,12 @@ dependencies:
74
76
  segments:
75
77
  - 0
76
78
  version: "0"
77
- requirement: *id004
78
- name: simplecov
79
- prerelease: false
80
79
  type: :development
80
+ version_requirements: *id004
81
81
  - !ruby/object:Gem::Dependency
82
- version_requirements: &id005 !ruby/object:Gem::Requirement
82
+ name: actionpack
83
+ prerelease: false
84
+ requirement: &id005 !ruby/object:Gem::Requirement
83
85
  none: false
84
86
  requirements:
85
87
  - - ">="
@@ -88,10 +90,8 @@ dependencies:
88
90
  segments:
89
91
  - 0
90
92
  version: "0"
91
- requirement: *id005
92
- name: actionpack
93
- prerelease: false
94
93
  type: :runtime
94
+ version_requirements: *id005
95
95
  description: A gem that provides text handling for Twitter
96
96
  email:
97
97
  - matt@twitter.com
@@ -165,7 +165,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
165
165
  requirements: []
166
166
 
167
167
  rubyforge_project:
168
- rubygems_version: 1.3.7
168
+ rubygems_version: 1.4.1
169
169
  signing_key:
170
170
  specification_version: 3
171
171
  summary: Twitter text handling library