twitter-text 1.3.1 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitmodules CHANGED
@@ -1,3 +1,3 @@
1
1
  [submodule "test/twitter-text-conformance"]
2
2
  path = test/twitter-text-conformance
3
- url = git://github.com/mzsanford/twitter-text-conformance.git
3
+ url = git://github.com/twitter/twitter-text-conformance.git
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitter-text (1.3.1)
4
+ twitter-text (1.3.2)
5
5
  actionpack
6
6
 
7
7
  GEM
data/Rakefile CHANGED
@@ -39,7 +39,7 @@ namespace :test do
39
39
 
40
40
  desc "Run conformance test suite"
41
41
  task :run do
42
- ruby "test/conformance_test.rb"
42
+ ruby '-rubygems', "test/conformance_test.rb"
43
43
  end
44
44
  end
45
45
 
data/lib/regex.rb CHANGED
@@ -46,17 +46,21 @@ module Twitter
46
46
 
47
47
  REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
48
48
 
49
- # Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
50
- HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
51
- REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/\?]+)(#|#)([0-9a-z_]*[a-z_]+#{HASHTAG_CHARACTERS}*)/io
49
+ # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
50
+ HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}]/io
51
+ HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}]/io
52
+ REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/\?]+)(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
52
53
  REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
53
54
  REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
54
55
 
55
56
  # URL related hash regex collection
56
57
  REGEXEN[:valid_preceding_chars] = /(?:[^-\/"':!=A-Z0-9_@@]|^|\:)/i
57
- REGEXEN[:valid_domain] = /(?:[^[:punct:]\s][\.-](?=[^[:punct:]\s])|[^[:punct:]\s]){1,}\.[a-z]{2,}(?::[0-9]+)?/i
58
58
 
59
- REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~]/i
59
+ REGEXEN[:valid_subdomain] = /([^[:punct:]\s]([_-]|[^[:punct:]\s])*)?[^[:punct:]\s]\./
60
+ REGEXEN[:valid_domain_name] = /([^[:punct:]\s]([-]|[^[:punct:]\s])*)?[^[:punct:]\s]/
61
+ REGEXEN[:valid_domain] = /#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}\.[a-z]{2,}(?::[0-9]+)?/i
62
+
63
+ REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~|]/i
60
64
  # Allow URL paths to contain balanced parens
61
65
  # 1. Used in Wikipedia URLs like /Primer_(film)
62
66
  # 2. Used in IIS sessions like /S(dfd346)/
@@ -71,7 +75,7 @@ module Twitter
71
75
  # Valid end-of-path chracters (so /foo. does not gobble the period).
72
76
  # 1. Allow =&# for empty URL parameters and other URL-join artifacts
73
77
  REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-]|#{REGEXEN[:wikipedia_disambiguation]}/io
74
- REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
78
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i
75
79
  REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
76
80
  REGEXEN[:valid_url] = %r{
77
81
  ( # $1 total match
@@ -91,6 +95,103 @@ module Twitter
91
95
  )
92
96
  }iox;
93
97
 
98
+ # These URL validation pattern strings are based on the ABNF from RFC 3986
99
+ REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
100
+ REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
101
+ REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
102
+ REGEXEN[:validate_url_pchar] = /(?:
103
+ #{REGEXEN[:validate_url_unreserved]}|
104
+ #{REGEXEN[:validate_url_pct_encoded]}|
105
+ #{REGEXEN[:validate_url_sub_delims]}|
106
+ :|@
107
+ )/iox
108
+
109
+ REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
110
+ REGEXEN[:validate_url_userinfo] = /(?:
111
+ #{REGEXEN[:validate_url_unreserved]}|
112
+ #{REGEXEN[:validate_url_pct_encoded]}|
113
+ #{REGEXEN[:validate_url_sub_delims]}|
114
+ :
115
+ )*/iox
116
+
117
+ REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
118
+ REGEXEN[:validate_url_ipv4] =
119
+ /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
120
+
121
+ # Punting on real IPv6 validation for now
122
+ REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
123
+
124
+ # Also punting on IPvFuture for now
125
+ REGEXEN[:validate_url_ip] = /(?:
126
+ #{REGEXEN[:validate_url_ipv4]}|
127
+ #{REGEXEN[:validate_url_ipv6]}
128
+ )/iox
129
+
130
+ # This is more strict than the rfc specifies
131
+ REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
132
+ REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
133
+ REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
134
+ REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
135
+ (?:#{REGEXEN[:validate_url_domain_segment]}\.)
136
+ #{REGEXEN[:validate_url_domain_tld]})/iox
137
+
138
+ REGEXEN[:validate_url_host] = /(?:
139
+ #{REGEXEN[:validate_url_ip]}|
140
+ #{REGEXEN[:validate_url_domain]}
141
+ )/iox
142
+
143
+ # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
144
+ REGEXEN[:validate_url_unicode_subdomain_segment] =
145
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
146
+ REGEXEN[:validate_url_unicode_domain_segment] =
147
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
148
+ REGEXEN[:validate_url_unicode_domain_tld] =
149
+ /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
150
+ REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
151
+ (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
152
+ #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
153
+
154
+ REGEXEN[:validate_url_unicode_host] = /(?:
155
+ #{REGEXEN[:validate_url_ip]}|
156
+ #{REGEXEN[:validate_url_unicode_domain]}
157
+ )/iox
158
+
159
+ REGEXEN[:validate_url_port] = /[0-9]{1,5}/
160
+
161
+ REGEXEN[:validate_url_unicode_authority] = %r{
162
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
163
+ (#{REGEXEN[:validate_url_unicode_host]}) # $2 host
164
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
165
+ }iox
166
+
167
+ REGEXEN[:validate_url_authority] = %r{
168
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
169
+ (#{REGEXEN[:validate_url_host]}) # $2 host
170
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
171
+ }iox
172
+
173
+ REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
174
+ REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
175
+ REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
176
+
177
+ # Modified version of RFC 3986 Appendix B
178
+ REGEXEN[:validate_url_unencoded] = %r{
179
+ \A # Full URL
180
+ (?:
181
+ ([^:/?#]+): # $1 Scheme
182
+ )
183
+ (?://
184
+ ([^/?#]*) # $2 Authority
185
+ )
186
+ ([^?#]*) # $3 Path
187
+ (?:
188
+ \?([^#]*) # $4 Query
189
+ )?
190
+ (?:
191
+ \#(.*) # $5 Fragment
192
+ )?\Z
193
+ }ix
194
+
94
195
  REGEXEN.each_pair{|k,v| v.freeze }
95
196
 
96
197
  # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
data/lib/validation.rb CHANGED
@@ -46,5 +46,57 @@ module Twitter
46
46
 
47
47
  return false
48
48
  end
49
+
50
+ def valid_tweet_text?(text)
51
+ !tweet_invalid?(text)
52
+ end
53
+
54
+ def valid_username?(username)
55
+ return false if username.blank?
56
+
57
+ extracted = Twitter::Extractor.extract_mentioned_screen_names(username)
58
+ # Should extract the username minus the @ sign, hence the [1..-1]
59
+ extracted.size == 1 && extracted.first == username[1..-1]
60
+ end
61
+
62
+ VALID_LIST_RE = /\A#{Twitter::Regex[:auto_link_usernames_or_lists]}\z/o
63
+ def valid_list?(username_list)
64
+ match = username_list.match(VALID_LIST_RE)
65
+ # Must have matched and had nothing before or after
66
+ !!(match && match[1] == "" && !match[4].blank?)
67
+ end
68
+
69
+ def valid_hashtag?(hashtag)
70
+ return false if hashtag.blank?
71
+
72
+ extracted = Twitter::Extractor.extract_hashtags(hashtag)
73
+ # Should extract the hashtag minus the # sign, hence the [1..-1]
74
+ extracted.size == 1 && extracted.first == hashtag[1..-1]
75
+ end
76
+
77
+ def valid_url?(url, unicode_domains=true)
78
+ return false if url.blank?
79
+
80
+ url_parts = url.match(Twitter::Regex[:validate_url_unencoded])
81
+ return false unless (url_parts && url_parts.to_s == url)
82
+
83
+ scheme, authority, path, query, fragment = url_parts.captures
84
+
85
+ return false unless (valid_match?(scheme, Twitter::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i) &&
86
+ valid_match?(path, Twitter::Regex[:validate_url_path]) &&
87
+ valid_match?(query, Twitter::Regex[:validate_url_query], true) &&
88
+ valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true))
89
+
90
+ return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) ||
91
+ (!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority]))
92
+ end
93
+
94
+ private
95
+
96
+ def valid_match?(string, regex, optional=false)
97
+ return (string && string.match(regex) && $~.to_s == string) unless optional
98
+
99
+ !(string && (!string.match(regex) || $~.to_s != string))
100
+ end
49
101
  end
50
102
  end
@@ -310,6 +310,17 @@ describe Twitter::Autolink do
310
310
  end
311
311
  end
312
312
 
313
+ context "with a hashtag containing an accented latin character" do
314
+ def original_text
315
+ # the hashtag is #éhashtag
316
+ "##{[0x00e9].pack('U')}hashtag"
317
+ end
318
+
319
+ it "should be linked" do
320
+ @autolinked_text.should == "<a href=\"http://twitter.com/search?q=%23éhashtag\" title=\"#éhashtag\" class=\"tweet-url hashtag\" rel=\"nofollow\">#éhashtag</a>"
321
+ end
322
+ end
323
+
313
324
  end
314
325
 
315
326
  describe "URL autolinking" do
data/spec/test_urls.rb CHANGED
@@ -21,6 +21,11 @@ module TestUrls
21
21
  "http://mrs.domain-dash.biz",
22
22
  "http://x.com/has/one/char/domain",
23
23
  "http://t.co/nwcLTFF",
24
+ "http://sub_domain-dash.twitter.com",
25
+ "http://a.b.cd",
26
+ "http://a_b.c-d.com",
27
+ "http://a-b.b.com",
28
+ "http://twitter-dash.com",
24
29
  # "t.co/nwcLTFF"
25
30
  ] unless defined?(TestUrls::VALID)
26
31
 
@@ -29,7 +34,16 @@ module TestUrls
29
34
  "http://tld-too-short.x",
30
35
  "www.foobar.com",
31
36
  "WWW.FOOBAR.COM",
32
- "http://-doman_dash.com"
37
+ "http://-doman_dash.com",
38
+ "http://_leadingunderscore.twitter.com",
39
+ "http://trailingunderscore_.twitter.com",
40
+ "http://-leadingdash.twitter.com",
41
+ "http://trailingdash-.twitter.com",
42
+ "http://-leadingdash.com",
43
+ "http://trailingdash-.com",
44
+ "http://no_underscores.com",
45
+ "http://test.c_o_m",
46
+ "http://test.c-o-m"
33
47
  ] unless defined?(TestUrls::INVALID)
34
48
 
35
49
  end
@@ -7,6 +7,7 @@ class ConformanceTest < Test::Unit::TestCase
7
7
  include Twitter::Extractor
8
8
  include Twitter::Autolink
9
9
  include Twitter::HitHighlighter
10
+ include Twitter::Validation
10
11
 
11
12
  def setup
12
13
  @conformance_dir = ENV['CONFORMANCE_DIR'] || File.join(File.dirname(__FILE__), 'twitter-text-conformance')
@@ -35,6 +36,9 @@ class ConformanceTest < Test::Unit::TestCase
35
36
  def test_url_extractor_conformance
36
37
  run_conformance_test(File.join(@conformance_dir, 'extract.yml'), :urls) do |description, expected, input|
37
38
  assert_equal expected, extract_urls(input), description
39
+ expected.each do |expected_url|
40
+ assert_equal true, valid_url?(expected_url), "expected url [#{expected_url}] not valid"
41
+ end
38
42
  end
39
43
  end
40
44
 
@@ -109,6 +113,39 @@ class ConformanceTest < Test::Unit::TestCase
109
113
  end
110
114
  include HitHighlighterConformance
111
115
 
116
+ module ValidationConformance
117
+ def test_tweet_validation_conformance
118
+ run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :tweets) do |description, expected, input|
119
+ assert_equal expected, valid_tweet_text?(input), description
120
+ end
121
+ end
122
+
123
+ def test_users_validation_conformance
124
+ run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :usernames) do |description, expected, input|
125
+ assert_equal expected, valid_username?(input), description
126
+ end
127
+ end
128
+
129
+ def test_lists_validation_conformance
130
+ run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :lists) do |description, expected, input|
131
+ assert_equal expected, valid_list?(input), description
132
+ end
133
+ end
134
+
135
+ def test_urls_validation_conformance
136
+ run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :urls) do |description, expected, input|
137
+ assert_equal expected, valid_url?(input), description
138
+ end
139
+ end
140
+
141
+ def test_hashtags_validation_conformance
142
+ run_conformance_test(File.join(@conformance_dir, 'validate.yml'), :hashtags) do |description, expected, input|
143
+ assert_equal expected, valid_hashtag?(input), description
144
+ end
145
+ end
146
+ end
147
+ include ValidationConformance
148
+
112
149
  private
113
150
 
114
151
  def run_conformance_test(file, test_type, hash_config = false, &block)
@@ -123,4 +160,4 @@ class ConformanceTest < Test::Unit::TestCase
123
160
  end
124
161
  end
125
162
  end
126
- end
163
+ end
data/twitter-text.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  spec = Gem::Specification.new do |s|
2
2
  s.name = "twitter-text"
3
- s.version = "1.3.1"
3
+ s.version = "1.3.2"
4
4
  s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle", "Raffi Krikorian"]
5
5
  s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com", "raffi@twitter.com"]
6
6
  s.homepage = "http://twitter.com"
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter-text
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
5
- prerelease: false
4
+ hash: 31
5
+ prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 3
9
- - 1
10
- version: 1.3.1
9
+ - 2
10
+ version: 1.3.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Matt Sanford
@@ -19,11 +19,13 @@ autorequire:
19
19
  bindir: bin
20
20
  cert_chain: []
21
21
 
22
- date: 2011-01-06 00:00:00 -08:00
22
+ date: 2011-04-20 00:00:00 -07:00
23
23
  default_executable:
24
24
  dependencies:
25
25
  - !ruby/object:Gem::Dependency
26
- version_requirements: &id001 !ruby/object:Gem::Requirement
26
+ name: nokogiri
27
+ prerelease: false
28
+ requirement: &id001 !ruby/object:Gem::Requirement
27
29
  none: false
28
30
  requirements:
29
31
  - - ">="
@@ -32,12 +34,12 @@ dependencies:
32
34
  segments:
33
35
  - 0
34
36
  version: "0"
35
- requirement: *id001
36
- name: nokogiri
37
- prerelease: false
38
37
  type: :development
38
+ version_requirements: *id001
39
39
  - !ruby/object:Gem::Dependency
40
- version_requirements: &id002 !ruby/object:Gem::Requirement
40
+ name: rake
41
+ prerelease: false
42
+ requirement: &id002 !ruby/object:Gem::Requirement
41
43
  none: false
42
44
  requirements:
43
45
  - - ">="
@@ -46,12 +48,12 @@ dependencies:
46
48
  segments:
47
49
  - 0
48
50
  version: "0"
49
- requirement: *id002
50
- name: rake
51
- prerelease: false
52
51
  type: :development
52
+ version_requirements: *id002
53
53
  - !ruby/object:Gem::Dependency
54
- version_requirements: &id003 !ruby/object:Gem::Requirement
54
+ name: rspec
55
+ prerelease: false
56
+ requirement: &id003 !ruby/object:Gem::Requirement
55
57
  none: false
56
58
  requirements:
57
59
  - - ">="
@@ -60,12 +62,12 @@ dependencies:
60
62
  segments:
61
63
  - 0
62
64
  version: "0"
63
- requirement: *id003
64
- name: rspec
65
- prerelease: false
66
65
  type: :development
66
+ version_requirements: *id003
67
67
  - !ruby/object:Gem::Dependency
68
- version_requirements: &id004 !ruby/object:Gem::Requirement
68
+ name: simplecov
69
+ prerelease: false
70
+ requirement: &id004 !ruby/object:Gem::Requirement
69
71
  none: false
70
72
  requirements:
71
73
  - - ">="
@@ -74,12 +76,12 @@ dependencies:
74
76
  segments:
75
77
  - 0
76
78
  version: "0"
77
- requirement: *id004
78
- name: simplecov
79
- prerelease: false
80
79
  type: :development
80
+ version_requirements: *id004
81
81
  - !ruby/object:Gem::Dependency
82
- version_requirements: &id005 !ruby/object:Gem::Requirement
82
+ name: actionpack
83
+ prerelease: false
84
+ requirement: &id005 !ruby/object:Gem::Requirement
83
85
  none: false
84
86
  requirements:
85
87
  - - ">="
@@ -88,10 +90,8 @@ dependencies:
88
90
  segments:
89
91
  - 0
90
92
  version: "0"
91
- requirement: *id005
92
- name: actionpack
93
- prerelease: false
94
93
  type: :runtime
94
+ version_requirements: *id005
95
95
  description: A gem that provides text handling for Twitter
96
96
  email:
97
97
  - matt@twitter.com
@@ -165,7 +165,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
165
165
  requirements: []
166
166
 
167
167
  rubyforge_project:
168
- rubygems_version: 1.3.7
168
+ rubygems_version: 1.4.1
169
169
  signing_key:
170
170
  specification_version: 3
171
171
  summary: Twitter text handling library