twitter-text 1.4.13 → 1.4.14

Sign up to get free protection for your applications and to get access to all the features.
data/lib/extractor.rb CHANGED
@@ -190,6 +190,11 @@ module Twitter
190
190
  last_url[:indices][1] = end_position
191
191
  end
192
192
  else
193
+ # In the case of t.co URLs, don't allow additional path characters
194
+ if url =~ Twitter::Regex[:valid_tco_url]
195
+ url = $&
196
+ end_position = start_position + url.char_length
197
+ end
193
198
  urls << {
194
199
  :url => url,
195
200
  :indices => [start_position, end_position]
data/lib/regex.rb CHANGED
@@ -106,14 +106,14 @@ module Twitter
106
106
 
107
107
  REGEXEN[:auto_link_hashtags] = /#{HASHTAG}/io
108
108
  # Used in Extractor and Rewriter for final filtering
109
- REGEXEN[:end_hashtag_match] = /^(?:[##]|:\/\/)/o
109
+ REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
110
110
 
111
111
  REGEXEN[:at_signs] = /[@@]/
112
112
  REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
113
113
  REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
114
114
  REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
115
115
  # Used in Extractor and Rewriter for final filtering
116
- REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
116
+ REGEXEN[:end_screen_name_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
117
117
 
118
118
  REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
119
119
  REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
@@ -125,7 +125,7 @@ module Twitter
125
125
  REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
126
126
  REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
127
127
 
128
- REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^a-z]|$))/i
128
+ REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^a-z]|$))/i
129
129
  REGEXEN[:valid_ccTLD] = %r{
130
130
  (?:
131
131
  (?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
@@ -150,8 +150,11 @@ module Twitter
150
150
  (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
151
151
  /iox
152
152
 
153
+ # This is used in Extractor for stricter t.co URL extraction
154
+ REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/[a-z0-9]+/i
155
+
153
156
  # This is used in Extractor to filter out unwanted URLs.
154
- REGEXEN[:invalid_short_domain] = /^#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}$/io
157
+ REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
155
158
 
156
159
  REGEXEN[:valid_port_number] = /[0-9]+/
157
160
 
@@ -169,6 +169,27 @@ describe Twitter::Extractor do
169
169
  @extractor.extract_urls("http://tld-too-short.x").should == []
170
170
  end
171
171
  end
172
+
173
+ describe "t.co URLS" do
174
+ TestUrls::TCO.each do |url|
175
+ it "should only extract the t.co URL from the URL #{url}" do
176
+ extracted_urls = @extractor.extract_urls(url)
177
+ extracted_urls.size.should == 1
178
+ extracted_url = extracted_urls.first
179
+ extracted_url.should_not == url
180
+ extracted_url.should == url[0...20]
181
+ end
182
+
183
+ it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
184
+ text = "Sweet url: #{url} I found. #awesome"
185
+ extracted_urls = @extractor.extract_urls(text)
186
+ extracted_urls.size.should == 1
187
+ extracted_url = extracted_urls.first
188
+ extracted_url.should_not == url
189
+ extracted_url.should == url[0...20]
190
+ end
191
+ end
192
+ end
172
193
  end
173
194
 
174
195
  describe "urls with indices" do
@@ -200,6 +221,31 @@ describe Twitter::Extractor do
200
221
  @extractor.extract_urls_with_indices("http://tld-too-short.x").should == []
201
222
  end
202
223
  end
224
+
225
+ describe "t.co URLS" do
226
+ TestUrls::TCO.each do |url|
227
+ it "should only extract the t.co URL from the URL #{url} and adjust indices correctly" do
228
+ extracted_urls = @extractor.extract_urls_with_indices(url)
229
+ extracted_urls.size.should == 1
230
+ extracted_url = extracted_urls.first
231
+ extracted_url[:url].should_not include(url)
232
+ extracted_url[:url].should include(url[0...20])
233
+ extracted_url[:indices].first.should == 0
234
+ extracted_url[:indices].last.should == 20
235
+ end
236
+
237
+ it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
238
+ text = "Sweet url: #{url} I found. #awesome"
239
+ extracted_urls = @extractor.extract_urls_with_indices(text)
240
+ extracted_urls.size.should == 1
241
+ extracted_url = extracted_urls.first
242
+ extracted_url[:url].should_not include(url)
243
+ extracted_url[:url].should include(url[0...20])
244
+ extracted_url[:indices].first.should == 11
245
+ extracted_url[:indices].last.should == 31
246
+ end
247
+ end
248
+ end
203
249
  end
204
250
 
205
251
  describe "hashtags" do
data/spec/test_urls.rb CHANGED
@@ -52,6 +52,29 @@ module TestUrls
52
52
  "http://twitt#{[0x202B].pack('U')}er.com",
53
53
  "http://twitt#{[0x202C].pack('U')}er.com",
54
54
  "http://twitt#{[0x202D].pack('U')}er.com",
55
- "http://twitt#{[0x202E].pack('U')}er.com",
55
+ "http://twitt#{[0x202E].pack('U')}er.com"
56
56
  ] unless defined?(TestUrls::INVALID)
57
+
58
+ TCO = [
59
+ "http://t.co/P53cv5yO!",
60
+ "http://t.co/fQJmiPGg***",
61
+ "http://t.co/pbY2NfTZ's",
62
+ "http://t.co/2vYHpAc5;",
63
+ "http://t.co/ulYGBYSo:",
64
+ "http://t.co/GeT4bSiw=win",
65
+ "http://t.co/8MkmHU0k+fun",
66
+ "http://t.co/TKLp64dY.yes,",
67
+ "http://t.co/8vuO27cI$$",
68
+ "http://t.co/rPYTvdA8/",
69
+ "http://t.co/WvtMw5ku%",
70
+ "http://t.co/8t7G3ddS#",
71
+ "http://t.co/nfHNJDV2/#!",
72
+ "http://t.co/gK6NOXHs[good]",
73
+ "http://t.co/dMrT0o1Y]bad",
74
+ "http://t.co/FNkPfmii-",
75
+ "http://t.co/sMgS3pjI_oh",
76
+ "http://t.co/F8Dq3Plb~",
77
+ "http://t.co/ivvH58vC&help",
78
+ "http://t.co/iUBL15zD|NZ5KYLQ8"
79
+ ] unless defined?(TestUrls::TCO)
57
80
  end
data/twitter-text.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "twitter-text"
5
- s.version = "1.4.13"
5
+ s.version = "1.4.14"
6
6
  s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle",
7
7
  "Raffi Krikorian", "J.P. Cummins", "Yoshimasa Niwa", "Keita Fujii"]
8
8
  s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com",
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter-text
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 27
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 4
9
- - 13
10
- version: 1.4.13
9
+ - 14
10
+ version: 1.4.14
11
11
  platform: ruby
12
12
  authors:
13
13
  - Matt Sanford
@@ -22,7 +22,7 @@ autorequire:
22
22
  bindir: bin
23
23
  cert_chain: []
24
24
 
25
- date: 2011-11-02 00:00:00 -07:00
25
+ date: 2011-12-01 00:00:00 -08:00
26
26
  default_executable:
27
27
  dependencies:
28
28
  - !ruby/object:Gem::Dependency