twitter-text 1.4.13 → 1.4.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/extractor.rb CHANGED
@@ -190,6 +190,11 @@ module Twitter
190
190
  last_url[:indices][1] = end_position
191
191
  end
192
192
  else
193
+ # In the case of t.co URLs, don't allow additional path characters
194
+ if url =~ Twitter::Regex[:valid_tco_url]
195
+ url = $&
196
+ end_position = start_position + url.char_length
197
+ end
193
198
  urls << {
194
199
  :url => url,
195
200
  :indices => [start_position, end_position]
data/lib/regex.rb CHANGED
@@ -106,14 +106,14 @@ module Twitter
106
106
 
107
107
  REGEXEN[:auto_link_hashtags] = /#{HASHTAG}/io
108
108
  # Used in Extractor and Rewriter for final filtering
109
- REGEXEN[:end_hashtag_match] = /^(?:[##]|:\/\/)/o
109
+ REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
110
110
 
111
111
  REGEXEN[:at_signs] = /[@@]/
112
112
  REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
113
113
  REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
114
114
  REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
115
115
  # Used in Extractor and Rewriter for final filtering
116
- REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
116
+ REGEXEN[:end_screen_name_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
117
117
 
118
118
  REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
119
119
  REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
@@ -125,7 +125,7 @@ module Twitter
125
125
  REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
126
126
  REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
127
127
 
128
- REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^a-z]|$))/i
128
+ REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^a-z]|$))/i
129
129
  REGEXEN[:valid_ccTLD] = %r{
130
130
  (?:
131
131
  (?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
@@ -150,8 +150,11 @@ module Twitter
150
150
  (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
151
151
  /iox
152
152
 
153
+ # This is used in Extractor for stricter t.co URL extraction
154
+ REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/[a-z0-9]+/i
155
+
153
156
  # This is used in Extractor to filter out unwanted URLs.
154
- REGEXEN[:invalid_short_domain] = /^#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}$/io
157
+ REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
155
158
 
156
159
  REGEXEN[:valid_port_number] = /[0-9]+/
157
160
 
@@ -169,6 +169,27 @@ describe Twitter::Extractor do
169
169
  @extractor.extract_urls("http://tld-too-short.x").should == []
170
170
  end
171
171
  end
172
+
173
+ describe "t.co URLS" do
174
+ TestUrls::TCO.each do |url|
175
+ it "should only extract the t.co URL from the URL #{url}" do
176
+ extracted_urls = @extractor.extract_urls(url)
177
+ extracted_urls.size.should == 1
178
+ extracted_url = extracted_urls.first
179
+ extracted_url.should_not == url
180
+ extracted_url.should == url[0...20]
181
+ end
182
+
183
+ it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
184
+ text = "Sweet url: #{url} I found. #awesome"
185
+ extracted_urls = @extractor.extract_urls(text)
186
+ extracted_urls.size.should == 1
187
+ extracted_url = extracted_urls.first
188
+ extracted_url.should_not == url
189
+ extracted_url.should == url[0...20]
190
+ end
191
+ end
192
+ end
172
193
  end
173
194
 
174
195
  describe "urls with indices" do
@@ -200,6 +221,31 @@ describe Twitter::Extractor do
200
221
  @extractor.extract_urls_with_indices("http://tld-too-short.x").should == []
201
222
  end
202
223
  end
224
+
225
+ describe "t.co URLS" do
226
+ TestUrls::TCO.each do |url|
227
+ it "should only extract the t.co URL from the URL #{url} and adjust indices correctly" do
228
+ extracted_urls = @extractor.extract_urls_with_indices(url)
229
+ extracted_urls.size.should == 1
230
+ extracted_url = extracted_urls.first
231
+ extracted_url[:url].should_not include(url)
232
+ extracted_url[:url].should include(url[0...20])
233
+ extracted_url[:indices].first.should == 0
234
+ extracted_url[:indices].last.should == 20
235
+ end
236
+
237
+ it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
238
+ text = "Sweet url: #{url} I found. #awesome"
239
+ extracted_urls = @extractor.extract_urls_with_indices(text)
240
+ extracted_urls.size.should == 1
241
+ extracted_url = extracted_urls.first
242
+ extracted_url[:url].should_not include(url)
243
+ extracted_url[:url].should include(url[0...20])
244
+ extracted_url[:indices].first.should == 11
245
+ extracted_url[:indices].last.should == 31
246
+ end
247
+ end
248
+ end
203
249
  end
204
250
 
205
251
  describe "hashtags" do
data/spec/test_urls.rb CHANGED
@@ -52,6 +52,29 @@ module TestUrls
52
52
  "http://twitt#{[0x202B].pack('U')}er.com",
53
53
  "http://twitt#{[0x202C].pack('U')}er.com",
54
54
  "http://twitt#{[0x202D].pack('U')}er.com",
55
- "http://twitt#{[0x202E].pack('U')}er.com",
55
+ "http://twitt#{[0x202E].pack('U')}er.com"
56
56
  ] unless defined?(TestUrls::INVALID)
57
+
58
+ TCO = [
59
+ "http://t.co/P53cv5yO!",
60
+ "http://t.co/fQJmiPGg***",
61
+ "http://t.co/pbY2NfTZ's",
62
+ "http://t.co/2vYHpAc5;",
63
+ "http://t.co/ulYGBYSo:",
64
+ "http://t.co/GeT4bSiw=win",
65
+ "http://t.co/8MkmHU0k+fun",
66
+ "http://t.co/TKLp64dY.yes,",
67
+ "http://t.co/8vuO27cI$$",
68
+ "http://t.co/rPYTvdA8/",
69
+ "http://t.co/WvtMw5ku%",
70
+ "http://t.co/8t7G3ddS#",
71
+ "http://t.co/nfHNJDV2/#!",
72
+ "http://t.co/gK6NOXHs[good]",
73
+ "http://t.co/dMrT0o1Y]bad",
74
+ "http://t.co/FNkPfmii-",
75
+ "http://t.co/sMgS3pjI_oh",
76
+ "http://t.co/F8Dq3Plb~",
77
+ "http://t.co/ivvH58vC&help",
78
+ "http://t.co/iUBL15zD|NZ5KYLQ8"
79
+ ] unless defined?(TestUrls::TCO)
57
80
  end
data/twitter-text.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "twitter-text"
5
- s.version = "1.4.13"
5
+ s.version = "1.4.14"
6
6
  s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle",
7
7
  "Raffi Krikorian", "J.P. Cummins", "Yoshimasa Niwa", "Keita Fujii"]
8
8
  s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com",
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter-text
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 27
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 4
9
- - 13
10
- version: 1.4.13
9
+ - 14
10
+ version: 1.4.14
11
11
  platform: ruby
12
12
  authors:
13
13
  - Matt Sanford
@@ -22,7 +22,7 @@ autorequire:
22
22
  bindir: bin
23
23
  cert_chain: []
24
24
 
25
- date: 2011-11-02 00:00:00 -07:00
25
+ date: 2011-12-01 00:00:00 -08:00
26
26
  default_executable:
27
27
  dependencies:
28
28
  - !ruby/object:Gem::Dependency