twitter-text 1.4.13 → 1.4.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/extractor.rb +5 -0
- data/lib/regex.rb +7 -4
- data/spec/extractor_spec.rb +46 -0
- data/spec/test_urls.rb +24 -1
- data/twitter-text.gemspec +1 -1
- metadata +4 -4
data/lib/extractor.rb
CHANGED
@@ -190,6 +190,11 @@ module Twitter
|
|
190
190
|
last_url[:indices][1] = end_position
|
191
191
|
end
|
192
192
|
else
|
193
|
+
# In the case of t.co URLs, don't allow additional path characters
|
194
|
+
if url =~ Twitter::Regex[:valid_tco_url]
|
195
|
+
url = $&
|
196
|
+
end_position = start_position + url.char_length
|
197
|
+
end
|
193
198
|
urls << {
|
194
199
|
:url => url,
|
195
200
|
:indices => [start_position, end_position]
|
data/lib/regex.rb
CHANGED
@@ -106,14 +106,14 @@ module Twitter
|
|
106
106
|
|
107
107
|
REGEXEN[:auto_link_hashtags] = /#{HASHTAG}/io
|
108
108
|
# Used in Extractor and Rewriter for final filtering
|
109
|
-
REGEXEN[:end_hashtag_match] =
|
109
|
+
REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
|
110
110
|
|
111
111
|
REGEXEN[:at_signs] = /[@@]/
|
112
112
|
REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
113
113
|
REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
|
114
114
|
REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
115
115
|
# Used in Extractor and Rewriter for final filtering
|
116
|
-
REGEXEN[:end_screen_name_match] =
|
116
|
+
REGEXEN[:end_screen_name_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
|
117
117
|
|
118
118
|
REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
|
119
119
|
REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
|
@@ -125,7 +125,7 @@ module Twitter
|
|
125
125
|
REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
126
126
|
REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
127
127
|
|
128
|
-
REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^a-z]|$))/i
|
128
|
+
REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^a-z]|$))/i
|
129
129
|
REGEXEN[:valid_ccTLD] = %r{
|
130
130
|
(?:
|
131
131
|
(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
|
@@ -150,8 +150,11 @@ module Twitter
|
|
150
150
|
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
151
151
|
/iox
|
152
152
|
|
153
|
+
# This is used in Extractor for stricter t.co URL extraction
|
154
|
+
REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/[a-z0-9]+/i
|
155
|
+
|
153
156
|
# This is used in Extractor to filter out unwanted URLs.
|
154
|
-
REGEXEN[:invalid_short_domain] =
|
157
|
+
REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
|
155
158
|
|
156
159
|
REGEXEN[:valid_port_number] = /[0-9]+/
|
157
160
|
|
data/spec/extractor_spec.rb
CHANGED
@@ -169,6 +169,27 @@ describe Twitter::Extractor do
|
|
169
169
|
@extractor.extract_urls("http://tld-too-short.x").should == []
|
170
170
|
end
|
171
171
|
end
|
172
|
+
|
173
|
+
describe "t.co URLS" do
|
174
|
+
TestUrls::TCO.each do |url|
|
175
|
+
it "should only extract the t.co URL from the URL #{url}" do
|
176
|
+
extracted_urls = @extractor.extract_urls(url)
|
177
|
+
extracted_urls.size.should == 1
|
178
|
+
extracted_url = extracted_urls.first
|
179
|
+
extracted_url.should_not == url
|
180
|
+
extracted_url.should == url[0...20]
|
181
|
+
end
|
182
|
+
|
183
|
+
it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
|
184
|
+
text = "Sweet url: #{url} I found. #awesome"
|
185
|
+
extracted_urls = @extractor.extract_urls(text)
|
186
|
+
extracted_urls.size.should == 1
|
187
|
+
extracted_url = extracted_urls.first
|
188
|
+
extracted_url.should_not == url
|
189
|
+
extracted_url.should == url[0...20]
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
172
193
|
end
|
173
194
|
|
174
195
|
describe "urls with indices" do
|
@@ -200,6 +221,31 @@ describe Twitter::Extractor do
|
|
200
221
|
@extractor.extract_urls_with_indices("http://tld-too-short.x").should == []
|
201
222
|
end
|
202
223
|
end
|
224
|
+
|
225
|
+
describe "t.co URLS" do
|
226
|
+
TestUrls::TCO.each do |url|
|
227
|
+
it "should only extract the t.co URL from the URL #{url} and adjust indices correctly" do
|
228
|
+
extracted_urls = @extractor.extract_urls_with_indices(url)
|
229
|
+
extracted_urls.size.should == 1
|
230
|
+
extracted_url = extracted_urls.first
|
231
|
+
extracted_url[:url].should_not include(url)
|
232
|
+
extracted_url[:url].should include(url[0...20])
|
233
|
+
extracted_url[:indices].first.should == 0
|
234
|
+
extracted_url[:indices].last.should == 20
|
235
|
+
end
|
236
|
+
|
237
|
+
it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
|
238
|
+
text = "Sweet url: #{url} I found. #awesome"
|
239
|
+
extracted_urls = @extractor.extract_urls_with_indices(text)
|
240
|
+
extracted_urls.size.should == 1
|
241
|
+
extracted_url = extracted_urls.first
|
242
|
+
extracted_url[:url].should_not include(url)
|
243
|
+
extracted_url[:url].should include(url[0...20])
|
244
|
+
extracted_url[:indices].first.should == 11
|
245
|
+
extracted_url[:indices].last.should == 31
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
203
249
|
end
|
204
250
|
|
205
251
|
describe "hashtags" do
|
data/spec/test_urls.rb
CHANGED
@@ -52,6 +52,29 @@ module TestUrls
|
|
52
52
|
"http://twitt#{[0x202B].pack('U')}er.com",
|
53
53
|
"http://twitt#{[0x202C].pack('U')}er.com",
|
54
54
|
"http://twitt#{[0x202D].pack('U')}er.com",
|
55
|
-
"http://twitt#{[0x202E].pack('U')}er.com"
|
55
|
+
"http://twitt#{[0x202E].pack('U')}er.com"
|
56
56
|
] unless defined?(TestUrls::INVALID)
|
57
|
+
|
58
|
+
TCO = [
|
59
|
+
"http://t.co/P53cv5yO!",
|
60
|
+
"http://t.co/fQJmiPGg***",
|
61
|
+
"http://t.co/pbY2NfTZ's",
|
62
|
+
"http://t.co/2vYHpAc5;",
|
63
|
+
"http://t.co/ulYGBYSo:",
|
64
|
+
"http://t.co/GeT4bSiw=win",
|
65
|
+
"http://t.co/8MkmHU0k+fun",
|
66
|
+
"http://t.co/TKLp64dY.yes,",
|
67
|
+
"http://t.co/8vuO27cI$$",
|
68
|
+
"http://t.co/rPYTvdA8/",
|
69
|
+
"http://t.co/WvtMw5ku%",
|
70
|
+
"http://t.co/8t7G3ddS#",
|
71
|
+
"http://t.co/nfHNJDV2/#!",
|
72
|
+
"http://t.co/gK6NOXHs[good]",
|
73
|
+
"http://t.co/dMrT0o1Y]bad",
|
74
|
+
"http://t.co/FNkPfmii-",
|
75
|
+
"http://t.co/sMgS3pjI_oh",
|
76
|
+
"http://t.co/F8Dq3Plb~",
|
77
|
+
"http://t.co/ivvH58vC&help",
|
78
|
+
"http://t.co/iUBL15zD|NZ5KYLQ8"
|
79
|
+
] unless defined?(TestUrls::TCO)
|
57
80
|
end
|
data/twitter-text.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = "twitter-text"
|
5
|
-
s.version = "1.4.
|
5
|
+
s.version = "1.4.14"
|
6
6
|
s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle",
|
7
7
|
"Raffi Krikorian", "J.P. Cummins", "Yoshimasa Niwa", "Keita Fujii"]
|
8
8
|
s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com",
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 4
|
9
|
-
-
|
10
|
-
version: 1.4.
|
9
|
+
- 14
|
10
|
+
version: 1.4.14
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Matt Sanford
|
@@ -22,7 +22,7 @@ autorequire:
|
|
22
22
|
bindir: bin
|
23
23
|
cert_chain: []
|
24
24
|
|
25
|
-
date: 2011-
|
25
|
+
date: 2011-12-01 00:00:00 -08:00
|
26
26
|
default_executable:
|
27
27
|
dependencies:
|
28
28
|
- !ruby/object:Gem::Dependency
|