twitter-text-simpleidn 3.0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,136 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ # encoding: utf-8
6
+ require File.dirname(__FILE__) + '/spec_helper'
7
+
8
+ describe Twitter::TwitterText::Configuration do
9
+ context "configuration" do
10
+ context "with invalid data" do
11
+ it "should raise an exception" do
12
+ invalid_hash = Twitter::TwitterText::Configuration.parse_string("{\"version\":2,\"maxWeightedTweetLength\":280,\"scale\":100,\"defaultWeight\":200,\"transformedURLLength\":23,\"ranges\":[{\"start\":0,\"end\":true,\"weight\":false},{\"start\":8192,\"end\":8205,\"weight\":100},{\"start\":8208,\"end\":8223,\"weight\":100},{\"start\":8242,\"end\":8247,\"weight\":100}]}")
13
+ expect { Twitter::TwitterText::Configuration.new(invalid_hash) }.to raise_error(ArgumentError)
14
+ end
15
+ end
16
+
17
+ context "with defaults" do
18
+ before do
19
+ Twitter::TwitterText::Configuration.default_configuration = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V2)
20
+ end
21
+
22
+ it "should define version constants" do
23
+ expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V1)).to be true
24
+ expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V2)).to be true
25
+ expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V3)).to be true
26
+ end
27
+
28
+ it "should define a default configuration" do
29
+ expect(Twitter::TwitterText::Configuration.default_configuration).to_not be_nil
30
+ expect(Twitter::TwitterText::Configuration.default_configuration.version).to eq(2)
31
+ end
32
+ end
33
+
34
+ context "with v1 configuration" do
35
+ before do
36
+ @config = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V1)
37
+ end
38
+
39
+ it "should have a version" do
40
+ expect(@config.version).to eq(1)
41
+ end
42
+
43
+ it "should have a max_weighted_tweet_length" do
44
+ expect(@config.max_weighted_tweet_length).to eq(140)
45
+ end
46
+
47
+ it "should have a scale" do
48
+ expect(@config.scale).to eq(1)
49
+ end
50
+
51
+ it "should have a default_weight" do
52
+ expect(@config.default_weight).to eq(1)
53
+ end
54
+
55
+ it "should have a transformed_url_length" do
56
+ expect(@config.transformed_url_length).to eq(23)
57
+ end
58
+ end
59
+
60
+ context "with v2 configuration" do
61
+ before do
62
+ @config = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V2)
63
+ end
64
+
65
+ it "should have a version" do
66
+ expect(@config.version).to eq(2)
67
+ end
68
+
69
+ it "should have a max_weighted_tweet_length" do
70
+ expect(@config.max_weighted_tweet_length).to eq(280)
71
+ end
72
+
73
+ it "should have a scale" do
74
+ expect(@config.scale).to eq(100)
75
+ end
76
+
77
+ it "should have a default_weight" do
78
+ expect(@config.default_weight).to eq(200)
79
+ end
80
+
81
+ it "should have a transformed_url_length" do
82
+ expect(@config.transformed_url_length).to eq(23)
83
+ end
84
+
85
+ it "should have a configured range" do
86
+ expect(@config.ranges).to be_kind_of(Array)
87
+ expect(@config.ranges.count).to be > 0
88
+ expect(@config.ranges[0]).to be_kind_of(Twitter::TwitterText::WeightedRange)
89
+ weighted_range = @config.ranges[0]
90
+ expect(weighted_range.start).to be_kind_of(Integer)
91
+ expect(weighted_range.end).to be_kind_of(Integer)
92
+ expect(weighted_range.weight).to be_kind_of(Integer)
93
+ end
94
+ end
95
+
96
+ context "with v3 configuration" do
97
+ before do
98
+ @config = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V3)
99
+ end
100
+
101
+ it "should have a version" do
102
+ expect(@config.version).to eq(3)
103
+ end
104
+
105
+ it "should have a max_weighted_tweet_length" do
106
+ expect(@config.max_weighted_tweet_length).to eq(280)
107
+ end
108
+
109
+ it "should have a scale" do
110
+ expect(@config.scale).to eq(100)
111
+ end
112
+
113
+ it "should have a default_weight" do
114
+ expect(@config.default_weight).to eq(200)
115
+ end
116
+
117
+ it "should have a transformed_url_length" do
118
+ expect(@config.transformed_url_length).to eq(23)
119
+ end
120
+
121
+ it "should have a configured range" do
122
+ expect(@config.ranges).to be_kind_of(Array)
123
+ expect(@config.ranges.count).to be > 0
124
+ expect(@config.ranges[0]).to be_kind_of(Twitter::TwitterText::WeightedRange)
125
+ weighted_range = @config.ranges[0]
126
+ expect(weighted_range.start).to be_kind_of(Integer)
127
+ expect(weighted_range.end).to be_kind_of(Integer)
128
+ expect(weighted_range.weight).to be_kind_of(Integer)
129
+ end
130
+
131
+ it "should support discounting emoji" do
132
+ expect(@config.emoji_parsing_enabled).to be true
133
+ end
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,392 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ # encoding: utf-8
6
+ require File.dirname(__FILE__) + '/spec_helper'
7
+
8
+ class TestExtractor
9
+ include Twitter::TwitterText::Extractor
10
+ end
11
+
12
+ describe Twitter::TwitterText::Extractor do
13
+ before do
14
+ @extractor = TestExtractor.new
15
+ end
16
+
17
+ describe "mentions" do
18
+ context "single screen name alone " do
19
+ it "should be linked" do
20
+ expect(@extractor.extract_mentioned_screen_names("@alice")).to be == ["alice"]
21
+ end
22
+
23
+ it "should be linked with _" do
24
+ expect(@extractor.extract_mentioned_screen_names("@alice_adams")).to be == ["alice_adams"]
25
+ end
26
+
27
+ it "should be linked if numeric" do
28
+ expect(@extractor.extract_mentioned_screen_names("@1234")).to be == ["1234"]
29
+ end
30
+ end
31
+
32
+ context "multiple screen names" do
33
+ it "should both be linked" do
34
+ expect(@extractor.extract_mentioned_screen_names("@alice @bob")).to be == ["alice", "bob"]
35
+ end
36
+ end
37
+
38
+ context "screen names embedded in text" do
39
+ it "should be linked in Latin text" do
40
+ expect(@extractor.extract_mentioned_screen_names("waiting for @alice to arrive")).to be == ["alice"]
41
+ end
42
+
43
+ it "should be linked in Japanese text" do
44
+ expect(@extractor.extract_mentioned_screen_names("の@aliceに到着を待っている")).to be == ["alice"]
45
+ end
46
+
47
+ it "should ignore mentions preceded by !, @, #, $, %, & or *" do
48
+ invalid_chars = ['!', '@', '#', '$', '%', '&', '*']
49
+ invalid_chars.each do |c|
50
+ expect(@extractor.extract_mentioned_screen_names("f#{c}@kn")).to be == []
51
+ end
52
+ end
53
+ end
54
+
55
+ it "should accept a block arugment and call it in order" do
56
+ needed = ["alice", "bob"]
57
+ @extractor.extract_mentioned_screen_names("@alice @bob") do |sn|
58
+ expect(sn).to be == needed.shift
59
+ end
60
+ expect(needed).to be == []
61
+ end
62
+ end
63
+
64
+ describe "mentions with indices" do
65
+ context "single screen name alone " do
66
+ it "should be linked and the correct indices" do
67
+ expect(@extractor.extract_mentioned_screen_names_with_indices("@alice")).to be == [{:screen_name => "alice", :indices => [0, 6]}]
68
+ end
69
+
70
+ it "should be linked with _ and the correct indices" do
71
+ expect(@extractor.extract_mentioned_screen_names_with_indices("@alice_adams")).to be == [{:screen_name => "alice_adams", :indices => [0, 12]}]
72
+ end
73
+
74
+ it "should be linked if numeric and the correct indices" do
75
+ expect(@extractor.extract_mentioned_screen_names_with_indices("@1234")).to be == [{:screen_name => "1234", :indices => [0, 5]}]
76
+ end
77
+ end
78
+
79
+ context "multiple screen names" do
80
+ it "should both be linked with the correct indices" do
81
+ expect(@extractor.extract_mentioned_screen_names_with_indices("@alice @bob")).to be ==
82
+ [{:screen_name => "alice", :indices => [0, 6]},
83
+ {:screen_name => "bob", :indices => [7, 11]}]
84
+ end
85
+
86
+ it "should be linked with the correct indices even when repeated" do
87
+ expect(@extractor.extract_mentioned_screen_names_with_indices("@alice @alice @bob")).to be ==
88
+ [{:screen_name => "alice", :indices => [0, 6]},
89
+ {:screen_name => "alice", :indices => [7, 13]},
90
+ {:screen_name => "bob", :indices => [14, 18]}]
91
+ end
92
+ end
93
+
94
+ context "screen names embedded in text" do
95
+ it "should be linked in Latin text with the correct indices" do
96
+ expect(@extractor.extract_mentioned_screen_names_with_indices("waiting for @alice to arrive")).to be == [{:screen_name => "alice", :indices => [12, 18]}]
97
+ end
98
+
99
+ it "should be linked in Japanese text with the correct indices" do
100
+ expect(@extractor.extract_mentioned_screen_names_with_indices("の@aliceに到着を待っている")).to be == [{:screen_name => "alice", :indices => [1, 7]}]
101
+ end
102
+ end
103
+
104
+ it "should accept a block arugment and call it in order" do
105
+ needed = [{:screen_name => "alice", :indices => [0, 6]}, {:screen_name => "bob", :indices => [7, 11]}]
106
+ @extractor.extract_mentioned_screen_names_with_indices("@alice @bob") do |sn, start_index, end_index|
107
+ data = needed.shift
108
+ expect(sn).to be == data[:screen_name]
109
+ expect(start_index).to be == data[:indices].first
110
+ expect(end_index).to be == data[:indices].last
111
+ end
112
+ expect(needed).to be == []
113
+ end
114
+
115
+ it "should extract screen name in text with supplementary character" do
116
+ expect(@extractor.extract_mentioned_screen_names_with_indices("#{[0x10400].pack('U')} @alice")).to be == [{:screen_name => "alice", :indices => [2, 8]}]
117
+ end
118
+ end
119
+
120
+ describe "replies" do
121
+ context "should be extracted from" do
122
+ it "should extract from lone name" do
123
+ expect(@extractor.extract_reply_screen_name("@alice")).to be == "alice"
124
+ end
125
+
126
+ it "should extract from the start" do
127
+ expect(@extractor.extract_reply_screen_name("@alice reply text")).to be == "alice"
128
+ end
129
+
130
+ it "should extract preceded by a space" do
131
+ expect(@extractor.extract_reply_screen_name(" @alice reply text")).to be == "alice"
132
+ end
133
+
134
+ it "should extract preceded by a full-width space" do
135
+ expect(@extractor.extract_reply_screen_name("#{[0x3000].pack('U')}@alice reply text")).to be == "alice"
136
+ end
137
+ end
138
+
139
+ context "should not be extracted from" do
140
+ it "should not be extracted when preceded by text" do
141
+ expect(@extractor.extract_reply_screen_name("reply @alice text")).to be == nil
142
+ end
143
+
144
+ it "should not be extracted when preceded by puctuation" do
145
+ %w(. / _ - + # ! @).each do |punct|
146
+ expect(@extractor.extract_reply_screen_name("#{punct}@alice text")).to be == nil
147
+ end
148
+ end
149
+ end
150
+
151
+ context "should accept a block arugment" do
152
+ it "should call the block on match" do
153
+ @extractor.extract_reply_screen_name("@alice") do |sn|
154
+ expect(sn).to be == "alice"
155
+ end
156
+ end
157
+
158
+ it "should not call the block on no match" do
159
+ calls = 0
160
+ @extractor.extract_reply_screen_name("not a reply") do |sn|
161
+ calls += 1
162
+ end
163
+ expect(calls).to be == 0
164
+ end
165
+ end
166
+ end
167
+
168
+ describe "urls" do
169
+ describe "matching URLS" do
170
+ TestUrls::VALID.each do |url|
171
+ it "should extract the URL #{url} and prefix it with a protocol if missing" do
172
+ expect(@extractor.extract_urls(url).first).to include(url)
173
+ end
174
+
175
+ it "should match the URL #{url} when it's embedded in other text" do
176
+ text = "Sweet url: #{url} I found. #awesome"
177
+ expect(@extractor.extract_urls(text).first).to include(url)
178
+ end
179
+ end
180
+ end
181
+
182
+ describe "invalid URLS" do
183
+ TestUrls::INVALID.each do |url|
184
+ it "does not extract URL from #{url}" do
185
+ expect(@extractor.extract_urls(url).first).to be nil
186
+ end
187
+ end
188
+ end
189
+
190
+ describe "t.co URLS" do
191
+ TestUrls::TCO.each do |url|
192
+ it "should only extract the t.co URL from the URL #{url}" do
193
+ extracted_urls = @extractor.extract_urls(url)
194
+ expect(extracted_urls.size).to be == 1
195
+ extracted_url = extracted_urls.first
196
+ expect(extracted_url).to_not be == url
197
+ expect(extracted_url).to be == url[0...20]
198
+ end
199
+
200
+ it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
201
+ text = "Sweet url: #{url} I found. #awesome"
202
+ extracted_urls = @extractor.extract_urls(text)
203
+ expect(extracted_urls.size).to be == 1
204
+ extracted_url = extracted_urls.first
205
+ expect(extracted_url).to_not be == url
206
+ expect(extracted_url).to be == url[0...20]
207
+ end
208
+ end
209
+ end
210
+ end
211
+
212
+ describe "urls with indices" do
213
+ describe "matching URLS" do
214
+ TestUrls::VALID.each do |url|
215
+ it "should extract the URL #{url} and prefix it with a protocol if missing" do
216
+ extracted_urls = @extractor.extract_urls_with_indices(url)
217
+ expect(extracted_urls.size).to be == 1
218
+ extracted_url = extracted_urls.first
219
+ expect(extracted_url[:url]).to include(url)
220
+ expect(extracted_url[:indices].first).to be == 0
221
+ expect(extracted_url[:indices].last).to be == url.chars.to_a.size
222
+ end
223
+
224
+ it "should match the URL #{url} when it's embedded in other text" do
225
+ text = "Sweet url: #{url} I found. #awesome"
226
+ extracted_urls = @extractor.extract_urls_with_indices(text)
227
+ expect(extracted_urls.size).to be == 1
228
+ extracted_url = extracted_urls.first
229
+ expect(extracted_url[:url]).to include(url)
230
+ expect(extracted_url[:indices].first).to be == 11
231
+ expect(extracted_url[:indices].last).to be == 11 + url.chars.to_a.size
232
+ end
233
+ end
234
+
235
+ it "should extract URL in text with supplementary character" do
236
+ expect(@extractor.extract_urls_with_indices("#{[0x10400].pack('U')} http://twitter.com")).to be == [{:url => "http://twitter.com", :indices => [2, 20]}]
237
+ end
238
+ end
239
+
240
+ describe "invalid URLS" do
241
+ it "does not link urls with invalid domains" do
242
+ expect(@extractor.extract_urls_with_indices("http://tld-too-short.x")).to be == []
243
+ end
244
+
245
+ it "does not consider a long URL with protocol to be valid" do
246
+ # maximum length of domain label is 32 chars.
247
+ url = ("a" * 31) + "."
248
+ url *= (Twitter::TwitterText::Extractor::MAX_URL_LENGTH / 32)
249
+ url = "https://" + url + "com" # longer than 4096 (MAX_URL_LENGTH) chars
250
+ expect(@extractor.is_valid_domain(url.length, url, true)).to be false
251
+ end
252
+
253
+ it "does not consider a long URL without protocol to be valid" do
254
+ # maximum length of domain label is 32 chars.
255
+ url = ("a" * 31) + "."
256
+ url *= ((Twitter::TwitterText::Extractor::MAX_URL_LENGTH / 32) - 1)
257
+ url = url + "com" # shorter than 4096 (MAX_URL_LENGTH) chars
258
+ expect(@extractor.is_valid_domain(url.length, url, false)).to be true
259
+ url = ("a" * (31 - "https://".length)) + "." + url
260
+ expect(@extractor.is_valid_domain(url.length, url, false)).to be false
261
+ end
262
+ end
263
+
264
+ describe "t.co URLS" do
265
+ TestUrls::TCO.each do |url|
266
+ it "should only extract the t.co URL from the URL #{url} and adjust indices correctly" do
267
+ extracted_urls = @extractor.extract_urls_with_indices(url)
268
+ expect(extracted_urls.size).to be == 1
269
+ extracted_url = extracted_urls.first
270
+ expect(extracted_url[:url]).to_not include(url)
271
+ expect(extracted_url[:url]).to include(url[0...20])
272
+ expect(extracted_url[:indices].first).to be == 0
273
+ expect(extracted_url[:indices].last).to be == 20
274
+ end
275
+
276
+ it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
277
+ text = "Sweet url: #{url} I found. #awesome"
278
+ extracted_urls = @extractor.extract_urls_with_indices(text)
279
+ expect(extracted_urls.size).to be == 1
280
+ extracted_url = extracted_urls.first
281
+ expect(extracted_url[:url]).to_not include(url)
282
+ expect(extracted_url[:url]).to include(url[0...20])
283
+ expect(extracted_url[:indices].first).to be == 11
284
+ expect(extracted_url[:indices].last).to be == 31
285
+ end
286
+ end
287
+ end
288
+ end
289
+
290
+ describe "hashtags" do
291
+ context "extracts latin/numeric hashtags" do
292
+ %w(text text123 123text).each do |hashtag|
293
+ it "should extract ##{hashtag}" do
294
+ expect(@extractor.extract_hashtags("##{hashtag}")).to be == [hashtag]
295
+ end
296
+
297
+ it "should extract ##{hashtag} within text" do
298
+ expect(@extractor.extract_hashtags("pre-text ##{hashtag} post-text")).to be == [hashtag]
299
+ end
300
+ end
301
+ end
302
+
303
+ context "international hashtags" do
304
+ context "should allow accents" do
305
+ %w(mañana café münchen).each do |hashtag|
306
+ it "should extract ##{hashtag}" do
307
+ expect(@extractor.extract_hashtags("##{hashtag}")).to be == [hashtag]
308
+ end
309
+
310
+ it "should extract ##{hashtag} within text" do
311
+ expect(@extractor.extract_hashtags("pre-text ##{hashtag} post-text")).to be == [hashtag]
312
+ end
313
+ end
314
+
315
+ it "should not allow the multiplication character" do
316
+ expect(@extractor.extract_hashtags("#pre#{Twitter::TwitterText::Unicode::U00D7}post")).to be == ["pre"]
317
+ end
318
+
319
+ it "should not allow the division character" do
320
+ expect(@extractor.extract_hashtags("#pre#{Twitter::TwitterText::Unicode::U00F7}post")).to be == ["pre"]
321
+ end
322
+ end
323
+
324
+ end
325
+
326
+ it "should not extract numeric hashtags" do
327
+ expect(@extractor.extract_hashtags("#1234")).to be == []
328
+ end
329
+
330
+ it "should extract hashtag followed by punctuations" do
331
+ expect(@extractor.extract_hashtags("#test1: #test2; #test3\"")).to be == ["test1", "test2" ,"test3"]
332
+ end
333
+ end
334
+
335
+ describe "hashtags with indices" do
336
+ def match_hashtag_in_text(hashtag, text, offset = 0)
337
+ extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
338
+ expect(extracted_hashtags.size).to be == 1
339
+ extracted_hashtag = extracted_hashtags.first
340
+ expect(extracted_hashtag[:hashtag]).to be == hashtag
341
+ expect(extracted_hashtag[:indices].first).to be == offset
342
+ expect(extracted_hashtag[:indices].last).to be == offset + hashtag.chars.to_a.size + 1
343
+ end
344
+
345
+ def not_match_hashtag_in_text(text)
346
+ extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
347
+ expect(extracted_hashtags.size).to be == 0
348
+ end
349
+
350
+ context "extracts latin/numeric hashtags" do
351
+ %w(text text123 123text).each do |hashtag|
352
+ it "should extract ##{hashtag}" do
353
+ match_hashtag_in_text(hashtag, "##{hashtag}")
354
+ end
355
+
356
+ it "should extract ##{hashtag} within text" do
357
+ match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
358
+ end
359
+ end
360
+ end
361
+
362
+ context "international hashtags" do
363
+ context "should allow accents" do
364
+ %w(mañana café münchen).each do |hashtag|
365
+ it "should extract ##{hashtag}" do
366
+ match_hashtag_in_text(hashtag, "##{hashtag}")
367
+ end
368
+
369
+ it "should extract ##{hashtag} within text" do
370
+ match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
371
+ end
372
+ end
373
+
374
+ it "should not allow the multiplication character" do
375
+ match_hashtag_in_text("pre", "#pre#{[0xd7].pack('U')}post", 0)
376
+ end
377
+
378
+ it "should not allow the division character" do
379
+ match_hashtag_in_text("pre", "#pre#{[0xf7].pack('U')}post", 0)
380
+ end
381
+ end
382
+ end
383
+
384
+ it "should not extract numeric hashtags" do
385
+ not_match_hashtag_in_text("#1234")
386
+ end
387
+
388
+ it "should extract hashtag in text with supplementary character" do
389
+ match_hashtag_in_text("hashtag", "#{[0x10400].pack('U')} #hashtag", 2)
390
+ end
391
+ end
392
+ end