twitter-text-simpleidn 3.0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gemtest +0 -0
- data/.gitignore +40 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +35 -0
- data/Gemfile +4 -0
- data/LICENSE +188 -0
- data/README.md +193 -0
- data/Rakefile +52 -0
- data/config/README.md +142 -0
- data/config/v1.json +8 -0
- data/config/v2.json +29 -0
- data/config/v3.json +30 -0
- data/lib/assets/tld_lib.yml +1571 -0
- data/lib/twitter-text.rb +29 -0
- data/lib/twitter-text/autolink.rb +453 -0
- data/lib/twitter-text/configuration.rb +68 -0
- data/lib/twitter-text/deprecation.rb +21 -0
- data/lib/twitter-text/emoji_regex.rb +27 -0
- data/lib/twitter-text/extractor.rb +388 -0
- data/lib/twitter-text/hash_helper.rb +27 -0
- data/lib/twitter-text/hit_highlighter.rb +92 -0
- data/lib/twitter-text/regex.rb +381 -0
- data/lib/twitter-text/rewriter.rb +69 -0
- data/lib/twitter-text/unicode.rb +31 -0
- data/lib/twitter-text/validation.rb +251 -0
- data/lib/twitter-text/weighted_range.rb +24 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/spec/autolinking_spec.rb +848 -0
- data/spec/configuration_spec.rb +136 -0
- data/spec/extractor_spec.rb +392 -0
- data/spec/hithighlighter_spec.rb +96 -0
- data/spec/regex_spec.rb +76 -0
- data/spec/rewriter_spec.rb +553 -0
- data/spec/spec_helper.rb +139 -0
- data/spec/test_urls.rb +90 -0
- data/spec/twitter_text_spec.rb +25 -0
- data/spec/unicode_spec.rb +35 -0
- data/spec/validation_spec.rb +87 -0
- data/test/conformance_test.rb +242 -0
- data/twitter-text.gemspec +35 -0
- metadata +229 -0
@@ -0,0 +1,136 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
# encoding: utf-8
|
6
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
7
|
+
|
8
|
+
describe Twitter::TwitterText::Configuration do
|
9
|
+
context "configuration" do
|
10
|
+
context "with invalid data" do
|
11
|
+
it "should raise an exception" do
|
12
|
+
invalid_hash = Twitter::TwitterText::Configuration.parse_string("{\"version\":2,\"maxWeightedTweetLength\":280,\"scale\":100,\"defaultWeight\":200,\"transformedURLLength\":23,\"ranges\":[{\"start\":0,\"end\":true,\"weight\":false},{\"start\":8192,\"end\":8205,\"weight\":100},{\"start\":8208,\"end\":8223,\"weight\":100},{\"start\":8242,\"end\":8247,\"weight\":100}]}")
|
13
|
+
expect { Twitter::TwitterText::Configuration.new(invalid_hash) }.to raise_error(ArgumentError)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "with defaults" do
|
18
|
+
before do
|
19
|
+
Twitter::TwitterText::Configuration.default_configuration = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V2)
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should define version constants" do
|
23
|
+
expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V1)).to be true
|
24
|
+
expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V2)).to be true
|
25
|
+
expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V3)).to be true
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should define a default configuration" do
|
29
|
+
expect(Twitter::TwitterText::Configuration.default_configuration).to_not be_nil
|
30
|
+
expect(Twitter::TwitterText::Configuration.default_configuration.version).to eq(2)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context "with v1 configuration" do
|
35
|
+
before do
|
36
|
+
@config = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V1)
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should have a version" do
|
40
|
+
expect(@config.version).to eq(1)
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should have a max_weighted_tweet_length" do
|
44
|
+
expect(@config.max_weighted_tweet_length).to eq(140)
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should have a scale" do
|
48
|
+
expect(@config.scale).to eq(1)
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should have a default_weight" do
|
52
|
+
expect(@config.default_weight).to eq(1)
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should have a transformed_url_length" do
|
56
|
+
expect(@config.transformed_url_length).to eq(23)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
context "with v2 configuration" do
|
61
|
+
before do
|
62
|
+
@config = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V2)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should have a version" do
|
66
|
+
expect(@config.version).to eq(2)
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should have a max_weighted_tweet_length" do
|
70
|
+
expect(@config.max_weighted_tweet_length).to eq(280)
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should have a scale" do
|
74
|
+
expect(@config.scale).to eq(100)
|
75
|
+
end
|
76
|
+
|
77
|
+
it "should have a default_weight" do
|
78
|
+
expect(@config.default_weight).to eq(200)
|
79
|
+
end
|
80
|
+
|
81
|
+
it "should have a transformed_url_length" do
|
82
|
+
expect(@config.transformed_url_length).to eq(23)
|
83
|
+
end
|
84
|
+
|
85
|
+
it "should have a configured range" do
|
86
|
+
expect(@config.ranges).to be_kind_of(Array)
|
87
|
+
expect(@config.ranges.count).to be > 0
|
88
|
+
expect(@config.ranges[0]).to be_kind_of(Twitter::TwitterText::WeightedRange)
|
89
|
+
weighted_range = @config.ranges[0]
|
90
|
+
expect(weighted_range.start).to be_kind_of(Integer)
|
91
|
+
expect(weighted_range.end).to be_kind_of(Integer)
|
92
|
+
expect(weighted_range.weight).to be_kind_of(Integer)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
context "with v3 configuration" do
|
97
|
+
before do
|
98
|
+
@config = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V3)
|
99
|
+
end
|
100
|
+
|
101
|
+
it "should have a version" do
|
102
|
+
expect(@config.version).to eq(3)
|
103
|
+
end
|
104
|
+
|
105
|
+
it "should have a max_weighted_tweet_length" do
|
106
|
+
expect(@config.max_weighted_tweet_length).to eq(280)
|
107
|
+
end
|
108
|
+
|
109
|
+
it "should have a scale" do
|
110
|
+
expect(@config.scale).to eq(100)
|
111
|
+
end
|
112
|
+
|
113
|
+
it "should have a default_weight" do
|
114
|
+
expect(@config.default_weight).to eq(200)
|
115
|
+
end
|
116
|
+
|
117
|
+
it "should have a transformed_url_length" do
|
118
|
+
expect(@config.transformed_url_length).to eq(23)
|
119
|
+
end
|
120
|
+
|
121
|
+
it "should have a configured range" do
|
122
|
+
expect(@config.ranges).to be_kind_of(Array)
|
123
|
+
expect(@config.ranges.count).to be > 0
|
124
|
+
expect(@config.ranges[0]).to be_kind_of(Twitter::TwitterText::WeightedRange)
|
125
|
+
weighted_range = @config.ranges[0]
|
126
|
+
expect(weighted_range.start).to be_kind_of(Integer)
|
127
|
+
expect(weighted_range.end).to be_kind_of(Integer)
|
128
|
+
expect(weighted_range.weight).to be_kind_of(Integer)
|
129
|
+
end
|
130
|
+
|
131
|
+
it "should support discounting emoji" do
|
132
|
+
expect(@config.emoji_parsing_enabled).to be true
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,392 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
# encoding: utf-8
|
6
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
7
|
+
|
8
|
+
class TestExtractor
|
9
|
+
include Twitter::TwitterText::Extractor
|
10
|
+
end
|
11
|
+
|
12
|
+
describe Twitter::TwitterText::Extractor do
|
13
|
+
before do
|
14
|
+
@extractor = TestExtractor.new
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "mentions" do
|
18
|
+
context "single screen name alone " do
|
19
|
+
it "should be linked" do
|
20
|
+
expect(@extractor.extract_mentioned_screen_names("@alice")).to be == ["alice"]
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should be linked with _" do
|
24
|
+
expect(@extractor.extract_mentioned_screen_names("@alice_adams")).to be == ["alice_adams"]
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should be linked if numeric" do
|
28
|
+
expect(@extractor.extract_mentioned_screen_names("@1234")).to be == ["1234"]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context "multiple screen names" do
|
33
|
+
it "should both be linked" do
|
34
|
+
expect(@extractor.extract_mentioned_screen_names("@alice @bob")).to be == ["alice", "bob"]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
context "screen names embedded in text" do
|
39
|
+
it "should be linked in Latin text" do
|
40
|
+
expect(@extractor.extract_mentioned_screen_names("waiting for @alice to arrive")).to be == ["alice"]
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should be linked in Japanese text" do
|
44
|
+
expect(@extractor.extract_mentioned_screen_names("の@aliceに到着を待っている")).to be == ["alice"]
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should ignore mentions preceded by !, @, #, $, %, & or *" do
|
48
|
+
invalid_chars = ['!', '@', '#', '$', '%', '&', '*']
|
49
|
+
invalid_chars.each do |c|
|
50
|
+
expect(@extractor.extract_mentioned_screen_names("f#{c}@kn")).to be == []
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should accept a block arugment and call it in order" do
|
56
|
+
needed = ["alice", "bob"]
|
57
|
+
@extractor.extract_mentioned_screen_names("@alice @bob") do |sn|
|
58
|
+
expect(sn).to be == needed.shift
|
59
|
+
end
|
60
|
+
expect(needed).to be == []
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
describe "mentions with indices" do
|
65
|
+
context "single screen name alone " do
|
66
|
+
it "should be linked and the correct indices" do
|
67
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("@alice")).to be == [{:screen_name => "alice", :indices => [0, 6]}]
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should be linked with _ and the correct indices" do
|
71
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("@alice_adams")).to be == [{:screen_name => "alice_adams", :indices => [0, 12]}]
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should be linked if numeric and the correct indices" do
|
75
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("@1234")).to be == [{:screen_name => "1234", :indices => [0, 5]}]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
context "multiple screen names" do
|
80
|
+
it "should both be linked with the correct indices" do
|
81
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("@alice @bob")).to be ==
|
82
|
+
[{:screen_name => "alice", :indices => [0, 6]},
|
83
|
+
{:screen_name => "bob", :indices => [7, 11]}]
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should be linked with the correct indices even when repeated" do
|
87
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("@alice @alice @bob")).to be ==
|
88
|
+
[{:screen_name => "alice", :indices => [0, 6]},
|
89
|
+
{:screen_name => "alice", :indices => [7, 13]},
|
90
|
+
{:screen_name => "bob", :indices => [14, 18]}]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
context "screen names embedded in text" do
|
95
|
+
it "should be linked in Latin text with the correct indices" do
|
96
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("waiting for @alice to arrive")).to be == [{:screen_name => "alice", :indices => [12, 18]}]
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should be linked in Japanese text with the correct indices" do
|
100
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("の@aliceに到着を待っている")).to be == [{:screen_name => "alice", :indices => [1, 7]}]
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
it "should accept a block arugment and call it in order" do
|
105
|
+
needed = [{:screen_name => "alice", :indices => [0, 6]}, {:screen_name => "bob", :indices => [7, 11]}]
|
106
|
+
@extractor.extract_mentioned_screen_names_with_indices("@alice @bob") do |sn, start_index, end_index|
|
107
|
+
data = needed.shift
|
108
|
+
expect(sn).to be == data[:screen_name]
|
109
|
+
expect(start_index).to be == data[:indices].first
|
110
|
+
expect(end_index).to be == data[:indices].last
|
111
|
+
end
|
112
|
+
expect(needed).to be == []
|
113
|
+
end
|
114
|
+
|
115
|
+
it "should extract screen name in text with supplementary character" do
|
116
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("#{[0x10400].pack('U')} @alice")).to be == [{:screen_name => "alice", :indices => [2, 8]}]
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
describe "replies" do
|
121
|
+
context "should be extracted from" do
|
122
|
+
it "should extract from lone name" do
|
123
|
+
expect(@extractor.extract_reply_screen_name("@alice")).to be == "alice"
|
124
|
+
end
|
125
|
+
|
126
|
+
it "should extract from the start" do
|
127
|
+
expect(@extractor.extract_reply_screen_name("@alice reply text")).to be == "alice"
|
128
|
+
end
|
129
|
+
|
130
|
+
it "should extract preceded by a space" do
|
131
|
+
expect(@extractor.extract_reply_screen_name(" @alice reply text")).to be == "alice"
|
132
|
+
end
|
133
|
+
|
134
|
+
it "should extract preceded by a full-width space" do
|
135
|
+
expect(@extractor.extract_reply_screen_name("#{[0x3000].pack('U')}@alice reply text")).to be == "alice"
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
context "should not be extracted from" do
|
140
|
+
it "should not be extracted when preceded by text" do
|
141
|
+
expect(@extractor.extract_reply_screen_name("reply @alice text")).to be == nil
|
142
|
+
end
|
143
|
+
|
144
|
+
it "should not be extracted when preceded by puctuation" do
|
145
|
+
%w(. / _ - + # ! @).each do |punct|
|
146
|
+
expect(@extractor.extract_reply_screen_name("#{punct}@alice text")).to be == nil
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
context "should accept a block arugment" do
|
152
|
+
it "should call the block on match" do
|
153
|
+
@extractor.extract_reply_screen_name("@alice") do |sn|
|
154
|
+
expect(sn).to be == "alice"
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
it "should not call the block on no match" do
|
159
|
+
calls = 0
|
160
|
+
@extractor.extract_reply_screen_name("not a reply") do |sn|
|
161
|
+
calls += 1
|
162
|
+
end
|
163
|
+
expect(calls).to be == 0
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
describe "urls" do
|
169
|
+
describe "matching URLS" do
|
170
|
+
TestUrls::VALID.each do |url|
|
171
|
+
it "should extract the URL #{url} and prefix it with a protocol if missing" do
|
172
|
+
expect(@extractor.extract_urls(url).first).to include(url)
|
173
|
+
end
|
174
|
+
|
175
|
+
it "should match the URL #{url} when it's embedded in other text" do
|
176
|
+
text = "Sweet url: #{url} I found. #awesome"
|
177
|
+
expect(@extractor.extract_urls(text).first).to include(url)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
describe "invalid URLS" do
|
183
|
+
TestUrls::INVALID.each do |url|
|
184
|
+
it "does not extract URL from #{url}" do
|
185
|
+
expect(@extractor.extract_urls(url).first).to be nil
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
describe "t.co URLS" do
|
191
|
+
TestUrls::TCO.each do |url|
|
192
|
+
it "should only extract the t.co URL from the URL #{url}" do
|
193
|
+
extracted_urls = @extractor.extract_urls(url)
|
194
|
+
expect(extracted_urls.size).to be == 1
|
195
|
+
extracted_url = extracted_urls.first
|
196
|
+
expect(extracted_url).to_not be == url
|
197
|
+
expect(extracted_url).to be == url[0...20]
|
198
|
+
end
|
199
|
+
|
200
|
+
it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
|
201
|
+
text = "Sweet url: #{url} I found. #awesome"
|
202
|
+
extracted_urls = @extractor.extract_urls(text)
|
203
|
+
expect(extracted_urls.size).to be == 1
|
204
|
+
extracted_url = extracted_urls.first
|
205
|
+
expect(extracted_url).to_not be == url
|
206
|
+
expect(extracted_url).to be == url[0...20]
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
describe "urls with indices" do
|
213
|
+
describe "matching URLS" do
|
214
|
+
TestUrls::VALID.each do |url|
|
215
|
+
it "should extract the URL #{url} and prefix it with a protocol if missing" do
|
216
|
+
extracted_urls = @extractor.extract_urls_with_indices(url)
|
217
|
+
expect(extracted_urls.size).to be == 1
|
218
|
+
extracted_url = extracted_urls.first
|
219
|
+
expect(extracted_url[:url]).to include(url)
|
220
|
+
expect(extracted_url[:indices].first).to be == 0
|
221
|
+
expect(extracted_url[:indices].last).to be == url.chars.to_a.size
|
222
|
+
end
|
223
|
+
|
224
|
+
it "should match the URL #{url} when it's embedded in other text" do
|
225
|
+
text = "Sweet url: #{url} I found. #awesome"
|
226
|
+
extracted_urls = @extractor.extract_urls_with_indices(text)
|
227
|
+
expect(extracted_urls.size).to be == 1
|
228
|
+
extracted_url = extracted_urls.first
|
229
|
+
expect(extracted_url[:url]).to include(url)
|
230
|
+
expect(extracted_url[:indices].first).to be == 11
|
231
|
+
expect(extracted_url[:indices].last).to be == 11 + url.chars.to_a.size
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
it "should extract URL in text with supplementary character" do
|
236
|
+
expect(@extractor.extract_urls_with_indices("#{[0x10400].pack('U')} http://twitter.com")).to be == [{:url => "http://twitter.com", :indices => [2, 20]}]
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
describe "invalid URLS" do
|
241
|
+
it "does not link urls with invalid domains" do
|
242
|
+
expect(@extractor.extract_urls_with_indices("http://tld-too-short.x")).to be == []
|
243
|
+
end
|
244
|
+
|
245
|
+
it "does not consider a long URL with protocol to be valid" do
|
246
|
+
# maximum length of domain label is 32 chars.
|
247
|
+
url = ("a" * 31) + "."
|
248
|
+
url *= (Twitter::TwitterText::Extractor::MAX_URL_LENGTH / 32)
|
249
|
+
url = "https://" + url + "com" # longer than 4096 (MAX_URL_LENGTH) chars
|
250
|
+
expect(@extractor.is_valid_domain(url.length, url, true)).to be false
|
251
|
+
end
|
252
|
+
|
253
|
+
it "does not consider a long URL without protocol to be valid" do
|
254
|
+
# maximum length of domain label is 32 chars.
|
255
|
+
url = ("a" * 31) + "."
|
256
|
+
url *= ((Twitter::TwitterText::Extractor::MAX_URL_LENGTH / 32) - 1)
|
257
|
+
url = url + "com" # shorter than 4096 (MAX_URL_LENGTH) chars
|
258
|
+
expect(@extractor.is_valid_domain(url.length, url, false)).to be true
|
259
|
+
url = ("a" * (31 - "https://".length)) + "." + url
|
260
|
+
expect(@extractor.is_valid_domain(url.length, url, false)).to be false
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
describe "t.co URLS" do
|
265
|
+
TestUrls::TCO.each do |url|
|
266
|
+
it "should only extract the t.co URL from the URL #{url} and adjust indices correctly" do
|
267
|
+
extracted_urls = @extractor.extract_urls_with_indices(url)
|
268
|
+
expect(extracted_urls.size).to be == 1
|
269
|
+
extracted_url = extracted_urls.first
|
270
|
+
expect(extracted_url[:url]).to_not include(url)
|
271
|
+
expect(extracted_url[:url]).to include(url[0...20])
|
272
|
+
expect(extracted_url[:indices].first).to be == 0
|
273
|
+
expect(extracted_url[:indices].last).to be == 20
|
274
|
+
end
|
275
|
+
|
276
|
+
it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
|
277
|
+
text = "Sweet url: #{url} I found. #awesome"
|
278
|
+
extracted_urls = @extractor.extract_urls_with_indices(text)
|
279
|
+
expect(extracted_urls.size).to be == 1
|
280
|
+
extracted_url = extracted_urls.first
|
281
|
+
expect(extracted_url[:url]).to_not include(url)
|
282
|
+
expect(extracted_url[:url]).to include(url[0...20])
|
283
|
+
expect(extracted_url[:indices].first).to be == 11
|
284
|
+
expect(extracted_url[:indices].last).to be == 31
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
describe "hashtags" do
|
291
|
+
context "extracts latin/numeric hashtags" do
|
292
|
+
%w(text text123 123text).each do |hashtag|
|
293
|
+
it "should extract ##{hashtag}" do
|
294
|
+
expect(@extractor.extract_hashtags("##{hashtag}")).to be == [hashtag]
|
295
|
+
end
|
296
|
+
|
297
|
+
it "should extract ##{hashtag} within text" do
|
298
|
+
expect(@extractor.extract_hashtags("pre-text ##{hashtag} post-text")).to be == [hashtag]
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
context "international hashtags" do
|
304
|
+
context "should allow accents" do
|
305
|
+
%w(mañana café münchen).each do |hashtag|
|
306
|
+
it "should extract ##{hashtag}" do
|
307
|
+
expect(@extractor.extract_hashtags("##{hashtag}")).to be == [hashtag]
|
308
|
+
end
|
309
|
+
|
310
|
+
it "should extract ##{hashtag} within text" do
|
311
|
+
expect(@extractor.extract_hashtags("pre-text ##{hashtag} post-text")).to be == [hashtag]
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
it "should not allow the multiplication character" do
|
316
|
+
expect(@extractor.extract_hashtags("#pre#{Twitter::TwitterText::Unicode::U00D7}post")).to be == ["pre"]
|
317
|
+
end
|
318
|
+
|
319
|
+
it "should not allow the division character" do
|
320
|
+
expect(@extractor.extract_hashtags("#pre#{Twitter::TwitterText::Unicode::U00F7}post")).to be == ["pre"]
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
end
|
325
|
+
|
326
|
+
it "should not extract numeric hashtags" do
|
327
|
+
expect(@extractor.extract_hashtags("#1234")).to be == []
|
328
|
+
end
|
329
|
+
|
330
|
+
it "should extract hashtag followed by punctuations" do
|
331
|
+
expect(@extractor.extract_hashtags("#test1: #test2; #test3\"")).to be == ["test1", "test2" ,"test3"]
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
describe "hashtags with indices" do
|
336
|
+
def match_hashtag_in_text(hashtag, text, offset = 0)
|
337
|
+
extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
|
338
|
+
expect(extracted_hashtags.size).to be == 1
|
339
|
+
extracted_hashtag = extracted_hashtags.first
|
340
|
+
expect(extracted_hashtag[:hashtag]).to be == hashtag
|
341
|
+
expect(extracted_hashtag[:indices].first).to be == offset
|
342
|
+
expect(extracted_hashtag[:indices].last).to be == offset + hashtag.chars.to_a.size + 1
|
343
|
+
end
|
344
|
+
|
345
|
+
def not_match_hashtag_in_text(text)
|
346
|
+
extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
|
347
|
+
expect(extracted_hashtags.size).to be == 0
|
348
|
+
end
|
349
|
+
|
350
|
+
context "extracts latin/numeric hashtags" do
|
351
|
+
%w(text text123 123text).each do |hashtag|
|
352
|
+
it "should extract ##{hashtag}" do
|
353
|
+
match_hashtag_in_text(hashtag, "##{hashtag}")
|
354
|
+
end
|
355
|
+
|
356
|
+
it "should extract ##{hashtag} within text" do
|
357
|
+
match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
context "international hashtags" do
|
363
|
+
context "should allow accents" do
|
364
|
+
%w(mañana café münchen).each do |hashtag|
|
365
|
+
it "should extract ##{hashtag}" do
|
366
|
+
match_hashtag_in_text(hashtag, "##{hashtag}")
|
367
|
+
end
|
368
|
+
|
369
|
+
it "should extract ##{hashtag} within text" do
|
370
|
+
match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
|
371
|
+
end
|
372
|
+
end
|
373
|
+
|
374
|
+
it "should not allow the multiplication character" do
|
375
|
+
match_hashtag_in_text("pre", "#pre#{[0xd7].pack('U')}post", 0)
|
376
|
+
end
|
377
|
+
|
378
|
+
it "should not allow the division character" do
|
379
|
+
match_hashtag_in_text("pre", "#pre#{[0xf7].pack('U')}post", 0)
|
380
|
+
end
|
381
|
+
end
|
382
|
+
end
|
383
|
+
|
384
|
+
it "should not extract numeric hashtags" do
|
385
|
+
not_match_hashtag_in_text("#1234")
|
386
|
+
end
|
387
|
+
|
388
|
+
it "should extract hashtag in text with supplementary character" do
|
389
|
+
match_hashtag_in_text("hashtag", "#{[0x10400].pack('U')} #hashtag", 2)
|
390
|
+
end
|
391
|
+
end
|
392
|
+
end
|