twitter-text-kow 1.3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gemtest +0 -0
- data/.gitignore +40 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +44 -0
- data/Gemfile +4 -0
- data/LICENSE +188 -0
- data/README.md +193 -0
- data/Rakefile +52 -0
- data/config/README.md +142 -0
- data/config/v1.json +8 -0
- data/config/v2.json +29 -0
- data/config/v3.json +30 -0
- data/lib/assets/tld_lib.yml +1577 -0
- data/lib/twitter-text/autolink.rb +455 -0
- data/lib/twitter-text/configuration.rb +68 -0
- data/lib/twitter-text/deprecation.rb +21 -0
- data/lib/twitter-text/emoji_regex.rb +27 -0
- data/lib/twitter-text/extractor.rb +388 -0
- data/lib/twitter-text/hash_helper.rb +27 -0
- data/lib/twitter-text/hit_highlighter.rb +92 -0
- data/lib/twitter-text/regex.rb +381 -0
- data/lib/twitter-text/rewriter.rb +69 -0
- data/lib/twitter-text/unicode.rb +31 -0
- data/lib/twitter-text/validation.rb +251 -0
- data/lib/twitter-text/weighted_range.rb +24 -0
- data/lib/twitter-text.rb +29 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/spec/autolinking_spec.rb +858 -0
- data/spec/configuration_spec.rb +136 -0
- data/spec/extractor_spec.rb +392 -0
- data/spec/hithighlighter_spec.rb +96 -0
- data/spec/regex_spec.rb +76 -0
- data/spec/rewriter_spec.rb +553 -0
- data/spec/spec_helper.rb +139 -0
- data/spec/test_urls.rb +90 -0
- data/spec/twitter_text_spec.rb +25 -0
- data/spec/unicode_spec.rb +35 -0
- data/spec/validation_spec.rb +87 -0
- data/test/conformance_test.rb +242 -0
- data/twitter-text.gemspec +35 -0
- metadata +228 -0
@@ -0,0 +1,136 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
# encoding: utf-8
|
6
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
7
|
+
|
8
|
+
describe Twitter::TwitterText::Configuration do
|
9
|
+
context "configuration" do
|
10
|
+
context "with invalid data" do
|
11
|
+
it "should raise an exception" do
|
12
|
+
invalid_hash = Twitter::TwitterText::Configuration.parse_string("{\"version\":2,\"maxWeightedTweetLength\":280,\"scale\":100,\"defaultWeight\":200,\"transformedURLLength\":23,\"ranges\":[{\"start\":0,\"end\":true,\"weight\":false},{\"start\":8192,\"end\":8205,\"weight\":100},{\"start\":8208,\"end\":8223,\"weight\":100},{\"start\":8242,\"end\":8247,\"weight\":100}]}")
|
13
|
+
expect { Twitter::TwitterText::Configuration.new(invalid_hash) }.to raise_error(ArgumentError)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "with defaults" do
|
18
|
+
before do
|
19
|
+
Twitter::TwitterText::Configuration.default_configuration = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V2)
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should define version constants" do
|
23
|
+
expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V1)).to be true
|
24
|
+
expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V2)).to be true
|
25
|
+
expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V3)).to be true
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should define a default configuration" do
|
29
|
+
expect(Twitter::TwitterText::Configuration.default_configuration).to_not be_nil
|
30
|
+
expect(Twitter::TwitterText::Configuration.default_configuration.version).to eq(2)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context "with v1 configuration" do
|
35
|
+
before do
|
36
|
+
@config = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V1)
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should have a version" do
|
40
|
+
expect(@config.version).to eq(1)
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should have a max_weighted_tweet_length" do
|
44
|
+
expect(@config.max_weighted_tweet_length).to eq(140)
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should have a scale" do
|
48
|
+
expect(@config.scale).to eq(1)
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should have a default_weight" do
|
52
|
+
expect(@config.default_weight).to eq(1)
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should have a transformed_url_length" do
|
56
|
+
expect(@config.transformed_url_length).to eq(23)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
context "with v2 configuration" do
|
61
|
+
before do
|
62
|
+
@config = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V2)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should have a version" do
|
66
|
+
expect(@config.version).to eq(2)
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should have a max_weighted_tweet_length" do
|
70
|
+
expect(@config.max_weighted_tweet_length).to eq(280)
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should have a scale" do
|
74
|
+
expect(@config.scale).to eq(100)
|
75
|
+
end
|
76
|
+
|
77
|
+
it "should have a default_weight" do
|
78
|
+
expect(@config.default_weight).to eq(200)
|
79
|
+
end
|
80
|
+
|
81
|
+
it "should have a transformed_url_length" do
|
82
|
+
expect(@config.transformed_url_length).to eq(23)
|
83
|
+
end
|
84
|
+
|
85
|
+
it "should have a configured range" do
|
86
|
+
expect(@config.ranges).to be_kind_of(Array)
|
87
|
+
expect(@config.ranges.count).to be > 0
|
88
|
+
expect(@config.ranges[0]).to be_kind_of(Twitter::TwitterText::WeightedRange)
|
89
|
+
weighted_range = @config.ranges[0]
|
90
|
+
expect(weighted_range.start).to be_kind_of(Integer)
|
91
|
+
expect(weighted_range.end).to be_kind_of(Integer)
|
92
|
+
expect(weighted_range.weight).to be_kind_of(Integer)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
context "with v3 configuration" do
|
97
|
+
before do
|
98
|
+
@config = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V3)
|
99
|
+
end
|
100
|
+
|
101
|
+
it "should have a version" do
|
102
|
+
expect(@config.version).to eq(3)
|
103
|
+
end
|
104
|
+
|
105
|
+
it "should have a max_weighted_tweet_length" do
|
106
|
+
expect(@config.max_weighted_tweet_length).to eq(280)
|
107
|
+
end
|
108
|
+
|
109
|
+
it "should have a scale" do
|
110
|
+
expect(@config.scale).to eq(100)
|
111
|
+
end
|
112
|
+
|
113
|
+
it "should have a default_weight" do
|
114
|
+
expect(@config.default_weight).to eq(200)
|
115
|
+
end
|
116
|
+
|
117
|
+
it "should have a transformed_url_length" do
|
118
|
+
expect(@config.transformed_url_length).to eq(23)
|
119
|
+
end
|
120
|
+
|
121
|
+
it "should have a configured range" do
|
122
|
+
expect(@config.ranges).to be_kind_of(Array)
|
123
|
+
expect(@config.ranges.count).to be > 0
|
124
|
+
expect(@config.ranges[0]).to be_kind_of(Twitter::TwitterText::WeightedRange)
|
125
|
+
weighted_range = @config.ranges[0]
|
126
|
+
expect(weighted_range.start).to be_kind_of(Integer)
|
127
|
+
expect(weighted_range.end).to be_kind_of(Integer)
|
128
|
+
expect(weighted_range.weight).to be_kind_of(Integer)
|
129
|
+
end
|
130
|
+
|
131
|
+
it "should support discounting emoji" do
|
132
|
+
expect(@config.emoji_parsing_enabled).to be true
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,392 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
# encoding: utf-8
|
6
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
7
|
+
|
8
|
+
class TestExtractor
|
9
|
+
include Twitter::TwitterText::Extractor
|
10
|
+
end
|
11
|
+
|
12
|
+
describe Twitter::TwitterText::Extractor do
|
13
|
+
before do
|
14
|
+
@extractor = TestExtractor.new
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "mentions" do
|
18
|
+
context "single screen name alone " do
|
19
|
+
it "should be linked" do
|
20
|
+
expect(@extractor.extract_mentioned_screen_names("@alice")).to be == ["alice"]
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should be linked with _" do
|
24
|
+
expect(@extractor.extract_mentioned_screen_names("@alice_adams")).to be == ["alice_adams"]
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should be linked if numeric" do
|
28
|
+
expect(@extractor.extract_mentioned_screen_names("@1234")).to be == ["1234"]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context "multiple screen names" do
|
33
|
+
it "should both be linked" do
|
34
|
+
expect(@extractor.extract_mentioned_screen_names("@alice @bob")).to be == ["alice", "bob"]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
context "screen names embedded in text" do
|
39
|
+
it "should be linked in Latin text" do
|
40
|
+
expect(@extractor.extract_mentioned_screen_names("waiting for @alice to arrive")).to be == ["alice"]
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should be linked in Japanese text" do
|
44
|
+
expect(@extractor.extract_mentioned_screen_names("の@aliceに到着を待っている")).to be == ["alice"]
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should ignore mentions preceded by !, @, #, $, %, & or *" do
|
48
|
+
invalid_chars = ['!', '@', '#', '$', '%', '&', '*']
|
49
|
+
invalid_chars.each do |c|
|
50
|
+
expect(@extractor.extract_mentioned_screen_names("f#{c}@kn")).to be == []
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should accept a block arugment and call it in order" do
|
56
|
+
needed = ["alice", "bob"]
|
57
|
+
@extractor.extract_mentioned_screen_names("@alice @bob") do |sn|
|
58
|
+
expect(sn).to be == needed.shift
|
59
|
+
end
|
60
|
+
expect(needed).to be == []
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
describe "mentions with indices" do
|
65
|
+
context "single screen name alone " do
|
66
|
+
it "should be linked and the correct indices" do
|
67
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("@alice")).to be == [{:screen_name => "alice", :indices => [0, 6]}]
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should be linked with _ and the correct indices" do
|
71
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("@alice_adams")).to be == [{:screen_name => "alice_adams", :indices => [0, 12]}]
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should be linked if numeric and the correct indices" do
|
75
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("@1234")).to be == [{:screen_name => "1234", :indices => [0, 5]}]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
context "multiple screen names" do
|
80
|
+
it "should both be linked with the correct indices" do
|
81
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("@alice @bob")).to be ==
|
82
|
+
[{:screen_name => "alice", :indices => [0, 6]},
|
83
|
+
{:screen_name => "bob", :indices => [7, 11]}]
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should be linked with the correct indices even when repeated" do
|
87
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("@alice @alice @bob")).to be ==
|
88
|
+
[{:screen_name => "alice", :indices => [0, 6]},
|
89
|
+
{:screen_name => "alice", :indices => [7, 13]},
|
90
|
+
{:screen_name => "bob", :indices => [14, 18]}]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
context "screen names embedded in text" do
|
95
|
+
it "should be linked in Latin text with the correct indices" do
|
96
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("waiting for @alice to arrive")).to be == [{:screen_name => "alice", :indices => [12, 18]}]
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should be linked in Japanese text with the correct indices" do
|
100
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("の@aliceに到着を待っている")).to be == [{:screen_name => "alice", :indices => [1, 7]}]
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
it "should accept a block arugment and call it in order" do
|
105
|
+
needed = [{:screen_name => "alice", :indices => [0, 6]}, {:screen_name => "bob", :indices => [7, 11]}]
|
106
|
+
@extractor.extract_mentioned_screen_names_with_indices("@alice @bob") do |sn, start_index, end_index|
|
107
|
+
data = needed.shift
|
108
|
+
expect(sn).to be == data[:screen_name]
|
109
|
+
expect(start_index).to be == data[:indices].first
|
110
|
+
expect(end_index).to be == data[:indices].last
|
111
|
+
end
|
112
|
+
expect(needed).to be == []
|
113
|
+
end
|
114
|
+
|
115
|
+
it "should extract screen name in text with supplementary character" do
|
116
|
+
expect(@extractor.extract_mentioned_screen_names_with_indices("#{[0x10400].pack('U')} @alice")).to be == [{:screen_name => "alice", :indices => [2, 8]}]
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
describe "replies" do
|
121
|
+
context "should be extracted from" do
|
122
|
+
it "should extract from lone name" do
|
123
|
+
expect(@extractor.extract_reply_screen_name("@alice")).to be == "alice"
|
124
|
+
end
|
125
|
+
|
126
|
+
it "should extract from the start" do
|
127
|
+
expect(@extractor.extract_reply_screen_name("@alice reply text")).to be == "alice"
|
128
|
+
end
|
129
|
+
|
130
|
+
it "should extract preceded by a space" do
|
131
|
+
expect(@extractor.extract_reply_screen_name(" @alice reply text")).to be == "alice"
|
132
|
+
end
|
133
|
+
|
134
|
+
it "should extract preceded by a full-width space" do
|
135
|
+
expect(@extractor.extract_reply_screen_name("#{[0x3000].pack('U')}@alice reply text")).to be == "alice"
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
context "should not be extracted from" do
|
140
|
+
it "should not be extracted when preceded by text" do
|
141
|
+
expect(@extractor.extract_reply_screen_name("reply @alice text")).to be == nil
|
142
|
+
end
|
143
|
+
|
144
|
+
it "should not be extracted when preceded by puctuation" do
|
145
|
+
%w(. / _ - + # ! @).each do |punct|
|
146
|
+
expect(@extractor.extract_reply_screen_name("#{punct}@alice text")).to be == nil
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
context "should accept a block arugment" do
|
152
|
+
it "should call the block on match" do
|
153
|
+
@extractor.extract_reply_screen_name("@alice") do |sn|
|
154
|
+
expect(sn).to be == "alice"
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
it "should not call the block on no match" do
|
159
|
+
calls = 0
|
160
|
+
@extractor.extract_reply_screen_name("not a reply") do |sn|
|
161
|
+
calls += 1
|
162
|
+
end
|
163
|
+
expect(calls).to be == 0
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
describe "urls" do
|
169
|
+
describe "matching URLS" do
|
170
|
+
TestUrls::VALID.each do |url|
|
171
|
+
it "should extract the URL #{url} and prefix it with a protocol if missing" do
|
172
|
+
expect(@extractor.extract_urls(url).first).to include(url)
|
173
|
+
end
|
174
|
+
|
175
|
+
it "should match the URL #{url} when it's embedded in other text" do
|
176
|
+
text = "Sweet url: #{url} I found. #awesome"
|
177
|
+
expect(@extractor.extract_urls(text).first).to include(url)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
describe "invalid URLS" do
|
183
|
+
TestUrls::INVALID.each do |url|
|
184
|
+
it "does not extract URL from #{url}" do
|
185
|
+
expect(@extractor.extract_urls(url).first).to be nil
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
describe "t.co URLS" do
|
191
|
+
TestUrls::TCO.each do |url|
|
192
|
+
it "should only extract the t.co URL from the URL #{url}" do
|
193
|
+
extracted_urls = @extractor.extract_urls(url)
|
194
|
+
expect(extracted_urls.size).to be == 1
|
195
|
+
extracted_url = extracted_urls.first
|
196
|
+
expect(extracted_url).to_not be == url
|
197
|
+
expect(extracted_url).to be == url[0...20]
|
198
|
+
end
|
199
|
+
|
200
|
+
it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
|
201
|
+
text = "Sweet url: #{url} I found. #awesome"
|
202
|
+
extracted_urls = @extractor.extract_urls(text)
|
203
|
+
expect(extracted_urls.size).to be == 1
|
204
|
+
extracted_url = extracted_urls.first
|
205
|
+
expect(extracted_url).to_not be == url
|
206
|
+
expect(extracted_url).to be == url[0...20]
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
describe "urls with indices" do
|
213
|
+
describe "matching URLS" do
|
214
|
+
TestUrls::VALID.each do |url|
|
215
|
+
it "should extract the URL #{url} and prefix it with a protocol if missing" do
|
216
|
+
extracted_urls = @extractor.extract_urls_with_indices(url)
|
217
|
+
expect(extracted_urls.size).to be == 1
|
218
|
+
extracted_url = extracted_urls.first
|
219
|
+
expect(extracted_url[:url]).to include(url)
|
220
|
+
expect(extracted_url[:indices].first).to be == 0
|
221
|
+
expect(extracted_url[:indices].last).to be == url.chars.to_a.size
|
222
|
+
end
|
223
|
+
|
224
|
+
it "should match the URL #{url} when it's embedded in other text" do
|
225
|
+
text = "Sweet url: #{url} I found. #awesome"
|
226
|
+
extracted_urls = @extractor.extract_urls_with_indices(text)
|
227
|
+
expect(extracted_urls.size).to be == 1
|
228
|
+
extracted_url = extracted_urls.first
|
229
|
+
expect(extracted_url[:url]).to include(url)
|
230
|
+
expect(extracted_url[:indices].first).to be == 11
|
231
|
+
expect(extracted_url[:indices].last).to be == 11 + url.chars.to_a.size
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
it "should extract URL in text with supplementary character" do
|
236
|
+
expect(@extractor.extract_urls_with_indices("#{[0x10400].pack('U')} http://twitter.com")).to be == [{:url => "http://twitter.com", :indices => [2, 20]}]
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
describe "invalid URLS" do
|
241
|
+
it "does not link urls with invalid domains" do
|
242
|
+
expect(@extractor.extract_urls_with_indices("http://tld-too-short.x")).to be == []
|
243
|
+
end
|
244
|
+
|
245
|
+
it "does not consider a long URL with protocol to be valid" do
|
246
|
+
# maximum length of domain label is 32 chars.
|
247
|
+
url = ("a" * 31) + "."
|
248
|
+
url *= (Twitter::TwitterText::Extractor::MAX_URL_LENGTH / 32)
|
249
|
+
url = "https://" + url + "com" # longer than 4096 (MAX_URL_LENGTH) chars
|
250
|
+
expect(@extractor.is_valid_domain(url.length, url, true)).to be false
|
251
|
+
end
|
252
|
+
|
253
|
+
it "does not consider a long URL without protocol to be valid" do
|
254
|
+
# maximum length of domain label is 32 chars.
|
255
|
+
url = ("a" * 31) + "."
|
256
|
+
url *= ((Twitter::TwitterText::Extractor::MAX_URL_LENGTH / 32) - 1)
|
257
|
+
url = url + "com" # shorter than 4096 (MAX_URL_LENGTH) chars
|
258
|
+
expect(@extractor.is_valid_domain(url.length, url, false)).to be true
|
259
|
+
url = ("a" * (31 - "https://".length)) + "." + url
|
260
|
+
expect(@extractor.is_valid_domain(url.length, url, false)).to be false
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
describe "t.co URLS" do
|
265
|
+
TestUrls::TCO.each do |url|
|
266
|
+
it "should only extract the t.co URL from the URL #{url} and adjust indices correctly" do
|
267
|
+
extracted_urls = @extractor.extract_urls_with_indices(url)
|
268
|
+
expect(extracted_urls.size).to be == 1
|
269
|
+
extracted_url = extracted_urls.first
|
270
|
+
expect(extracted_url[:url]).to_not include(url)
|
271
|
+
expect(extracted_url[:url]).to include(url[0...20])
|
272
|
+
expect(extracted_url[:indices].first).to be == 0
|
273
|
+
expect(extracted_url[:indices].last).to be == 20
|
274
|
+
end
|
275
|
+
|
276
|
+
it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
|
277
|
+
text = "Sweet url: #{url} I found. #awesome"
|
278
|
+
extracted_urls = @extractor.extract_urls_with_indices(text)
|
279
|
+
expect(extracted_urls.size).to be == 1
|
280
|
+
extracted_url = extracted_urls.first
|
281
|
+
expect(extracted_url[:url]).to_not include(url)
|
282
|
+
expect(extracted_url[:url]).to include(url[0...20])
|
283
|
+
expect(extracted_url[:indices].first).to be == 11
|
284
|
+
expect(extracted_url[:indices].last).to be == 31
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
describe "hashtags" do
|
291
|
+
context "extracts latin/numeric hashtags" do
|
292
|
+
%w(text text123 123text).each do |hashtag|
|
293
|
+
it "should extract ##{hashtag}" do
|
294
|
+
expect(@extractor.extract_hashtags("##{hashtag}")).to be == [hashtag]
|
295
|
+
end
|
296
|
+
|
297
|
+
it "should extract ##{hashtag} within text" do
|
298
|
+
expect(@extractor.extract_hashtags("pre-text ##{hashtag} post-text")).to be == [hashtag]
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
context "international hashtags" do
|
304
|
+
context "should allow accents" do
|
305
|
+
%w(mañana café münchen).each do |hashtag|
|
306
|
+
it "should extract ##{hashtag}" do
|
307
|
+
expect(@extractor.extract_hashtags("##{hashtag}")).to be == [hashtag]
|
308
|
+
end
|
309
|
+
|
310
|
+
it "should extract ##{hashtag} within text" do
|
311
|
+
expect(@extractor.extract_hashtags("pre-text ##{hashtag} post-text")).to be == [hashtag]
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
it "should not allow the multiplication character" do
|
316
|
+
expect(@extractor.extract_hashtags("#pre#{Twitter::TwitterText::Unicode::U00D7}post")).to be == ["pre"]
|
317
|
+
end
|
318
|
+
|
319
|
+
it "should not allow the division character" do
|
320
|
+
expect(@extractor.extract_hashtags("#pre#{Twitter::TwitterText::Unicode::U00F7}post")).to be == ["pre"]
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
end
|
325
|
+
|
326
|
+
it "should not extract numeric hashtags" do
|
327
|
+
expect(@extractor.extract_hashtags("#1234")).to be == []
|
328
|
+
end
|
329
|
+
|
330
|
+
it "should extract hashtag followed by punctuations" do
|
331
|
+
expect(@extractor.extract_hashtags("#test1: #test2; #test3\"")).to be == ["test1", "test2" ,"test3"]
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
describe "hashtags with indices" do
|
336
|
+
def match_hashtag_in_text(hashtag, text, offset = 0)
|
337
|
+
extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
|
338
|
+
expect(extracted_hashtags.size).to be == 1
|
339
|
+
extracted_hashtag = extracted_hashtags.first
|
340
|
+
expect(extracted_hashtag[:hashtag]).to be == hashtag
|
341
|
+
expect(extracted_hashtag[:indices].first).to be == offset
|
342
|
+
expect(extracted_hashtag[:indices].last).to be == offset + hashtag.chars.to_a.size + 1
|
343
|
+
end
|
344
|
+
|
345
|
+
def not_match_hashtag_in_text(text)
|
346
|
+
extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
|
347
|
+
expect(extracted_hashtags.size).to be == 0
|
348
|
+
end
|
349
|
+
|
350
|
+
context "extracts latin/numeric hashtags" do
|
351
|
+
%w(text text123 123text).each do |hashtag|
|
352
|
+
it "should extract ##{hashtag}" do
|
353
|
+
match_hashtag_in_text(hashtag, "##{hashtag}")
|
354
|
+
end
|
355
|
+
|
356
|
+
it "should extract ##{hashtag} within text" do
|
357
|
+
match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
context "international hashtags" do
|
363
|
+
context "should allow accents" do
|
364
|
+
%w(mañana café münchen).each do |hashtag|
|
365
|
+
it "should extract ##{hashtag}" do
|
366
|
+
match_hashtag_in_text(hashtag, "##{hashtag}")
|
367
|
+
end
|
368
|
+
|
369
|
+
it "should extract ##{hashtag} within text" do
|
370
|
+
match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
|
371
|
+
end
|
372
|
+
end
|
373
|
+
|
374
|
+
it "should not allow the multiplication character" do
|
375
|
+
match_hashtag_in_text("pre", "#pre#{[0xd7].pack('U')}post", 0)
|
376
|
+
end
|
377
|
+
|
378
|
+
it "should not allow the division character" do
|
379
|
+
match_hashtag_in_text("pre", "#pre#{[0xf7].pack('U')}post", 0)
|
380
|
+
end
|
381
|
+
end
|
382
|
+
end
|
383
|
+
|
384
|
+
it "should not extract numeric hashtags" do
|
385
|
+
not_match_hashtag_in_text("#1234")
|
386
|
+
end
|
387
|
+
|
388
|
+
it "should extract hashtag in text with supplementary character" do
|
389
|
+
match_hashtag_in_text("hashtag", "#{[0x10400].pack('U')} #hashtag", 2)
|
390
|
+
end
|
391
|
+
end
|
392
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
# encoding: utf-8
|
6
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
7
|
+
|
8
|
+
class TestHitHighlighter
|
9
|
+
include Twitter::TwitterText::HitHighlighter
|
10
|
+
end
|
11
|
+
|
12
|
+
describe Twitter::TwitterText::HitHighlighter do
|
13
|
+
describe "highlight" do
|
14
|
+
before do
|
15
|
+
@highlighter = TestHitHighlighter.new
|
16
|
+
end
|
17
|
+
|
18
|
+
context "with options" do
|
19
|
+
before do
|
20
|
+
@original = "Testing this hit highliter"
|
21
|
+
@hits = [[13,16]]
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should default to <em> tags" do
|
25
|
+
expect(@highlighter.hit_highlight(@original, @hits)).to be == "Testing this <em>hit</em> highliter"
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should allow tag override" do
|
29
|
+
expect(@highlighter.hit_highlight(@original, @hits, :tag => 'b')).to be == "Testing this <b>hit</b> highliter"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "without links" do
|
34
|
+
before do
|
35
|
+
@original = "Hey! this is a test tweet"
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should return original when no hits are provided" do
|
39
|
+
expect(@highlighter.hit_highlight(@original)).to be == @original
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should highlight one hit" do
|
43
|
+
expect(@highlighter.hit_highlight(@original, hits = [[5, 9]])).to be == "Hey! <em>this</em> is a test tweet"
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should highlight two hits" do
|
47
|
+
expect(@highlighter.hit_highlight(@original, hits = [[5, 9], [15, 19]])).to be == "Hey! <em>this</em> is a <em>test</em> tweet"
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should correctly highlight first-word hits" do
|
51
|
+
expect(@highlighter.hit_highlight(@original, hits = [[0, 3]])).to be == "<em>Hey</em>! this is a test tweet"
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should correctly highlight last-word hits" do
|
55
|
+
expect(@highlighter.hit_highlight(@original, hits = [[20, 25]])).to be == "Hey! this is a test <em>tweet</em>"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
context "with links" do
|
60
|
+
it "should highlight with a single link" do
|
61
|
+
expect(@highlighter.hit_highlight("@<a>bcherry</a> this was a test tweet", [[9, 13]])).to be == "@<a>bcherry</a> <em>this</em> was a test tweet"
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should highlight with link at the end" do
|
65
|
+
expect(@highlighter.hit_highlight("test test <a>test</a>", [[5, 9]])).to be == "test <em>test</em> <a>test</a>"
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should highlight with a link at the beginning" do
|
69
|
+
expect(@highlighter.hit_highlight("<a>test</a> test test", [[5, 9]])).to be == "<a>test</a> <em>test</em> test"
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should highlight an entire link" do
|
73
|
+
expect(@highlighter.hit_highlight("test <a>test</a> test", [[5, 9]])).to be == "test <a><em>test</em></a> test"
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should highlight within a link" do
|
77
|
+
expect(@highlighter.hit_highlight("test <a>test</a> test", [[6, 8]])).to be == "test <a>t<em>es</em>t</a> test"
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should highlight around a link" do
|
81
|
+
expect(@highlighter.hit_highlight("test <a>test</a> test", [[3, 11]])).to be == "tes<em>t <a>test</a> t</em>est"
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should fail gracefully with bad hits" do
|
85
|
+
expect(@highlighter.hit_highlight("test test", [[5, 20]])).to be == "test <em>test</em>"
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should not mess up with touching tags" do
|
89
|
+
expect(@highlighter.hit_highlight("<a>foo</a><a>foo</a>", [[3,6]])).to be == "<a>foo</a><a><em>foo</em></a>"
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|