twitter-text 1.4.17 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +4 -0
- data/README.rdoc +3 -13
- data/Rakefile +1 -0
- data/lib/twitter-text/autolink.rb +436 -0
- data/lib/twitter-text/deprecation.rb +15 -0
- data/lib/{extractor.rb → twitter-text/extractor.rb} +125 -41
- data/lib/{hithighlighter.rb → twitter-text/hit_highlighter.rb} +5 -7
- data/lib/{regex.rb → twitter-text/regex.rb} +33 -23
- data/lib/twitter-text/rewriter.rb +59 -0
- data/lib/{unicode.rb → twitter-text/unicode.rb} +0 -0
- data/lib/{validation.rb → twitter-text/validation.rb} +17 -3
- data/lib/twitter-text.rb +13 -7
- data/spec/autolinking_spec.rb +192 -16
- data/spec/extractor_spec.rb +12 -0
- data/spec/rewriter_spec.rb +2 -11
- data/spec/spec_helper.rb +1 -1
- data/test/conformance_test.rb +128 -129
- data/twitter-text.gemspec +1 -1
- metadata +14 -12
- data/lib/autolink.rb +0 -266
- data/lib/rewriter.rb +0 -65
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
class String
|
2
4
|
# Helper function to count the character length by first converting to an
|
3
5
|
# array. This is needed because with unicode strings, the return value
|
@@ -45,16 +47,48 @@ module Twitter
|
|
45
47
|
# A module for including Tweet parsing in a class. This module provides function for the extraction and processing
|
46
48
|
# of usernames, lists, URLs and hashtags.
|
47
49
|
module Extractor extend self
|
50
|
+
# Remove overlapping entities.
|
51
|
+
# This returns a new array with no overlapping entities.
|
52
|
+
def remove_overlapping_entities(entities)
|
53
|
+
# sort by start index
|
54
|
+
entities = entities.sort_by{|entity| entity[:indices].first}
|
55
|
+
|
56
|
+
# remove duplicates
|
57
|
+
prev = nil
|
58
|
+
entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
|
59
|
+
entities
|
60
|
+
end
|
61
|
+
|
62
|
+
# Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
|
63
|
+
# along with the indices for where the entity ocurred
|
64
|
+
# If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
|
65
|
+
# will be returned.
|
66
|
+
#
|
67
|
+
# If a block is given then it will be called for each entity.
|
68
|
+
def extract_entities_with_indices(text, options = {}, &block)
|
69
|
+
# extract all entities
|
70
|
+
entities = extract_urls_with_indices(text, options) +
|
71
|
+
extract_hashtags_with_indices(text, :check_url_overlap => false) +
|
72
|
+
extract_mentions_or_lists_with_indices(text) +
|
73
|
+
extract_cashtags_with_indices(text)
|
74
|
+
|
75
|
+
return [] if entities.empty?
|
76
|
+
|
77
|
+
entities = remove_overlapping_entities(entities)
|
78
|
+
|
79
|
+
entities.each(&block) if block_given?
|
80
|
+
entities
|
81
|
+
end
|
48
82
|
|
49
83
|
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
|
50
84
|
# <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
|
51
85
|
# will be returned.
|
52
86
|
#
|
53
87
|
# If a block is given then it will be called for each username.
|
54
|
-
def extract_mentioned_screen_names(text) # :yields: username
|
55
|
-
|
56
|
-
|
57
|
-
|
88
|
+
def extract_mentioned_screen_names(text, &block) # :yields: username
|
89
|
+
screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
|
90
|
+
screen_names.each(&block) if block_given?
|
91
|
+
screen_names
|
58
92
|
end
|
59
93
|
|
60
94
|
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
|
@@ -68,23 +102,20 @@ module Twitter
|
|
68
102
|
return [] unless text
|
69
103
|
|
70
104
|
possible_screen_names = []
|
71
|
-
text
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
possible_screen_names << {
|
78
|
-
:screen_name => sn,
|
79
|
-
:indices => [start_position, end_position]
|
80
|
-
}
|
81
|
-
end
|
105
|
+
extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
|
106
|
+
next unless list_slug.empty?
|
107
|
+
possible_screen_names << {
|
108
|
+
:screen_name => screen_name,
|
109
|
+
:indices => [start_position, end_position]
|
110
|
+
}
|
82
111
|
end
|
112
|
+
|
83
113
|
if block_given?
|
84
114
|
possible_screen_names.each do |mention|
|
85
115
|
yield mention[:screen_name], mention[:indices].first, mention[:indices].last
|
86
116
|
end
|
87
117
|
end
|
118
|
+
|
88
119
|
possible_screen_names
|
89
120
|
end
|
90
121
|
|
@@ -97,17 +128,17 @@ module Twitter
|
|
97
128
|
# index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
|
98
129
|
# if this is a username mention.
|
99
130
|
def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
|
100
|
-
return [] unless text
|
131
|
+
return [] unless text =~ /[@@]/
|
101
132
|
|
102
133
|
possible_entries = []
|
103
|
-
text.to_s.scan(Twitter::Regex[:
|
104
|
-
|
134
|
+
text.to_s.scan(Twitter::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
|
135
|
+
match_data = $~
|
105
136
|
after = $'
|
106
|
-
unless after =~ Twitter::Regex[:
|
107
|
-
start_position =
|
108
|
-
end_position =
|
137
|
+
unless after =~ Twitter::Regex[:end_mention_match]
|
138
|
+
start_position = match_data.char_begin(3) - 1
|
139
|
+
end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
|
109
140
|
possible_entries << {
|
110
|
-
:screen_name =>
|
141
|
+
:screen_name => screen_name,
|
111
142
|
:list_slug => list_slug || "",
|
112
143
|
:indices => [start_position, end_position]
|
113
144
|
}
|
@@ -130,9 +161,9 @@ module Twitter
|
|
130
161
|
def extract_reply_screen_name(text) # :yields: username
|
131
162
|
return nil unless text
|
132
163
|
|
133
|
-
possible_screen_name = text.match(Twitter::Regex[:
|
164
|
+
possible_screen_name = text.match(Twitter::Regex[:valid_reply])
|
134
165
|
return unless possible_screen_name.respond_to?(:captures)
|
135
|
-
return if $' =~ Twitter::Regex[:
|
166
|
+
return if $' =~ Twitter::Regex[:end_mention_match]
|
136
167
|
screen_name = possible_screen_name.captures.first
|
137
168
|
yield screen_name if block_given?
|
138
169
|
screen_name
|
@@ -143,10 +174,10 @@ module Twitter
|
|
143
174
|
# will be returned.
|
144
175
|
#
|
145
176
|
# If a block is given then it will be called for each URL.
|
146
|
-
def extract_urls(text) # :yields: url
|
147
|
-
|
148
|
-
|
149
|
-
|
177
|
+
def extract_urls(text, &block) # :yields: url
|
178
|
+
urls = extract_urls_with_indices(text).map{|u| u[:url]}
|
179
|
+
urls.each(&block) if block_given?
|
180
|
+
urls
|
150
181
|
end
|
151
182
|
|
152
183
|
# Extracts a list of all URLs included in the Tweet <tt>text</tt> along
|
@@ -154,10 +185,11 @@ module Twitter
|
|
154
185
|
# URLs an empty array will be returned.
|
155
186
|
#
|
156
187
|
# If a block is given then it will be called for each URL.
|
157
|
-
def extract_urls_with_indices(text) # :yields: url, start, end
|
158
|
-
return [] unless text
|
188
|
+
def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
|
189
|
+
return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
|
159
190
|
urls = []
|
160
191
|
position = 0
|
192
|
+
|
161
193
|
text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
|
162
194
|
valid_url_match_data = $~
|
163
195
|
|
@@ -167,6 +199,7 @@ module Twitter
|
|
167
199
|
# If protocol is missing and domain contains non-ASCII characters,
|
168
200
|
# extract ASCII-only domains.
|
169
201
|
if !protocol
|
202
|
+
next if !options[:extract_url_without_protocol] || before =~ Twitter::Regex[:invalid_url_without_protocol_preceding_chars]
|
170
203
|
last_url = nil
|
171
204
|
last_url_invalid_match = nil
|
172
205
|
domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
|
@@ -201,7 +234,7 @@ module Twitter
|
|
201
234
|
}
|
202
235
|
end
|
203
236
|
end
|
204
|
-
urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last
|
237
|
+
urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
|
205
238
|
urls
|
206
239
|
end
|
207
240
|
|
@@ -211,10 +244,10 @@ module Twitter
|
|
211
244
|
# character.
|
212
245
|
#
|
213
246
|
# If a block is given then it will be called for each hashtag.
|
214
|
-
def extract_hashtags(text) # :yields: hashtag_text
|
215
|
-
|
216
|
-
|
217
|
-
|
247
|
+
def extract_hashtags(text, &block) # :yields: hashtag_text
|
248
|
+
hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
|
249
|
+
hashtags.each(&block) if block_given?
|
250
|
+
hashtags
|
218
251
|
end
|
219
252
|
|
220
253
|
# Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
|
@@ -223,13 +256,14 @@ module Twitter
|
|
223
256
|
# character.
|
224
257
|
#
|
225
258
|
# If a block is given then it will be called for each hashtag.
|
226
|
-
def extract_hashtags_with_indices(text) # :yields: hashtag_text, start, end
|
227
|
-
return [] unless text
|
259
|
+
def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
|
260
|
+
return [] unless text =~ /[##]/
|
228
261
|
|
229
262
|
tags = []
|
230
|
-
text.scan(Twitter::Regex[:
|
231
|
-
|
232
|
-
|
263
|
+
text.scan(Twitter::Regex[:valid_hashtag]) do |before, hash, hash_text|
|
264
|
+
match_data = $~
|
265
|
+
start_position = match_data.char_begin(2)
|
266
|
+
end_position = match_data.char_end(3)
|
233
267
|
after = $'
|
234
268
|
unless after =~ Twitter::Regex[:end_hashtag_match]
|
235
269
|
tags << {
|
@@ -238,7 +272,57 @@ module Twitter
|
|
238
272
|
}
|
239
273
|
end
|
240
274
|
end
|
241
|
-
|
275
|
+
|
276
|
+
if options[:check_url_overlap]
|
277
|
+
# extract URLs
|
278
|
+
urls = extract_urls_with_indices(text)
|
279
|
+
unless urls.empty?
|
280
|
+
tags.concat(urls)
|
281
|
+
# remove duplicates
|
282
|
+
tags = remove_overlapping_entities(tags)
|
283
|
+
# remove URL entities
|
284
|
+
tags.reject!{|entity| !entity[:hashtag] }
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
|
289
|
+
tags
|
290
|
+
end
|
291
|
+
|
292
|
+
# Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
|
293
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
|
294
|
+
# will be returned. The array returned will not include the leading <tt>$</tt>
|
295
|
+
# character.
|
296
|
+
#
|
297
|
+
# If a block is given then it will be called for each cashtag.
|
298
|
+
def extract_cashtags(text, &block) # :yields: cashtag_text
|
299
|
+
cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
|
300
|
+
cashtags.each(&block) if block_given?
|
301
|
+
cashtags
|
302
|
+
end
|
303
|
+
|
304
|
+
# Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
|
305
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
|
306
|
+
# will be returned. The array returned will not include the leading <tt>$</tt>
|
307
|
+
# character.
|
308
|
+
#
|
309
|
+
# If a block is given then it will be called for each cashtag.
|
310
|
+
def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
|
311
|
+
return [] unless text =~ /\$/
|
312
|
+
|
313
|
+
tags = []
|
314
|
+
text.scan(Twitter::Regex[:valid_cashtag]) do |cash_text|
|
315
|
+
match_data = $~
|
316
|
+
# cash_text doesn't contain $ symbol, so need to decrement start_position by one
|
317
|
+
start_position = match_data.char_begin(1) - 1
|
318
|
+
end_position = match_data.char_end(1)
|
319
|
+
tags << {
|
320
|
+
:cashtag => cash_text[0],
|
321
|
+
:indices => [start_position, end_position]
|
322
|
+
}
|
323
|
+
end
|
324
|
+
|
325
|
+
tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
|
242
326
|
tags
|
243
327
|
end
|
244
328
|
end
|
@@ -23,9 +23,9 @@ module Twitter
|
|
23
23
|
|
24
24
|
chunks = text.split(/[<>]/)
|
25
25
|
|
26
|
-
result =
|
26
|
+
result = []
|
27
27
|
chunk_index, chunk = 0, chunks[0]
|
28
|
-
chunk_chars = chunk.
|
28
|
+
chunk_chars = chunk.to_s.to_char_a
|
29
29
|
prev_chunks_len = 0
|
30
30
|
chunk_cursor = 0
|
31
31
|
start_in_chunk = false
|
@@ -49,13 +49,13 @@ module Twitter
|
|
49
49
|
chunk_cursor = 0
|
50
50
|
chunk_index += 2
|
51
51
|
chunk = chunks[chunk_index]
|
52
|
-
chunk_chars = chunk.
|
52
|
+
chunk_chars = chunk.to_s.to_char_a
|
53
53
|
start_in_chunk = false
|
54
54
|
end
|
55
55
|
|
56
56
|
if !placed && !chunk.nil?
|
57
57
|
hit_spot = hit - prev_chunks_len
|
58
|
-
result << chunk_chars[chunk_cursor...hit_spot]
|
58
|
+
result << chunk_chars[chunk_cursor...hit_spot] << tag
|
59
59
|
chunk_cursor = hit_spot
|
60
60
|
if index % 2 == 0
|
61
61
|
start_in_chunk = true
|
@@ -80,9 +80,7 @@ module Twitter
|
|
80
80
|
end
|
81
81
|
end
|
82
82
|
|
83
|
-
result
|
84
|
-
rescue
|
85
|
-
text
|
83
|
+
result.flatten.join
|
86
84
|
end
|
87
85
|
end
|
88
86
|
end
|
@@ -1,4 +1,5 @@
|
|
1
|
-
# encoding:
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
2
3
|
module Twitter
|
3
4
|
# A collection of regular expressions for parsing Tweet text. The regular expression
|
4
5
|
# list is frozen at load time to ensure immutability. These reular expressions are
|
@@ -77,6 +78,7 @@ module Twitter
|
|
77
78
|
regex_range(0x0289),
|
78
79
|
regex_range(0x028b),
|
79
80
|
regex_range(0x02bb),
|
81
|
+
regex_range(0x0300, 0x036f),
|
80
82
|
regex_range(0x1e00, 0x1eff)
|
81
83
|
].join('').freeze
|
82
84
|
|
@@ -86,13 +88,12 @@ module Twitter
|
|
86
88
|
regex_range(0x0500, 0x0527), # Cyrillic Supplement
|
87
89
|
regex_range(0x2de0, 0x2dff), # Cyrillic Extended A
|
88
90
|
regex_range(0xa640, 0xa69f), # Cyrillic Extended B
|
89
|
-
regex_range(0x0591,
|
90
|
-
regex_range(0x05bf),
|
91
|
+
regex_range(0x0591, 0x05bf), # Hebrew
|
91
92
|
regex_range(0x05c1, 0x05c2),
|
92
93
|
regex_range(0x05c4, 0x05c5),
|
93
94
|
regex_range(0x05c7),
|
94
95
|
regex_range(0x05d0, 0x05ea),
|
95
|
-
regex_range(0x05f0,
|
96
|
+
regex_range(0x05f0, 0x05f4),
|
96
97
|
regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms
|
97
98
|
regex_range(0xfb2a, 0xfb36),
|
98
99
|
regex_range(0xfb38, 0xfb3c),
|
@@ -141,38 +142,44 @@ module Twitter
|
|
141
142
|
regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B)
|
142
143
|
regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C)
|
143
144
|
regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D)
|
144
|
-
regex_range(0x2F800, 0x2FA1F), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
|
145
|
+
regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
|
145
146
|
].join('').freeze
|
146
147
|
|
148
|
+
PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
|
149
|
+
SPACE_CHARS = " \t\n\x0B\f\r"
|
150
|
+
CTRL_CHARS = "\x00-\x1F\x7F"
|
151
|
+
|
147
152
|
# A hashtag must contain latin characters, numbers and underscores, but not all numbers.
|
148
153
|
HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
|
149
154
|
HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
|
150
|
-
HASHTAG_BOUNDARY = /\A|\z|[
|
155
|
+
HASHTAG_BOUNDARY = /\A|\z|[^&a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
|
151
156
|
|
152
157
|
HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
|
153
158
|
|
154
|
-
REGEXEN[:
|
155
|
-
# Used in Extractor
|
159
|
+
REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
|
160
|
+
# Used in Extractor for final filtering
|
156
161
|
REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
|
157
162
|
|
163
|
+
REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)/o
|
158
164
|
REGEXEN[:at_signs] = /[@@]/
|
159
|
-
REGEXEN[:
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
REGEXEN[:
|
166
|
-
|
165
|
+
REGEXEN[:valid_mention_or_list] = /
|
166
|
+
(#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
|
167
|
+
(#{REGEXEN[:at_signs]}) # $2: At mark
|
168
|
+
([a-zA-Z0-9_]{1,20}) # $3: Screen name
|
169
|
+
(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
|
170
|
+
/ox
|
171
|
+
REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
172
|
+
# Used in Extractor for final filtering
|
173
|
+
REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
|
167
174
|
|
168
175
|
# URL related hash regex collection
|
169
|
-
REGEXEN[:
|
170
|
-
|
171
|
-
DOMAIN_VALID_CHARS = "[
|
176
|
+
REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
|
177
|
+
REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
|
178
|
+
DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
|
172
179
|
REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
173
180
|
REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
174
181
|
|
175
|
-
REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^
|
182
|
+
REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^0-9a-z]|$))/i
|
176
183
|
REGEXEN[:valid_ccTLD] = %r{
|
177
184
|
(?:
|
178
185
|
(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
|
@@ -181,10 +188,10 @@ module Twitter
|
|
181
188
|
lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
|
182
189
|
pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|
|
183
190
|
tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
|
184
|
-
(?=[^
|
191
|
+
(?=[^0-9a-z]|$)
|
185
192
|
)
|
186
193
|
}ix
|
187
|
-
REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/
|
194
|
+
REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
|
188
195
|
|
189
196
|
REGEXEN[:valid_domain] = /(?:
|
190
197
|
#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
|
@@ -226,7 +233,7 @@ module Twitter
|
|
226
233
|
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
|
227
234
|
REGEXEN[:valid_url] = %r{
|
228
235
|
( # $1 total match
|
229
|
-
(#{REGEXEN[:
|
236
|
+
(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
|
230
237
|
( # $3 URL
|
231
238
|
(https?:\/\/)? # $4 Protocol (optional)
|
232
239
|
(#{REGEXEN[:valid_domain]}) # $5 Domain(s)
|
@@ -237,6 +244,9 @@ module Twitter
|
|
237
244
|
)
|
238
245
|
}iox;
|
239
246
|
|
247
|
+
REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
|
248
|
+
REGEXEN[:valid_cashtag] = /(?:^|#{REGEXEN[:spaces]})\$(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
|
249
|
+
|
240
250
|
# These URL validation pattern strings are based on the ABNF from RFC 3986
|
241
251
|
REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
|
242
252
|
REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Twitter
|
2
|
+
# A module provides base methods to rewrite usernames, lists, hashtags and URLs.
|
3
|
+
module Rewriter extend self
|
4
|
+
def rewrite_entities(text, entities)
|
5
|
+
chars = text.to_s.to_char_a
|
6
|
+
|
7
|
+
# sort by start index
|
8
|
+
entities = entities.sort_by{|entity| entity[:indices].first}
|
9
|
+
|
10
|
+
result = []
|
11
|
+
last_index = entities.inject(0) do |last_index, entity|
|
12
|
+
result << chars[last_index...entity[:indices].first]
|
13
|
+
result << yield(entity, chars)
|
14
|
+
entity[:indices].last
|
15
|
+
end
|
16
|
+
result << chars[last_index..-1]
|
17
|
+
|
18
|
+
result.flatten.join
|
19
|
+
end
|
20
|
+
|
21
|
+
# These methods are deprecated, will be removed in future.
|
22
|
+
extend Deprecation
|
23
|
+
|
24
|
+
def rewrite(text, options = {})
|
25
|
+
[:hashtags, :urls, :usernames_or_lists].inject(text) do |key|
|
26
|
+
options[key] ? send(:"rewrite_#{key}", text, &options[key]) : text
|
27
|
+
end
|
28
|
+
end
|
29
|
+
deprecate :rewrite, :rewrite_entities
|
30
|
+
|
31
|
+
def rewrite_usernames_or_lists(text)
|
32
|
+
entities = Extractor.extract_mentions_or_lists_with_indices(text)
|
33
|
+
rewrite_entities(text, entities) do |entity, chars|
|
34
|
+
at = chars[entity[:indices].first]
|
35
|
+
list_slug = entity[:list_slug]
|
36
|
+
list_slug = nil if list_slug.empty?
|
37
|
+
yield(at, entity[:screen_name], list_slug)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
deprecate :rewrite_usernames_or_lists, :rewrite_entities
|
41
|
+
|
42
|
+
def rewrite_hashtags(text)
|
43
|
+
entities = Extractor.extract_hashtags_with_indices(text)
|
44
|
+
rewrite_entities(text, entities) do |entity, chars|
|
45
|
+
hash = chars[entity[:indices].first]
|
46
|
+
yield(hash, entity[:hashtag])
|
47
|
+
end
|
48
|
+
end
|
49
|
+
deprecate :rewrite_hashtags, :rewrite_entities
|
50
|
+
|
51
|
+
def rewrite_urls(text)
|
52
|
+
entities = Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false)
|
53
|
+
rewrite_entities(text, entities) do |entity, chars|
|
54
|
+
yield(entity[:url])
|
55
|
+
end
|
56
|
+
end
|
57
|
+
deprecate :rewrite_urls, :rewrite_entities
|
58
|
+
end
|
59
|
+
end
|
File without changes
|
@@ -2,6 +2,11 @@ module Twitter
|
|
2
2
|
module Validation extend self
|
3
3
|
MAX_LENGTH = 140
|
4
4
|
|
5
|
+
DEFAULT_TCO_URL_LENGTHS = {
|
6
|
+
:short_url_length => 20,
|
7
|
+
:short_url_length_https => 21
|
8
|
+
}.freeze
|
9
|
+
|
5
10
|
# Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
|
6
11
|
# (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
|
7
12
|
# string no matter which actual form was transmitted. For example:
|
@@ -14,8 +19,17 @@ module Twitter
|
|
14
19
|
#
|
15
20
|
# The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
|
16
21
|
#
|
17
|
-
def tweet_length(text)
|
18
|
-
|
22
|
+
def tweet_length(text, options = {})
|
23
|
+
options = DEFAULT_TCO_URL_LENGTHS.merge(options)
|
24
|
+
|
25
|
+
length = ActiveSupport::Multibyte::Chars.new(text).normalize(:c).length
|
26
|
+
|
27
|
+
Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
|
28
|
+
length += start_position - end_position
|
29
|
+
length += url.downcase =~ /^https:\/\// ? options[:short_url_length_https] : options[:short_url_length]
|
30
|
+
end
|
31
|
+
|
32
|
+
length
|
19
33
|
end
|
20
34
|
|
21
35
|
# Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
|
@@ -52,7 +66,7 @@ module Twitter
|
|
52
66
|
extracted.size == 1 && extracted.first == username[1..-1]
|
53
67
|
end
|
54
68
|
|
55
|
-
VALID_LIST_RE = /\A#{Twitter::Regex[:
|
69
|
+
VALID_LIST_RE = /\A#{Twitter::Regex[:valid_mention_or_list]}\z/o
|
56
70
|
def valid_list?(username_list)
|
57
71
|
match = username_list.match(VALID_LIST_RE)
|
58
72
|
# Must have matched and had nothing before or after
|
data/lib/twitter-text.rb
CHANGED
@@ -10,11 +10,17 @@ end
|
|
10
10
|
|
11
11
|
require 'active_support'
|
12
12
|
require 'active_support/core_ext/string/multibyte.rb'
|
13
|
+
require 'active_support/core_ext/hash/keys.rb'
|
13
14
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
15
|
+
%w(
|
16
|
+
deprecation
|
17
|
+
regex
|
18
|
+
rewriter
|
19
|
+
autolink
|
20
|
+
extractor
|
21
|
+
unicode
|
22
|
+
validation
|
23
|
+
hit_highlighter
|
24
|
+
).each do |name|
|
25
|
+
require "twitter-text/#{name}"
|
26
|
+
end
|