twitter-text-kow 1.3.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,388 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ # encoding: utf-8
6
+ require 'idn'
7
+
8
+ class String
9
+ # Helper function to count the character length by first converting to an
10
+ # array. This is needed because with unicode strings, the return value
11
+ # of length may be incorrect
12
+ def codepoint_length
13
+ if respond_to? :codepoints
14
+ length
15
+ else
16
+ chars.kind_of?(Enumerable) ? chars.to_a.size : chars.size
17
+ end
18
+ end
19
+
20
+ # Helper function to convert this string into an array of unicode code points.
21
+ def to_codepoint_a
22
+ if chars.kind_of?(Enumerable)
23
+ chars.to_a
24
+ else
25
+ codepoint_array = []
26
+ 0.upto(codepoint_length - 1) { |i| codepoint_array << [chars.slice(i)].pack('U') }
27
+ codepoint_array
28
+ end
29
+ end
30
+ end
31
+
32
+ # Helper functions to return code point offsets instead of byte offsets.
33
+ class MatchData
34
+ def char_begin(n)
35
+ if string.respond_to? :codepoints
36
+ self.begin(n)
37
+ else
38
+ string[0, self.begin(n)].codepoint_length
39
+ end
40
+ end
41
+
42
+ def char_end(n)
43
+ if string.respond_to? :codepoints
44
+ self.end(n)
45
+ else
46
+ string[0, self.end(n)].codepoint_length
47
+ end
48
+ end
49
+ end
50
+
51
+ module Twitter
52
+ module TwitterText
53
+ # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
54
+ # of usernames, lists, URLs and hashtags.
55
+ module Extractor extend self
56
+
57
+ # Maximum URL length as defined by Twitter's backend.
58
+ MAX_URL_LENGTH = 4096
59
+
60
+ # The maximum t.co path length that the Twitter backend supports.
61
+ MAX_TCO_SLUG_LENGTH = 40
62
+
63
+ URL_PROTOCOL_LENGTH = "https://".length
64
+
65
+ # Remove overlapping entities.
66
+ # This returns a new array with no overlapping entities.
67
+ def remove_overlapping_entities(entities)
68
+ # sort by start index
69
+ entities = entities.sort_by{|entity| entity[:indices].first}
70
+
71
+ # remove duplicates
72
+ prev = nil
73
+ entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
74
+ entities
75
+ end
76
+
77
+ # Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
78
+ # along with the indices for where the entity ocurred
79
+ # If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
80
+ # will be returned.
81
+ #
82
+ # If a block is given then it will be called for each entity.
83
+ def extract_entities_with_indices(text, options = {}, &block)
84
+ config = options[:config] || Twitter::TwitterText::Configuration.default_configuration
85
+
86
+ # extract all entities
87
+ entities = extract_urls_with_indices(text, options) +
88
+ extract_hashtags_with_indices(text, :check_url_overlap => false) +
89
+ extract_mentions_or_lists_with_indices(text) +
90
+ extract_cashtags_with_indices(text)
91
+ entities += extract_emoji_with_indices(text) if config.emoji_parsing_enabled
92
+
93
+ return [] if entities.empty?
94
+
95
+ entities = remove_overlapping_entities(entities)
96
+
97
+ entities.each(&block) if block_given?
98
+ entities
99
+ end
100
+
101
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
102
+ # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
103
+ # will be returned.
104
+ #
105
+ # If a block is given then it will be called for each username.
106
+ def extract_mentioned_screen_names(text, &block) # :yields: username
107
+ screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
108
+ screen_names.each(&block) if block_given?
109
+ screen_names
110
+ end
111
+
112
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
113
+ # along with the indices for where the mention ocurred. If the
114
+ # <tt>text</tt> is nil or contains no username mentions, an empty array
115
+ # will be returned.
116
+ #
117
+ # If a block is given, then it will be called with each username, the start
118
+ # index, and the end index in the <tt>text</tt>.
119
+ def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
120
+ return [] unless text
121
+
122
+ possible_screen_names = []
123
+ extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
124
+ next unless list_slug.empty?
125
+ possible_screen_names << {
126
+ :screen_name => screen_name,
127
+ :indices => [start_position, end_position]
128
+ }
129
+ end
130
+
131
+ if block_given?
132
+ possible_screen_names.each do |mention|
133
+ yield mention[:screen_name], mention[:indices].first, mention[:indices].last
134
+ end
135
+ end
136
+
137
+ possible_screen_names
138
+ end
139
+
140
+ # Extracts a list of all usernames or lists mentioned in the Tweet <tt>text</tt>
141
+ # along with the indices for where the mention ocurred. If the
142
+ # <tt>text</tt> is nil or contains no username or list mentions, an empty array
143
+ # will be returned.
144
+ #
145
+ # If a block is given, then it will be called with each username, list slug, the start
146
+ # index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
147
+ # if this is a username mention.
148
+ def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
149
+ return [] unless text =~ /[@@]/
150
+
151
+ possible_entries = []
152
+ text.to_s.scan(Twitter::TwitterText::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
153
+ match_data = $~
154
+ after = $'
155
+ unless after =~ Twitter::TwitterText::Regex[:end_mention_match]
156
+ start_position = match_data.char_begin(3) - 1
157
+ end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
158
+ possible_entries << {
159
+ :screen_name => screen_name,
160
+ :list_slug => list_slug || "",
161
+ :indices => [start_position, end_position]
162
+ }
163
+ end
164
+ end
165
+
166
+ if block_given?
167
+ possible_entries.each do |mention|
168
+ yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
169
+ end
170
+ end
171
+
172
+ possible_entries
173
+ end
174
+
175
+ # Extracts the username username replied to in the Tweet <tt>text</tt>. If the
176
+ # <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
177
+ #
178
+ # If a block is given then it will be called with the username replied to (if any)
179
+ def extract_reply_screen_name(text) # :yields: username
180
+ return nil unless text
181
+
182
+ possible_screen_name = text.match(Twitter::TwitterText::Regex[:valid_reply])
183
+ return unless possible_screen_name.respond_to?(:captures)
184
+ return if $' =~ Twitter::TwitterText::Regex[:end_mention_match]
185
+ screen_name = possible_screen_name.captures.first
186
+ yield screen_name if block_given?
187
+ screen_name
188
+ end
189
+
190
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
191
+ # <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
192
+ # will be returned.
193
+ #
194
+ # If a block is given then it will be called for each URL.
195
+ def extract_urls(text, &block) # :yields: url
196
+ urls = extract_urls_with_indices(text).map{|u| u[:url]}
197
+ urls.each(&block) if block_given?
198
+ urls
199
+ end
200
+
201
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
202
+ # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
203
+ # URLs an empty array will be returned.
204
+ #
205
+ # If a block is given then it will be called for each URL.
206
+ def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
207
+ return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
208
+ urls = []
209
+
210
+ text.to_s.scan(Twitter::TwitterText::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
211
+ valid_url_match_data = $~
212
+
213
+ start_position = valid_url_match_data.char_begin(3)
214
+ end_position = valid_url_match_data.char_end(3)
215
+
216
+ # If protocol is missing and domain contains non-ASCII characters,
217
+ # extract ASCII-only domains.
218
+ if !protocol
219
+ next if !options[:extract_url_without_protocol] || before =~ Twitter::TwitterText::Regex[:invalid_url_without_protocol_preceding_chars]
220
+ last_url = nil
221
+ domain.scan(Twitter::TwitterText::Regex[:valid_ascii_domain]) do |ascii_domain|
222
+ next unless is_valid_domain(url.length, ascii_domain, protocol)
223
+ last_url = {
224
+ :url => ascii_domain,
225
+ :indices => [start_position + $~.char_begin(0),
226
+ start_position + $~.char_end(0)]
227
+ }
228
+ urls << last_url
229
+ end
230
+
231
+ # no ASCII-only domain found. Skip the entire URL
232
+ next unless last_url
233
+
234
+ # last_url only contains domain. Need to add path and query if they exist.
235
+ if path
236
+ # last_url was not added. Add it to urls here.
237
+ last_url[:url] = url.sub(domain, last_url[:url])
238
+ last_url[:indices][1] = end_position
239
+ end
240
+ else
241
+ # In the case of t.co URLs, don't allow additional path characters
242
+ if url =~ Twitter::TwitterText::Regex[:valid_tco_url]
243
+ next if $1 && $1.length > MAX_TCO_SLUG_LENGTH
244
+ url = $&
245
+ end_position = start_position + url.codepoint_length
246
+ end
247
+
248
+ next unless is_valid_domain(url.length, domain, protocol)
249
+
250
+ urls << {
251
+ :url => url,
252
+ :indices => [start_position, end_position]
253
+ }
254
+ end
255
+ end
256
+ urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
257
+ urls
258
+ end
259
+
260
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
261
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
262
+ # will be returned. The array returned will not include the leading <tt>#</tt>
263
+ # character.
264
+ #
265
+ # If a block is given then it will be called for each hashtag.
266
+ def extract_hashtags(text, &block) # :yields: hashtag_text
267
+ hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
268
+ hashtags.each(&block) if block_given?
269
+ hashtags
270
+ end
271
+
272
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
273
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
274
+ # will be returned. The array returned will not include the leading <tt>#</tt>
275
+ # character.
276
+ #
277
+ # If a block is given then it will be called for each hashtag.
278
+ def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
279
+ return [] unless text =~ /[##]/
280
+
281
+ tags = []
282
+ text.scan(Twitter::TwitterText::Regex[:valid_hashtag]) do |before, hash, hash_text|
283
+ match_data = $~
284
+ start_position = match_data.char_begin(2)
285
+ end_position = match_data.char_end(3)
286
+ after = $'
287
+ unless after =~ Twitter::TwitterText::Regex[:end_hashtag_match]
288
+ tags << {
289
+ :hashtag => hash_text,
290
+ :indices => [start_position, end_position]
291
+ }
292
+ end
293
+ end
294
+
295
+ if options[:check_url_overlap]
296
+ # extract URLs
297
+ urls = extract_urls_with_indices(text)
298
+ unless urls.empty?
299
+ tags.concat(urls)
300
+ # remove duplicates
301
+ tags = remove_overlapping_entities(tags)
302
+ # remove URL entities
303
+ tags.reject!{|entity| !entity[:hashtag] }
304
+ end
305
+ end
306
+
307
+ tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
308
+ tags
309
+ end
310
+
311
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
312
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
313
+ # will be returned. The array returned will not include the leading <tt>$</tt>
314
+ # character.
315
+ #
316
+ # If a block is given then it will be called for each cashtag.
317
+ def extract_cashtags(text, &block) # :yields: cashtag_text
318
+ cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
319
+ cashtags.each(&block) if block_given?
320
+ cashtags
321
+ end
322
+
323
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
324
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
325
+ # will be returned. The array returned will not include the leading <tt>$</tt>
326
+ # character.
327
+ #
328
+ # If a block is given then it will be called for each cashtag.
329
+ def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
330
+ return [] unless text =~ /\$/
331
+
332
+ tags = []
333
+ text.scan(Twitter::TwitterText::Regex[:valid_cashtag]) do |before, dollar, cash_text|
334
+ match_data = $~
335
+ start_position = match_data.char_begin(2)
336
+ end_position = match_data.char_end(3)
337
+ tags << {
338
+ :cashtag => cash_text,
339
+ :indices => [start_position, end_position]
340
+ }
341
+ end
342
+
343
+ tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
344
+ tags
345
+ end
346
+
347
+ def extract_emoji_with_indices(text) # :yields: emoji, start, end
348
+ emoji = []
349
+ text.scan(Twitter::TwitterText::Regex[:valid_emoji]) do |emoji_text|
350
+ match_data = $~
351
+ start_position = match_data.char_begin(0)
352
+ end_position = match_data.char_end(0)
353
+ emoji << {
354
+ :emoji => emoji_text,
355
+ :indices => [start_position, end_position]
356
+ }
357
+ end
358
+ emoji
359
+ end
360
+
361
+ def is_valid_emoji(text)
362
+ begin
363
+ raise ArgumentError.new("invalid empty emoji") unless text
364
+ entities = extract_emoji_with_indices(text)
365
+ entities.count == 1 && entities[0][:emoji] == text
366
+ rescue Exception
367
+ # On error don't consider this a valid domain.
368
+ return false
369
+ end
370
+ end
371
+
372
+ def is_valid_domain(url_length, domain, protocol)
373
+ begin
374
+ raise ArgumentError.new("invalid empty domain") unless domain
375
+ original_domain_length = domain.length
376
+ encoded_domain = IDN::Idna.toASCII(domain)
377
+ updated_domain_length = encoded_domain.length
378
+ url_length += (updated_domain_length - original_domain_length) if (updated_domain_length > original_domain_length)
379
+ url_length += URL_PROTOCOL_LENGTH unless protocol
380
+ url_length <= MAX_URL_LENGTH
381
+ rescue Exception
382
+ # On error don't consider this a valid domain.
383
+ return false
384
+ end
385
+ end
386
+ end
387
+ end
388
+ end
@@ -0,0 +1,27 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ module Twitter
6
+ module TwitterText
7
+ module HashHelper
8
+ # Return a new hash with all keys converted to symbols, as long as
9
+ # they respond to +to_sym+.
10
+ #
11
+ # { 'name' => 'Rob', 'years' => '28' }.symbolize_keys
12
+ # #=> { :name => "Rob", :years => "28" }
13
+ def self.symbolize_keys(hash)
14
+ symbolize_keys!(hash.dup)
15
+ end
16
+
17
+ # Destructively convert all keys to symbols, as long as they respond
18
+ # to +to_sym+. Same as +symbolize_keys+, but modifies +self+.
19
+ def self.symbolize_keys!(hash)
20
+ hash.keys.each do |key|
21
+ hash[(key.to_sym rescue key) || key] = hash.delete(key)
22
+ end
23
+ hash
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,92 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ module Twitter
6
+ module TwitterText
7
+ # Module for doing "hit highlighting" on tweets that have been auto-linked already.
8
+ # Useful with the results returned from the Search API.
9
+ module HitHighlighter extend self
10
+ # Default Tag used for hit highlighting
11
+ DEFAULT_HIGHLIGHT_TAG = "em"
12
+
13
+ # Add <tt><em></em></tt> tags around the <tt>hits</tt> provided in the <tt>text</tt>. The
14
+ # <tt>hits</tt> should be an array of (start, end) index pairs, relative to the original
15
+ # text, before auto-linking (but the <tt>text</tt> may already be auto-linked if desired)
16
+ #
17
+ # The <tt><em></em></tt> tags can be overridden using the <tt>:tag</tt> option. For example:
18
+ #
19
+ # irb> hit_highlight("test hit here", [[5, 8]], :tag => 'strong')
20
+ # => "test <strong>hit</strong> here"
21
+ def hit_highlight(text, hits = [], options = {})
22
+ if hits.empty?
23
+ return text
24
+ end
25
+
26
+ tag_name = options[:tag] || DEFAULT_HIGHLIGHT_TAG
27
+ tags = ["<" + tag_name + ">", "</" + tag_name + ">"]
28
+
29
+ chunks = text.split(/[<>]/)
30
+
31
+ result = []
32
+ chunk_index, chunk = 0, chunks[0]
33
+ chunk_chars = chunk.to_s.to_codepoint_a
34
+ prev_chunks_len = 0
35
+ chunk_cursor = 0
36
+ start_in_chunk = false
37
+ for hit, index in hits.flatten.each_with_index do
38
+ tag = tags[index % 2]
39
+
40
+ placed = false
41
+ until chunk.nil? || hit < prev_chunks_len + chunk.length do
42
+ result << chunk_chars[chunk_cursor..-1]
43
+ if start_in_chunk && hit == prev_chunks_len + chunk_chars.length
44
+ result << tag
45
+ placed = true
46
+ end
47
+
48
+ # correctly handle highlights that end on the final character.
49
+ if tag_text = chunks[chunk_index+1]
50
+ result << "<#{tag_text}>"
51
+ end
52
+
53
+ prev_chunks_len += chunk_chars.length
54
+ chunk_cursor = 0
55
+ chunk_index += 2
56
+ chunk = chunks[chunk_index]
57
+ chunk_chars = chunk.to_s.to_codepoint_a
58
+ start_in_chunk = false
59
+ end
60
+
61
+ if !placed && !chunk.nil?
62
+ hit_spot = hit - prev_chunks_len
63
+ result << chunk_chars[chunk_cursor...hit_spot] << tag
64
+ chunk_cursor = hit_spot
65
+ if index % 2 == 0
66
+ start_in_chunk = true
67
+ else
68
+ start_in_chunk = false
69
+ end
70
+ placed = true
71
+ end
72
+
73
+ # ultimate fallback, hits that run off the end get a closing tag
74
+ if !placed
75
+ result << tag
76
+ end
77
+ end
78
+
79
+ if chunk
80
+ if chunk_cursor < chunk_chars.length
81
+ result << chunk_chars[chunk_cursor..-1]
82
+ end
83
+ (chunk_index+1).upto(chunks.length-1).each do |i|
84
+ result << (i.even? ? chunks[i] : "<#{chunks[i]}>")
85
+ end
86
+ end
87
+
88
+ result.flatten.join
89
+ end
90
+ end
91
+ end
92
+ end