twitter-text-kow 1.3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,388 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ # encoding: utf-8
6
+ require 'idn'
7
+
8
+ class String
9
+ # Helper function to count the character length by first converting to an
10
+ # array. This is needed because with unicode strings, the return value
11
+ # of length may be incorrect
12
+ def codepoint_length
13
+ if respond_to? :codepoints
14
+ length
15
+ else
16
+ chars.kind_of?(Enumerable) ? chars.to_a.size : chars.size
17
+ end
18
+ end
19
+
20
+ # Helper function to convert this string into an array of unicode code points.
21
+ def to_codepoint_a
22
+ if chars.kind_of?(Enumerable)
23
+ chars.to_a
24
+ else
25
+ codepoint_array = []
26
+ 0.upto(codepoint_length - 1) { |i| codepoint_array << [chars.slice(i)].pack('U') }
27
+ codepoint_array
28
+ end
29
+ end
30
+ end
31
+
32
+ # Helper functions to return code point offsets instead of byte offsets.
33
+ class MatchData
34
+ def char_begin(n)
35
+ if string.respond_to? :codepoints
36
+ self.begin(n)
37
+ else
38
+ string[0, self.begin(n)].codepoint_length
39
+ end
40
+ end
41
+
42
+ def char_end(n)
43
+ if string.respond_to? :codepoints
44
+ self.end(n)
45
+ else
46
+ string[0, self.end(n)].codepoint_length
47
+ end
48
+ end
49
+ end
50
+
51
+ module Twitter
52
+ module TwitterText
53
+ # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
54
+ # of usernames, lists, URLs and hashtags.
55
+ module Extractor extend self
56
+
57
+ # Maximum URL length as defined by Twitter's backend.
58
+ MAX_URL_LENGTH = 4096
59
+
60
+ # The maximum t.co path length that the Twitter backend supports.
61
+ MAX_TCO_SLUG_LENGTH = 40
62
+
63
+ URL_PROTOCOL_LENGTH = "https://".length
64
+
65
+ # Remove overlapping entities.
66
+ # This returns a new array with no overlapping entities.
67
+ def remove_overlapping_entities(entities)
68
+ # sort by start index
69
+ entities = entities.sort_by{|entity| entity[:indices].first}
70
+
71
+ # remove duplicates
72
+ prev = nil
73
+ entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
74
+ entities
75
+ end
76
+
77
+ # Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
78
+ # along with the indices for where the entity ocurred
79
+ # If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
80
+ # will be returned.
81
+ #
82
+ # If a block is given then it will be called for each entity.
83
+ def extract_entities_with_indices(text, options = {}, &block)
84
+ config = options[:config] || Twitter::TwitterText::Configuration.default_configuration
85
+
86
+ # extract all entities
87
+ entities = extract_urls_with_indices(text, options) +
88
+ extract_hashtags_with_indices(text, :check_url_overlap => false) +
89
+ extract_mentions_or_lists_with_indices(text) +
90
+ extract_cashtags_with_indices(text)
91
+ entities += extract_emoji_with_indices(text) if config.emoji_parsing_enabled
92
+
93
+ return [] if entities.empty?
94
+
95
+ entities = remove_overlapping_entities(entities)
96
+
97
+ entities.each(&block) if block_given?
98
+ entities
99
+ end
100
+
101
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
102
+ # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
103
+ # will be returned.
104
+ #
105
+ # If a block is given then it will be called for each username.
106
+ def extract_mentioned_screen_names(text, &block) # :yields: username
107
+ screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
108
+ screen_names.each(&block) if block_given?
109
+ screen_names
110
+ end
111
+
112
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
113
+ # along with the indices for where the mention ocurred. If the
114
+ # <tt>text</tt> is nil or contains no username mentions, an empty array
115
+ # will be returned.
116
+ #
117
+ # If a block is given, then it will be called with each username, the start
118
+ # index, and the end index in the <tt>text</tt>.
119
+ def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
120
+ return [] unless text
121
+
122
+ possible_screen_names = []
123
+ extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
124
+ next unless list_slug.empty?
125
+ possible_screen_names << {
126
+ :screen_name => screen_name,
127
+ :indices => [start_position, end_position]
128
+ }
129
+ end
130
+
131
+ if block_given?
132
+ possible_screen_names.each do |mention|
133
+ yield mention[:screen_name], mention[:indices].first, mention[:indices].last
134
+ end
135
+ end
136
+
137
+ possible_screen_names
138
+ end
139
+
140
+ # Extracts a list of all usernames or lists mentioned in the Tweet <tt>text</tt>
141
+ # along with the indices for where the mention ocurred. If the
142
+ # <tt>text</tt> is nil or contains no username or list mentions, an empty array
143
+ # will be returned.
144
+ #
145
+ # If a block is given, then it will be called with each username, list slug, the start
146
+ # index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
147
+ # if this is a username mention.
148
+ def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
149
+ return [] unless text =~ /[@@]/
150
+
151
+ possible_entries = []
152
+ text.to_s.scan(Twitter::TwitterText::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
153
+ match_data = $~
154
+ after = $'
155
+ unless after =~ Twitter::TwitterText::Regex[:end_mention_match]
156
+ start_position = match_data.char_begin(3) - 1
157
+ end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
158
+ possible_entries << {
159
+ :screen_name => screen_name,
160
+ :list_slug => list_slug || "",
161
+ :indices => [start_position, end_position]
162
+ }
163
+ end
164
+ end
165
+
166
+ if block_given?
167
+ possible_entries.each do |mention|
168
+ yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
169
+ end
170
+ end
171
+
172
+ possible_entries
173
+ end
174
+
175
+ # Extracts the username username replied to in the Tweet <tt>text</tt>. If the
176
+ # <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
177
+ #
178
+ # If a block is given then it will be called with the username replied to (if any)
179
+ def extract_reply_screen_name(text) # :yields: username
180
+ return nil unless text
181
+
182
+ possible_screen_name = text.match(Twitter::TwitterText::Regex[:valid_reply])
183
+ return unless possible_screen_name.respond_to?(:captures)
184
+ return if $' =~ Twitter::TwitterText::Regex[:end_mention_match]
185
+ screen_name = possible_screen_name.captures.first
186
+ yield screen_name if block_given?
187
+ screen_name
188
+ end
189
+
190
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
191
+ # <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
192
+ # will be returned.
193
+ #
194
+ # If a block is given then it will be called for each URL.
195
+ def extract_urls(text, &block) # :yields: url
196
+ urls = extract_urls_with_indices(text).map{|u| u[:url]}
197
+ urls.each(&block) if block_given?
198
+ urls
199
+ end
200
+
201
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
202
+ # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
203
+ # URLs an empty array will be returned.
204
+ #
205
+ # If a block is given then it will be called for each URL.
206
+ def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
207
+ return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
208
+ urls = []
209
+
210
+ text.to_s.scan(Twitter::TwitterText::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
211
+ valid_url_match_data = $~
212
+
213
+ start_position = valid_url_match_data.char_begin(3)
214
+ end_position = valid_url_match_data.char_end(3)
215
+
216
+ # If protocol is missing and domain contains non-ASCII characters,
217
+ # extract ASCII-only domains.
218
+ if !protocol
219
+ next if !options[:extract_url_without_protocol] || before =~ Twitter::TwitterText::Regex[:invalid_url_without_protocol_preceding_chars]
220
+ last_url = nil
221
+ domain.scan(Twitter::TwitterText::Regex[:valid_ascii_domain]) do |ascii_domain|
222
+ next unless is_valid_domain(url.length, ascii_domain, protocol)
223
+ last_url = {
224
+ :url => ascii_domain,
225
+ :indices => [start_position + $~.char_begin(0),
226
+ start_position + $~.char_end(0)]
227
+ }
228
+ urls << last_url
229
+ end
230
+
231
+ # no ASCII-only domain found. Skip the entire URL
232
+ next unless last_url
233
+
234
+ # last_url only contains domain. Need to add path and query if they exist.
235
+ if path
236
+ # last_url was not added. Add it to urls here.
237
+ last_url[:url] = url.sub(domain, last_url[:url])
238
+ last_url[:indices][1] = end_position
239
+ end
240
+ else
241
+ # In the case of t.co URLs, don't allow additional path characters
242
+ if url =~ Twitter::TwitterText::Regex[:valid_tco_url]
243
+ next if $1 && $1.length > MAX_TCO_SLUG_LENGTH
244
+ url = $&
245
+ end_position = start_position + url.codepoint_length
246
+ end
247
+
248
+ next unless is_valid_domain(url.length, domain, protocol)
249
+
250
+ urls << {
251
+ :url => url,
252
+ :indices => [start_position, end_position]
253
+ }
254
+ end
255
+ end
256
+ urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
257
+ urls
258
+ end
259
+
260
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
261
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
262
+ # will be returned. The array returned will not include the leading <tt>#</tt>
263
+ # character.
264
+ #
265
+ # If a block is given then it will be called for each hashtag.
266
+ def extract_hashtags(text, &block) # :yields: hashtag_text
267
+ hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
268
+ hashtags.each(&block) if block_given?
269
+ hashtags
270
+ end
271
+
272
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
273
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
274
+ # will be returned. The array returned will not include the leading <tt>#</tt>
275
+ # character.
276
+ #
277
+ # If a block is given then it will be called for each hashtag.
278
+ def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
279
+ return [] unless text =~ /[##]/
280
+
281
+ tags = []
282
+ text.scan(Twitter::TwitterText::Regex[:valid_hashtag]) do |before, hash, hash_text|
283
+ match_data = $~
284
+ start_position = match_data.char_begin(2)
285
+ end_position = match_data.char_end(3)
286
+ after = $'
287
+ unless after =~ Twitter::TwitterText::Regex[:end_hashtag_match]
288
+ tags << {
289
+ :hashtag => hash_text,
290
+ :indices => [start_position, end_position]
291
+ }
292
+ end
293
+ end
294
+
295
+ if options[:check_url_overlap]
296
+ # extract URLs
297
+ urls = extract_urls_with_indices(text)
298
+ unless urls.empty?
299
+ tags.concat(urls)
300
+ # remove duplicates
301
+ tags = remove_overlapping_entities(tags)
302
+ # remove URL entities
303
+ tags.reject!{|entity| !entity[:hashtag] }
304
+ end
305
+ end
306
+
307
+ tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
308
+ tags
309
+ end
310
+
311
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
312
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
313
+ # will be returned. The array returned will not include the leading <tt>$</tt>
314
+ # character.
315
+ #
316
+ # If a block is given then it will be called for each cashtag.
317
+ def extract_cashtags(text, &block) # :yields: cashtag_text
318
+ cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
319
+ cashtags.each(&block) if block_given?
320
+ cashtags
321
+ end
322
+
323
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
324
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
325
+ # will be returned. The array returned will not include the leading <tt>$</tt>
326
+ # character.
327
+ #
328
+ # If a block is given then it will be called for each cashtag.
329
+ def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
330
+ return [] unless text =~ /\$/
331
+
332
+ tags = []
333
+ text.scan(Twitter::TwitterText::Regex[:valid_cashtag]) do |before, dollar, cash_text|
334
+ match_data = $~
335
+ start_position = match_data.char_begin(2)
336
+ end_position = match_data.char_end(3)
337
+ tags << {
338
+ :cashtag => cash_text,
339
+ :indices => [start_position, end_position]
340
+ }
341
+ end
342
+
343
+ tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
344
+ tags
345
+ end
346
+
347
+ def extract_emoji_with_indices(text) # :yields: emoji, start, end
348
+ emoji = []
349
+ text.scan(Twitter::TwitterText::Regex[:valid_emoji]) do |emoji_text|
350
+ match_data = $~
351
+ start_position = match_data.char_begin(0)
352
+ end_position = match_data.char_end(0)
353
+ emoji << {
354
+ :emoji => emoji_text,
355
+ :indices => [start_position, end_position]
356
+ }
357
+ end
358
+ emoji
359
+ end
360
+
361
+ def is_valid_emoji(text)
362
+ begin
363
+ raise ArgumentError.new("invalid empty emoji") unless text
364
+ entities = extract_emoji_with_indices(text)
365
+ entities.count == 1 && entities[0][:emoji] == text
366
+ rescue Exception
367
+ # On error don't consider this a valid domain.
368
+ return false
369
+ end
370
+ end
371
+
372
+ def is_valid_domain(url_length, domain, protocol)
373
+ begin
374
+ raise ArgumentError.new("invalid empty domain") unless domain
375
+ original_domain_length = domain.length
376
+ encoded_domain = IDN::Idna.toASCII(domain)
377
+ updated_domain_length = encoded_domain.length
378
+ url_length += (updated_domain_length - original_domain_length) if (updated_domain_length > original_domain_length)
379
+ url_length += URL_PROTOCOL_LENGTH unless protocol
380
+ url_length <= MAX_URL_LENGTH
381
+ rescue Exception
382
+ # On error don't consider this a valid domain.
383
+ return false
384
+ end
385
+ end
386
+ end
387
+ end
388
+ end
@@ -0,0 +1,27 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ module Twitter
6
+ module TwitterText
7
+ module HashHelper
8
+ # Return a new hash with all keys converted to symbols, as long as
9
+ # they respond to +to_sym+.
10
+ #
11
+ # { 'name' => 'Rob', 'years' => '28' }.symbolize_keys
12
+ # #=> { :name => "Rob", :years => "28" }
13
+ def self.symbolize_keys(hash)
14
+ symbolize_keys!(hash.dup)
15
+ end
16
+
17
+ # Destructively convert all keys to symbols, as long as they respond
18
+ # to +to_sym+. Same as +symbolize_keys+, but modifies +self+.
19
+ def self.symbolize_keys!(hash)
20
+ hash.keys.each do |key|
21
+ hash[(key.to_sym rescue key) || key] = hash.delete(key)
22
+ end
23
+ hash
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,92 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ module Twitter
6
+ module TwitterText
7
+ # Module for doing "hit highlighting" on tweets that have been auto-linked already.
8
+ # Useful with the results returned from the Search API.
9
+ module HitHighlighter extend self
10
+ # Default Tag used for hit highlighting
11
+ DEFAULT_HIGHLIGHT_TAG = "em"
12
+
13
+ # Add <tt><em></em></tt> tags around the <tt>hits</tt> provided in the <tt>text</tt>. The
14
+ # <tt>hits</tt> should be an array of (start, end) index pairs, relative to the original
15
+ # text, before auto-linking (but the <tt>text</tt> may already be auto-linked if desired)
16
+ #
17
+ # The <tt><em></em></tt> tags can be overridden using the <tt>:tag</tt> option. For example:
18
+ #
19
+ # irb> hit_highlight("test hit here", [[5, 8]], :tag => 'strong')
20
+ # => "test <strong>hit</strong> here"
21
+ def hit_highlight(text, hits = [], options = {})
22
+ if hits.empty?
23
+ return text
24
+ end
25
+
26
+ tag_name = options[:tag] || DEFAULT_HIGHLIGHT_TAG
27
+ tags = ["<" + tag_name + ">", "</" + tag_name + ">"]
28
+
29
+ chunks = text.split(/[<>]/)
30
+
31
+ result = []
32
+ chunk_index, chunk = 0, chunks[0]
33
+ chunk_chars = chunk.to_s.to_codepoint_a
34
+ prev_chunks_len = 0
35
+ chunk_cursor = 0
36
+ start_in_chunk = false
37
+ for hit, index in hits.flatten.each_with_index do
38
+ tag = tags[index % 2]
39
+
40
+ placed = false
41
+ until chunk.nil? || hit < prev_chunks_len + chunk.length do
42
+ result << chunk_chars[chunk_cursor..-1]
43
+ if start_in_chunk && hit == prev_chunks_len + chunk_chars.length
44
+ result << tag
45
+ placed = true
46
+ end
47
+
48
+ # correctly handle highlights that end on the final character.
49
+ if tag_text = chunks[chunk_index+1]
50
+ result << "<#{tag_text}>"
51
+ end
52
+
53
+ prev_chunks_len += chunk_chars.length
54
+ chunk_cursor = 0
55
+ chunk_index += 2
56
+ chunk = chunks[chunk_index]
57
+ chunk_chars = chunk.to_s.to_codepoint_a
58
+ start_in_chunk = false
59
+ end
60
+
61
+ if !placed && !chunk.nil?
62
+ hit_spot = hit - prev_chunks_len
63
+ result << chunk_chars[chunk_cursor...hit_spot] << tag
64
+ chunk_cursor = hit_spot
65
+ if index % 2 == 0
66
+ start_in_chunk = true
67
+ else
68
+ start_in_chunk = false
69
+ end
70
+ placed = true
71
+ end
72
+
73
+ # ultimate fallback, hits that run off the end get a closing tag
74
+ if !placed
75
+ result << tag
76
+ end
77
+ end
78
+
79
+ if chunk
80
+ if chunk_cursor < chunk_chars.length
81
+ result << chunk_chars[chunk_cursor..-1]
82
+ end
83
+ (chunk_index+1).upto(chunks.length-1).each do |i|
84
+ result << (i.even? ? chunks[i] : "<#{chunks[i]}>")
85
+ end
86
+ end
87
+
88
+ result.flatten.join
89
+ end
90
+ end
91
+ end
92
+ end