twitter-text-editted 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,328 @@
1
+ # encoding: UTF-8
2
+
3
+ class String
4
+ # Helper function to count the character length by first converting to an
5
+ # array. This is needed because with unicode strings, the return value
6
+ # of length may be incorrect
7
+ def char_length
8
+ if respond_to? :codepoints
9
+ length
10
+ else
11
+ chars.kind_of?(Enumerable) ? chars.to_a.size : chars.size
12
+ end
13
+ end
14
+
15
+ # Helper function to convert this string into an array of unicode characters.
16
+ def to_char_a
17
+ @to_char_a ||= if chars.kind_of?(Enumerable)
18
+ chars.to_a
19
+ else
20
+ char_array = []
21
+ 0.upto(char_length - 1) { |i| char_array << [chars.slice(i)].pack('U') }
22
+ char_array
23
+ end
24
+ end
25
+ end
26
+
27
+ # Helper functions to return character offsets instead of byte offsets.
28
+ class MatchData
29
+ def char_begin(n)
30
+ if string.respond_to? :codepoints
31
+ self.begin(n)
32
+ else
33
+ string[0, self.begin(n)].char_length
34
+ end
35
+ end
36
+
37
+ def char_end(n)
38
+ if string.respond_to? :codepoints
39
+ self.end(n)
40
+ else
41
+ string[0, self.end(n)].char_length
42
+ end
43
+ end
44
+ end
45
+
46
+ module Twitter
47
+ # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
48
+ # of usernames, lists, URLs and hashtags.
49
+ module Extractor extend self
50
+ # Remove overlapping entities.
51
+ # This returns a new array with no overlapping entities.
52
+ def remove_overlapping_entities(entities)
53
+ # sort by start index
54
+ entities = entities.sort_by{|entity| entity[:indices].first}
55
+
56
+ # remove duplicates
57
+ prev = nil
58
+ entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
59
+ entities
60
+ end
61
+
62
+ # Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
63
+ # along with the indices for where the entity ocurred
64
+ # If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
65
+ # will be returned.
66
+ #
67
+ # If a block is given then it will be called for each entity.
68
+ def extract_entities_with_indices(text, options = {}, &block)
69
+ # extract all entities
70
+ entities = extract_urls_with_indices(text, options) +
71
+ extract_hashtags_with_indices(text, :check_url_overlap => false) +
72
+ extract_mentions_or_lists_with_indices(text) +
73
+ extract_cashtags_with_indices(text)
74
+
75
+ return [] if entities.empty?
76
+
77
+ entities = remove_overlapping_entities(entities)
78
+
79
+ entities.each(&block) if block_given?
80
+ entities
81
+ end
82
+
83
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
84
+ # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
85
+ # will be returned.
86
+ #
87
+ # If a block is given then it will be called for each username.
88
+ def extract_mentioned_screen_names(text, &block) # :yields: username
89
+ screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
90
+ screen_names.each(&block) if block_given?
91
+ screen_names
92
+ end
93
+
94
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
95
+ # along with the indices for where the mention ocurred. If the
96
+ # <tt>text</tt> is nil or contains no username mentions, an empty array
97
+ # will be returned.
98
+ #
99
+ # If a block is given, then it will be called with each username, the start
100
+ # index, and the end index in the <tt>text</tt>.
101
+ def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
102
+ return [] unless text
103
+
104
+ possible_screen_names = []
105
+ extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
106
+ next unless list_slug.empty?
107
+ possible_screen_names << {
108
+ :screen_name => screen_name,
109
+ :indices => [start_position, end_position]
110
+ }
111
+ end
112
+
113
+ if block_given?
114
+ possible_screen_names.each do |mention|
115
+ yield mention[:screen_name], mention[:indices].first, mention[:indices].last
116
+ end
117
+ end
118
+
119
+ possible_screen_names
120
+ end
121
+
122
+ # Extracts a list of all usernames or lists mentioned in the Tweet <tt>text</tt>
123
+ # along with the indices for where the mention ocurred. If the
124
+ # <tt>text</tt> is nil or contains no username or list mentions, an empty array
125
+ # will be returned.
126
+ #
127
+ # If a block is given, then it will be called with each username, list slug, the start
128
+ # index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
129
+ # if this is a username mention.
130
+ def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
131
+ return [] unless text =~ /[@@]/
132
+
133
+ possible_entries = []
134
+ text.to_s.scan(Twitter::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
135
+ match_data = $~
136
+ after = $'
137
+ unless after =~ Twitter::Regex[:end_mention_match]
138
+ start_position = match_data.char_begin(3) - 1
139
+ end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
140
+ possible_entries << {
141
+ :screen_name => screen_name,
142
+ :list_slug => list_slug || "",
143
+ :indices => [start_position, end_position]
144
+ }
145
+ end
146
+ end
147
+
148
+ if block_given?
149
+ possible_entries.each do |mention|
150
+ yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
151
+ end
152
+ end
153
+
154
+ possible_entries
155
+ end
156
+
157
+ # Extracts the username username replied to in the Tweet <tt>text</tt>. If the
158
+ # <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
159
+ #
160
+ # If a block is given then it will be called with the username replied to (if any)
161
+ def extract_reply_screen_name(text) # :yields: username
162
+ return nil unless text
163
+
164
+ possible_screen_name = text.match(Twitter::Regex[:valid_reply])
165
+ return unless possible_screen_name.respond_to?(:captures)
166
+ return if $' =~ Twitter::Regex[:end_mention_match]
167
+ screen_name = possible_screen_name.captures.first
168
+ yield screen_name if block_given?
169
+ screen_name
170
+ end
171
+
172
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
173
+ # <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
174
+ # will be returned.
175
+ #
176
+ # If a block is given then it will be called for each URL.
177
+ def extract_urls(text, &block) # :yields: url
178
+ urls = extract_urls_with_indices(text).map{|u| u[:url]}
179
+ urls.each(&block) if block_given?
180
+ urls
181
+ end
182
+
183
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
184
+ # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
185
+ # URLs an empty array will be returned.
186
+ #
187
+ # If a block is given then it will be called for each URL.
188
+ def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
189
+ return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
190
+ urls = []
191
+ position = 0
192
+
193
+ text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
194
+ valid_url_match_data = $~
195
+
196
+ start_position = valid_url_match_data.char_begin(3)
197
+ end_position = valid_url_match_data.char_end(3)
198
+
199
+ # If protocol is missing and domain contains non-ASCII characters,
200
+ # extract ASCII-only domains.
201
+ if !protocol
202
+ next if !options[:extract_url_without_protocol] || before =~ Twitter::Regex[:invalid_url_without_protocol_preceding_chars]
203
+ last_url = nil
204
+ last_url_invalid_match = nil
205
+ domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
206
+ last_url = {
207
+ :url => ascii_domain,
208
+ :indices => [start_position + $~.char_begin(0),
209
+ start_position + $~.char_end(0)]
210
+ }
211
+ last_url_invalid_match = ascii_domain =~ Twitter::Regex[:invalid_short_domain]
212
+ urls << last_url unless last_url_invalid_match
213
+ end
214
+
215
+ # no ASCII-only domain found. Skip the entire URL
216
+ next unless last_url
217
+
218
+ # last_url only contains domain. Need to add path and query if they exist.
219
+ if path
220
+ # last_url was not added. Add it to urls here.
221
+ urls << last_url if last_url_invalid_match
222
+ last_url[:url] = url.sub(domain, last_url[:url])
223
+ last_url[:indices][1] = end_position
224
+ end
225
+ else
226
+ # In the case of t.co URLs, don't allow additional path characters
227
+ if url =~ Twitter::Regex[:valid_tco_url]
228
+ url = $&
229
+ end_position = start_position + url.char_length
230
+ end
231
+ urls << {
232
+ :url => url,
233
+ :indices => [start_position, end_position]
234
+ }
235
+ end
236
+ end
237
+ urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
238
+ urls
239
+ end
240
+
241
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
242
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
243
+ # will be returned. The array returned will not include the leading <tt>#</tt>
244
+ # character.
245
+ #
246
+ # If a block is given then it will be called for each hashtag.
247
+ def extract_hashtags(text, &block) # :yields: hashtag_text
248
+ hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
249
+ hashtags.each(&block) if block_given?
250
+ hashtags
251
+ end
252
+
253
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
254
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
255
+ # will be returned. The array returned will not include the leading <tt>#</tt>
256
+ # character.
257
+ #
258
+ # If a block is given then it will be called for each hashtag.
259
+ def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
260
+ return [] unless text =~ /[##]/
261
+
262
+ tags = []
263
+ text.scan(Twitter::Regex[:valid_hashtag]) do |before, hash, hash_text|
264
+ match_data = $~
265
+ start_position = match_data.char_begin(2)
266
+ end_position = match_data.char_end(3)
267
+ after = $'
268
+ unless after =~ Twitter::Regex[:end_hashtag_match]
269
+ tags << {
270
+ :hashtag => hash_text,
271
+ :indices => [start_position, end_position]
272
+ }
273
+ end
274
+ end
275
+
276
+ if options[:check_url_overlap]
277
+ # extract URLs
278
+ urls = extract_urls_with_indices(text)
279
+ unless urls.empty?
280
+ tags.concat(urls)
281
+ # remove duplicates
282
+ tags = remove_overlapping_entities(tags)
283
+ # remove URL entities
284
+ tags.reject!{|entity| !entity[:hashtag] }
285
+ end
286
+ end
287
+
288
+ tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
289
+ tags
290
+ end
291
+
292
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
293
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
294
+ # will be returned. The array returned will not include the leading <tt>$</tt>
295
+ # character.
296
+ #
297
+ # If a block is given then it will be called for each cashtag.
298
+ def extract_cashtags(text, &block) # :yields: cashtag_text
299
+ cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
300
+ cashtags.each(&block) if block_given?
301
+ cashtags
302
+ end
303
+
304
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
305
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
306
+ # will be returned. The array returned will not include the leading <tt>$</tt>
307
+ # character.
308
+ #
309
+ # If a block is given then it will be called for each cashtag.
310
+ def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
311
+ return [] unless text =~ /\$/
312
+
313
+ tags = []
314
+ text.scan(Twitter::Regex[:valid_cashtag]) do |before, dollar, cash_text|
315
+ match_data = $~
316
+ start_position = match_data.char_begin(2)
317
+ end_position = match_data.char_end(3)
318
+ tags << {
319
+ :cashtag => cash_text,
320
+ :indices => [start_position, end_position]
321
+ }
322
+ end
323
+
324
+ tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
325
+ tags
326
+ end
327
+ end
328
+ end
@@ -0,0 +1,21 @@
1
+ module Twitter
2
+ module HashHelper
3
+ # Return a new hash with all keys converted to symbols, as long as
4
+ # they respond to +to_sym+.
5
+ #
6
+ # { 'name' => 'Rob', 'years' => '28' }.symbolize_keys
7
+ # #=> { :name => "Rob", :years => "28" }
8
+ def self.symbolize_keys(hash)
9
+ hash.dup.symbolize_keys!
10
+ end
11
+
12
+ # Destructively convert all keys to symbols, as long as they respond
13
+ # to +to_sym+. Same as +symbolize_keys+, but modifies +self+.
14
+ def self.symbolize_keys!(hash)
15
+ hash.keys.each do |key|
16
+ hash[(key.to_sym rescue key) || key] = hash.delete(key)
17
+ end
18
+ hash
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,86 @@
1
+ module Twitter
2
+ # Module for doing "hit highlighting" on tweets that have been auto-linked already.
3
+ # Useful with the results returned from the Search API.
4
+ module HitHighlighter extend self
5
+ # Default Tag used for hit highlighting
6
+ DEFAULT_HIGHLIGHT_TAG = "em"
7
+
8
+ # Add <tt><em></em></tt> tags around the <tt>hits</tt> provided in the <tt>text</tt>. The
9
+ # <tt>hits</tt> should be an array of (start, end) index pairs, relative to the original
10
+ # text, before auto-linking (but the <tt>text</tt> may already be auto-linked if desired)
11
+ #
12
+ # The <tt><em></em></tt> tags can be overridden using the <tt>:tag</tt> option. For example:
13
+ #
14
+ # irb> hit_highlight("test hit here", [[5, 8]], :tag => 'strong')
15
+ # => "test <strong>hit</strong> here"
16
+ def hit_highlight(text, hits = [], options = {})
17
+ if hits.empty?
18
+ return text
19
+ end
20
+
21
+ tag_name = options[:tag] || DEFAULT_HIGHLIGHT_TAG
22
+ tags = ["<" + tag_name + ">", "</" + tag_name + ">"]
23
+
24
+ chunks = text.split(/[<>]/)
25
+
26
+ result = []
27
+ chunk_index, chunk = 0, chunks[0]
28
+ chunk_chars = chunk.to_s.to_char_a
29
+ prev_chunks_len = 0
30
+ chunk_cursor = 0
31
+ start_in_chunk = false
32
+ for hit, index in hits.flatten.each_with_index do
33
+ tag = tags[index % 2]
34
+
35
+ placed = false
36
+ until chunk.nil? || hit < prev_chunks_len + chunk.length do
37
+ result << chunk_chars[chunk_cursor..-1]
38
+ if start_in_chunk && hit == prev_chunks_len + chunk_chars.length
39
+ result << tag
40
+ placed = true
41
+ end
42
+
43
+ # correctly handle highlights that end on the final character.
44
+ if tag_text = chunks[chunk_index+1]
45
+ result << "<#{tag_text}>"
46
+ end
47
+
48
+ prev_chunks_len += chunk_chars.length
49
+ chunk_cursor = 0
50
+ chunk_index += 2
51
+ chunk = chunks[chunk_index]
52
+ chunk_chars = chunk.to_s.to_char_a
53
+ start_in_chunk = false
54
+ end
55
+
56
+ if !placed && !chunk.nil?
57
+ hit_spot = hit - prev_chunks_len
58
+ result << chunk_chars[chunk_cursor...hit_spot] << tag
59
+ chunk_cursor = hit_spot
60
+ if index % 2 == 0
61
+ start_in_chunk = true
62
+ else
63
+ start_in_chunk = false
64
+ end
65
+ placed = true
66
+ end
67
+
68
+ # ultimate fallback, hits that run off the end get a closing tag
69
+ if !placed
70
+ result << tag
71
+ end
72
+ end
73
+
74
+ if chunk
75
+ if chunk_cursor < chunk_chars.length
76
+ result << chunk_chars[chunk_cursor..-1]
77
+ end
78
+ (chunk_index+1).upto(chunks.length-1).each do |index|
79
+ result << (index.even? ? chunks[index] : "<#{chunks[index]}>")
80
+ end
81
+ end
82
+
83
+ result.flatten.join
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,366 @@
1
+ # encoding: UTF-8
2
+
3
+ module Twitter
4
+ # A collection of regular expressions for parsing Tweet text. The regular expression
5
+ # list is frozen at load time to ensure immutability. These regular expressions are
6
+ # used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
7
+ # sure these reular expressions work with Tweets in all languages.
8
+ class Regex
9
+ REGEXEN = {} # :nodoc:
10
+
11
+ def self.regex_range(from, to = nil) # :nodoc:
12
+ if $RUBY_1_9
13
+ if to
14
+ "\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
15
+ else
16
+ "\\u{#{from.to_s(16).rjust(4, '0')}}"
17
+ end
18
+ else
19
+ if to
20
+ [from].pack('U') + '-' + [to].pack('U')
21
+ else
22
+ [from].pack('U')
23
+ end
24
+ end
25
+ end
26
+
27
+ # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
28
+ # to access both the list of characters and a pattern suitible for use with String#split
29
+ # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
30
+ UNICODE_SPACES = [
31
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
32
+ 0x0020, # White_Space # Zs SPACE
33
+ 0x0085, # White_Space # Cc <control-0085>
34
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
35
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
36
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
37
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
38
+ 0x2028, # White_Space # Zl LINE SEPARATOR
39
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
40
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
41
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
42
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
43
+ ].flatten.map{|c| [c].pack('U*')}.freeze
44
+ REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
45
+
46
+ # Character not allowed in Tweets
47
+ INVALID_CHARACTERS = [
48
+ 0xFFFE, 0xFEFF, # BOM
49
+ 0xFFFF, # Special
50
+ 0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
51
+ ].map{|cp| [cp].pack('U') }.freeze
52
+ REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
53
+
54
+ major, minor, patch = RUBY_VERSION.split('.')
55
+ if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
56
+ REGEXEN[:list_name] = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/
57
+ else
58
+ # This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
59
+ REGEXEN[:list_name] = eval("/[a-zA-Z][a-zA-Z0-9_\\-\x80-\xff]{0,24}/")
60
+ end
61
+
62
+ # Latin accented characters
63
+ # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
64
+ # Also excludes 0xf7, the division sign
65
+ LATIN_ACCENTS = [
66
+ regex_range(0xc0, 0xd6),
67
+ regex_range(0xd8, 0xf6),
68
+ regex_range(0xf8, 0xff),
69
+ regex_range(0x0100, 0x024f),
70
+ regex_range(0x0253, 0x0254),
71
+ regex_range(0x0256, 0x0257),
72
+ regex_range(0x0259),
73
+ regex_range(0x025b),
74
+ regex_range(0x0263),
75
+ regex_range(0x0268),
76
+ regex_range(0x026f),
77
+ regex_range(0x0272),
78
+ regex_range(0x0289),
79
+ regex_range(0x028b),
80
+ regex_range(0x02bb),
81
+ regex_range(0x0300, 0x036f),
82
+ regex_range(0x1e00, 0x1eff)
83
+ ].join('').freeze
84
+
85
+ RTL_CHARACTERS = [
86
+ regex_range(0x0600,0x06FF),
87
+ regex_range(0x0750,0x077F),
88
+ regex_range(0x0590,0x05FF),
89
+ regex_range(0xFE70,0xFEFF)
90
+ ].join('').freeze
91
+
92
+
93
+ NON_LATIN_HASHTAG_CHARS = [
94
+ # Cyrillic (Russian, Ukrainian, etc.)
95
+ regex_range(0x0400, 0x04ff), # Cyrillic
96
+ regex_range(0x0500, 0x0527), # Cyrillic Supplement
97
+ regex_range(0x2de0, 0x2dff), # Cyrillic Extended A
98
+ regex_range(0xa640, 0xa69f), # Cyrillic Extended B
99
+ regex_range(0x0591, 0x05bf), # Hebrew
100
+ regex_range(0x05c1, 0x05c2),
101
+ regex_range(0x05c4, 0x05c5),
102
+ regex_range(0x05c7),
103
+ regex_range(0x05d0, 0x05ea),
104
+ regex_range(0x05f0, 0x05f4),
105
+ regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms
106
+ regex_range(0xfb2a, 0xfb36),
107
+ regex_range(0xfb38, 0xfb3c),
108
+ regex_range(0xfb3e),
109
+ regex_range(0xfb40, 0xfb41),
110
+ regex_range(0xfb43, 0xfb44),
111
+ regex_range(0xfb46, 0xfb4f),
112
+ regex_range(0x0610, 0x061a), # Arabic
113
+ regex_range(0x0620, 0x065f),
114
+ regex_range(0x066e, 0x06d3),
115
+ regex_range(0x06d5, 0x06dc),
116
+ regex_range(0x06de, 0x06e8),
117
+ regex_range(0x06ea, 0x06ef),
118
+ regex_range(0x06fa, 0x06fc),
119
+ regex_range(0x06ff),
120
+ regex_range(0x0750, 0x077f), # Arabic Supplement
121
+ regex_range(0x08a0), # Arabic Extended A
122
+ regex_range(0x08a2, 0x08ac),
123
+ regex_range(0x08e4, 0x08fe),
124
+ regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A
125
+ regex_range(0xfbd3, 0xfd3d),
126
+ regex_range(0xfd50, 0xfd8f),
127
+ regex_range(0xfd92, 0xfdc7),
128
+ regex_range(0xfdf0, 0xfdfb),
129
+ regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B
130
+ regex_range(0xfe76, 0xfefc),
131
+ regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner
132
+ regex_range(0x0e01, 0x0e3a), # Thai
133
+ regex_range(0x0e40, 0x0e4e), # Hangul (Korean)
134
+ regex_range(0x1100, 0x11ff), # Hangul Jamo
135
+ regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo
136
+ regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A
137
+ regex_range(0xAC00, 0xD7AF), # Hangul Syllables
138
+ regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B
139
+ regex_range(0xFFA1, 0xFFDC) # Half-width Hangul
140
+ ].join('').freeze
141
+ REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
142
+
143
+ CJ_HASHTAG_CHARACTERS = [
144
+ regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width)
145
+ regex_range(0xFF66, 0xFF9F), # Katakana (half-width)
146
+ regex_range(0xFF10, 0xFF19), regex_range(0xFF21, 0xFF3A), regex_range(0xFF41, 0xFF5A), # Latin (full-width)
147
+ regex_range(0x3041, 0x3096), regex_range(0x3099, 0x309E), # Hiragana
148
+ regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A)
149
+ regex_range(0x4E00, 0x9FFF), # Kanji (Unified)
150
+ regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B)
151
+ regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C)
152
+ regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D)
153
+ regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
154
+ ].join('').freeze
155
+
156
+ PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
157
+ SPACE_CHARS = " \t\n\x0B\f\r"
158
+ CTRL_CHARS = "\x00-\x1F\x7F"
159
+
160
+ # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
161
+ HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
162
+ HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
163
+ HASHTAG_BOUNDARY = /\A|\z|[^&a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
164
+
165
+ # Original:
166
+ #HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
167
+
168
+ # The new one:
169
+ HASHTAG = /[#].\S+/io
170
+
171
+ REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
172
+ # Used in Extractor for final filtering
173
+ REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
174
+
175
+ REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)/o
176
+ REGEXEN[:at_signs] = /[@@]/
177
+ REGEXEN[:valid_mention_or_list] = /
178
+ (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
179
+ (#{REGEXEN[:at_signs]}) # $2: At mark
180
+ ([a-zA-Z0-9_]{1,20}) # $3: Screen name
181
+ (\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
182
+ /ox
183
+ REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
184
+ # Used in Extractor for final filtering
185
+ REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
186
+
187
+ # URL related hash regex collection
188
+ REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
189
+ REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
190
+ DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
191
+ REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
192
+ REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
193
+
194
+ REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^0-9a-z]|$))/i
195
+ REGEXEN[:valid_ccTLD] = %r{
196
+ (?:
197
+ (?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
198
+ ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|
199
+ gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|
200
+ lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
201
+ pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|
202
+ th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
203
+ (?=[^0-9a-z]|$)
204
+ )
205
+ }ix
206
+ REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
207
+
208
+ REGEXEN[:valid_domain] = /(?:
209
+ #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
210
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
211
+ )/iox
212
+
213
+ # This is used in Extractor
214
+ REGEXEN[:valid_ascii_domain] = /
215
+ (?:(?:[A-Za-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
216
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
217
+ /iox
218
+
219
+ # This is used in Extractor for stricter t.co URL extraction
220
+ REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/[a-z0-9]+/i
221
+
222
+ # This is used in Extractor to filter out unwanted URLs.
223
+ REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
224
+
225
+ REGEXEN[:valid_port_number] = /[0-9]+/
226
+
227
+ REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\.\$\/%#\[\]\-_~&|@#{LATIN_ACCENTS}]/io
228
+ # Allow URL paths to contain balanced parens
229
+ # 1. Used in Wikipedia URLs like /Primer_(film)
230
+ # 2. Used in IIS sessions like /S(dfd346)/
231
+ REGEXEN[:valid_url_balanced_parens] = /\(#{REGEXEN[:valid_general_url_path_chars]}+\)/io
232
+ # Valid end-of-path chracters (so /foo. does not gobble the period).
233
+ # 1. Allow =&# for empty URL parameters and other URL-join artifacts
234
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
235
+ REGEXEN[:valid_url_path] = /(?:
236
+ (?:
237
+ #{REGEXEN[:valid_general_url_path_chars]}*
238
+ (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
239
+ #{REGEXEN[:valid_url_path_ending_chars]}
240
+ )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
241
+ )/iox
242
+
243
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
244
+ REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
245
+ REGEXEN[:valid_url] = %r{
246
+ ( # $1 total match
247
+ (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
248
+ ( # $3 URL
249
+ (https?:\/\/)? # $4 Protocol (optional)
250
+ (#{REGEXEN[:valid_domain]}) # $5 Domain(s)
251
+ (?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
252
+ (/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
253
+ (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
254
+ )
255
+ )
256
+ }iox;
257
+
258
+ REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
259
+ REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
260
+
261
+ # These URL validation pattern strings are based on the ABNF from RFC 3986
262
+ REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
263
+ REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
264
+ REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
265
+ REGEXEN[:validate_url_pchar] = /(?:
266
+ #{REGEXEN[:validate_url_unreserved]}|
267
+ #{REGEXEN[:validate_url_pct_encoded]}|
268
+ #{REGEXEN[:validate_url_sub_delims]}|
269
+ [:\|@]
270
+ )/iox
271
+
272
+ REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
273
+ REGEXEN[:validate_url_userinfo] = /(?:
274
+ #{REGEXEN[:validate_url_unreserved]}|
275
+ #{REGEXEN[:validate_url_pct_encoded]}|
276
+ #{REGEXEN[:validate_url_sub_delims]}|
277
+ :
278
+ )*/iox
279
+
280
+ REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
281
+ REGEXEN[:validate_url_ipv4] =
282
+ /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
283
+
284
+ # Punting on real IPv6 validation for now
285
+ REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
286
+
287
+ # Also punting on IPvFuture for now
288
+ REGEXEN[:validate_url_ip] = /(?:
289
+ #{REGEXEN[:validate_url_ipv4]}|
290
+ #{REGEXEN[:validate_url_ipv6]}
291
+ )/iox
292
+
293
+ # This is more strict than the rfc specifies
294
+ REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
295
+ REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
296
+ REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
297
+ REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
298
+ (?:#{REGEXEN[:validate_url_domain_segment]}\.)
299
+ #{REGEXEN[:validate_url_domain_tld]})/iox
300
+
301
+ REGEXEN[:validate_url_host] = /(?:
302
+ #{REGEXEN[:validate_url_ip]}|
303
+ #{REGEXEN[:validate_url_domain]}
304
+ )/iox
305
+
306
+ # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
307
+ REGEXEN[:validate_url_unicode_subdomain_segment] =
308
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
309
+ REGEXEN[:validate_url_unicode_domain_segment] =
310
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
311
+ REGEXEN[:validate_url_unicode_domain_tld] =
312
+ /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
313
+ REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
314
+ (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
315
+ #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
316
+
317
+ REGEXEN[:validate_url_unicode_host] = /(?:
318
+ #{REGEXEN[:validate_url_ip]}|
319
+ #{REGEXEN[:validate_url_unicode_domain]}
320
+ )/iox
321
+
322
+ REGEXEN[:validate_url_port] = /[0-9]{1,5}/
323
+
324
+ REGEXEN[:validate_url_unicode_authority] = %r{
325
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
326
+ (#{REGEXEN[:validate_url_unicode_host]}) # $2 host
327
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
328
+ }iox
329
+
330
+ REGEXEN[:validate_url_authority] = %r{
331
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
332
+ (#{REGEXEN[:validate_url_host]}) # $2 host
333
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
334
+ }iox
335
+
336
+ REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
337
+ REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
338
+ REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
339
+
340
+ # Modified version of RFC 3986 Appendix B
341
+ REGEXEN[:validate_url_unencoded] = %r{
342
+ \A # Full URL
343
+ (?:
344
+ ([^:/?#]+):// # $1 Scheme
345
+ )?
346
+ ([^/?#]*) # $2 Authority
347
+ ([^?#]*) # $3 Path
348
+ (?:
349
+ \?([^#]*) # $4 Query
350
+ )?
351
+ (?:
352
+ \#(.*) # $5 Fragment
353
+ )?\Z
354
+ }ix
355
+
356
+ REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
357
+
358
+ REGEXEN.each_pair{|k,v| v.freeze }
359
+
360
+ # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
361
+ # is not a known symbol a <tt>nil</tt> will be returned.
362
+ def self.[](key)
363
+ REGEXEN[key]
364
+ end
365
+ end
366
+ end