incollege-text 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,328 @@
1
+ # encoding: UTF-8
2
+
3
+ class String
4
+ # Helper function to count the character length by first converting to an
5
+ # array. This is needed because with unicode strings, the return value
6
+ # of length may be incorrect
7
+ def char_length
8
+ if respond_to? :codepoints
9
+ length
10
+ else
11
+ chars.kind_of?(Enumerable) ? chars.to_a.size : chars.size
12
+ end
13
+ end
14
+
15
+ # Helper function to convert this string into an array of unicode characters.
16
+ def to_char_a
17
+ @to_char_a ||= if chars.kind_of?(Enumerable)
18
+ chars.to_a
19
+ else
20
+ char_array = []
21
+ 0.upto(char_length - 1) { |i| char_array << [chars.slice(i)].pack('U') }
22
+ char_array
23
+ end
24
+ end
25
+ end
26
+
27
+ # Helper functions to return character offsets instead of byte offsets.
28
+ class MatchData
29
+ def char_begin(n)
30
+ if string.respond_to? :codepoints
31
+ self.begin(n)
32
+ else
33
+ string[0, self.begin(n)].char_length
34
+ end
35
+ end
36
+
37
+ def char_end(n)
38
+ if string.respond_to? :codepoints
39
+ self.end(n)
40
+ else
41
+ string[0, self.end(n)].char_length
42
+ end
43
+ end
44
+ end
45
+
46
+ module Incollege
47
+ # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
48
+ # of usernames, lists, URLs and hashtags.
49
+ module Extractor extend self
50
+ # Remove overlapping entities.
51
+ # This returns a new array with no overlapping entities.
52
+ def remove_overlapping_entities(entities)
53
+ # sort by start index
54
+ entities = entities.sort_by{|entity| entity[:indices].first}
55
+
56
+ # remove duplicates
57
+ prev = nil
58
+ entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
59
+ entities
60
+ end
61
+
62
+ # Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
63
+ # along with the indices for where the entity ocurred
64
+ # If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
65
+ # will be returned.
66
+ #
67
+ # If a block is given then it will be called for each entity.
68
+ def extract_entities_with_indices(text, options = {}, &block)
69
+ # extract all entities
70
+ entities = extract_urls_with_indices(text, options) +
71
+ extract_hashtags_with_indices(text, :check_url_overlap => false) +
72
+ extract_mentions_or_lists_with_indices(text) +
73
+ extract_cashtags_with_indices(text)
74
+
75
+ return [] if entities.empty?
76
+
77
+ entities = remove_overlapping_entities(entities)
78
+
79
+ entities.each(&block) if block_given?
80
+ entities
81
+ end
82
+
83
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
84
+ # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
85
+ # will be returned.
86
+ #
87
+ # If a block is given then it will be called for each username.
88
+ def extract_mentioned_screen_names(text, &block) # :yields: username
89
+ screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
90
+ screen_names.each(&block) if block_given?
91
+ screen_names
92
+ end
93
+
94
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
95
+ # along with the indices for where the mention ocurred. If the
96
+ # <tt>text</tt> is nil or contains no username mentions, an empty array
97
+ # will be returned.
98
+ #
99
+ # If a block is given, then it will be called with each username, the start
100
+ # index, and the end index in the <tt>text</tt>.
101
+ def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
102
+ return [] unless text
103
+
104
+ possible_screen_names = []
105
+ extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
106
+ next unless list_slug.empty?
107
+ possible_screen_names << {
108
+ :screen_name => screen_name,
109
+ :indices => [start_position, end_position]
110
+ }
111
+ end
112
+
113
+ if block_given?
114
+ possible_screen_names.each do |mention|
115
+ yield mention[:screen_name], mention[:indices].first, mention[:indices].last
116
+ end
117
+ end
118
+
119
+ possible_screen_names
120
+ end
121
+
122
+ # Extracts a list of all usernames or lists mentioned in the Tweet <tt>text</tt>
123
+ # along with the indices for where the mention ocurred. If the
124
+ # <tt>text</tt> is nil or contains no username or list mentions, an empty array
125
+ # will be returned.
126
+ #
127
+ # If a block is given, then it will be called with each username, list slug, the start
128
+ # index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
129
+ # if this is a username mention.
130
+ def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
131
+ return [] unless text =~ /[@@]/
132
+
133
+ possible_entries = []
134
+ text.to_s.scan(Incollege::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
135
+ match_data = $~
136
+ after = $'
137
+ unless after =~ Incollege::Regex[:end_mention_match]
138
+ start_position = match_data.char_begin(3) - 1
139
+ end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
140
+ possible_entries << {
141
+ :screen_name => screen_name,
142
+ :list_slug => list_slug || "",
143
+ :indices => [start_position, end_position]
144
+ }
145
+ end
146
+ end
147
+
148
+ if block_given?
149
+ possible_entries.each do |mention|
150
+ yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
151
+ end
152
+ end
153
+
154
+ possible_entries
155
+ end
156
+
157
+ # Extracts the username username replied to in the Tweet <tt>text</tt>. If the
158
+ # <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
159
+ #
160
+ # If a block is given then it will be called with the username replied to (if any)
161
+ def extract_reply_screen_name(text) # :yields: username
162
+ return nil unless text
163
+
164
+ possible_screen_name = text.match(Incollege::Regex[:valid_reply])
165
+ return unless possible_screen_name.respond_to?(:captures)
166
+ return if $' =~ Incollege::Regex[:end_mention_match]
167
+ screen_name = possible_screen_name.captures.first
168
+ yield screen_name if block_given?
169
+ screen_name
170
+ end
171
+
172
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
173
+ # <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
174
+ # will be returned.
175
+ #
176
+ # If a block is given then it will be called for each URL.
177
+ def extract_urls(text, &block) # :yields: url
178
+ urls = extract_urls_with_indices(text).map{|u| u[:url]}
179
+ urls.each(&block) if block_given?
180
+ urls
181
+ end
182
+
183
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
184
+ # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
185
+ # URLs an empty array will be returned.
186
+ #
187
+ # If a block is given then it will be called for each URL.
188
+ def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
189
+ return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
190
+ urls = []
191
+
192
+ text.to_s.scan(Incollege::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
193
+ valid_url_match_data = $~
194
+
195
+ start_position = valid_url_match_data.char_begin(3)
196
+ end_position = valid_url_match_data.char_end(3)
197
+
198
+ # If protocol is missing and domain contains non-ASCII characters,
199
+ # extract ASCII-only domains.
200
+ if !protocol
201
+ next if !options[:extract_url_without_protocol] || before =~ Incollege::Regex[:invalid_url_without_protocol_preceding_chars]
202
+ last_url = nil
203
+ domain.scan(Incollege::Regex[:valid_ascii_domain]) do |ascii_domain|
204
+ last_url = {
205
+ :url => ascii_domain,
206
+ :indices => [start_position + $~.char_begin(0),
207
+ start_position + $~.char_end(0)]
208
+ }
209
+ if path ||
210
+ ascii_domain =~ Incollege::Regex[:valid_special_short_domain] ||
211
+ ascii_domain !~ Incollege::Regex[:invalid_short_domain]
212
+ urls << last_url
213
+ end
214
+ end
215
+
216
+ # no ASCII-only domain found. Skip the entire URL
217
+ next unless last_url
218
+
219
+ # last_url only contains domain. Need to add path and query if they exist.
220
+ if path
221
+ # last_url was not added. Add it to urls here.
222
+ last_url[:url] = url.sub(domain, last_url[:url])
223
+ last_url[:indices][1] = end_position
224
+ end
225
+ else
226
+ # In the case of t.co URLs, don't allow additional path characters
227
+ if url =~ Incollege::Regex[:valid_tco_url]
228
+ url = $&
229
+ end_position = start_position + url.char_length
230
+ end
231
+ urls << {
232
+ :url => url,
233
+ :indices => [start_position, end_position]
234
+ }
235
+ end
236
+ end
237
+ urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
238
+ urls
239
+ end
240
+
241
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
242
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
243
+ # will be returned. The array returned will not include the leading <tt>#</tt>
244
+ # character.
245
+ #
246
+ # If a block is given then it will be called for each hashtag.
247
+ def extract_hashtags(text, &block) # :yields: hashtag_text
248
+ hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
249
+ hashtags.each(&block) if block_given?
250
+ hashtags
251
+ end
252
+
253
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
254
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
255
+ # will be returned. The array returned will not include the leading <tt>#</tt>
256
+ # character.
257
+ #
258
+ # If a block is given then it will be called for each hashtag.
259
+ def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
260
+ return [] unless text =~ /[##]/
261
+
262
+ tags = []
263
+ text.scan(Incollege::Regex[:valid_hashtag]) do |before, hash, hash_text|
264
+ match_data = $~
265
+ start_position = match_data.char_begin(2)
266
+ end_position = match_data.char_end(3)
267
+ after = $'
268
+ unless after =~ Incollege::Regex[:end_hashtag_match]
269
+ tags << {
270
+ :hashtag => hash_text,
271
+ :indices => [start_position, end_position]
272
+ }
273
+ end
274
+ end
275
+
276
+ if options[:check_url_overlap]
277
+ # extract URLs
278
+ urls = extract_urls_with_indices(text)
279
+ unless urls.empty?
280
+ tags.concat(urls)
281
+ # remove duplicates
282
+ tags = remove_overlapping_entities(tags)
283
+ # remove URL entities
284
+ tags.reject!{|entity| !entity[:hashtag] }
285
+ end
286
+ end
287
+
288
+ tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
289
+ tags
290
+ end
291
+
292
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
293
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
294
+ # will be returned. The array returned will not include the leading <tt>$</tt>
295
+ # character.
296
+ #
297
+ # If a block is given then it will be called for each cashtag.
298
+ def extract_cashtags(text, &block) # :yields: cashtag_text
299
+ cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
300
+ cashtags.each(&block) if block_given?
301
+ cashtags
302
+ end
303
+
304
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
305
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
306
+ # will be returned. The array returned will not include the leading <tt>$</tt>
307
+ # character.
308
+ #
309
+ # If a block is given then it will be called for each cashtag.
310
+ def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
311
+ return [] unless text =~ /\$/
312
+
313
+ tags = []
314
+ text.scan(Incollege::Regex[:valid_cashtag]) do |before, dollar, cash_text|
315
+ match_data = $~
316
+ start_position = match_data.char_begin(2)
317
+ end_position = match_data.char_end(3)
318
+ tags << {
319
+ :cashtag => cash_text,
320
+ :indices => [start_position, end_position]
321
+ }
322
+ end
323
+
324
+ tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
325
+ tags
326
+ end
327
+ end
328
+ end
@@ -0,0 +1,21 @@
1
+ module Incollege
2
+ module HashHelper
3
+ # Return a new hash with all keys converted to symbols, as long as
4
+ # they respond to +to_sym+.
5
+ #
6
+ # { 'name' => 'Rob', 'years' => '28' }.symbolize_keys
7
+ # #=> { :name => "Rob", :years => "28" }
8
+ def self.symbolize_keys(hash)
9
+ symbolize_keys!(hash.dup)
10
+ end
11
+
12
+ # Destructively convert all keys to symbols, as long as they respond
13
+ # to +to_sym+. Same as +symbolize_keys+, but modifies +self+.
14
+ def self.symbolize_keys!(hash)
15
+ hash.keys.each do |key|
16
+ hash[(key.to_sym rescue key) || key] = hash.delete(key)
17
+ end
18
+ hash
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,86 @@
1
+ module Incollege
2
+ # Module for doing "hit highlighting" on tweets that have been auto-linked already.
3
+ # Useful with the results returned from the Search API.
4
+ module HitHighlighter extend self
5
+ # Default Tag used for hit highlighting
6
+ DEFAULT_HIGHLIGHT_TAG = "em"
7
+
8
+ # Add <tt><em></em></tt> tags around the <tt>hits</tt> provided in the <tt>text</tt>. The
9
+ # <tt>hits</tt> should be an array of (start, end) index pairs, relative to the original
10
+ # text, before auto-linking (but the <tt>text</tt> may already be auto-linked if desired)
11
+ #
12
+ # The <tt><em></em></tt> tags can be overridden using the <tt>:tag</tt> option. For example:
13
+ #
14
+ # irb> hit_highlight("test hit here", [[5, 8]], :tag => 'strong')
15
+ # => "test <strong>hit</strong> here"
16
+ def hit_highlight(text, hits = [], options = {})
17
+ if hits.empty?
18
+ return text
19
+ end
20
+
21
+ tag_name = options[:tag] || DEFAULT_HIGHLIGHT_TAG
22
+ tags = ["<" + tag_name + ">", "</" + tag_name + ">"]
23
+
24
+ chunks = text.split(/[<>]/)
25
+
26
+ result = []
27
+ chunk_index, chunk = 0, chunks[0]
28
+ chunk_chars = chunk.to_s.to_char_a
29
+ prev_chunks_len = 0
30
+ chunk_cursor = 0
31
+ start_in_chunk = false
32
+ for hit, index in hits.flatten.each_with_index do
33
+ tag = tags[index % 2]
34
+
35
+ placed = false
36
+ until chunk.nil? || hit < prev_chunks_len + chunk.length do
37
+ result << chunk_chars[chunk_cursor..-1]
38
+ if start_in_chunk && hit == prev_chunks_len + chunk_chars.length
39
+ result << tag
40
+ placed = true
41
+ end
42
+
43
+ # correctly handle highlights that end on the final character.
44
+ if tag_text = chunks[chunk_index+1]
45
+ result << "<#{tag_text}>"
46
+ end
47
+
48
+ prev_chunks_len += chunk_chars.length
49
+ chunk_cursor = 0
50
+ chunk_index += 2
51
+ chunk = chunks[chunk_index]
52
+ chunk_chars = chunk.to_s.to_char_a
53
+ start_in_chunk = false
54
+ end
55
+
56
+ if !placed && !chunk.nil?
57
+ hit_spot = hit - prev_chunks_len
58
+ result << chunk_chars[chunk_cursor...hit_spot] << tag
59
+ chunk_cursor = hit_spot
60
+ if index % 2 == 0
61
+ start_in_chunk = true
62
+ else
63
+ start_in_chunk = false
64
+ end
65
+ placed = true
66
+ end
67
+
68
+ # ultimate fallback, hits that run off the end get a closing tag
69
+ if !placed
70
+ result << tag
71
+ end
72
+ end
73
+
74
+ if chunk
75
+ if chunk_cursor < chunk_chars.length
76
+ result << chunk_chars[chunk_cursor..-1]
77
+ end
78
+ (chunk_index+1).upto(chunks.length-1).each do |i|
79
+ result << (i.even? ? chunks[i] : "<#{chunks[i]}>")
80
+ end
81
+ end
82
+
83
+ result.flatten.join
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,333 @@
1
+ # encoding: UTF-8
2
+
3
+ module Incollege
4
+ # A collection of regular expressions for parsing Tweet text. The regular expression
5
+ # list is frozen at load time to ensure immutability. These regular expressions are
6
+ # used throughout the <tt>Incollege</tt> classes. Special care has been taken to make
7
+ # sure these reular expressions work with Tweets in all languages.
8
+ class Regex
9
+ require 'yaml'
10
+
11
+ REGEXEN = {} # :nodoc:
12
+
13
+ def self.regex_range(from, to = nil) # :nodoc:
14
+ if $RUBY_1_9
15
+ if to
16
+ "\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
17
+ else
18
+ "\\u{#{from.to_s(16).rjust(4, '0')}}"
19
+ end
20
+ else
21
+ if to
22
+ [from].pack('U') + '-' + [to].pack('U')
23
+ else
24
+ [from].pack('U')
25
+ end
26
+ end
27
+ end
28
+
29
+ TLDS = YAML.load_file(
30
+ File.join(
31
+ File.expand_path('../../..', __FILE__), # project root
32
+ 'lib', 'assets', 'tld_lib.yml'
33
+ )
34
+ )
35
+
36
+ # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
37
+ # to access both the list of characters and a pattern suitible for use with String#split
38
+ # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
39
+ UNICODE_SPACES = [
40
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
41
+ 0x0020, # White_Space # Zs SPACE
42
+ 0x0085, # White_Space # Cc <control-0085>
43
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
44
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
45
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
46
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
47
+ 0x2028, # White_Space # Zl LINE SEPARATOR
48
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
49
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
50
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
51
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
52
+ ].flatten.map{|c| [c].pack('U*')}.freeze
53
+ REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
54
+
55
+ # Character not allowed in Tweets
56
+ INVALID_CHARACTERS = [
57
+ 0xFFFE, 0xFEFF, # BOM
58
+ 0xFFFF, # Special
59
+ 0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
60
+ ].map{|cp| [cp].pack('U') }.freeze
61
+ REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
62
+
63
+ major, minor, _patch = RUBY_VERSION.split('.')
64
+ if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
65
+ REGEXEN[:list_name] = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/
66
+ else
67
+ # This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
68
+ REGEXEN[:list_name] = eval("/[a-zA-Z][a-zA-Z0-9_\\-\x80-\xff]{0,24}/")
69
+ end
70
+
71
+ # Latin accented characters
72
+ # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
73
+ # Also excludes 0xf7, the division sign
74
+ LATIN_ACCENTS = [
75
+ regex_range(0xc0, 0xd6),
76
+ regex_range(0xd8, 0xf6),
77
+ regex_range(0xf8, 0xff),
78
+ regex_range(0x0100, 0x024f),
79
+ regex_range(0x0253, 0x0254),
80
+ regex_range(0x0256, 0x0257),
81
+ regex_range(0x0259),
82
+ regex_range(0x025b),
83
+ regex_range(0x0263),
84
+ regex_range(0x0268),
85
+ regex_range(0x026f),
86
+ regex_range(0x0272),
87
+ regex_range(0x0289),
88
+ regex_range(0x028b),
89
+ regex_range(0x02bb),
90
+ regex_range(0x0300, 0x036f),
91
+ regex_range(0x1e00, 0x1eff)
92
+ ].join('').freeze
93
+ REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
94
+
95
+ RTL_CHARACTERS = [
96
+ regex_range(0x0600,0x06FF),
97
+ regex_range(0x0750,0x077F),
98
+ regex_range(0x0590,0x05FF),
99
+ regex_range(0xFE70,0xFEFF)
100
+ ].join('').freeze
101
+
102
+ PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
103
+ SPACE_CHARS = " \t\n\x0B\f\r"
104
+ CTRL_CHARS = "\x00-\x1F\x7F"
105
+
106
+ # A hashtag must contain at least one unicode letter or mark, as well as numbers, underscores, and select special characters.
107
+ HASHTAG_ALPHA = /[\p{L}\p{M}]/
108
+ HASHTAG_ALPHANUMERIC = /[\p{L}\p{M}\p{Nd}_\u200c\u200d\u0482\ua673\ua67e\u05be\u05f3\u05f4\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7]/
109
+ HASHTAG_BOUNDARY = /\A|\z|[^&\p{L}\p{M}\p{Nd}_\u200c\u200d\u0482\ua673\ua67e\u05be\u05f3\u05f4\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7]/
110
+
111
+ HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
112
+
113
+ REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
114
+ # Used in Extractor for final filtering
115
+ REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
116
+
117
+ REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-zA-Z0-9_!#\$%&*@@]|^|(?:^|[^a-zA-Z0-9_+~.-])[rR][tT]:?)/o
118
+ REGEXEN[:at_signs] = /[@@]/
119
+ REGEXEN[:valid_mention_or_list] = /
120
+ (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
121
+ (#{REGEXEN[:at_signs]}) # $2: At mark
122
+ ([a-zA-Z0-9_]{1,20}) # $3: Screen name
123
+ (\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
124
+ /ox
125
+ REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
126
+ # Used in Extractor for final filtering
127
+ REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
128
+
129
+ # URL related hash regex collection
130
+ REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
131
+ REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
132
+ DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
133
+ REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
134
+ REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
135
+
136
+ REGEXEN[:valid_gTLD] = %r{
137
+ (?:
138
+ (?:#{TLDS['generic'].join('|')})
139
+ (?=[^0-9a-z@]|$)
140
+ )
141
+ }ix
142
+
143
+ REGEXEN[:valid_ccTLD] = %r{
144
+ (?:
145
+ (?:#{TLDS['country'].join('|')})
146
+ (?=[^0-9a-z@]|$)
147
+ )
148
+ }ix
149
+ REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
150
+
151
+ REGEXEN[:valid_special_cctld] = %r{
152
+ (?:
153
+ (?:co|tv)
154
+ (?=[^0-9a-z@]|$)
155
+ )
156
+ }ix
157
+
158
+ REGEXEN[:valid_domain] = /(?:
159
+ #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
160
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
161
+ )/iox
162
+
163
+ # This is used in Extractor
164
+ REGEXEN[:valid_ascii_domain] = /
165
+ (?:(?:[A-Za-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
166
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
167
+ /iox
168
+
169
+ # This is used in Extractor for stricter t.co URL extraction
170
+ REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/[a-z0-9]+/i
171
+
172
+ # This is used in Extractor to filter out unwanted URLs.
173
+ REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
174
+ REGEXEN[:valid_special_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_special_cctld]}\Z/io
175
+
176
+ REGEXEN[:valid_port_number] = /[0-9]+/
177
+
178
+ REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\.\$\/%#\[\]\-_~&|@#{LATIN_ACCENTS}]/io
179
+ # Allow URL paths to contain up to two nested levels of balanced parens
180
+ # 1. Used in Wikipedia URLs like /Primer_(film)
181
+ # 2. Used in IIS sessions like /S(dfd346)/
182
+ # 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
183
+ REGEXEN[:valid_url_balanced_parens] = /
184
+ \(
185
+ (?:
186
+ #{REGEXEN[:valid_general_url_path_chars]}+
187
+ |
188
+ # allow one nested level of balanced parentheses
189
+ (?:
190
+ #{REGEXEN[:valid_general_url_path_chars]}*
191
+ \(
192
+ #{REGEXEN[:valid_general_url_path_chars]}+
193
+ \)
194
+ #{REGEXEN[:valid_general_url_path_chars]}*
195
+ )
196
+ )
197
+ \)
198
+ /iox
199
+ # Valid end-of-path chracters (so /foo. does not gobble the period).
200
+ # 1. Allow =&# for empty URL parameters and other URL-join artifacts
201
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
202
+ REGEXEN[:valid_url_path] = /(?:
203
+ (?:
204
+ #{REGEXEN[:valid_general_url_path_chars]}*
205
+ (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
206
+ #{REGEXEN[:valid_url_path_ending_chars]}
207
+ )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
208
+ )/iox
209
+
210
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
211
+ REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/\-]/i
212
+ REGEXEN[:valid_url] = %r{
213
+ ( # $1 total match
214
+ (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
215
+ ( # $3 URL
216
+ (https?:\/\/)? # $4 Protocol (optional)
217
+ (#{REGEXEN[:valid_domain]}) # $5 Domain(s)
218
+ (?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
219
+ (/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
220
+ (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
221
+ )
222
+ )
223
+ }iox
224
+
225
+ REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
226
+ REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
227
+
228
+ # These URL validation pattern strings are based on the ABNF from RFC 3986
229
+ REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
230
+ REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
231
+ REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
232
+ REGEXEN[:validate_url_pchar] = /(?:
233
+ #{REGEXEN[:validate_url_unreserved]}|
234
+ #{REGEXEN[:validate_url_pct_encoded]}|
235
+ #{REGEXEN[:validate_url_sub_delims]}|
236
+ [:\|@]
237
+ )/iox
238
+
239
+ REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
240
+ REGEXEN[:validate_url_userinfo] = /(?:
241
+ #{REGEXEN[:validate_url_unreserved]}|
242
+ #{REGEXEN[:validate_url_pct_encoded]}|
243
+ #{REGEXEN[:validate_url_sub_delims]}|
244
+ :
245
+ )*/iox
246
+
247
+ REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
248
+ REGEXEN[:validate_url_ipv4] =
249
+ /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
250
+
251
+ # Punting on real IPv6 validation for now
252
+ REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
253
+
254
+ # Also punting on IPvFuture for now
255
+ REGEXEN[:validate_url_ip] = /(?:
256
+ #{REGEXEN[:validate_url_ipv4]}|
257
+ #{REGEXEN[:validate_url_ipv6]}
258
+ )/iox
259
+
260
+ # This is more strict than the rfc specifies
261
+ REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
262
+ REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
263
+ REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
264
+ REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
265
+ (?:#{REGEXEN[:validate_url_domain_segment]}\.)
266
+ #{REGEXEN[:validate_url_domain_tld]})/iox
267
+
268
+ REGEXEN[:validate_url_host] = /(?:
269
+ #{REGEXEN[:validate_url_ip]}|
270
+ #{REGEXEN[:validate_url_domain]}
271
+ )/iox
272
+
273
+ # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
274
+ REGEXEN[:validate_url_unicode_subdomain_segment] =
275
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
276
+ REGEXEN[:validate_url_unicode_domain_segment] =
277
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
278
+ REGEXEN[:validate_url_unicode_domain_tld] =
279
+ /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
280
+ REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
281
+ (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
282
+ #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
283
+
284
+ REGEXEN[:validate_url_unicode_host] = /(?:
285
+ #{REGEXEN[:validate_url_ip]}|
286
+ #{REGEXEN[:validate_url_unicode_domain]}
287
+ )/iox
288
+
289
+ REGEXEN[:validate_url_port] = /[0-9]{1,5}/
290
+
291
+ REGEXEN[:validate_url_unicode_authority] = %r{
292
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
293
+ (#{REGEXEN[:validate_url_unicode_host]}) # $2 host
294
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
295
+ }iox
296
+
297
+ REGEXEN[:validate_url_authority] = %r{
298
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
299
+ (#{REGEXEN[:validate_url_host]}) # $2 host
300
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
301
+ }iox
302
+
303
+ REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
304
+ REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
305
+ REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
306
+
307
+ # Modified version of RFC 3986 Appendix B
308
+ REGEXEN[:validate_url_unencoded] = %r{
309
+ \A # Full URL
310
+ (?:
311
+ ([^:/?#]+):// # $1 Scheme
312
+ )?
313
+ ([^/?#]*) # $2 Authority
314
+ ([^?#]*) # $3 Path
315
+ (?:
316
+ \?([^#]*) # $4 Query
317
+ )?
318
+ (?:
319
+ \#(.*) # $5 Fragment
320
+ )?\Z
321
+ }ix
322
+
323
+ REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
324
+
325
+ REGEXEN.each_pair{|k,v| v.freeze }
326
+
327
+ # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
328
+ # is not a known symbol a <tt>nil</tt> will be returned.
329
+ def self.[](key)
330
+ REGEXEN[key]
331
+ end
332
+ end
333
+ end