twitter-text-simpleidn 3.0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ module Twitter
6
+ module TwitterText
7
+ module HashHelper
8
+ # Return a new hash with all keys converted to symbols, as long as
9
+ # they respond to +to_sym+.
10
+ #
11
+ # { 'name' => 'Rob', 'years' => '28' }.symbolize_keys
12
+ # #=> { :name => "Rob", :years => "28" }
13
+ def self.symbolize_keys(hash)
14
+ symbolize_keys!(hash.dup)
15
+ end
16
+
17
+ # Destructively convert all keys to symbols, as long as they respond
18
+ # to +to_sym+. Same as +symbolize_keys+, but modifies +self+.
19
+ def self.symbolize_keys!(hash)
20
+ hash.keys.each do |key|
21
+ hash[(key.to_sym rescue key) || key] = hash.delete(key)
22
+ end
23
+ hash
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,92 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ module Twitter
6
+ module TwitterText
7
+ # Module for doing "hit highlighting" on tweets that have been auto-linked already.
8
+ # Useful with the results returned from the Search API.
9
+ module HitHighlighter extend self
10
+ # Default Tag used for hit highlighting
11
+ DEFAULT_HIGHLIGHT_TAG = "em"
12
+
13
+ # Add <tt><em></em></tt> tags around the <tt>hits</tt> provided in the <tt>text</tt>. The
14
+ # <tt>hits</tt> should be an array of (start, end) index pairs, relative to the original
15
+ # text, before auto-linking (but the <tt>text</tt> may already be auto-linked if desired)
16
+ #
17
+ # The <tt><em></em></tt> tags can be overridden using the <tt>:tag</tt> option. For example:
18
+ #
19
+ # irb> hit_highlight("test hit here", [[5, 8]], :tag => 'strong')
20
+ # => "test <strong>hit</strong> here"
21
+ def hit_highlight(text, hits = [], options = {})
22
+ if hits.empty?
23
+ return text
24
+ end
25
+
26
+ tag_name = options[:tag] || DEFAULT_HIGHLIGHT_TAG
27
+ tags = ["<" + tag_name + ">", "</" + tag_name + ">"]
28
+
29
+ chunks = text.split(/[<>]/)
30
+
31
+ result = []
32
+ chunk_index, chunk = 0, chunks[0]
33
+ chunk_chars = chunk.to_s.to_codepoint_a
34
+ prev_chunks_len = 0
35
+ chunk_cursor = 0
36
+ start_in_chunk = false
37
+ for hit, index in hits.flatten.each_with_index do
38
+ tag = tags[index % 2]
39
+
40
+ placed = false
41
+ until chunk.nil? || hit < prev_chunks_len + chunk.length do
42
+ result << chunk_chars[chunk_cursor..-1]
43
+ if start_in_chunk && hit == prev_chunks_len + chunk_chars.length
44
+ result << tag
45
+ placed = true
46
+ end
47
+
48
+ # correctly handle highlights that end on the final character.
49
+ if tag_text = chunks[chunk_index+1]
50
+ result << "<#{tag_text}>"
51
+ end
52
+
53
+ prev_chunks_len += chunk_chars.length
54
+ chunk_cursor = 0
55
+ chunk_index += 2
56
+ chunk = chunks[chunk_index]
57
+ chunk_chars = chunk.to_s.to_codepoint_a
58
+ start_in_chunk = false
59
+ end
60
+
61
+ if !placed && !chunk.nil?
62
+ hit_spot = hit - prev_chunks_len
63
+ result << chunk_chars[chunk_cursor...hit_spot] << tag
64
+ chunk_cursor = hit_spot
65
+ if index % 2 == 0
66
+ start_in_chunk = true
67
+ else
68
+ start_in_chunk = false
69
+ end
70
+ placed = true
71
+ end
72
+
73
+ # ultimate fallback, hits that run off the end get a closing tag
74
+ if !placed
75
+ result << tag
76
+ end
77
+ end
78
+
79
+ if chunk
80
+ if chunk_cursor < chunk_chars.length
81
+ result << chunk_chars[chunk_cursor..-1]
82
+ end
83
+ (chunk_index+1).upto(chunks.length-1).each do |i|
84
+ result << (i.even? ? chunks[i] : "<#{chunks[i]}>")
85
+ end
86
+ end
87
+
88
+ result.flatten.join
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,381 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ # encoding: utf-8
6
+
7
+ module Twitter
8
+ module TwitterText
9
+ # A collection of regular expressions for parsing Tweet text. The regular expression
10
+ # list is frozen at load time to ensure immutability. These regular expressions are
11
+ # used throughout the <tt>TwitterText</tt> classes. Special care has been taken to make
12
+ # sure these reular expressions work with Tweets in all languages.
13
+ class Regex
14
+ require 'yaml'
15
+
16
+ REGEXEN = {} # :nodoc:
17
+
18
+ def self.regex_range(from, to = nil) # :nodoc:
19
+ if $RUBY_1_9
20
+ if to
21
+ "\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
22
+ else
23
+ "\\u{#{from.to_s(16).rjust(4, '0')}}"
24
+ end
25
+ else
26
+ if to
27
+ [from].pack('U') + '-' + [to].pack('U')
28
+ else
29
+ [from].pack('U')
30
+ end
31
+ end
32
+ end
33
+
34
+ TLDS = YAML.load_file(
35
+ File.join(
36
+ File.expand_path('../../..', __FILE__), # project root
37
+ 'lib', 'assets', 'tld_lib.yml'
38
+ )
39
+ )
40
+
41
+ # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
42
+ # to access both the list of characters and a pattern suitible for use with String#split
43
+ # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
44
+ UNICODE_SPACES = [
45
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
46
+ 0x0020, # White_Space # Zs SPACE
47
+ 0x0085, # White_Space # Cc <control-0085>
48
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
49
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
50
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
51
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
52
+ 0x2028, # White_Space # Zl LINE SEPARATOR
53
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
54
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
55
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
56
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
57
+ ].flatten.map{|c| [c].pack('U*')}.freeze
58
+ REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
59
+
60
+ DIRECTIONAL_CHARACTERS = [
61
+ 0x061C, # ARABIC LETTER MARK (ALM)
62
+ 0x200E, # LEFT-TO-RIGHT MARK (LRM)
63
+ 0x200F, # RIGHT-TO-LEFT MARK (RLM)
64
+ 0x202A, # LEFT-TO-RIGHT EMBEDDING (LRE)
65
+ 0x202B, # RIGHT-TO-LEFT EMBEDDING (RLE)
66
+ 0x202C, # POP DIRECTIONAL FORMATTING (PDF)
67
+ 0x202D, # LEFT-TO-RIGHT OVERRIDE (LRO)
68
+ 0x202E, # RIGHT-TO-LEFT OVERRIDE (RLO)
69
+ 0x2066, # LEFT-TO-RIGHT ISOLATE (LRI)
70
+ 0x2067, # RIGHT-TO-LEFT ISOLATE (RLI)
71
+ 0x2068, # FIRST STRONG ISOLATE (FSI)
72
+ 0x2069, # POP DIRECTIONAL ISOLATE (PDI)
73
+ ].map{|cp| [cp].pack('U')}.freeze
74
+ REGEXEN[:directional_characters] = /[#{DIRECTIONAL_CHARACTERS.join('')}]/o
75
+
76
+ # Character not allowed in Tweets
77
+ INVALID_CHARACTERS = [
78
+ 0xFFFE, 0xFEFF, # BOM
79
+ 0xFFFF, # Special
80
+ ].map{|cp| [cp].pack('U') }.freeze
81
+ REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
82
+
83
+ major, minor, _patch = RUBY_VERSION.split('.')
84
+ if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
85
+ REGEXEN[:list_name] = /[a-z][a-z0-9_\-\u0080-\u00ff]{0,24}/i
86
+ else
87
+ # This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
88
+ REGEXEN[:list_name] = eval("/[a-z][a-z0-9_\\-\x80-\xff]{0,24}/i")
89
+ end
90
+
91
+ # Latin accented characters
92
+ # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
93
+ # Also excludes 0xf7, the division sign
94
+ LATIN_ACCENTS = [
95
+ regex_range(0xc0, 0xd6),
96
+ regex_range(0xd8, 0xf6),
97
+ regex_range(0xf8, 0xff),
98
+ regex_range(0x0100, 0x024f),
99
+ regex_range(0x0253, 0x0254),
100
+ regex_range(0x0256, 0x0257),
101
+ regex_range(0x0259),
102
+ regex_range(0x025b),
103
+ regex_range(0x0263),
104
+ regex_range(0x0268),
105
+ regex_range(0x026f),
106
+ regex_range(0x0272),
107
+ regex_range(0x0289),
108
+ regex_range(0x028b),
109
+ regex_range(0x02bb),
110
+ regex_range(0x0300, 0x036f),
111
+ regex_range(0x1e00, 0x1eff)
112
+ ].join('').freeze
113
+ REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
114
+
115
+ RTL_CHARACTERS = [
116
+ regex_range(0x0600,0x06FF),
117
+ regex_range(0x0750,0x077F),
118
+ regex_range(0x0590,0x05FF),
119
+ regex_range(0xFE70,0xFEFF)
120
+ ].join('').freeze
121
+
122
+ PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
123
+ SPACE_CHARS = " \t\n\x0B\f\r"
124
+ CTRL_CHARS = "\x00-\x1F\x7F"
125
+
126
+ # Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{L}\p{M}
127
+ HASHTAG_LETTERS_AND_MARKS = "\\p{L}\\p{M}" +
128
+ "\u037f\u0528-\u052f\u08a0-\u08b2\u08e4-\u08ff\u0978\u0980\u0c00\u0c34\u0c81\u0d01\u0ede\u0edf" +
129
+ "\u10c7\u10cd\u10fd-\u10ff\u16f1-\u16f8\u17b4\u17b5\u191d\u191e\u1ab0-\u1abe\u1bab-\u1bad\u1bba-" +
130
+ "\u1bbf\u1cf3-\u1cf6\u1cf8\u1cf9\u1de7-\u1df5\u2cf2\u2cf3\u2d27\u2d2d\u2d66\u2d67\u9fcc\ua674-" +
131
+ "\ua67b\ua698-\ua69d\ua69f\ua792-\ua79f\ua7aa-\ua7ad\ua7b0\ua7b1\ua7f7-\ua7f9\ua9e0-\ua9ef\ua9fa-" +
132
+ "\ua9fe\uaa7c-\uaa7f\uaae0-\uaaef\uaaf2-\uaaf6\uab30-\uab5a\uab5c-\uab5f\uab64\uab65\uf870-\uf87f" +
133
+ "\uf882\uf884-\uf89f\uf8b8\uf8c1-\uf8d6\ufa2e\ufa2f\ufe27-\ufe2d\u{102e0}\u{1031f}\u{10350}-\u{1037a}" +
134
+ "\u{10500}-\u{10527}\u{10530}-\u{10563}\u{10600}-\u{10736}\u{10740}-\u{10755}\u{10760}-\u{10767}" +
135
+ "\u{10860}-\u{10876}\u{10880}-\u{1089e}\u{10980}-\u{109b7}\u{109be}\u{109bf}\u{10a80}-\u{10a9c}" +
136
+ "\u{10ac0}-\u{10ac7}\u{10ac9}-\u{10ae6}\u{10b80}-\u{10b91}\u{1107f}\u{110d0}-\u{110e8}\u{11100}-" +
137
+ "\u{11134}\u{11150}-\u{11173}\u{11176}\u{11180}-\u{111c4}\u{111da}\u{11200}-\u{11211}\u{11213}-" +
138
+ "\u{11237}\u{112b0}-\u{112ea}\u{11301}-\u{11303}\u{11305}-\u{1130c}\u{1130f}\u{11310}\u{11313}-" +
139
+ "\u{11328}\u{1132a}-\u{11330}\u{11332}\u{11333}\u{11335}-\u{11339}\u{1133c}-\u{11344}\u{11347}" +
140
+ "\u{11348}\u{1134b}-\u{1134d}\u{11357}\u{1135d}-\u{11363}\u{11366}-\u{1136c}\u{11370}-\u{11374}" +
141
+ "\u{11480}-\u{114c5}\u{114c7}\u{11580}-\u{115b5}\u{115b8}-\u{115c0}\u{11600}-\u{11640}\u{11644}" +
142
+ "\u{11680}-\u{116b7}\u{118a0}-\u{118df}\u{118ff}\u{11ac0}-\u{11af8}\u{1236f}-\u{12398}\u{16a40}-" +
143
+ "\u{16a5e}\u{16ad0}-\u{16aed}\u{16af0}-\u{16af4}\u{16b00}-\u{16b36}\u{16b40}-\u{16b43}\u{16b63}-" +
144
+ "\u{16b77}\u{16b7d}-\u{16b8f}\u{16f00}-\u{16f44}\u{16f50}-\u{16f7e}\u{16f8f}-\u{16f9f}\u{1bc00}-" +
145
+ "\u{1bc6a}\u{1bc70}-\u{1bc7c}\u{1bc80}-\u{1bc88}\u{1bc90}-\u{1bc99}\u{1bc9d}\u{1bc9e}\u{1e800}-" +
146
+ "\u{1e8c4}\u{1e8d0}-\u{1e8d6}\u{1ee00}-\u{1ee03}\u{1ee05}-\u{1ee1f}\u{1ee21}\u{1ee22}\u{1ee24}" +
147
+ "\u{1ee27}\u{1ee29}-\u{1ee32}\u{1ee34}-\u{1ee37}\u{1ee39}\u{1ee3b}\u{1ee42}\u{1ee47}\u{1ee49}" +
148
+ "\u{1ee4b}\u{1ee4d}-\u{1ee4f}\u{1ee51}\u{1ee52}\u{1ee54}\u{1ee57}\u{1ee59}\u{1ee5b}\u{1ee5d}\u{1ee5f}" +
149
+ "\u{1ee61}\u{1ee62}\u{1ee64}\u{1ee67}-\u{1ee6a}\u{1ee6c}-\u{1ee72}\u{1ee74}-\u{1ee77}\u{1ee79}-" +
150
+ "\u{1ee7c}\u{1ee7e}\u{1ee80}-\u{1ee89}\u{1ee8b}-\u{1ee9b}\u{1eea1}-\u{1eea3}\u{1eea5}-\u{1eea9}" +
151
+ "\u{1eeab}-\u{1eebb}"
152
+
153
+ # Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{Nd}
154
+ HASHTAG_NUMERALS = "\\p{Nd}" +
155
+ "\u0de6-\u0def\ua9f0-\ua9f9\u{110f0}-\u{110f9}\u{11136}-\u{1113f}\u{111d0}-\u{111d9}\u{112f0}-" +
156
+ "\u{112f9}\u{114d0}-\u{114d9}\u{11650}-\u{11659}\u{116c0}-\u{116c9}\u{118e0}-\u{118e9}\u{16a60}-" +
157
+ "\u{16a69}\u{16b50}-\u{16b59}"
158
+
159
+ HASHTAG_SPECIAL_CHARS = "_\u200c\u200d\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7"
160
+
161
+ HASHTAG_LETTERS_NUMERALS = "#{HASHTAG_LETTERS_AND_MARKS}#{HASHTAG_NUMERALS}#{HASHTAG_SPECIAL_CHARS}"
162
+ HASHTAG_LETTERS_NUMERALS_SET = "[#{HASHTAG_LETTERS_NUMERALS}]"
163
+ HASHTAG_LETTERS_SET = "[#{HASHTAG_LETTERS_AND_MARKS}]"
164
+
165
+ HASHTAG = /(\A|\ufe0e|\ufe0f|[^&#{HASHTAG_LETTERS_NUMERALS}])(#|#)(?!\ufe0f|\u20e3)(#{HASHTAG_LETTERS_NUMERALS_SET}*#{HASHTAG_LETTERS_SET}#{HASHTAG_LETTERS_NUMERALS_SET}*)/io
166
+
167
+ REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
168
+ # Used in Extractor for final filtering
169
+ REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
170
+
171
+ REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/io
172
+ REGEXEN[:at_signs] = /[@@]/
173
+ REGEXEN[:valid_mention_or_list] = /
174
+ (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
175
+ (#{REGEXEN[:at_signs]}) # $2: At mark
176
+ ([a-z0-9_]{1,20}) # $3: Screen name
177
+ (\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
178
+ /iox
179
+ REGEXEN[:valid_reply] = /^(?:[#{UNICODE_SPACES}#{DIRECTIONAL_CHARACTERS}])*#{REGEXEN[:at_signs]}([a-z0-9_]{1,20})/io
180
+ # Used in Extractor for final filtering
181
+ REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/io
182
+
183
+ # URL related hash regex collection
184
+ REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|[#{DIRECTIONAL_CHARACTERS.join('')}]|^)/io
185
+ REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
186
+
187
+ DOMAIN_VALID_CHARS = "[^#{DIRECTIONAL_CHARACTERS.join('')}#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
188
+ # "[a-z0-9#{LATIN_ACCENTS}]"
189
+ REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
190
+ REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
191
+
192
+ REGEXEN[:valid_gTLD] = %r{
193
+ (?:
194
+ (?:#{TLDS['generic'].join('|')})
195
+ (?=[^0-9a-z@]|$)
196
+ )
197
+ }ix
198
+
199
+ REGEXEN[:valid_ccTLD] = %r{
200
+ (?:
201
+ (?:#{TLDS['country'].join('|')})
202
+ (?=[^0-9a-z@]|$)
203
+ )
204
+ }ix
205
+ REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
206
+
207
+ REGEXEN[:valid_domain] = /(?:
208
+ #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
209
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
210
+ )/iox
211
+
212
+ # This is used in Extractor
213
+ REGEXEN[:valid_ascii_domain] = /
214
+ (?:(?:[a-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
215
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
216
+ /iox
217
+
218
+ # This is used in Extractor for stricter t.co URL extraction
219
+ REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/([a-z0-9]+)/i
220
+
221
+ REGEXEN[:valid_port_number] = /[0-9]+/
222
+
223
+ REGEXEN[:valid_general_url_path_chars] = /[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|@#{LATIN_ACCENTS}]/io
224
+ # Allow URL paths to contain up to two nested levels of balanced parens
225
+ # 1. Used in Wikipedia URLs like /Primer_(film)
226
+ # 2. Used in IIS sessions like /S(dfd346)/
227
+ # 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
228
+ REGEXEN[:valid_url_balanced_parens] = /
229
+ \(
230
+ (?:
231
+ #{REGEXEN[:valid_general_url_path_chars]}+
232
+ |
233
+ # allow one nested level of balanced parentheses
234
+ (?:
235
+ #{REGEXEN[:valid_general_url_path_chars]}*
236
+ \(
237
+ #{REGEXEN[:valid_general_url_path_chars]}+
238
+ \)
239
+ #{REGEXEN[:valid_general_url_path_chars]}*
240
+ )
241
+ )
242
+ \)
243
+ /iox
244
+ # Valid end-of-path chracters (so /foo. does not gobble the period).
245
+ # 1. Allow =&# for empty URL parameters and other URL-join artifacts
246
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z\p{Cyrillic}0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
247
+ REGEXEN[:valid_url_path] = /(?:
248
+ (?:
249
+ #{REGEXEN[:valid_general_url_path_chars]}*
250
+ (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
251
+ #{REGEXEN[:valid_url_path_ending_chars]}
252
+ )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
253
+ )/iox
254
+
255
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
256
+ REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/\-]/i
257
+ REGEXEN[:valid_url] = %r{
258
+ ( # $1 total match
259
+ (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
260
+ ( # $3 URL
261
+ (https?:\/\/)? # $4 Protocol (optional)
262
+ (#{REGEXEN[:valid_domain]}) # $5 Domain(s)
263
+ (?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
264
+ (/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
265
+ (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
266
+ )
267
+ )
268
+ }iox
269
+
270
+ REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
271
+ REGEXEN[:valid_cashtag] = /(^|[#{UNICODE_SPACES}#{DIRECTIONAL_CHARACTERS}])(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
272
+
273
+ # These URL validation pattern strings are based on the ABNF from RFC 3986
274
+ REGEXEN[:validate_url_unreserved] = /[a-z\p{Cyrillic}0-9\p{Pd}._~]/i
275
+ REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
276
+ REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
277
+ REGEXEN[:validate_url_pchar] = /(?:
278
+ #{REGEXEN[:validate_url_unreserved]}|
279
+ #{REGEXEN[:validate_url_pct_encoded]}|
280
+ #{REGEXEN[:validate_url_sub_delims]}|
281
+ [:\|@]
282
+ )/iox
283
+
284
+ REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
285
+ REGEXEN[:validate_url_userinfo] = /(?:
286
+ #{REGEXEN[:validate_url_unreserved]}|
287
+ #{REGEXEN[:validate_url_pct_encoded]}|
288
+ #{REGEXEN[:validate_url_sub_delims]}|
289
+ :
290
+ )*/iox
291
+
292
+ REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
293
+ REGEXEN[:validate_url_ipv4] =
294
+ /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
295
+
296
+ # Punting on real IPv6 validation for now
297
+ REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
298
+
299
+ # Also punting on IPvFuture for now
300
+ REGEXEN[:validate_url_ip] = /(?:
301
+ #{REGEXEN[:validate_url_ipv4]}|
302
+ #{REGEXEN[:validate_url_ipv6]}
303
+ )/iox
304
+
305
+ # This is more strict than the rfc specifies
306
+ REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
307
+ REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
308
+ REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
309
+ REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
310
+ (?:#{REGEXEN[:validate_url_domain_segment]}\.)
311
+ #{REGEXEN[:validate_url_domain_tld]})/iox
312
+
313
+ REGEXEN[:validate_url_host] = /(?:
314
+ #{REGEXEN[:validate_url_ip]}|
315
+ #{REGEXEN[:validate_url_domain]}
316
+ )/iox
317
+
318
+ # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
319
+ REGEXEN[:validate_url_unicode_subdomain_segment] =
320
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
321
+ REGEXEN[:validate_url_unicode_domain_segment] =
322
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
323
+ REGEXEN[:validate_url_unicode_domain_tld] =
324
+ /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
325
+ REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
326
+ (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
327
+ #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
328
+
329
+ REGEXEN[:validate_url_unicode_host] = /(?:
330
+ #{REGEXEN[:validate_url_ip]}|
331
+ #{REGEXEN[:validate_url_unicode_domain]}
332
+ )/iox
333
+
334
+ REGEXEN[:validate_url_port] = /[0-9]{1,5}/
335
+
336
+ REGEXEN[:validate_url_unicode_authority] = %r{
337
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
338
+ (#{REGEXEN[:validate_url_unicode_host]}) # $2 host
339
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
340
+ }iox
341
+
342
+ REGEXEN[:validate_url_authority] = %r{
343
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
344
+ (#{REGEXEN[:validate_url_host]}) # $2 host
345
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
346
+ }iox
347
+
348
+ REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
349
+ REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
350
+ REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
351
+
352
+ REGEXEN[:valid_emoji] = Twitter::TwitterText::Regex::Emoji[:valid_emoji]
353
+
354
+ # Modified version of RFC 3986 Appendix B
355
+ REGEXEN[:validate_url_unencoded] = %r{
356
+ \A # Full URL
357
+ (?:
358
+ ([^:/?#]+):// # $1 Scheme
359
+ )?
360
+ ([^/?#]*) # $2 Authority
361
+ ([^?#]*) # $3 Path
362
+ (?:
363
+ \?([^#]*) # $4 Query
364
+ )?
365
+ (?:
366
+ \#(.*) # $5 Fragment
367
+ )?\Z
368
+ }ix
369
+
370
+ REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
371
+
372
+ REGEXEN.each_pair{|k,v| v.freeze }
373
+
374
+ # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
375
+ # is not a known symbol a <tt>nil</tt> will be returned.
376
+ def self.[](key)
377
+ REGEXEN[key]
378
+ end
379
+ end
380
+ end
381
+ end