twitter-text-kow 1.3.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,381 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ # encoding: utf-8
6
+
7
+ module Twitter
8
+ module TwitterText
9
+ # A collection of regular expressions for parsing Tweet text. The regular expression
10
+ # list is frozen at load time to ensure immutability. These regular expressions are
11
+ # used throughout the <tt>TwitterText</tt> classes. Special care has been taken to make
12
+ # sure these reular expressions work with Tweets in all languages.
13
+ class Regex
14
+ require 'yaml'
15
+
16
+ REGEXEN = {} # :nodoc:
17
+
18
+ def self.regex_range(from, to = nil) # :nodoc:
19
+ if $RUBY_1_9
20
+ if to
21
+ "\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
22
+ else
23
+ "\\u{#{from.to_s(16).rjust(4, '0')}}"
24
+ end
25
+ else
26
+ if to
27
+ [from].pack('U') + '-' + [to].pack('U')
28
+ else
29
+ [from].pack('U')
30
+ end
31
+ end
32
+ end
33
+
34
+ TLDS = YAML.load_file(
35
+ File.join(
36
+ File.expand_path('../../..', __FILE__), # project root
37
+ 'lib', 'assets', 'tld_lib.yml'
38
+ )
39
+ )
40
+
41
+ # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
42
+ # to access both the list of characters and a pattern suitible for use with String#split
43
+ # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
44
+ UNICODE_SPACES = [
45
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
46
+ 0x0020, # White_Space # Zs SPACE
47
+ 0x0085, # White_Space # Cc <control-0085>
48
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
49
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
50
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
51
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
52
+ 0x2028, # White_Space # Zl LINE SEPARATOR
53
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
54
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
55
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
56
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
57
+ ].flatten.map{|c| [c].pack('U*')}.freeze
58
+ REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
59
+
60
+ DIRECTIONAL_CHARACTERS = [
61
+ 0x061C, # ARABIC LETTER MARK (ALM)
62
+ 0x200E, # LEFT-TO-RIGHT MARK (LRM)
63
+ 0x200F, # RIGHT-TO-LEFT MARK (RLM)
64
+ 0x202A, # LEFT-TO-RIGHT EMBEDDING (LRE)
65
+ 0x202B, # RIGHT-TO-LEFT EMBEDDING (RLE)
66
+ 0x202C, # POP DIRECTIONAL FORMATTING (PDF)
67
+ 0x202D, # LEFT-TO-RIGHT OVERRIDE (LRO)
68
+ 0x202E, # RIGHT-TO-LEFT OVERRIDE (RLO)
69
+ 0x2066, # LEFT-TO-RIGHT ISOLATE (LRI)
70
+ 0x2067, # RIGHT-TO-LEFT ISOLATE (RLI)
71
+ 0x2068, # FIRST STRONG ISOLATE (FSI)
72
+ 0x2069, # POP DIRECTIONAL ISOLATE (PDI)
73
+ ].map{|cp| [cp].pack('U')}.freeze
74
+ REGEXEN[:directional_characters] = /[#{DIRECTIONAL_CHARACTERS.join('')}]/o
75
+
76
+ # Character not allowed in Tweets
77
+ INVALID_CHARACTERS = [
78
+ 0xFFFE, 0xFEFF, # BOM
79
+ 0xFFFF, # Special
80
+ ].map{|cp| [cp].pack('U') }.freeze
81
+ REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
82
+
83
+ major, minor, _patch = RUBY_VERSION.split('.')
84
+ if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
85
+ REGEXEN[:list_name] = /[a-z][a-z0-9_\-\u0080-\u00ff]{0,24}/i
86
+ else
87
+ # This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
88
+ REGEXEN[:list_name] = eval("/[a-z][a-z0-9_\\-\x80-\xff]{0,24}/i")
89
+ end
90
+
91
+ # Latin accented characters
92
+ # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
93
+ # Also excludes 0xf7, the division sign
94
+ LATIN_ACCENTS = [
95
+ regex_range(0xc0, 0xd6),
96
+ regex_range(0xd8, 0xf6),
97
+ regex_range(0xf8, 0xff),
98
+ regex_range(0x0100, 0x024f),
99
+ regex_range(0x0253, 0x0254),
100
+ regex_range(0x0256, 0x0257),
101
+ regex_range(0x0259),
102
+ regex_range(0x025b),
103
+ regex_range(0x0263),
104
+ regex_range(0x0268),
105
+ regex_range(0x026f),
106
+ regex_range(0x0272),
107
+ regex_range(0x0289),
108
+ regex_range(0x028b),
109
+ regex_range(0x02bb),
110
+ regex_range(0x0300, 0x036f),
111
+ regex_range(0x1e00, 0x1eff)
112
+ ].join('').freeze
113
+ REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
114
+
115
+ RTL_CHARACTERS = [
116
+ regex_range(0x0600,0x06FF),
117
+ regex_range(0x0750,0x077F),
118
+ regex_range(0x0590,0x05FF),
119
+ regex_range(0xFE70,0xFEFF)
120
+ ].join('').freeze
121
+
122
+ PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
123
+ SPACE_CHARS = " \t\n\x0B\f\r"
124
+ CTRL_CHARS = "\x00-\x1F\x7F"
125
+
126
+ # Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{L}\p{M}
127
+ HASHTAG_LETTERS_AND_MARKS = "\\p{L}\\p{M}" +
128
+ "\u037f\u0528-\u052f\u08a0-\u08b2\u08e4-\u08ff\u0978\u0980\u0c00\u0c34\u0c81\u0d01\u0ede\u0edf" +
129
+ "\u10c7\u10cd\u10fd-\u10ff\u16f1-\u16f8\u17b4\u17b5\u191d\u191e\u1ab0-\u1abe\u1bab-\u1bad\u1bba-" +
130
+ "\u1bbf\u1cf3-\u1cf6\u1cf8\u1cf9\u1de7-\u1df5\u2cf2\u2cf3\u2d27\u2d2d\u2d66\u2d67\u9fcc\ua674-" +
131
+ "\ua67b\ua698-\ua69d\ua69f\ua792-\ua79f\ua7aa-\ua7ad\ua7b0\ua7b1\ua7f7-\ua7f9\ua9e0-\ua9ef\ua9fa-" +
132
+ "\ua9fe\uaa7c-\uaa7f\uaae0-\uaaef\uaaf2-\uaaf6\uab30-\uab5a\uab5c-\uab5f\uab64\uab65\uf870-\uf87f" +
133
+ "\uf882\uf884-\uf89f\uf8b8\uf8c1-\uf8d6\ufa2e\ufa2f\ufe27-\ufe2d\u{102e0}\u{1031f}\u{10350}-\u{1037a}" +
134
+ "\u{10500}-\u{10527}\u{10530}-\u{10563}\u{10600}-\u{10736}\u{10740}-\u{10755}\u{10760}-\u{10767}" +
135
+ "\u{10860}-\u{10876}\u{10880}-\u{1089e}\u{10980}-\u{109b7}\u{109be}\u{109bf}\u{10a80}-\u{10a9c}" +
136
+ "\u{10ac0}-\u{10ac7}\u{10ac9}-\u{10ae6}\u{10b80}-\u{10b91}\u{1107f}\u{110d0}-\u{110e8}\u{11100}-" +
137
+ "\u{11134}\u{11150}-\u{11173}\u{11176}\u{11180}-\u{111c4}\u{111da}\u{11200}-\u{11211}\u{11213}-" +
138
+ "\u{11237}\u{112b0}-\u{112ea}\u{11301}-\u{11303}\u{11305}-\u{1130c}\u{1130f}\u{11310}\u{11313}-" +
139
+ "\u{11328}\u{1132a}-\u{11330}\u{11332}\u{11333}\u{11335}-\u{11339}\u{1133c}-\u{11344}\u{11347}" +
140
+ "\u{11348}\u{1134b}-\u{1134d}\u{11357}\u{1135d}-\u{11363}\u{11366}-\u{1136c}\u{11370}-\u{11374}" +
141
+ "\u{11480}-\u{114c5}\u{114c7}\u{11580}-\u{115b5}\u{115b8}-\u{115c0}\u{11600}-\u{11640}\u{11644}" +
142
+ "\u{11680}-\u{116b7}\u{118a0}-\u{118df}\u{118ff}\u{11ac0}-\u{11af8}\u{1236f}-\u{12398}\u{16a40}-" +
143
+ "\u{16a5e}\u{16ad0}-\u{16aed}\u{16af0}-\u{16af4}\u{16b00}-\u{16b36}\u{16b40}-\u{16b43}\u{16b63}-" +
144
+ "\u{16b77}\u{16b7d}-\u{16b8f}\u{16f00}-\u{16f44}\u{16f50}-\u{16f7e}\u{16f8f}-\u{16f9f}\u{1bc00}-" +
145
+ "\u{1bc6a}\u{1bc70}-\u{1bc7c}\u{1bc80}-\u{1bc88}\u{1bc90}-\u{1bc99}\u{1bc9d}\u{1bc9e}\u{1e800}-" +
146
+ "\u{1e8c4}\u{1e8d0}-\u{1e8d6}\u{1ee00}-\u{1ee03}\u{1ee05}-\u{1ee1f}\u{1ee21}\u{1ee22}\u{1ee24}" +
147
+ "\u{1ee27}\u{1ee29}-\u{1ee32}\u{1ee34}-\u{1ee37}\u{1ee39}\u{1ee3b}\u{1ee42}\u{1ee47}\u{1ee49}" +
148
+ "\u{1ee4b}\u{1ee4d}-\u{1ee4f}\u{1ee51}\u{1ee52}\u{1ee54}\u{1ee57}\u{1ee59}\u{1ee5b}\u{1ee5d}\u{1ee5f}" +
149
+ "\u{1ee61}\u{1ee62}\u{1ee64}\u{1ee67}-\u{1ee6a}\u{1ee6c}-\u{1ee72}\u{1ee74}-\u{1ee77}\u{1ee79}-" +
150
+ "\u{1ee7c}\u{1ee7e}\u{1ee80}-\u{1ee89}\u{1ee8b}-\u{1ee9b}\u{1eea1}-\u{1eea3}\u{1eea5}-\u{1eea9}" +
151
+ "\u{1eeab}-\u{1eebb}"
152
+
153
+ # Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{Nd}
154
+ HASHTAG_NUMERALS = "\\p{Nd}" +
155
+ "\u0de6-\u0def\ua9f0-\ua9f9\u{110f0}-\u{110f9}\u{11136}-\u{1113f}\u{111d0}-\u{111d9}\u{112f0}-" +
156
+ "\u{112f9}\u{114d0}-\u{114d9}\u{11650}-\u{11659}\u{116c0}-\u{116c9}\u{118e0}-\u{118e9}\u{16a60}-" +
157
+ "\u{16a69}\u{16b50}-\u{16b59}"
158
+
159
+ HASHTAG_SPECIAL_CHARS = "_\u200c\u200d\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7"
160
+
161
+ HASHTAG_LETTERS_NUMERALS = "#{HASHTAG_LETTERS_AND_MARKS}#{HASHTAG_NUMERALS}#{HASHTAG_SPECIAL_CHARS}"
162
+ HASHTAG_LETTERS_NUMERALS_SET = "[#{HASHTAG_LETTERS_NUMERALS}]"
163
+ HASHTAG_LETTERS_SET = "[#{HASHTAG_LETTERS_AND_MARKS}]"
164
+
165
+ HASHTAG = /(\A|\ufe0e|\ufe0f|[^&#{HASHTAG_LETTERS_NUMERALS}])(#|#)(?!\ufe0f|\u20e3)(#{HASHTAG_LETTERS_NUMERALS_SET}*#{HASHTAG_LETTERS_SET}#{HASHTAG_LETTERS_NUMERALS_SET}*)/io
166
+
167
+ REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
168
+ # Used in Extractor for final filtering
169
+ REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
170
+
171
+ REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/io
172
+ REGEXEN[:at_signs] = /[@@]/
173
+ REGEXEN[:valid_mention_or_list] = /
174
+ (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
175
+ (#{REGEXEN[:at_signs]}) # $2: At mark
176
+ ([a-z0-9_]{1,20}) # $3: Screen name
177
+ (\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
178
+ /iox
179
+ REGEXEN[:valid_reply] = /^(?:[#{UNICODE_SPACES}#{DIRECTIONAL_CHARACTERS}])*#{REGEXEN[:at_signs]}([a-z0-9_]{1,20})/io
180
+ # Used in Extractor for final filtering
181
+ REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/io
182
+
183
+ # URL related hash regex collection
184
+ REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|[#{DIRECTIONAL_CHARACTERS.join('')}]|^)/io
185
+ REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
186
+
187
+ DOMAIN_VALID_CHARS = "[^#{DIRECTIONAL_CHARACTERS.join('')}#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
188
+ # "[a-z0-9#{LATIN_ACCENTS}]"
189
+ REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
190
+ REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
191
+
192
+ REGEXEN[:valid_gTLD] = %r{
193
+ (?:
194
+ (?:#{TLDS['generic'].join('|')})
195
+ (?=[^0-9a-z@+-]|$)
196
+ )
197
+ }ix
198
+
199
+ REGEXEN[:valid_ccTLD] = %r{
200
+ (?:
201
+ (?:#{TLDS['country'].join('|')})
202
+ (?=[^0-9a-z@+-]|$)
203
+ )
204
+ }ix
205
+ REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
206
+
207
+ REGEXEN[:valid_domain] = /(?:
208
+ #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
209
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
210
+ )/iox
211
+
212
+ # This is used in Extractor
213
+ REGEXEN[:valid_ascii_domain] = /
214
+ (?:(?:[a-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
215
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
216
+ /iox
217
+
218
+ # This is used in Extractor for stricter t.co URL extraction
219
+ REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/([a-z0-9]+)/i
220
+
221
+ REGEXEN[:valid_port_number] = /[0-9]+/
222
+
223
+ REGEXEN[:valid_general_url_path_chars] = /[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|@#{LATIN_ACCENTS}]/io
224
+ # Allow URL paths to contain up to two nested levels of balanced parens
225
+ # 1. Used in Wikipedia URLs like /Primer_(film)
226
+ # 2. Used in IIS sessions like /S(dfd346)/
227
+ # 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
228
+ REGEXEN[:valid_url_balanced_parens] = /
229
+ \(
230
+ (?:
231
+ #{REGEXEN[:valid_general_url_path_chars]}+
232
+ |
233
+ # allow one nested level of balanced parentheses
234
+ (?:
235
+ #{REGEXEN[:valid_general_url_path_chars]}*
236
+ \(
237
+ #{REGEXEN[:valid_general_url_path_chars]}+
238
+ \)
239
+ #{REGEXEN[:valid_general_url_path_chars]}*
240
+ )
241
+ )
242
+ \)
243
+ /iox
244
+ # Valid end-of-path chracters (so /foo. does not gobble the period).
245
+ # 1. Allow =&# for empty URL parameters and other URL-join artifacts
246
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z\p{Cyrillic}0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
247
+ REGEXEN[:valid_url_path] = /(?:
248
+ (?:
249
+ #{REGEXEN[:valid_general_url_path_chars]}*
250
+ (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
251
+ #{REGEXEN[:valid_url_path_ending_chars]}
252
+ )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
253
+ )/iox
254
+
255
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
256
+ REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/\-]/i
257
+ REGEXEN[:valid_url] = %r{
258
+ ( # $1 total match
259
+ (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
260
+ ( # $3 URL
261
+ (https?:\/\/)? # $4 Protocol (optional)
262
+ (#{REGEXEN[:valid_domain]}) # $5 Domain(s)
263
+ (?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
264
+ (/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
265
+ (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
266
+ )
267
+ )
268
+ }iox
269
+
270
+ REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
271
+ REGEXEN[:valid_cashtag] = /(^|[#{UNICODE_SPACES}#{DIRECTIONAL_CHARACTERS}])(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
272
+
273
+ # These URL validation pattern strings are based on the ABNF from RFC 3986
274
+ REGEXEN[:validate_url_unreserved] = /[a-z\p{Cyrillic}0-9\p{Pd}._~]/i
275
+ REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
276
+ REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
277
+ REGEXEN[:validate_url_pchar] = /(?:
278
+ #{REGEXEN[:validate_url_unreserved]}|
279
+ #{REGEXEN[:validate_url_pct_encoded]}|
280
+ #{REGEXEN[:validate_url_sub_delims]}|
281
+ [:\|@]
282
+ )/iox
283
+
284
+ REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
285
+ REGEXEN[:validate_url_userinfo] = /(?:
286
+ #{REGEXEN[:validate_url_unreserved]}|
287
+ #{REGEXEN[:validate_url_pct_encoded]}|
288
+ #{REGEXEN[:validate_url_sub_delims]}|
289
+ :
290
+ )*/iox
291
+
292
+ REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
293
+ REGEXEN[:validate_url_ipv4] =
294
+ /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
295
+
296
+ # Punting on real IPv6 validation for now
297
+ REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
298
+
299
+ # Also punting on IPvFuture for now
300
+ REGEXEN[:validate_url_ip] = /(?:
301
+ #{REGEXEN[:validate_url_ipv4]}|
302
+ #{REGEXEN[:validate_url_ipv6]}
303
+ )/iox
304
+
305
+ # This is more strict than the rfc specifies
306
+ REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
307
+ REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
308
+ REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
309
+ REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
310
+ (?:#{REGEXEN[:validate_url_domain_segment]}\.)
311
+ #{REGEXEN[:validate_url_domain_tld]})/iox
312
+
313
+ REGEXEN[:validate_url_host] = /(?:
314
+ #{REGEXEN[:validate_url_ip]}|
315
+ #{REGEXEN[:validate_url_domain]}
316
+ )/iox
317
+
318
+ # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
319
+ REGEXEN[:validate_url_unicode_subdomain_segment] =
320
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
321
+ REGEXEN[:validate_url_unicode_domain_segment] =
322
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
323
+ REGEXEN[:validate_url_unicode_domain_tld] =
324
+ /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
325
+ REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
326
+ (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
327
+ #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
328
+
329
+ REGEXEN[:validate_url_unicode_host] = /(?:
330
+ #{REGEXEN[:validate_url_ip]}|
331
+ #{REGEXEN[:validate_url_unicode_domain]}
332
+ )/iox
333
+
334
+ REGEXEN[:validate_url_port] = /[0-9]{1,5}/
335
+
336
+ REGEXEN[:validate_url_unicode_authority] = %r{
337
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
338
+ (#{REGEXEN[:validate_url_unicode_host]}) # $2 host
339
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
340
+ }iox
341
+
342
+ REGEXEN[:validate_url_authority] = %r{
343
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
344
+ (#{REGEXEN[:validate_url_host]}) # $2 host
345
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
346
+ }iox
347
+
348
+ REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
349
+ REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
350
+ REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
351
+
352
+ REGEXEN[:valid_emoji] = Twitter::TwitterText::Regex::Emoji[:valid_emoji]
353
+
354
+ # Modified version of RFC 3986 Appendix B
355
+ REGEXEN[:validate_url_unencoded] = %r{
356
+ \A # Full URL
357
+ (?:
358
+ ([^:/?#]+):// # $1 Scheme
359
+ )?
360
+ ([^/?#]*) # $2 Authority
361
+ ([^?#]*) # $3 Path
362
+ (?:
363
+ \?([^#]*) # $4 Query
364
+ )?
365
+ (?:
366
+ \#(.*) # $5 Fragment
367
+ )?\Z
368
+ }ix
369
+
370
+ REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
371
+
372
+ REGEXEN.each_pair{|k,v| v.freeze }
373
+
374
+ # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
375
+ # is not a known symbol a <tt>nil</tt> will be returned.
376
+ def self.[](key)
377
+ REGEXEN[key]
378
+ end
379
+ end
380
+ end
381
+ end
@@ -0,0 +1,69 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ module Twitter
6
+ module TwitterText
7
+ # A module provides base methods to rewrite usernames, lists, hashtags and URLs.
8
+ module Rewriter extend self
9
+ def rewrite_entities(text, entities)
10
+ codepoints = text.to_s.to_codepoint_a
11
+
12
+ # sort by start index
13
+ entities = entities.sort_by do |entity|
14
+ indices = entity.respond_to?(:indices) ? entity.indices : entity[:indices]
15
+ indices.first
16
+ end
17
+
18
+ result = []
19
+ last_index = entities.inject(0) do |index, entity|
20
+ indices = entity.respond_to?(:indices) ? entity.indices : entity[:indices]
21
+ result << codepoints[index...indices.first]
22
+ result << yield(entity, codepoints)
23
+ indices.last
24
+ end
25
+ result << codepoints[last_index..-1]
26
+
27
+ result.flatten.join
28
+ end
29
+
30
+ # These methods are deprecated, will be removed in future.
31
+ extend Deprecation
32
+
33
+ def rewrite(text, options = {})
34
+ [:hashtags, :urls, :usernames_or_lists].inject(text) do |key|
35
+ options[key] ? send(:"rewrite_#{key}", text, &options[key]) : text
36
+ end
37
+ end
38
+ deprecate :rewrite, :rewrite_entities
39
+
40
+ def rewrite_usernames_or_lists(text)
41
+ entities = Extractor.extract_mentions_or_lists_with_indices(text)
42
+ rewrite_entities(text, entities) do |entity, codepoints|
43
+ at = codepoints[entity[:indices].first]
44
+ list_slug = entity[:list_slug]
45
+ list_slug = nil if list_slug.empty?
46
+ yield(at, entity[:screen_name], list_slug)
47
+ end
48
+ end
49
+ deprecate :rewrite_usernames_or_lists, :rewrite_entities
50
+
51
+ def rewrite_hashtags(text)
52
+ entities = Extractor.extract_hashtags_with_indices(text)
53
+ rewrite_entities(text, entities) do |entity, codepoints|
54
+ hash = codepoints[entity[:indices].first]
55
+ yield(hash, entity[:hashtag])
56
+ end
57
+ end
58
+ deprecate :rewrite_hashtags, :rewrite_entities
59
+
60
+ def rewrite_urls(text)
61
+ entities = Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false)
62
+ rewrite_entities(text, entities) do |entity, codepoints|
63
+ yield(entity[:url])
64
+ end
65
+ end
66
+ deprecate :rewrite_urls, :rewrite_entities
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,31 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ module Twitter
6
+ module TwitterText
7
+ # This module lazily defines constants of the form Uxxxx for all Unicode
8
+ # codepoints from U0000 to U10FFFF. The value of each constant is the
9
+ # UTF-8 string for the codepoint.
10
+ # Examples:
11
+ # copyright = Unicode::U00A9
12
+ # euro = Unicode::U20AC
13
+ # infinity = Unicode::U221E
14
+ #
15
+ module Unicode
16
+ CODEPOINT_REGEX = /^U_?([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})$/
17
+
18
+ def self.const_missing(name)
19
+ # Check that the constant name is of the right form: U0000 to U10FFFF
20
+ if name.to_s =~ CODEPOINT_REGEX
21
+ # Convert the codepoint to an immutable UTF-8 string,
22
+ # define a real constant for that value and return the value
23
+ #p name, name.class
24
+ const_set(name, [$1.to_i(16)].pack("U").freeze)
25
+ else # Raise an error for constants that are not Unicode.
26
+ raise NameError, "Uninitialized constant: Unicode::#{name}"
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end