twitter-text-kow 1.3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,381 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ # encoding: utf-8
6
+
7
+ module Twitter
8
+ module TwitterText
9
+ # A collection of regular expressions for parsing Tweet text. The regular expression
10
+ # list is frozen at load time to ensure immutability. These regular expressions are
11
+ # used throughout the <tt>TwitterText</tt> classes. Special care has been taken to make
12
+ # sure these reular expressions work with Tweets in all languages.
13
+ class Regex
14
+ require 'yaml'
15
+
16
+ REGEXEN = {} # :nodoc:
17
+
18
+ def self.regex_range(from, to = nil) # :nodoc:
19
+ if $RUBY_1_9
20
+ if to
21
+ "\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
22
+ else
23
+ "\\u{#{from.to_s(16).rjust(4, '0')}}"
24
+ end
25
+ else
26
+ if to
27
+ [from].pack('U') + '-' + [to].pack('U')
28
+ else
29
+ [from].pack('U')
30
+ end
31
+ end
32
+ end
33
+
34
+ TLDS = YAML.load_file(
35
+ File.join(
36
+ File.expand_path('../../..', __FILE__), # project root
37
+ 'lib', 'assets', 'tld_lib.yml'
38
+ )
39
+ )
40
+
41
+ # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
42
+ # to access both the list of characters and a pattern suitible for use with String#split
43
+ # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
44
+ UNICODE_SPACES = [
45
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
46
+ 0x0020, # White_Space # Zs SPACE
47
+ 0x0085, # White_Space # Cc <control-0085>
48
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
49
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
50
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
51
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
52
+ 0x2028, # White_Space # Zl LINE SEPARATOR
53
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
54
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
55
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
56
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
57
+ ].flatten.map{|c| [c].pack('U*')}.freeze
58
+ REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
59
+
60
+ DIRECTIONAL_CHARACTERS = [
61
+ 0x061C, # ARABIC LETTER MARK (ALM)
62
+ 0x200E, # LEFT-TO-RIGHT MARK (LRM)
63
+ 0x200F, # RIGHT-TO-LEFT MARK (RLM)
64
+ 0x202A, # LEFT-TO-RIGHT EMBEDDING (LRE)
65
+ 0x202B, # RIGHT-TO-LEFT EMBEDDING (RLE)
66
+ 0x202C, # POP DIRECTIONAL FORMATTING (PDF)
67
+ 0x202D, # LEFT-TO-RIGHT OVERRIDE (LRO)
68
+ 0x202E, # RIGHT-TO-LEFT OVERRIDE (RLO)
69
+ 0x2066, # LEFT-TO-RIGHT ISOLATE (LRI)
70
+ 0x2067, # RIGHT-TO-LEFT ISOLATE (RLI)
71
+ 0x2068, # FIRST STRONG ISOLATE (FSI)
72
+ 0x2069, # POP DIRECTIONAL ISOLATE (PDI)
73
+ ].map{|cp| [cp].pack('U')}.freeze
74
+ REGEXEN[:directional_characters] = /[#{DIRECTIONAL_CHARACTERS.join('')}]/o
75
+
76
+ # Character not allowed in Tweets
77
+ INVALID_CHARACTERS = [
78
+ 0xFFFE, 0xFEFF, # BOM
79
+ 0xFFFF, # Special
80
+ ].map{|cp| [cp].pack('U') }.freeze
81
+ REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
82
+
83
+ major, minor, _patch = RUBY_VERSION.split('.')
84
+ if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
85
+ REGEXEN[:list_name] = /[a-z][a-z0-9_\-\u0080-\u00ff]{0,24}/i
86
+ else
87
+ # This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
88
+ REGEXEN[:list_name] = eval("/[a-z][a-z0-9_\\-\x80-\xff]{0,24}/i")
89
+ end
90
+
91
+ # Latin accented characters
92
+ # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
93
+ # Also excludes 0xf7, the division sign
94
+ LATIN_ACCENTS = [
95
+ regex_range(0xc0, 0xd6),
96
+ regex_range(0xd8, 0xf6),
97
+ regex_range(0xf8, 0xff),
98
+ regex_range(0x0100, 0x024f),
99
+ regex_range(0x0253, 0x0254),
100
+ regex_range(0x0256, 0x0257),
101
+ regex_range(0x0259),
102
+ regex_range(0x025b),
103
+ regex_range(0x0263),
104
+ regex_range(0x0268),
105
+ regex_range(0x026f),
106
+ regex_range(0x0272),
107
+ regex_range(0x0289),
108
+ regex_range(0x028b),
109
+ regex_range(0x02bb),
110
+ regex_range(0x0300, 0x036f),
111
+ regex_range(0x1e00, 0x1eff)
112
+ ].join('').freeze
113
+ REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
114
+
115
+ RTL_CHARACTERS = [
116
+ regex_range(0x0600,0x06FF),
117
+ regex_range(0x0750,0x077F),
118
+ regex_range(0x0590,0x05FF),
119
+ regex_range(0xFE70,0xFEFF)
120
+ ].join('').freeze
121
+
122
+ PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
123
+ SPACE_CHARS = " \t\n\x0B\f\r"
124
+ CTRL_CHARS = "\x00-\x1F\x7F"
125
+
126
+ # Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{L}\p{M}
127
+ HASHTAG_LETTERS_AND_MARKS = "\\p{L}\\p{M}" +
128
+ "\u037f\u0528-\u052f\u08a0-\u08b2\u08e4-\u08ff\u0978\u0980\u0c00\u0c34\u0c81\u0d01\u0ede\u0edf" +
129
+ "\u10c7\u10cd\u10fd-\u10ff\u16f1-\u16f8\u17b4\u17b5\u191d\u191e\u1ab0-\u1abe\u1bab-\u1bad\u1bba-" +
130
+ "\u1bbf\u1cf3-\u1cf6\u1cf8\u1cf9\u1de7-\u1df5\u2cf2\u2cf3\u2d27\u2d2d\u2d66\u2d67\u9fcc\ua674-" +
131
+ "\ua67b\ua698-\ua69d\ua69f\ua792-\ua79f\ua7aa-\ua7ad\ua7b0\ua7b1\ua7f7-\ua7f9\ua9e0-\ua9ef\ua9fa-" +
132
+ "\ua9fe\uaa7c-\uaa7f\uaae0-\uaaef\uaaf2-\uaaf6\uab30-\uab5a\uab5c-\uab5f\uab64\uab65\uf870-\uf87f" +
133
+ "\uf882\uf884-\uf89f\uf8b8\uf8c1-\uf8d6\ufa2e\ufa2f\ufe27-\ufe2d\u{102e0}\u{1031f}\u{10350}-\u{1037a}" +
134
+ "\u{10500}-\u{10527}\u{10530}-\u{10563}\u{10600}-\u{10736}\u{10740}-\u{10755}\u{10760}-\u{10767}" +
135
+ "\u{10860}-\u{10876}\u{10880}-\u{1089e}\u{10980}-\u{109b7}\u{109be}\u{109bf}\u{10a80}-\u{10a9c}" +
136
+ "\u{10ac0}-\u{10ac7}\u{10ac9}-\u{10ae6}\u{10b80}-\u{10b91}\u{1107f}\u{110d0}-\u{110e8}\u{11100}-" +
137
+ "\u{11134}\u{11150}-\u{11173}\u{11176}\u{11180}-\u{111c4}\u{111da}\u{11200}-\u{11211}\u{11213}-" +
138
+ "\u{11237}\u{112b0}-\u{112ea}\u{11301}-\u{11303}\u{11305}-\u{1130c}\u{1130f}\u{11310}\u{11313}-" +
139
+ "\u{11328}\u{1132a}-\u{11330}\u{11332}\u{11333}\u{11335}-\u{11339}\u{1133c}-\u{11344}\u{11347}" +
140
+ "\u{11348}\u{1134b}-\u{1134d}\u{11357}\u{1135d}-\u{11363}\u{11366}-\u{1136c}\u{11370}-\u{11374}" +
141
+ "\u{11480}-\u{114c5}\u{114c7}\u{11580}-\u{115b5}\u{115b8}-\u{115c0}\u{11600}-\u{11640}\u{11644}" +
142
+ "\u{11680}-\u{116b7}\u{118a0}-\u{118df}\u{118ff}\u{11ac0}-\u{11af8}\u{1236f}-\u{12398}\u{16a40}-" +
143
+ "\u{16a5e}\u{16ad0}-\u{16aed}\u{16af0}-\u{16af4}\u{16b00}-\u{16b36}\u{16b40}-\u{16b43}\u{16b63}-" +
144
+ "\u{16b77}\u{16b7d}-\u{16b8f}\u{16f00}-\u{16f44}\u{16f50}-\u{16f7e}\u{16f8f}-\u{16f9f}\u{1bc00}-" +
145
+ "\u{1bc6a}\u{1bc70}-\u{1bc7c}\u{1bc80}-\u{1bc88}\u{1bc90}-\u{1bc99}\u{1bc9d}\u{1bc9e}\u{1e800}-" +
146
+ "\u{1e8c4}\u{1e8d0}-\u{1e8d6}\u{1ee00}-\u{1ee03}\u{1ee05}-\u{1ee1f}\u{1ee21}\u{1ee22}\u{1ee24}" +
147
+ "\u{1ee27}\u{1ee29}-\u{1ee32}\u{1ee34}-\u{1ee37}\u{1ee39}\u{1ee3b}\u{1ee42}\u{1ee47}\u{1ee49}" +
148
+ "\u{1ee4b}\u{1ee4d}-\u{1ee4f}\u{1ee51}\u{1ee52}\u{1ee54}\u{1ee57}\u{1ee59}\u{1ee5b}\u{1ee5d}\u{1ee5f}" +
149
+ "\u{1ee61}\u{1ee62}\u{1ee64}\u{1ee67}-\u{1ee6a}\u{1ee6c}-\u{1ee72}\u{1ee74}-\u{1ee77}\u{1ee79}-" +
150
+ "\u{1ee7c}\u{1ee7e}\u{1ee80}-\u{1ee89}\u{1ee8b}-\u{1ee9b}\u{1eea1}-\u{1eea3}\u{1eea5}-\u{1eea9}" +
151
+ "\u{1eeab}-\u{1eebb}"
152
+
153
+ # Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{Nd}
154
+ HASHTAG_NUMERALS = "\\p{Nd}" +
155
+ "\u0de6-\u0def\ua9f0-\ua9f9\u{110f0}-\u{110f9}\u{11136}-\u{1113f}\u{111d0}-\u{111d9}\u{112f0}-" +
156
+ "\u{112f9}\u{114d0}-\u{114d9}\u{11650}-\u{11659}\u{116c0}-\u{116c9}\u{118e0}-\u{118e9}\u{16a60}-" +
157
+ "\u{16a69}\u{16b50}-\u{16b59}"
158
+
159
+ HASHTAG_SPECIAL_CHARS = "_\u200c\u200d\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7"
160
+
161
+ HASHTAG_LETTERS_NUMERALS = "#{HASHTAG_LETTERS_AND_MARKS}#{HASHTAG_NUMERALS}#{HASHTAG_SPECIAL_CHARS}"
162
+ HASHTAG_LETTERS_NUMERALS_SET = "[#{HASHTAG_LETTERS_NUMERALS}]"
163
+ HASHTAG_LETTERS_SET = "[#{HASHTAG_LETTERS_AND_MARKS}]"
164
+
165
+ HASHTAG = /(\A|\ufe0e|\ufe0f|[^&#{HASHTAG_LETTERS_NUMERALS}])(#|#)(?!\ufe0f|\u20e3)(#{HASHTAG_LETTERS_NUMERALS_SET}*#{HASHTAG_LETTERS_SET}#{HASHTAG_LETTERS_NUMERALS_SET}*)/io
166
+
167
+ REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
168
+ # Used in Extractor for final filtering
169
+ REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
170
+
171
+ REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/io
172
+ REGEXEN[:at_signs] = /[@@]/
173
+ REGEXEN[:valid_mention_or_list] = /
174
+ (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
175
+ (#{REGEXEN[:at_signs]}) # $2: At mark
176
+ ([a-z0-9_]{1,20}) # $3: Screen name
177
+ (\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
178
+ /iox
179
+ REGEXEN[:valid_reply] = /^(?:[#{UNICODE_SPACES}#{DIRECTIONAL_CHARACTERS}])*#{REGEXEN[:at_signs]}([a-z0-9_]{1,20})/io
180
+ # Used in Extractor for final filtering
181
+ REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/io
182
+
183
+ # URL related hash regex collection
184
+ REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|[#{DIRECTIONAL_CHARACTERS.join('')}]|^)/io
185
+ REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
186
+
187
+ DOMAIN_VALID_CHARS = "[^#{DIRECTIONAL_CHARACTERS.join('')}#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
188
+ # "[a-z0-9#{LATIN_ACCENTS}]"
189
+ REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
190
+ REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
191
+
192
+ REGEXEN[:valid_gTLD] = %r{
193
+ (?:
194
+ (?:#{TLDS['generic'].join('|')})
195
+ (?=[^0-9a-z@+-]|$)
196
+ )
197
+ }ix
198
+
199
+ REGEXEN[:valid_ccTLD] = %r{
200
+ (?:
201
+ (?:#{TLDS['country'].join('|')})
202
+ (?=[^0-9a-z@+-]|$)
203
+ )
204
+ }ix
205
+ REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
206
+
207
+ REGEXEN[:valid_domain] = /(?:
208
+ #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
209
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
210
+ )/iox
211
+
212
+ # This is used in Extractor
213
+ REGEXEN[:valid_ascii_domain] = /
214
+ (?:(?:[a-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
215
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
216
+ /iox
217
+
218
+ # This is used in Extractor for stricter t.co URL extraction
219
+ REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/([a-z0-9]+)/i
220
+
221
+ REGEXEN[:valid_port_number] = /[0-9]+/
222
+
223
+ REGEXEN[:valid_general_url_path_chars] = /[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|@#{LATIN_ACCENTS}]/io
224
+ # Allow URL paths to contain up to two nested levels of balanced parens
225
+ # 1. Used in Wikipedia URLs like /Primer_(film)
226
+ # 2. Used in IIS sessions like /S(dfd346)/
227
+ # 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
228
+ REGEXEN[:valid_url_balanced_parens] = /
229
+ \(
230
+ (?:
231
+ #{REGEXEN[:valid_general_url_path_chars]}+
232
+ |
233
+ # allow one nested level of balanced parentheses
234
+ (?:
235
+ #{REGEXEN[:valid_general_url_path_chars]}*
236
+ \(
237
+ #{REGEXEN[:valid_general_url_path_chars]}+
238
+ \)
239
+ #{REGEXEN[:valid_general_url_path_chars]}*
240
+ )
241
+ )
242
+ \)
243
+ /iox
244
+ # Valid end-of-path chracters (so /foo. does not gobble the period).
245
+ # 1. Allow =&# for empty URL parameters and other URL-join artifacts
246
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z\p{Cyrillic}0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
247
+ REGEXEN[:valid_url_path] = /(?:
248
+ (?:
249
+ #{REGEXEN[:valid_general_url_path_chars]}*
250
+ (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
251
+ #{REGEXEN[:valid_url_path_ending_chars]}
252
+ )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
253
+ )/iox
254
+
255
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
256
+ REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/\-]/i
257
+ REGEXEN[:valid_url] = %r{
258
+ ( # $1 total match
259
+ (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
260
+ ( # $3 URL
261
+ (https?:\/\/)? # $4 Protocol (optional)
262
+ (#{REGEXEN[:valid_domain]}) # $5 Domain(s)
263
+ (?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
264
+ (/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
265
+ (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
266
+ )
267
+ )
268
+ }iox
269
+
270
+ REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
271
+ REGEXEN[:valid_cashtag] = /(^|[#{UNICODE_SPACES}#{DIRECTIONAL_CHARACTERS}])(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
272
+
273
+ # These URL validation pattern strings are based on the ABNF from RFC 3986
274
+ REGEXEN[:validate_url_unreserved] = /[a-z\p{Cyrillic}0-9\p{Pd}._~]/i
275
+ REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
276
+ REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
277
+ REGEXEN[:validate_url_pchar] = /(?:
278
+ #{REGEXEN[:validate_url_unreserved]}|
279
+ #{REGEXEN[:validate_url_pct_encoded]}|
280
+ #{REGEXEN[:validate_url_sub_delims]}|
281
+ [:\|@]
282
+ )/iox
283
+
284
+ REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
285
+ REGEXEN[:validate_url_userinfo] = /(?:
286
+ #{REGEXEN[:validate_url_unreserved]}|
287
+ #{REGEXEN[:validate_url_pct_encoded]}|
288
+ #{REGEXEN[:validate_url_sub_delims]}|
289
+ :
290
+ )*/iox
291
+
292
+ REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
293
+ REGEXEN[:validate_url_ipv4] =
294
+ /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
295
+
296
+ # Punting on real IPv6 validation for now
297
+ REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
298
+
299
+ # Also punting on IPvFuture for now
300
+ REGEXEN[:validate_url_ip] = /(?:
301
+ #{REGEXEN[:validate_url_ipv4]}|
302
+ #{REGEXEN[:validate_url_ipv6]}
303
+ )/iox
304
+
305
+ # This is more strict than the rfc specifies
306
+ REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
307
+ REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
308
+ REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
309
+ REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
310
+ (?:#{REGEXEN[:validate_url_domain_segment]}\.)
311
+ #{REGEXEN[:validate_url_domain_tld]})/iox
312
+
313
+ REGEXEN[:validate_url_host] = /(?:
314
+ #{REGEXEN[:validate_url_ip]}|
315
+ #{REGEXEN[:validate_url_domain]}
316
+ )/iox
317
+
318
+ # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
319
+ REGEXEN[:validate_url_unicode_subdomain_segment] =
320
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
321
+ REGEXEN[:validate_url_unicode_domain_segment] =
322
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
323
+ REGEXEN[:validate_url_unicode_domain_tld] =
324
+ /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
325
+ REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
326
+ (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
327
+ #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
328
+
329
+ REGEXEN[:validate_url_unicode_host] = /(?:
330
+ #{REGEXEN[:validate_url_ip]}|
331
+ #{REGEXEN[:validate_url_unicode_domain]}
332
+ )/iox
333
+
334
+ REGEXEN[:validate_url_port] = /[0-9]{1,5}/
335
+
336
+ REGEXEN[:validate_url_unicode_authority] = %r{
337
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
338
+ (#{REGEXEN[:validate_url_unicode_host]}) # $2 host
339
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
340
+ }iox
341
+
342
+ REGEXEN[:validate_url_authority] = %r{
343
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
344
+ (#{REGEXEN[:validate_url_host]}) # $2 host
345
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
346
+ }iox
347
+
348
+ REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
349
+ REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
350
+ REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
351
+
352
+ REGEXEN[:valid_emoji] = Twitter::TwitterText::Regex::Emoji[:valid_emoji]
353
+
354
+ # Modified version of RFC 3986 Appendix B
355
+ REGEXEN[:validate_url_unencoded] = %r{
356
+ \A # Full URL
357
+ (?:
358
+ ([^:/?#]+):// # $1 Scheme
359
+ )?
360
+ ([^/?#]*) # $2 Authority
361
+ ([^?#]*) # $3 Path
362
+ (?:
363
+ \?([^#]*) # $4 Query
364
+ )?
365
+ (?:
366
+ \#(.*) # $5 Fragment
367
+ )?\Z
368
+ }ix
369
+
370
+ REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
371
+
372
+ REGEXEN.each_pair{|k,v| v.freeze }
373
+
374
+ # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
375
+ # is not a known symbol a <tt>nil</tt> will be returned.
376
+ def self.[](key)
377
+ REGEXEN[key]
378
+ end
379
+ end
380
+ end
381
+ end
@@ -0,0 +1,69 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ module Twitter
6
+ module TwitterText
7
+ # A module provides base methods to rewrite usernames, lists, hashtags and URLs.
8
+ module Rewriter extend self
9
+ def rewrite_entities(text, entities)
10
+ codepoints = text.to_s.to_codepoint_a
11
+
12
+ # sort by start index
13
+ entities = entities.sort_by do |entity|
14
+ indices = entity.respond_to?(:indices) ? entity.indices : entity[:indices]
15
+ indices.first
16
+ end
17
+
18
+ result = []
19
+ last_index = entities.inject(0) do |index, entity|
20
+ indices = entity.respond_to?(:indices) ? entity.indices : entity[:indices]
21
+ result << codepoints[index...indices.first]
22
+ result << yield(entity, codepoints)
23
+ indices.last
24
+ end
25
+ result << codepoints[last_index..-1]
26
+
27
+ result.flatten.join
28
+ end
29
+
30
+ # These methods are deprecated, will be removed in future.
31
+ extend Deprecation
32
+
33
+ def rewrite(text, options = {})
34
+ [:hashtags, :urls, :usernames_or_lists].inject(text) do |key|
35
+ options[key] ? send(:"rewrite_#{key}", text, &options[key]) : text
36
+ end
37
+ end
38
+ deprecate :rewrite, :rewrite_entities
39
+
40
+ def rewrite_usernames_or_lists(text)
41
+ entities = Extractor.extract_mentions_or_lists_with_indices(text)
42
+ rewrite_entities(text, entities) do |entity, codepoints|
43
+ at = codepoints[entity[:indices].first]
44
+ list_slug = entity[:list_slug]
45
+ list_slug = nil if list_slug.empty?
46
+ yield(at, entity[:screen_name], list_slug)
47
+ end
48
+ end
49
+ deprecate :rewrite_usernames_or_lists, :rewrite_entities
50
+
51
+ def rewrite_hashtags(text)
52
+ entities = Extractor.extract_hashtags_with_indices(text)
53
+ rewrite_entities(text, entities) do |entity, codepoints|
54
+ hash = codepoints[entity[:indices].first]
55
+ yield(hash, entity[:hashtag])
56
+ end
57
+ end
58
+ deprecate :rewrite_hashtags, :rewrite_entities
59
+
60
+ def rewrite_urls(text)
61
+ entities = Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false)
62
+ rewrite_entities(text, entities) do |entity, codepoints|
63
+ yield(entity[:url])
64
+ end
65
+ end
66
+ deprecate :rewrite_urls, :rewrite_entities
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,31 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ module Twitter
6
+ module TwitterText
7
+ # This module lazily defines constants of the form Uxxxx for all Unicode
8
+ # codepoints from U0000 to U10FFFF. The value of each constant is the
9
+ # UTF-8 string for the codepoint.
10
+ # Examples:
11
+ # copyright = Unicode::U00A9
12
+ # euro = Unicode::U20AC
13
+ # infinity = Unicode::U221E
14
+ #
15
+ module Unicode
16
+ CODEPOINT_REGEX = /^U_?([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})$/
17
+
18
+ def self.const_missing(name)
19
+ # Check that the constant name is of the right form: U0000 to U10FFFF
20
+ if name.to_s =~ CODEPOINT_REGEX
21
+ # Convert the codepoint to an immutable UTF-8 string,
22
+ # define a real constant for that value and return the value
23
+ #p name, name.class
24
+ const_set(name, [$1.to_i(16)].pack("U").freeze)
25
+ else # Raise an error for constants that are not Unicode.
26
+ raise NameError, "Uninitialized constant: Unicode::#{name}"
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end