twitter-text-relative 1.6.2.pre.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,362 @@
1
+ # encoding: UTF-8
2
+
3
+ module Twitter
4
+ # A collection of regular expressions for parsing Tweet text. The regular expression
5
+ # list is frozen at load time to ensure immutability. These regular expressions are
6
+ # used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
7
+ # sure these reular expressions work with Tweets in all languages.
8
+ class Regex
9
+ REGEXEN = {} # :nodoc:
10
+
11
+ def self.regex_range(from, to = nil) # :nodoc:
12
+ if $RUBY_1_9
13
+ if to
14
+ "\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
15
+ else
16
+ "\\u{#{from.to_s(16).rjust(4, '0')}}"
17
+ end
18
+ else
19
+ if to
20
+ [from].pack('U') + '-' + [to].pack('U')
21
+ else
22
+ [from].pack('U')
23
+ end
24
+ end
25
+ end
26
+
27
+ # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
28
+ # to access both the list of characters and a pattern suitible for use with String#split
29
+ # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
30
+ UNICODE_SPACES = [
31
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
32
+ 0x0020, # White_Space # Zs SPACE
33
+ 0x0085, # White_Space # Cc <control-0085>
34
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
35
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
36
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
37
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
38
+ 0x2028, # White_Space # Zl LINE SEPARATOR
39
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
40
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
41
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
42
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
43
+ ].flatten.map{|c| [c].pack('U*')}.freeze
44
+ REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
45
+
46
+ # Character not allowed in Tweets
47
+ INVALID_CHARACTERS = [
48
+ 0xFFFE, 0xFEFF, # BOM
49
+ 0xFFFF, # Special
50
+ 0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
51
+ ].map{|cp| [cp].pack('U') }.freeze
52
+ REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
53
+
54
+ major, minor, patch = RUBY_VERSION.split('.')
55
+ if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
56
+ REGEXEN[:list_name] = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/
57
+ else
58
+ # This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
59
+ REGEXEN[:list_name] = eval("/[a-zA-Z][a-zA-Z0-9_\\-\x80-\xff]{0,24}/")
60
+ end
61
+
62
+ # Latin accented characters
63
+ # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
64
+ # Also excludes 0xf7, the division sign
65
+ LATIN_ACCENTS = [
66
+ regex_range(0xc0, 0xd6),
67
+ regex_range(0xd8, 0xf6),
68
+ regex_range(0xf8, 0xff),
69
+ regex_range(0x0100, 0x024f),
70
+ regex_range(0x0253, 0x0254),
71
+ regex_range(0x0256, 0x0257),
72
+ regex_range(0x0259),
73
+ regex_range(0x025b),
74
+ regex_range(0x0263),
75
+ regex_range(0x0268),
76
+ regex_range(0x026f),
77
+ regex_range(0x0272),
78
+ regex_range(0x0289),
79
+ regex_range(0x028b),
80
+ regex_range(0x02bb),
81
+ regex_range(0x0300, 0x036f),
82
+ regex_range(0x1e00, 0x1eff)
83
+ ].join('').freeze
84
+
85
+ RTL_CHARACTERS = [
86
+ regex_range(0x0600,0x06FF),
87
+ regex_range(0x0750,0x077F),
88
+ regex_range(0x0590,0x05FF),
89
+ regex_range(0xFE70,0xFEFF)
90
+ ].join('').freeze
91
+
92
+
93
+ NON_LATIN_HASHTAG_CHARS = [
94
+ # Cyrillic (Russian, Ukrainian, etc.)
95
+ regex_range(0x0400, 0x04ff), # Cyrillic
96
+ regex_range(0x0500, 0x0527), # Cyrillic Supplement
97
+ regex_range(0x2de0, 0x2dff), # Cyrillic Extended A
98
+ regex_range(0xa640, 0xa69f), # Cyrillic Extended B
99
+ regex_range(0x0591, 0x05bf), # Hebrew
100
+ regex_range(0x05c1, 0x05c2),
101
+ regex_range(0x05c4, 0x05c5),
102
+ regex_range(0x05c7),
103
+ regex_range(0x05d0, 0x05ea),
104
+ regex_range(0x05f0, 0x05f4),
105
+ regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms
106
+ regex_range(0xfb2a, 0xfb36),
107
+ regex_range(0xfb38, 0xfb3c),
108
+ regex_range(0xfb3e),
109
+ regex_range(0xfb40, 0xfb41),
110
+ regex_range(0xfb43, 0xfb44),
111
+ regex_range(0xfb46, 0xfb4f),
112
+ regex_range(0x0610, 0x061a), # Arabic
113
+ regex_range(0x0620, 0x065f),
114
+ regex_range(0x066e, 0x06d3),
115
+ regex_range(0x06d5, 0x06dc),
116
+ regex_range(0x06de, 0x06e8),
117
+ regex_range(0x06ea, 0x06ef),
118
+ regex_range(0x06fa, 0x06fc),
119
+ regex_range(0x06ff),
120
+ regex_range(0x0750, 0x077f), # Arabic Supplement
121
+ regex_range(0x08a0), # Arabic Extended A
122
+ regex_range(0x08a2, 0x08ac),
123
+ regex_range(0x08e4, 0x08fe),
124
+ regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A
125
+ regex_range(0xfbd3, 0xfd3d),
126
+ regex_range(0xfd50, 0xfd8f),
127
+ regex_range(0xfd92, 0xfdc7),
128
+ regex_range(0xfdf0, 0xfdfb),
129
+ regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B
130
+ regex_range(0xfe76, 0xfefc),
131
+ regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner
132
+ regex_range(0x0e01, 0x0e3a), # Thai
133
+ regex_range(0x0e40, 0x0e4e), # Hangul (Korean)
134
+ regex_range(0x1100, 0x11ff), # Hangul Jamo
135
+ regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo
136
+ regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A
137
+ regex_range(0xAC00, 0xD7AF), # Hangul Syllables
138
+ regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B
139
+ regex_range(0xFFA1, 0xFFDC) # Half-width Hangul
140
+ ].join('').freeze
141
+ REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
142
+
143
+ CJ_HASHTAG_CHARACTERS = [
144
+ regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width)
145
+ regex_range(0xFF66, 0xFF9F), # Katakana (half-width)
146
+ regex_range(0xFF10, 0xFF19), regex_range(0xFF21, 0xFF3A), regex_range(0xFF41, 0xFF5A), # Latin (full-width)
147
+ regex_range(0x3041, 0x3096), regex_range(0x3099, 0x309E), # Hiragana
148
+ regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A)
149
+ regex_range(0x4E00, 0x9FFF), # Kanji (Unified)
150
+ regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B)
151
+ regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C)
152
+ regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D)
153
+ regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
154
+ ].join('').freeze
155
+
156
+ PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
157
+ SPACE_CHARS = " \t\n\x0B\f\r"
158
+ CTRL_CHARS = "\x00-\x1F\x7F"
159
+
160
+ # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
161
+ HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
162
+ HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
163
+ HASHTAG_BOUNDARY = /\A|\z|[^&a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
164
+
165
+ HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
166
+
167
+ REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
168
+ # Used in Extractor for final filtering
169
+ REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
170
+
171
+ REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-zA-Z0-9_!#\$%&*@@]|^|[rR][tT]:?)/o
172
+ REGEXEN[:at_signs] = /[@@]/
173
+ REGEXEN[:valid_mention_or_list] = /
174
+ (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
175
+ (#{REGEXEN[:at_signs]}) # $2: At mark
176
+ ([a-zA-Z0-9_]{1,20}) # $3: Screen name
177
+ (\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
178
+ /ox
179
+ REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
180
+ # Used in Extractor for final filtering
181
+ REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
182
+
183
+ # URL related hash regex collection
184
+ REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
185
+ REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
186
+ DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
187
+ REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
188
+ REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
189
+
190
+ REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^0-9a-z]|$))/i
191
+ REGEXEN[:valid_ccTLD] = %r{
192
+ (?:
193
+ (?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
194
+ ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|
195
+ gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|
196
+ lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
197
+ pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|
198
+ th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
199
+ (?=[^0-9a-z]|$)
200
+ )
201
+ }ix
202
+ REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
203
+
204
+ REGEXEN[:valid_domain] = /(?:
205
+ #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
206
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
207
+ )/iox
208
+
209
+ # This is used in Extractor
210
+ REGEXEN[:valid_ascii_domain] = /
211
+ (?:(?:[A-Za-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
212
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
213
+ /iox
214
+
215
+ # This is used in Extractor for stricter t.co URL extraction
216
+ REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/[a-z0-9]+/i
217
+
218
+ # This is used in Extractor to filter out unwanted URLs.
219
+ REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
220
+
221
+ REGEXEN[:valid_port_number] = /[0-9]+/
222
+
223
+ REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\.\$\/%#\[\]\-_~&|@#{LATIN_ACCENTS}]/io
224
+ # Allow URL paths to contain balanced parens
225
+ # 1. Used in Wikipedia URLs like /Primer_(film)
226
+ # 2. Used in IIS sessions like /S(dfd346)/
227
+ REGEXEN[:valid_url_balanced_parens] = /\(#{REGEXEN[:valid_general_url_path_chars]}+\)/io
228
+ # Valid end-of-path chracters (so /foo. does not gobble the period).
229
+ # 1. Allow =&# for empty URL parameters and other URL-join artifacts
230
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
231
+ REGEXEN[:valid_url_path] = /(?:
232
+ (?:
233
+ #{REGEXEN[:valid_general_url_path_chars]}*
234
+ (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
235
+ #{REGEXEN[:valid_url_path_ending_chars]}
236
+ )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
237
+ )/iox
238
+
239
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
240
+ REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
241
+ REGEXEN[:valid_url] = %r{
242
+ ( # $1 total match
243
+ (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
244
+ ( # $3 URL
245
+ (https?:\/\/)? # $4 Protocol (optional)
246
+ (#{REGEXEN[:valid_domain]}) # $5 Domain(s)
247
+ (?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
248
+ (/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
249
+ (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
250
+ )
251
+ )
252
+ }iox;
253
+
254
+ REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
255
+ REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
256
+
257
+ # These URL validation pattern strings are based on the ABNF from RFC 3986
258
+ REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
259
+ REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
260
+ REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
261
+ REGEXEN[:validate_url_pchar] = /(?:
262
+ #{REGEXEN[:validate_url_unreserved]}|
263
+ #{REGEXEN[:validate_url_pct_encoded]}|
264
+ #{REGEXEN[:validate_url_sub_delims]}|
265
+ [:\|@]
266
+ )/iox
267
+
268
+ REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
269
+ REGEXEN[:validate_url_userinfo] = /(?:
270
+ #{REGEXEN[:validate_url_unreserved]}|
271
+ #{REGEXEN[:validate_url_pct_encoded]}|
272
+ #{REGEXEN[:validate_url_sub_delims]}|
273
+ :
274
+ )*/iox
275
+
276
+ REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
277
+ REGEXEN[:validate_url_ipv4] =
278
+ /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
279
+
280
+ # Punting on real IPv6 validation for now
281
+ REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
282
+
283
+ # Also punting on IPvFuture for now
284
+ REGEXEN[:validate_url_ip] = /(?:
285
+ #{REGEXEN[:validate_url_ipv4]}|
286
+ #{REGEXEN[:validate_url_ipv6]}
287
+ )/iox
288
+
289
+ # This is more strict than the rfc specifies
290
+ REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
291
+ REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
292
+ REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
293
+ REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
294
+ (?:#{REGEXEN[:validate_url_domain_segment]}\.)
295
+ #{REGEXEN[:validate_url_domain_tld]})/iox
296
+
297
+ REGEXEN[:validate_url_host] = /(?:
298
+ #{REGEXEN[:validate_url_ip]}|
299
+ #{REGEXEN[:validate_url_domain]}
300
+ )/iox
301
+
302
+ # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
303
+ REGEXEN[:validate_url_unicode_subdomain_segment] =
304
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
305
+ REGEXEN[:validate_url_unicode_domain_segment] =
306
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
307
+ REGEXEN[:validate_url_unicode_domain_tld] =
308
+ /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
309
+ REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
310
+ (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
311
+ #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
312
+
313
+ REGEXEN[:validate_url_unicode_host] = /(?:
314
+ #{REGEXEN[:validate_url_ip]}|
315
+ #{REGEXEN[:validate_url_unicode_domain]}
316
+ )/iox
317
+
318
+ REGEXEN[:validate_url_port] = /[0-9]{1,5}/
319
+
320
+ REGEXEN[:validate_url_unicode_authority] = %r{
321
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
322
+ (#{REGEXEN[:validate_url_unicode_host]}) # $2 host
323
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
324
+ }iox
325
+
326
+ REGEXEN[:validate_url_authority] = %r{
327
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
328
+ (#{REGEXEN[:validate_url_host]}) # $2 host
329
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
330
+ }iox
331
+
332
+ REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
333
+ REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
334
+ REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
335
+
336
+ # Modified version of RFC 3986 Appendix B
337
+ REGEXEN[:validate_url_unencoded] = %r{
338
+ \A # Full URL
339
+ (?:
340
+ ([^:/?#]+):// # $1 Scheme
341
+ )?
342
+ ([^/?#]*) # $2 Authority
343
+ ([^?#]*) # $3 Path
344
+ (?:
345
+ \?([^#]*) # $4 Query
346
+ )?
347
+ (?:
348
+ \#(.*) # $5 Fragment
349
+ )?\Z
350
+ }ix
351
+
352
+ REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
353
+
354
+ REGEXEN.each_pair{|k,v| v.freeze }
355
+
356
+ # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
357
+ # is not a known symbol a <tt>nil</tt> will be returned.
358
+ def self.[](key)
359
+ REGEXEN[key]
360
+ end
361
+ end
362
+ end
@@ -0,0 +1,59 @@
1
+ module Twitter
2
+ # A module provides base methods to rewrite usernames, lists, hashtags and URLs.
3
+ module Rewriter extend self
4
+ def rewrite_entities(text, entities)
5
+ chars = text.to_s.to_char_a
6
+
7
+ # sort by start index
8
+ entities = entities.sort_by{|entity| entity[:indices].first}
9
+
10
+ result = []
11
+ last_index = entities.inject(0) do |last_index, entity|
12
+ result << chars[last_index...entity[:indices].first]
13
+ result << yield(entity, chars)
14
+ entity[:indices].last
15
+ end
16
+ result << chars[last_index..-1]
17
+
18
+ result.flatten.join
19
+ end
20
+
21
+ # These methods are deprecated, will be removed in future.
22
+ extend Deprecation
23
+
24
+ def rewrite(text, options = {})
25
+ [:hashtags, :urls, :usernames_or_lists].inject(text) do |key|
26
+ options[key] ? send(:"rewrite_#{key}", text, &options[key]) : text
27
+ end
28
+ end
29
+ deprecate :rewrite, :rewrite_entities
30
+
31
+ def rewrite_usernames_or_lists(text)
32
+ entities = Extractor.extract_mentions_or_lists_with_indices(text)
33
+ rewrite_entities(text, entities) do |entity, chars|
34
+ at = chars[entity[:indices].first]
35
+ list_slug = entity[:list_slug]
36
+ list_slug = nil if list_slug.empty?
37
+ yield(at, entity[:screen_name], list_slug)
38
+ end
39
+ end
40
+ deprecate :rewrite_usernames_or_lists, :rewrite_entities
41
+
42
+ def rewrite_hashtags(text)
43
+ entities = Extractor.extract_hashtags_with_indices(text)
44
+ rewrite_entities(text, entities) do |entity, chars|
45
+ hash = chars[entity[:indices].first]
46
+ yield(hash, entity[:hashtag])
47
+ end
48
+ end
49
+ deprecate :rewrite_hashtags, :rewrite_entities
50
+
51
+ def rewrite_urls(text)
52
+ entities = Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false)
53
+ rewrite_entities(text, entities) do |entity, chars|
54
+ yield(entity[:url])
55
+ end
56
+ end
57
+ deprecate :rewrite_urls, :rewrite_entities
58
+ end
59
+ end
@@ -0,0 +1,26 @@
1
+ module Twitter
2
+ # This module lazily defines constants of the form Uxxxx for all Unicode
3
+ # codepoints from U0000 to U10FFFF. The value of each constant is the
4
+ # UTF-8 string for the codepoint.
5
+ # Examples:
6
+ # copyright = Unicode::U00A9
7
+ # euro = Unicode::U20AC
8
+ # infinity = Unicode::U221E
9
+ #
10
+ module Unicode
11
+ CODEPOINT_REGEX = /^U_?([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})$/
12
+
13
+ def self.const_missing(name)
14
+ # Check that the constant name is of the right form: U0000 to U10FFFF
15
+ if name.to_s =~ CODEPOINT_REGEX
16
+ # Convert the codepoint to an immutable UTF-8 string,
17
+ # define a real constant for that value and return the value
18
+ #p name, name.class
19
+ const_set(name, [$1.to_i(16)].pack("U").freeze)
20
+ else # Raise an error for constants that are not Unicode.
21
+ raise NameError, "Uninitialized constant: Unicode::#{name}"
22
+ end
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,113 @@
1
+ require 'unf'
2
+
3
+ module Twitter
4
+ module Validation extend self
5
+ MAX_LENGTH = 140
6
+
7
+ DEFAULT_TCO_URL_LENGTHS = {
8
+ :short_url_length => 22,
9
+ :short_url_length_https => 23,
10
+ :characters_reserved_per_media => 22
11
+ }.freeze
12
+
13
+ # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
14
+ # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
15
+ # string no matter which actual form was transmitted. For example:
16
+ #
17
+ # U+0065 Latin Small Letter E
18
+ # + U+0301 Combining Acute Accent
19
+ # ----------
20
+ # = 2 bytes, 2 characters, displayed as é (1 visual glyph)
21
+ # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
22
+ #
23
+ # The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
24
+ #
25
+ def tweet_length(text, options = {})
26
+ options = DEFAULT_TCO_URL_LENGTHS.merge(options)
27
+
28
+ length = text.to_nfc.unpack("U*").length
29
+
30
+ Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
31
+ length += start_position - end_position
32
+ length += url.downcase =~ /^https:\/\// ? options[:short_url_length_https] : options[:short_url_length]
33
+ end
34
+
35
+ length
36
+ end
37
+
38
+ # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
39
+ # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
40
+ # will allow quicker feedback.
41
+ #
42
+ # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
43
+ #
44
+ # <tt>:too_long</tt>:: if the <tt>text</tt> is too long
45
+ # <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
46
+ # <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
47
+ def tweet_invalid?(text)
48
+ return :empty if !text || text.empty?
49
+ begin
50
+ return :too_long if tweet_length(text) > MAX_LENGTH
51
+ return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
52
+ rescue ArgumentError => e
53
+ # non-Unicode value.
54
+ return :invalid_characters
55
+ end
56
+
57
+ return false
58
+ end
59
+
60
+ def valid_tweet_text?(text)
61
+ !tweet_invalid?(text)
62
+ end
63
+
64
+ def valid_username?(username)
65
+ return false if !username || username.empty?
66
+
67
+ extracted = Twitter::Extractor.extract_mentioned_screen_names(username)
68
+ # Should extract the username minus the @ sign, hence the [1..-1]
69
+ extracted.size == 1 && extracted.first == username[1..-1]
70
+ end
71
+
72
+ VALID_LIST_RE = /\A#{Twitter::Regex[:valid_mention_or_list]}\z/o
73
+ def valid_list?(username_list)
74
+ match = username_list.match(VALID_LIST_RE)
75
+ # Must have matched and had nothing before or after
76
+ !!(match && match[1] == "" && match[4] && !match[4].empty?)
77
+ end
78
+
79
+ def valid_hashtag?(hashtag)
80
+ return false if !hashtag || hashtag.empty?
81
+
82
+ extracted = Twitter::Extractor.extract_hashtags(hashtag)
83
+ # Should extract the hashtag minus the # sign, hence the [1..-1]
84
+ extracted.size == 1 && extracted.first == hashtag[1..-1]
85
+ end
86
+
87
+ def valid_url?(url, unicode_domains=true, require_protocol=true)
88
+ return false if !url || url.empty?
89
+
90
+ url_parts = url.match(Twitter::Regex[:validate_url_unencoded])
91
+ return false unless (url_parts && url_parts.to_s == url)
92
+
93
+ scheme, authority, path, query, fragment = url_parts.captures
94
+
95
+ return false unless ((!require_protocol ||
96
+ (valid_match?(scheme, Twitter::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) &&
97
+ valid_match?(path, Twitter::Regex[:validate_url_path]) &&
98
+ valid_match?(query, Twitter::Regex[:validate_url_query], true) &&
99
+ valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true))
100
+
101
+ return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) ||
102
+ (!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority]))
103
+ end
104
+
105
+ private
106
+
107
+ def valid_match?(string, regex, optional=false)
108
+ return (string && string.match(regex) && $~.to_s == string) unless optional
109
+
110
+ !(string && (!string.match(regex) || $~.to_s != string))
111
+ end
112
+ end
113
+ end