ritter 0.0.87 → 0.0.88

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,366 @@
1
+ # encoding: UTF-8
2
+
3
+ module Twitter
4
+ # A collection of regular expressions for parsing Tweet text. The regular expression
5
+ # list is frozen at load time to ensure immutability. These regular expressions are
6
+ # used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
7
+ # sure these reular expressions work with Tweets in all languages.
8
+ class Regex
9
+ REGEXEN = {} # :nodoc:
10
+
11
+ def self.regex_range(from, to = nil) # :nodoc:
12
+ if $RUBY_1_9
13
+ if to
14
+ "\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
15
+ else
16
+ "\\u{#{from.to_s(16).rjust(4, '0')}}"
17
+ end
18
+ else
19
+ if to
20
+ [from].pack('U') + '-' + [to].pack('U')
21
+ else
22
+ [from].pack('U')
23
+ end
24
+ end
25
+ end
26
+
27
+ # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
28
+ # to access both the list of characters and a pattern suitible for use with String#split
29
+ # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
30
+ UNICODE_SPACES = [
31
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
32
+ 0x0020, # White_Space # Zs SPACE
33
+ 0x0085, # White_Space # Cc <control-0085>
34
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
35
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
36
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
37
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
38
+ 0x2028, # White_Space # Zl LINE SEPARATOR
39
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
40
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
41
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
42
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
43
+ ].flatten.map{|c| [c].pack('U*')}.freeze
44
+ REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
45
+
46
+ # Character not allowed in Tweets
47
+ INVALID_CHARACTERS = [
48
+ 0xFFFE, 0xFEFF, # BOM
49
+ 0xFFFF, # Special
50
+ 0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
51
+ ].map{|cp| [cp].pack('U') }.freeze
52
+ REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
53
+
54
+ major, minor, patch = RUBY_VERSION.split('.')
55
+ if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
56
+ REGEXEN[:list_name] = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/
57
+ else
58
+ # This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
59
+ REGEXEN[:list_name] = eval("/[a-zA-Z][a-zA-Z0-9_\\-\x80-\xff]{0,24}/")
60
+ end
61
+
62
+ # Latin accented characters
63
+ # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
64
+ # Also excludes 0xf7, the division sign
65
+ LATIN_ACCENTS = [
66
+ regex_range(0xc0, 0xd6),
67
+ regex_range(0xd8, 0xf6),
68
+ regex_range(0xf8, 0xff),
69
+ regex_range(0x0100, 0x024f),
70
+ regex_range(0x0253, 0x0254),
71
+ regex_range(0x0256, 0x0257),
72
+ regex_range(0x0259),
73
+ regex_range(0x025b),
74
+ regex_range(0x0263),
75
+ regex_range(0x0268),
76
+ regex_range(0x026f),
77
+ regex_range(0x0272),
78
+ regex_range(0x0289),
79
+ regex_range(0x028b),
80
+ regex_range(0x02bb),
81
+ regex_range(0x0300, 0x036f),
82
+ regex_range(0x1e00, 0x1eff)
83
+ ].join('').freeze
84
+
85
+ RTL_CHARACTERS = [
86
+ regex_range(0x0600,0x06FF),
87
+ regex_range(0x0750,0x077F),
88
+ regex_range(0x0590,0x05FF),
89
+ regex_range(0xFE70,0xFEFF)
90
+ ].join('').freeze
91
+
92
+
93
+ NON_LATIN_HASHTAG_CHARS = [
94
+ # Cyrillic (Russian, Ukrainian, etc.)
95
+ regex_range(0x0400, 0x04ff), # Cyrillic
96
+ regex_range(0x0500, 0x0527), # Cyrillic Supplement
97
+ regex_range(0x2de0, 0x2dff), # Cyrillic Extended A
98
+ regex_range(0xa640, 0xa69f), # Cyrillic Extended B
99
+ regex_range(0x0591, 0x05bf), # Hebrew
100
+ regex_range(0x05c1, 0x05c2),
101
+ regex_range(0x05c4, 0x05c5),
102
+ regex_range(0x05c7),
103
+ regex_range(0x05d0, 0x05ea),
104
+ regex_range(0x05f0, 0x05f4),
105
+ regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms
106
+ regex_range(0xfb2a, 0xfb36),
107
+ regex_range(0xfb38, 0xfb3c),
108
+ regex_range(0xfb3e),
109
+ regex_range(0xfb40, 0xfb41),
110
+ regex_range(0xfb43, 0xfb44),
111
+ regex_range(0xfb46, 0xfb4f),
112
+ regex_range(0x0610, 0x061a), # Arabic
113
+ regex_range(0x0620, 0x065f),
114
+ regex_range(0x066e, 0x06d3),
115
+ regex_range(0x06d5, 0x06dc),
116
+ regex_range(0x06de, 0x06e8),
117
+ regex_range(0x06ea, 0x06ef),
118
+ regex_range(0x06fa, 0x06fc),
119
+ regex_range(0x06ff),
120
+ regex_range(0x0750, 0x077f), # Arabic Supplement
121
+ regex_range(0x08a0), # Arabic Extended A
122
+ regex_range(0x08a2, 0x08ac),
123
+ regex_range(0x08e4, 0x08fe),
124
+ regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A
125
+ regex_range(0xfbd3, 0xfd3d),
126
+ regex_range(0xfd50, 0xfd8f),
127
+ regex_range(0xfd92, 0xfdc7),
128
+ regex_range(0xfdf0, 0xfdfb),
129
+ regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B
130
+ regex_range(0xfe76, 0xfefc),
131
+ regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner
132
+ regex_range(0x0e01, 0x0e3a), # Thai
133
+ regex_range(0x0e40, 0x0e4e), # Hangul (Korean)
134
+ regex_range(0x1100, 0x11ff), # Hangul Jamo
135
+ regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo
136
+ regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A
137
+ regex_range(0xAC00, 0xD7AF), # Hangul Syllables
138
+ regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B
139
+ regex_range(0xFFA1, 0xFFDC) # Half-width Hangul
140
+ ].join('').freeze
141
+ REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
142
+
143
+ CJ_HASHTAG_CHARACTERS = [
144
+ regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width)
145
+ regex_range(0xFF66, 0xFF9F), # Katakana (half-width)
146
+ regex_range(0xFF10, 0xFF19), regex_range(0xFF21, 0xFF3A), regex_range(0xFF41, 0xFF5A), # Latin (full-width)
147
+ regex_range(0x3041, 0x3096), regex_range(0x3099, 0x309E), # Hiragana
148
+ regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A)
149
+ regex_range(0x4E00, 0x9FFF), # Kanji (Unified)
150
+ regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B)
151
+ regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C)
152
+ regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D)
153
+ regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
154
+ ].join('').freeze
155
+
156
+ PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
157
+ SPACE_CHARS = " \t\n\x0B\f\r"
158
+ CTRL_CHARS = "\x00-\x1F\x7F"
159
+
160
+ # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
161
+ HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
162
+ HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
163
+ HASHTAG_BOUNDARY = /\A|\z|[^&a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
164
+
165
+ # Original:
166
+ #HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
167
+
168
+ # The new one:
169
+ HASHTAG = /[#].\S+/io
170
+
171
+ REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
172
+ # Used in Extractor for final filtering
173
+ REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
174
+
175
+ REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)/o
176
+ REGEXEN[:at_signs] = /[@@]/
177
+ REGEXEN[:valid_mention_or_list] = /
178
+ (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
179
+ (#{REGEXEN[:at_signs]}) # $2: At mark
180
+ ([a-zA-Z0-9_]{1,20}) # $3: Screen name
181
+ (\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
182
+ /ox
183
+ REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
184
+ # Used in Extractor for final filtering
185
+ REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
186
+
187
+ # URL related hash regex collection
188
+ REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
189
+ REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
190
+ DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
191
+ REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
192
+ REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
193
+
194
+ REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^0-9a-z]|$))/i
195
+ REGEXEN[:valid_ccTLD] = %r{
196
+ (?:
197
+ (?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
198
+ ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|
199
+ gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|
200
+ lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
201
+ pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|
202
+ th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
203
+ (?=[^0-9a-z]|$)
204
+ )
205
+ }ix
206
+ REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
207
+
208
+ REGEXEN[:valid_domain] = /(?:
209
+ #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
210
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
211
+ )/iox
212
+
213
+ # This is used in Extractor
214
+ REGEXEN[:valid_ascii_domain] = /
215
+ (?:(?:[A-Za-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
216
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
217
+ /iox
218
+
219
+ # This is used in Extractor for stricter t.co URL extraction
220
+ REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/[a-z0-9]+/i
221
+
222
+ # This is used in Extractor to filter out unwanted URLs.
223
+ REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
224
+
225
+ REGEXEN[:valid_port_number] = /[0-9]+/
226
+
227
+ REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\.\$\/%#\[\]\-_~&|@#{LATIN_ACCENTS}]/io
228
+ # Allow URL paths to contain balanced parens
229
+ # 1. Used in Wikipedia URLs like /Primer_(film)
230
+ # 2. Used in IIS sessions like /S(dfd346)/
231
+ REGEXEN[:valid_url_balanced_parens] = /\(#{REGEXEN[:valid_general_url_path_chars]}+\)/io
232
+ # Valid end-of-path chracters (so /foo. does not gobble the period).
233
+ # 1. Allow =&# for empty URL parameters and other URL-join artifacts
234
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
235
+ REGEXEN[:valid_url_path] = /(?:
236
+ (?:
237
+ #{REGEXEN[:valid_general_url_path_chars]}*
238
+ (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
239
+ #{REGEXEN[:valid_url_path_ending_chars]}
240
+ )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
241
+ )/iox
242
+
243
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
244
+ REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
245
+ REGEXEN[:valid_url] = %r{
246
+ ( # $1 total match
247
+ (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
248
+ ( # $3 URL
249
+ (https?:\/\/)? # $4 Protocol (optional)
250
+ (#{REGEXEN[:valid_domain]}) # $5 Domain(s)
251
+ (?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
252
+ (/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
253
+ (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
254
+ )
255
+ )
256
+ }iox;
257
+
258
+ REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
259
+ REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
260
+
261
+ # These URL validation pattern strings are based on the ABNF from RFC 3986
262
+ REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
263
+ REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
264
+ REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
265
+ REGEXEN[:validate_url_pchar] = /(?:
266
+ #{REGEXEN[:validate_url_unreserved]}|
267
+ #{REGEXEN[:validate_url_pct_encoded]}|
268
+ #{REGEXEN[:validate_url_sub_delims]}|
269
+ [:\|@]
270
+ )/iox
271
+
272
+ REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
273
+ REGEXEN[:validate_url_userinfo] = /(?:
274
+ #{REGEXEN[:validate_url_unreserved]}|
275
+ #{REGEXEN[:validate_url_pct_encoded]}|
276
+ #{REGEXEN[:validate_url_sub_delims]}|
277
+ :
278
+ )*/iox
279
+
280
+ REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
281
+ REGEXEN[:validate_url_ipv4] =
282
+ /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
283
+
284
+ # Punting on real IPv6 validation for now
285
+ REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
286
+
287
+ # Also punting on IPvFuture for now
288
+ REGEXEN[:validate_url_ip] = /(?:
289
+ #{REGEXEN[:validate_url_ipv4]}|
290
+ #{REGEXEN[:validate_url_ipv6]}
291
+ )/iox
292
+
293
+ # This is more strict than the rfc specifies
294
+ REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
295
+ REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
296
+ REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
297
+ REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
298
+ (?:#{REGEXEN[:validate_url_domain_segment]}\.)
299
+ #{REGEXEN[:validate_url_domain_tld]})/iox
300
+
301
+ REGEXEN[:validate_url_host] = /(?:
302
+ #{REGEXEN[:validate_url_ip]}|
303
+ #{REGEXEN[:validate_url_domain]}
304
+ )/iox
305
+
306
+ # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
307
+ REGEXEN[:validate_url_unicode_subdomain_segment] =
308
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
309
+ REGEXEN[:validate_url_unicode_domain_segment] =
310
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
311
+ REGEXEN[:validate_url_unicode_domain_tld] =
312
+ /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
313
+ REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
314
+ (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
315
+ #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
316
+
317
+ REGEXEN[:validate_url_unicode_host] = /(?:
318
+ #{REGEXEN[:validate_url_ip]}|
319
+ #{REGEXEN[:validate_url_unicode_domain]}
320
+ )/iox
321
+
322
+ REGEXEN[:validate_url_port] = /[0-9]{1,5}/
323
+
324
+ REGEXEN[:validate_url_unicode_authority] = %r{
325
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
326
+ (#{REGEXEN[:validate_url_unicode_host]}) # $2 host
327
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
328
+ }iox
329
+
330
+ REGEXEN[:validate_url_authority] = %r{
331
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
332
+ (#{REGEXEN[:validate_url_host]}) # $2 host
333
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
334
+ }iox
335
+
336
+ REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
337
+ REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
338
+ REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
339
+
340
+ # Modified version of RFC 3986 Appendix B
341
+ REGEXEN[:validate_url_unencoded] = %r{
342
+ \A # Full URL
343
+ (?:
344
+ ([^:/?#]+):// # $1 Scheme
345
+ )?
346
+ ([^/?#]*) # $2 Authority
347
+ ([^?#]*) # $3 Path
348
+ (?:
349
+ \?([^#]*) # $4 Query
350
+ )?
351
+ (?:
352
+ \#(.*) # $5 Fragment
353
+ )?\Z
354
+ }ix
355
+
356
+ REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
357
+
358
+ REGEXEN.each_pair{|k,v| v.freeze }
359
+
360
+ # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
361
+ # is not a known symbol a <tt>nil</tt> will be returned.
362
+ def self.[](key)
363
+ REGEXEN[key]
364
+ end
365
+ end
366
+ end
@@ -0,0 +1,59 @@
1
+ module Twitter
2
+ # A module provides base methods to rewrite usernames, lists, hashtags and URLs.
3
+ module Rewriter extend self
4
+ def rewrite_entities(text, entities)
5
+ chars = text.to_s.to_char_a
6
+
7
+ # sort by start index
8
+ entities = entities.sort_by{|entity| entity[:indices].first}
9
+
10
+ result = []
11
+ last_index = entities.inject(0) do |last_index, entity|
12
+ result << chars[last_index...entity[:indices].first]
13
+ result << yield(entity, chars)
14
+ entity[:indices].last
15
+ end
16
+ result << chars[last_index..-1]
17
+
18
+ result.flatten.join
19
+ end
20
+
21
+ # These methods are deprecated, will be removed in future.
22
+ extend Deprecation
23
+
24
+ def rewrite(text, options = {})
25
+ [:hashtags, :urls, :usernames_or_lists].inject(text) do |key|
26
+ options[key] ? send(:"rewrite_#{key}", text, &options[key]) : text
27
+ end
28
+ end
29
+ deprecate :rewrite, :rewrite_entities
30
+
31
+ def rewrite_usernames_or_lists(text)
32
+ entities = Extractor.extract_mentions_or_lists_with_indices(text)
33
+ rewrite_entities(text, entities) do |entity, chars|
34
+ at = chars[entity[:indices].first]
35
+ list_slug = entity[:list_slug]
36
+ list_slug = nil if list_slug.empty?
37
+ yield(at, entity[:screen_name], list_slug)
38
+ end
39
+ end
40
+ deprecate :rewrite_usernames_or_lists, :rewrite_entities
41
+
42
+ def rewrite_hashtags(text)
43
+ entities = Extractor.extract_hashtags_with_indices(text)
44
+ rewrite_entities(text, entities) do |entity, chars|
45
+ hash = chars[entity[:indices].first]
46
+ yield(hash, entity[:hashtag])
47
+ end
48
+ end
49
+ deprecate :rewrite_hashtags, :rewrite_entities
50
+
51
+ def rewrite_urls(text)
52
+ entities = Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false)
53
+ rewrite_entities(text, entities) do |entity, chars|
54
+ yield(entity[:url])
55
+ end
56
+ end
57
+ deprecate :rewrite_urls, :rewrite_entities
58
+ end
59
+ end
@@ -0,0 +1,26 @@
1
+ module Twitter
2
+ # This module lazily defines constants of the form Uxxxx for all Unicode
3
+ # codepoints from U0000 to U10FFFF. The value of each constant is the
4
+ # UTF-8 string for the codepoint.
5
+ # Examples:
6
+ # copyright = Unicode::U00A9
7
+ # euro = Unicode::U20AC
8
+ # infinity = Unicode::U221E
9
+ #
10
+ module Unicode
11
+ CODEPOINT_REGEX = /^U_?([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})$/
12
+
13
+ def self.const_missing(name)
14
+ # Check that the constant name is of the right form: U0000 to U10FFFF
15
+ if name.to_s =~ CODEPOINT_REGEX
16
+ # Convert the codepoint to an immutable UTF-8 string,
17
+ # define a real constant for that value and return the value
18
+ #p name, name.class
19
+ const_set(name, [$1.to_i(16)].pack("U").freeze)
20
+ else # Raise an error for constants that are not Unicode.
21
+ raise NameError, "Uninitialized constant: Unicode::#{name}"
22
+ end
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,113 @@
1
+ require 'unf'
2
+
3
+ module Twitter
4
+ module Validation extend self
5
+ MAX_LENGTH = 140
6
+
7
+ DEFAULT_TCO_URL_LENGTHS = {
8
+ :short_url_length => 22,
9
+ :short_url_length_https => 23,
10
+ :characters_reserved_per_media => 22
11
+ }.freeze
12
+
13
+ # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
14
+ # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
15
+ # string no matter which actual form was transmitted. For example:
16
+ #
17
+ # U+0065 Latin Small Letter E
18
+ # + U+0301 Combining Acute Accent
19
+ # ----------
20
+ # = 2 bytes, 2 characters, displayed as é (1 visual glyph)
21
+ # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
22
+ #
23
+ # The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
24
+ #
25
+ def tweet_length(text, options = {})
26
+ options = DEFAULT_TCO_URL_LENGTHS.merge(options)
27
+
28
+ length = text.to_nfc.unpack("U*").length
29
+
30
+ Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
31
+ length += start_position - end_position
32
+ length += url.downcase =~ /^https:\/\// ? options[:short_url_length_https] : options[:short_url_length]
33
+ end
34
+
35
+ length
36
+ end
37
+
38
+ # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
39
+ # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
40
+ # will allow quicker feedback.
41
+ #
42
+ # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
43
+ #
44
+ # <tt>:too_long</tt>:: if the <tt>text</tt> is too long
45
+ # <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
46
+ # <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
47
+ def tweet_invalid?(text)
48
+ return :empty if !text || text.empty?
49
+ begin
50
+ return :too_long if tweet_length(text) > MAX_LENGTH
51
+ return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
52
+ rescue ArgumentError => e
53
+ # non-Unicode value.
54
+ return :invalid_characters
55
+ end
56
+
57
+ return false
58
+ end
59
+
60
+ def valid_tweet_text?(text)
61
+ !tweet_invalid?(text)
62
+ end
63
+
64
+ def valid_username?(username)
65
+ return false if !username || username.empty?
66
+
67
+ extracted = Twitter::Extractor.extract_mentioned_screen_names(username)
68
+ # Should extract the username minus the @ sign, hence the [1..-1]
69
+ extracted.size == 1 && extracted.first == username[1..-1]
70
+ end
71
+
72
+ VALID_LIST_RE = /\A#{Twitter::Regex[:valid_mention_or_list]}\z/o
73
+ def valid_list?(username_list)
74
+ match = username_list.match(VALID_LIST_RE)
75
+ # Must have matched and had nothing before or after
76
+ !!(match && match[1] == "" && match[4] && !match[4].empty?)
77
+ end
78
+
79
+ def valid_hashtag?(hashtag)
80
+ return false if !hashtag || hashtag.empty?
81
+
82
+ extracted = Twitter::Extractor.extract_hashtags(hashtag)
83
+ # Should extract the hashtag minus the # sign, hence the [1..-1]
84
+ extracted.size == 1 && extracted.first == hashtag[1..-1]
85
+ end
86
+
87
+ def valid_url?(url, unicode_domains=true, require_protocol=true)
88
+ return false if !url || url.empty?
89
+
90
+ url_parts = url.match(Twitter::Regex[:validate_url_unencoded])
91
+ return false unless (url_parts && url_parts.to_s == url)
92
+
93
+ scheme, authority, path, query, fragment = url_parts.captures
94
+
95
+ return false unless ((!require_protocol ||
96
+ (valid_match?(scheme, Twitter::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) &&
97
+ valid_match?(path, Twitter::Regex[:validate_url_path]) &&
98
+ valid_match?(query, Twitter::Regex[:validate_url_query], true) &&
99
+ valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true))
100
+
101
+ return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) ||
102
+ (!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority]))
103
+ end
104
+
105
+ private
106
+
107
+ def valid_match?(string, regex, optional=false)
108
+ return (string && string.match(regex) && $~.to_s == string) unless optional
109
+
110
+ !(string && (!string.match(regex) || $~.to_s != string))
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,22 @@
1
+ major, minor, patch = RUBY_VERSION.split('.')
2
+
3
+ $RUBY_1_9 = if major.to_i == 1 && minor.to_i < 9
4
+ # Ruby 1.8 KCODE check. Not needed on 1.9 and later.
5
+ raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless $KCODE[0].chr =~ /u/i
6
+ false
7
+ else
8
+ true
9
+ end
10
+
11
+ %w(
12
+ deprecation
13
+ regex
14
+ rewriter
15
+ autolink
16
+ extractor
17
+ unicode
18
+ validation
19
+ hit_highlighter
20
+ ).each do |name|
21
+ require "twitter-text/#{name}"
22
+ end
data/script/destroy ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
data/script/generate ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)