ritter 0.0.87 → 0.0.88

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,366 @@
1
+ # encoding: UTF-8
2
+
3
+ module Twitter
4
+ # A collection of regular expressions for parsing Tweet text. The regular expression
5
+ # list is frozen at load time to ensure immutability. These regular expressions are
6
+ # used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
7
+ # sure these reular expressions work with Tweets in all languages.
8
+ class Regex
9
+ REGEXEN = {} # :nodoc:
10
+
11
+ def self.regex_range(from, to = nil) # :nodoc:
12
+ if $RUBY_1_9
13
+ if to
14
+ "\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
15
+ else
16
+ "\\u{#{from.to_s(16).rjust(4, '0')}}"
17
+ end
18
+ else
19
+ if to
20
+ [from].pack('U') + '-' + [to].pack('U')
21
+ else
22
+ [from].pack('U')
23
+ end
24
+ end
25
+ end
26
+
27
+ # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
28
+ # to access both the list of characters and a pattern suitible for use with String#split
29
+ # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
30
+ UNICODE_SPACES = [
31
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
32
+ 0x0020, # White_Space # Zs SPACE
33
+ 0x0085, # White_Space # Cc <control-0085>
34
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
35
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
36
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
37
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
38
+ 0x2028, # White_Space # Zl LINE SEPARATOR
39
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
40
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
41
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
42
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
43
+ ].flatten.map{|c| [c].pack('U*')}.freeze
44
+ REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
45
+
46
+ # Character not allowed in Tweets
47
+ INVALID_CHARACTERS = [
48
+ 0xFFFE, 0xFEFF, # BOM
49
+ 0xFFFF, # Special
50
+ 0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
51
+ ].map{|cp| [cp].pack('U') }.freeze
52
+ REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
53
+
54
+ major, minor, patch = RUBY_VERSION.split('.')
55
+ if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
56
+ REGEXEN[:list_name] = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/
57
+ else
58
+ # This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
59
+ REGEXEN[:list_name] = eval("/[a-zA-Z][a-zA-Z0-9_\\-\x80-\xff]{0,24}/")
60
+ end
61
+
62
+ # Latin accented characters
63
+ # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
64
+ # Also excludes 0xf7, the division sign
65
+ LATIN_ACCENTS = [
66
+ regex_range(0xc0, 0xd6),
67
+ regex_range(0xd8, 0xf6),
68
+ regex_range(0xf8, 0xff),
69
+ regex_range(0x0100, 0x024f),
70
+ regex_range(0x0253, 0x0254),
71
+ regex_range(0x0256, 0x0257),
72
+ regex_range(0x0259),
73
+ regex_range(0x025b),
74
+ regex_range(0x0263),
75
+ regex_range(0x0268),
76
+ regex_range(0x026f),
77
+ regex_range(0x0272),
78
+ regex_range(0x0289),
79
+ regex_range(0x028b),
80
+ regex_range(0x02bb),
81
+ regex_range(0x0300, 0x036f),
82
+ regex_range(0x1e00, 0x1eff)
83
+ ].join('').freeze
84
+
85
+ RTL_CHARACTERS = [
86
+ regex_range(0x0600,0x06FF),
87
+ regex_range(0x0750,0x077F),
88
+ regex_range(0x0590,0x05FF),
89
+ regex_range(0xFE70,0xFEFF)
90
+ ].join('').freeze
91
+
92
+
93
+ NON_LATIN_HASHTAG_CHARS = [
94
+ # Cyrillic (Russian, Ukrainian, etc.)
95
+ regex_range(0x0400, 0x04ff), # Cyrillic
96
+ regex_range(0x0500, 0x0527), # Cyrillic Supplement
97
+ regex_range(0x2de0, 0x2dff), # Cyrillic Extended A
98
+ regex_range(0xa640, 0xa69f), # Cyrillic Extended B
99
+ regex_range(0x0591, 0x05bf), # Hebrew
100
+ regex_range(0x05c1, 0x05c2),
101
+ regex_range(0x05c4, 0x05c5),
102
+ regex_range(0x05c7),
103
+ regex_range(0x05d0, 0x05ea),
104
+ regex_range(0x05f0, 0x05f4),
105
+ regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms
106
+ regex_range(0xfb2a, 0xfb36),
107
+ regex_range(0xfb38, 0xfb3c),
108
+ regex_range(0xfb3e),
109
+ regex_range(0xfb40, 0xfb41),
110
+ regex_range(0xfb43, 0xfb44),
111
+ regex_range(0xfb46, 0xfb4f),
112
+ regex_range(0x0610, 0x061a), # Arabic
113
+ regex_range(0x0620, 0x065f),
114
+ regex_range(0x066e, 0x06d3),
115
+ regex_range(0x06d5, 0x06dc),
116
+ regex_range(0x06de, 0x06e8),
117
+ regex_range(0x06ea, 0x06ef),
118
+ regex_range(0x06fa, 0x06fc),
119
+ regex_range(0x06ff),
120
+ regex_range(0x0750, 0x077f), # Arabic Supplement
121
+ regex_range(0x08a0), # Arabic Extended A
122
+ regex_range(0x08a2, 0x08ac),
123
+ regex_range(0x08e4, 0x08fe),
124
+ regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A
125
+ regex_range(0xfbd3, 0xfd3d),
126
+ regex_range(0xfd50, 0xfd8f),
127
+ regex_range(0xfd92, 0xfdc7),
128
+ regex_range(0xfdf0, 0xfdfb),
129
+ regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B
130
+ regex_range(0xfe76, 0xfefc),
131
+ regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner
132
+ regex_range(0x0e01, 0x0e3a), # Thai
133
+ regex_range(0x0e40, 0x0e4e), # Hangul (Korean)
134
+ regex_range(0x1100, 0x11ff), # Hangul Jamo
135
+ regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo
136
+ regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A
137
+ regex_range(0xAC00, 0xD7AF), # Hangul Syllables
138
+ regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B
139
+ regex_range(0xFFA1, 0xFFDC) # Half-width Hangul
140
+ ].join('').freeze
141
+ REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
142
+
143
+ CJ_HASHTAG_CHARACTERS = [
144
+ regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width)
145
+ regex_range(0xFF66, 0xFF9F), # Katakana (half-width)
146
+ regex_range(0xFF10, 0xFF19), regex_range(0xFF21, 0xFF3A), regex_range(0xFF41, 0xFF5A), # Latin (full-width)
147
+ regex_range(0x3041, 0x3096), regex_range(0x3099, 0x309E), # Hiragana
148
+ regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A)
149
+ regex_range(0x4E00, 0x9FFF), # Kanji (Unified)
150
+ regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B)
151
+ regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C)
152
+ regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D)
153
+ regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
154
+ ].join('').freeze
155
+
156
+ PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
157
+ SPACE_CHARS = " \t\n\x0B\f\r"
158
+ CTRL_CHARS = "\x00-\x1F\x7F"
159
+
160
+ # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
161
+ HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
162
+ HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
163
+ HASHTAG_BOUNDARY = /\A|\z|[^&a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
164
+
165
+ # Original:
166
+ #HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
167
+
168
+ # The new one:
169
+ HASHTAG = /[#].\S+/io
170
+
171
+ REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
172
+ # Used in Extractor for final filtering
173
+ REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
174
+
175
+ REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)/o
176
+ REGEXEN[:at_signs] = /[@@]/
177
+ REGEXEN[:valid_mention_or_list] = /
178
+ (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
179
+ (#{REGEXEN[:at_signs]}) # $2: At mark
180
+ ([a-zA-Z0-9_]{1,20}) # $3: Screen name
181
+ (\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
182
+ /ox
183
+ REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
184
+ # Used in Extractor for final filtering
185
+ REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
186
+
187
+ # URL related hash regex collection
188
+ REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
189
+ REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
190
+ DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
191
+ REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
192
+ REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
193
+
194
+ REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^0-9a-z]|$))/i
195
+ REGEXEN[:valid_ccTLD] = %r{
196
+ (?:
197
+ (?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
198
+ ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|
199
+ gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|
200
+ lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
201
+ pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|
202
+ th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
203
+ (?=[^0-9a-z]|$)
204
+ )
205
+ }ix
206
+ REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
207
+
208
+ REGEXEN[:valid_domain] = /(?:
209
+ #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
210
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
211
+ )/iox
212
+
213
+ # This is used in Extractor
214
+ REGEXEN[:valid_ascii_domain] = /
215
+ (?:(?:[A-Za-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
216
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
217
+ /iox
218
+
219
+ # This is used in Extractor for stricter t.co URL extraction
220
+ REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/[a-z0-9]+/i
221
+
222
+ # This is used in Extractor to filter out unwanted URLs.
223
+ REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
224
+
225
+ REGEXEN[:valid_port_number] = /[0-9]+/
226
+
227
+ REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\.\$\/%#\[\]\-_~&|@#{LATIN_ACCENTS}]/io
228
+ # Allow URL paths to contain balanced parens
229
+ # 1. Used in Wikipedia URLs like /Primer_(film)
230
+ # 2. Used in IIS sessions like /S(dfd346)/
231
+ REGEXEN[:valid_url_balanced_parens] = /\(#{REGEXEN[:valid_general_url_path_chars]}+\)/io
232
+ # Valid end-of-path chracters (so /foo. does not gobble the period).
233
+ # 1. Allow =&# for empty URL parameters and other URL-join artifacts
234
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
235
+ REGEXEN[:valid_url_path] = /(?:
236
+ (?:
237
+ #{REGEXEN[:valid_general_url_path_chars]}*
238
+ (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
239
+ #{REGEXEN[:valid_url_path_ending_chars]}
240
+ )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
241
+ )/iox
242
+
243
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
244
+ REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
245
+ REGEXEN[:valid_url] = %r{
246
+ ( # $1 total match
247
+ (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
248
+ ( # $3 URL
249
+ (https?:\/\/)? # $4 Protocol (optional)
250
+ (#{REGEXEN[:valid_domain]}) # $5 Domain(s)
251
+ (?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
252
+ (/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
253
+ (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
254
+ )
255
+ )
256
+ }iox;
257
+
258
+ REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
259
+ REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
260
+
261
+ # These URL validation pattern strings are based on the ABNF from RFC 3986
262
+ REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
263
+ REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
264
+ REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
265
+ REGEXEN[:validate_url_pchar] = /(?:
266
+ #{REGEXEN[:validate_url_unreserved]}|
267
+ #{REGEXEN[:validate_url_pct_encoded]}|
268
+ #{REGEXEN[:validate_url_sub_delims]}|
269
+ [:\|@]
270
+ )/iox
271
+
272
+ REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
273
+ REGEXEN[:validate_url_userinfo] = /(?:
274
+ #{REGEXEN[:validate_url_unreserved]}|
275
+ #{REGEXEN[:validate_url_pct_encoded]}|
276
+ #{REGEXEN[:validate_url_sub_delims]}|
277
+ :
278
+ )*/iox
279
+
280
+ REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
281
+ REGEXEN[:validate_url_ipv4] =
282
+ /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
283
+
284
+ # Punting on real IPv6 validation for now
285
+ REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
286
+
287
+ # Also punting on IPvFuture for now
288
+ REGEXEN[:validate_url_ip] = /(?:
289
+ #{REGEXEN[:validate_url_ipv4]}|
290
+ #{REGEXEN[:validate_url_ipv6]}
291
+ )/iox
292
+
293
+ # This is more strict than the rfc specifies
294
+ REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
295
+ REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
296
+ REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
297
+ REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
298
+ (?:#{REGEXEN[:validate_url_domain_segment]}\.)
299
+ #{REGEXEN[:validate_url_domain_tld]})/iox
300
+
301
+ REGEXEN[:validate_url_host] = /(?:
302
+ #{REGEXEN[:validate_url_ip]}|
303
+ #{REGEXEN[:validate_url_domain]}
304
+ )/iox
305
+
306
+ # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
307
+ REGEXEN[:validate_url_unicode_subdomain_segment] =
308
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
309
+ REGEXEN[:validate_url_unicode_domain_segment] =
310
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
311
+ REGEXEN[:validate_url_unicode_domain_tld] =
312
+ /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
313
+ REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
314
+ (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
315
+ #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
316
+
317
+ REGEXEN[:validate_url_unicode_host] = /(?:
318
+ #{REGEXEN[:validate_url_ip]}|
319
+ #{REGEXEN[:validate_url_unicode_domain]}
320
+ )/iox
321
+
322
+ REGEXEN[:validate_url_port] = /[0-9]{1,5}/
323
+
324
+ REGEXEN[:validate_url_unicode_authority] = %r{
325
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
326
+ (#{REGEXEN[:validate_url_unicode_host]}) # $2 host
327
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
328
+ }iox
329
+
330
+ REGEXEN[:validate_url_authority] = %r{
331
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
332
+ (#{REGEXEN[:validate_url_host]}) # $2 host
333
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
334
+ }iox
335
+
336
+ REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
337
+ REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
338
+ REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
339
+
340
+ # Modified version of RFC 3986 Appendix B
341
+ REGEXEN[:validate_url_unencoded] = %r{
342
+ \A # Full URL
343
+ (?:
344
+ ([^:/?#]+):// # $1 Scheme
345
+ )?
346
+ ([^/?#]*) # $2 Authority
347
+ ([^?#]*) # $3 Path
348
+ (?:
349
+ \?([^#]*) # $4 Query
350
+ )?
351
+ (?:
352
+ \#(.*) # $5 Fragment
353
+ )?\Z
354
+ }ix
355
+
356
+ REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
357
+
358
+ REGEXEN.each_pair{|k,v| v.freeze }
359
+
360
+ # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
361
+ # is not a known symbol a <tt>nil</tt> will be returned.
362
+ def self.[](key)
363
+ REGEXEN[key]
364
+ end
365
+ end
366
+ end
@@ -0,0 +1,59 @@
1
+ module Twitter
2
+ # A module provides base methods to rewrite usernames, lists, hashtags and URLs.
3
+ module Rewriter extend self
4
+ def rewrite_entities(text, entities)
5
+ chars = text.to_s.to_char_a
6
+
7
+ # sort by start index
8
+ entities = entities.sort_by{|entity| entity[:indices].first}
9
+
10
+ result = []
11
+ last_index = entities.inject(0) do |last_index, entity|
12
+ result << chars[last_index...entity[:indices].first]
13
+ result << yield(entity, chars)
14
+ entity[:indices].last
15
+ end
16
+ result << chars[last_index..-1]
17
+
18
+ result.flatten.join
19
+ end
20
+
21
+ # These methods are deprecated, will be removed in future.
22
+ extend Deprecation
23
+
24
+ def rewrite(text, options = {})
25
+ [:hashtags, :urls, :usernames_or_lists].inject(text) do |key|
26
+ options[key] ? send(:"rewrite_#{key}", text, &options[key]) : text
27
+ end
28
+ end
29
+ deprecate :rewrite, :rewrite_entities
30
+
31
+ def rewrite_usernames_or_lists(text)
32
+ entities = Extractor.extract_mentions_or_lists_with_indices(text)
33
+ rewrite_entities(text, entities) do |entity, chars|
34
+ at = chars[entity[:indices].first]
35
+ list_slug = entity[:list_slug]
36
+ list_slug = nil if list_slug.empty?
37
+ yield(at, entity[:screen_name], list_slug)
38
+ end
39
+ end
40
+ deprecate :rewrite_usernames_or_lists, :rewrite_entities
41
+
42
+ def rewrite_hashtags(text)
43
+ entities = Extractor.extract_hashtags_with_indices(text)
44
+ rewrite_entities(text, entities) do |entity, chars|
45
+ hash = chars[entity[:indices].first]
46
+ yield(hash, entity[:hashtag])
47
+ end
48
+ end
49
+ deprecate :rewrite_hashtags, :rewrite_entities
50
+
51
+ def rewrite_urls(text)
52
+ entities = Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false)
53
+ rewrite_entities(text, entities) do |entity, chars|
54
+ yield(entity[:url])
55
+ end
56
+ end
57
+ deprecate :rewrite_urls, :rewrite_entities
58
+ end
59
+ end
@@ -0,0 +1,26 @@
1
+ module Twitter
2
+ # This module lazily defines constants of the form Uxxxx for all Unicode
3
+ # codepoints from U0000 to U10FFFF. The value of each constant is the
4
+ # UTF-8 string for the codepoint.
5
+ # Examples:
6
+ # copyright = Unicode::U00A9
7
+ # euro = Unicode::U20AC
8
+ # infinity = Unicode::U221E
9
+ #
10
+ module Unicode
11
+ CODEPOINT_REGEX = /^U_?([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})$/
12
+
13
+ def self.const_missing(name)
14
+ # Check that the constant name is of the right form: U0000 to U10FFFF
15
+ if name.to_s =~ CODEPOINT_REGEX
16
+ # Convert the codepoint to an immutable UTF-8 string,
17
+ # define a real constant for that value and return the value
18
+ #p name, name.class
19
+ const_set(name, [$1.to_i(16)].pack("U").freeze)
20
+ else # Raise an error for constants that are not Unicode.
21
+ raise NameError, "Uninitialized constant: Unicode::#{name}"
22
+ end
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,113 @@
1
+ require 'unf'
2
+
3
+ module Twitter
4
+ module Validation extend self
5
+ MAX_LENGTH = 140
6
+
7
+ DEFAULT_TCO_URL_LENGTHS = {
8
+ :short_url_length => 22,
9
+ :short_url_length_https => 23,
10
+ :characters_reserved_per_media => 22
11
+ }.freeze
12
+
13
+ # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
14
+ # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
15
+ # string no matter which actual form was transmitted. For example:
16
+ #
17
+ # U+0065 Latin Small Letter E
18
+ # + U+0301 Combining Acute Accent
19
+ # ----------
20
+ # = 2 bytes, 2 characters, displayed as é (1 visual glyph)
21
+ # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
22
+ #
23
+ # The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
24
+ #
25
+ def tweet_length(text, options = {})
26
+ options = DEFAULT_TCO_URL_LENGTHS.merge(options)
27
+
28
+ length = text.to_nfc.unpack("U*").length
29
+
30
+ Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
31
+ length += start_position - end_position
32
+ length += url.downcase =~ /^https:\/\// ? options[:short_url_length_https] : options[:short_url_length]
33
+ end
34
+
35
+ length
36
+ end
37
+
38
+ # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
39
+ # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
40
+ # will allow quicker feedback.
41
+ #
42
+ # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
43
+ #
44
+ # <tt>:too_long</tt>:: if the <tt>text</tt> is too long
45
+ # <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
46
+ # <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
47
+ def tweet_invalid?(text)
48
+ return :empty if !text || text.empty?
49
+ begin
50
+ return :too_long if tweet_length(text) > MAX_LENGTH
51
+ return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
52
+ rescue ArgumentError => e
53
+ # non-Unicode value.
54
+ return :invalid_characters
55
+ end
56
+
57
+ return false
58
+ end
59
+
60
+ def valid_tweet_text?(text)
61
+ !tweet_invalid?(text)
62
+ end
63
+
64
+ def valid_username?(username)
65
+ return false if !username || username.empty?
66
+
67
+ extracted = Twitter::Extractor.extract_mentioned_screen_names(username)
68
+ # Should extract the username minus the @ sign, hence the [1..-1]
69
+ extracted.size == 1 && extracted.first == username[1..-1]
70
+ end
71
+
72
+ VALID_LIST_RE = /\A#{Twitter::Regex[:valid_mention_or_list]}\z/o
73
+ def valid_list?(username_list)
74
+ match = username_list.match(VALID_LIST_RE)
75
+ # Must have matched and had nothing before or after
76
+ !!(match && match[1] == "" && match[4] && !match[4].empty?)
77
+ end
78
+
79
+ def valid_hashtag?(hashtag)
80
+ return false if !hashtag || hashtag.empty?
81
+
82
+ extracted = Twitter::Extractor.extract_hashtags(hashtag)
83
+ # Should extract the hashtag minus the # sign, hence the [1..-1]
84
+ extracted.size == 1 && extracted.first == hashtag[1..-1]
85
+ end
86
+
87
+ def valid_url?(url, unicode_domains=true, require_protocol=true)
88
+ return false if !url || url.empty?
89
+
90
+ url_parts = url.match(Twitter::Regex[:validate_url_unencoded])
91
+ return false unless (url_parts && url_parts.to_s == url)
92
+
93
+ scheme, authority, path, query, fragment = url_parts.captures
94
+
95
+ return false unless ((!require_protocol ||
96
+ (valid_match?(scheme, Twitter::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) &&
97
+ valid_match?(path, Twitter::Regex[:validate_url_path]) &&
98
+ valid_match?(query, Twitter::Regex[:validate_url_query], true) &&
99
+ valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true))
100
+
101
+ return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) ||
102
+ (!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority]))
103
+ end
104
+
105
+ private
106
+
107
+ def valid_match?(string, regex, optional=false)
108
+ return (string && string.match(regex) && $~.to_s == string) unless optional
109
+
110
+ !(string && (!string.match(regex) || $~.to_s != string))
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,22 @@
1
+ major, minor, patch = RUBY_VERSION.split('.')
2
+
3
+ $RUBY_1_9 = if major.to_i == 1 && minor.to_i < 9
4
+ # Ruby 1.8 KCODE check. Not needed on 1.9 and later.
5
+ raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless $KCODE[0].chr =~ /u/i
6
+ false
7
+ else
8
+ true
9
+ end
10
+
11
+ %w(
12
+ deprecation
13
+ regex
14
+ rewriter
15
+ autolink
16
+ extractor
17
+ unicode
18
+ validation
19
+ hit_highlighter
20
+ ).each do |name|
21
+ require "twitter-text/#{name}"
22
+ end
data/script/destroy ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
data/script/generate ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)