twitter-text-relative 1.6.2.pre.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gemtest +0 -0
- data/.gitignore +40 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +188 -0
- data/README.rdoc +123 -0
- data/Rakefile +64 -0
- data/lib/twitter-text-relative.rb +22 -0
- data/lib/twitter-text-relative/autolink.rb +443 -0
- data/lib/twitter-text-relative/deprecation.rb +15 -0
- data/lib/twitter-text-relative/extractor.rb +328 -0
- data/lib/twitter-text-relative/hash_helper.rb +21 -0
- data/lib/twitter-text-relative/hit_highlighter.rb +86 -0
- data/lib/twitter-text-relative/regex.rb +362 -0
- data/lib/twitter-text-relative/rewriter.rb +59 -0
- data/lib/twitter-text-relative/unicode.rb +26 -0
- data/lib/twitter-text-relative/validation.rb +113 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/spec/autolinking_spec.rb +826 -0
- data/spec/extractor_spec.rb +368 -0
- data/spec/hithighlighter_spec.rb +92 -0
- data/spec/regex_spec.rb +38 -0
- data/spec/rewriter_spec.rb +548 -0
- data/spec/spec_helper.rb +127 -0
- data/spec/test_urls.rb +80 -0
- data/spec/twitter_text_spec.rb +21 -0
- data/spec/unicode_spec.rb +31 -0
- data/spec/validation_spec.rb +43 -0
- data/test/conformance_test.rb +182 -0
- data/twitter-text-relative.gemspec +30 -0
- metadata +203 -0
@@ -0,0 +1,362 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Twitter
|
4
|
+
# A collection of regular expressions for parsing Tweet text. The regular expression
|
5
|
+
# list is frozen at load time to ensure immutability. These regular expressions are
|
6
|
+
# used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
|
7
|
+
# sure these reular expressions work with Tweets in all languages.
|
8
|
+
class Regex
|
9
|
+
REGEXEN = {} # :nodoc:
|
10
|
+
|
11
|
+
def self.regex_range(from, to = nil) # :nodoc:
|
12
|
+
if $RUBY_1_9
|
13
|
+
if to
|
14
|
+
"\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
|
15
|
+
else
|
16
|
+
"\\u{#{from.to_s(16).rjust(4, '0')}}"
|
17
|
+
end
|
18
|
+
else
|
19
|
+
if to
|
20
|
+
[from].pack('U') + '-' + [to].pack('U')
|
21
|
+
else
|
22
|
+
[from].pack('U')
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
|
28
|
+
# to access both the list of characters and a pattern suitible for use with String#split
|
29
|
+
# Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
|
30
|
+
UNICODE_SPACES = [
|
31
|
+
(0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
|
32
|
+
0x0020, # White_Space # Zs SPACE
|
33
|
+
0x0085, # White_Space # Cc <control-0085>
|
34
|
+
0x00A0, # White_Space # Zs NO-BREAK SPACE
|
35
|
+
0x1680, # White_Space # Zs OGHAM SPACE MARK
|
36
|
+
0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
|
37
|
+
(0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
|
38
|
+
0x2028, # White_Space # Zl LINE SEPARATOR
|
39
|
+
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
|
40
|
+
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
|
41
|
+
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
|
42
|
+
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
|
43
|
+
].flatten.map{|c| [c].pack('U*')}.freeze
|
44
|
+
REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
|
45
|
+
|
46
|
+
# Character not allowed in Tweets
|
47
|
+
INVALID_CHARACTERS = [
|
48
|
+
0xFFFE, 0xFEFF, # BOM
|
49
|
+
0xFFFF, # Special
|
50
|
+
0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
|
51
|
+
].map{|cp| [cp].pack('U') }.freeze
|
52
|
+
REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
|
53
|
+
|
54
|
+
major, minor, patch = RUBY_VERSION.split('.')
|
55
|
+
if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
|
56
|
+
REGEXEN[:list_name] = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/
|
57
|
+
else
|
58
|
+
# This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
|
59
|
+
REGEXEN[:list_name] = eval("/[a-zA-Z][a-zA-Z0-9_\\-\x80-\xff]{0,24}/")
|
60
|
+
end
|
61
|
+
|
62
|
+
# Latin accented characters
|
63
|
+
# Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
|
64
|
+
# Also excludes 0xf7, the division sign
|
65
|
+
LATIN_ACCENTS = [
|
66
|
+
regex_range(0xc0, 0xd6),
|
67
|
+
regex_range(0xd8, 0xf6),
|
68
|
+
regex_range(0xf8, 0xff),
|
69
|
+
regex_range(0x0100, 0x024f),
|
70
|
+
regex_range(0x0253, 0x0254),
|
71
|
+
regex_range(0x0256, 0x0257),
|
72
|
+
regex_range(0x0259),
|
73
|
+
regex_range(0x025b),
|
74
|
+
regex_range(0x0263),
|
75
|
+
regex_range(0x0268),
|
76
|
+
regex_range(0x026f),
|
77
|
+
regex_range(0x0272),
|
78
|
+
regex_range(0x0289),
|
79
|
+
regex_range(0x028b),
|
80
|
+
regex_range(0x02bb),
|
81
|
+
regex_range(0x0300, 0x036f),
|
82
|
+
regex_range(0x1e00, 0x1eff)
|
83
|
+
].join('').freeze
|
84
|
+
|
85
|
+
RTL_CHARACTERS = [
|
86
|
+
regex_range(0x0600,0x06FF),
|
87
|
+
regex_range(0x0750,0x077F),
|
88
|
+
regex_range(0x0590,0x05FF),
|
89
|
+
regex_range(0xFE70,0xFEFF)
|
90
|
+
].join('').freeze
|
91
|
+
|
92
|
+
|
93
|
+
NON_LATIN_HASHTAG_CHARS = [
|
94
|
+
# Cyrillic (Russian, Ukrainian, etc.)
|
95
|
+
regex_range(0x0400, 0x04ff), # Cyrillic
|
96
|
+
regex_range(0x0500, 0x0527), # Cyrillic Supplement
|
97
|
+
regex_range(0x2de0, 0x2dff), # Cyrillic Extended A
|
98
|
+
regex_range(0xa640, 0xa69f), # Cyrillic Extended B
|
99
|
+
regex_range(0x0591, 0x05bf), # Hebrew
|
100
|
+
regex_range(0x05c1, 0x05c2),
|
101
|
+
regex_range(0x05c4, 0x05c5),
|
102
|
+
regex_range(0x05c7),
|
103
|
+
regex_range(0x05d0, 0x05ea),
|
104
|
+
regex_range(0x05f0, 0x05f4),
|
105
|
+
regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms
|
106
|
+
regex_range(0xfb2a, 0xfb36),
|
107
|
+
regex_range(0xfb38, 0xfb3c),
|
108
|
+
regex_range(0xfb3e),
|
109
|
+
regex_range(0xfb40, 0xfb41),
|
110
|
+
regex_range(0xfb43, 0xfb44),
|
111
|
+
regex_range(0xfb46, 0xfb4f),
|
112
|
+
regex_range(0x0610, 0x061a), # Arabic
|
113
|
+
regex_range(0x0620, 0x065f),
|
114
|
+
regex_range(0x066e, 0x06d3),
|
115
|
+
regex_range(0x06d5, 0x06dc),
|
116
|
+
regex_range(0x06de, 0x06e8),
|
117
|
+
regex_range(0x06ea, 0x06ef),
|
118
|
+
regex_range(0x06fa, 0x06fc),
|
119
|
+
regex_range(0x06ff),
|
120
|
+
regex_range(0x0750, 0x077f), # Arabic Supplement
|
121
|
+
regex_range(0x08a0), # Arabic Extended A
|
122
|
+
regex_range(0x08a2, 0x08ac),
|
123
|
+
regex_range(0x08e4, 0x08fe),
|
124
|
+
regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A
|
125
|
+
regex_range(0xfbd3, 0xfd3d),
|
126
|
+
regex_range(0xfd50, 0xfd8f),
|
127
|
+
regex_range(0xfd92, 0xfdc7),
|
128
|
+
regex_range(0xfdf0, 0xfdfb),
|
129
|
+
regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B
|
130
|
+
regex_range(0xfe76, 0xfefc),
|
131
|
+
regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner
|
132
|
+
regex_range(0x0e01, 0x0e3a), # Thai
|
133
|
+
regex_range(0x0e40, 0x0e4e), # Hangul (Korean)
|
134
|
+
regex_range(0x1100, 0x11ff), # Hangul Jamo
|
135
|
+
regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo
|
136
|
+
regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A
|
137
|
+
regex_range(0xAC00, 0xD7AF), # Hangul Syllables
|
138
|
+
regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B
|
139
|
+
regex_range(0xFFA1, 0xFFDC) # Half-width Hangul
|
140
|
+
].join('').freeze
|
141
|
+
REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
|
142
|
+
|
143
|
+
CJ_HASHTAG_CHARACTERS = [
|
144
|
+
regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width)
|
145
|
+
regex_range(0xFF66, 0xFF9F), # Katakana (half-width)
|
146
|
+
regex_range(0xFF10, 0xFF19), regex_range(0xFF21, 0xFF3A), regex_range(0xFF41, 0xFF5A), # Latin (full-width)
|
147
|
+
regex_range(0x3041, 0x3096), regex_range(0x3099, 0x309E), # Hiragana
|
148
|
+
regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A)
|
149
|
+
regex_range(0x4E00, 0x9FFF), # Kanji (Unified)
|
150
|
+
regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B)
|
151
|
+
regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C)
|
152
|
+
regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D)
|
153
|
+
regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
|
154
|
+
].join('').freeze
|
155
|
+
|
156
|
+
PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
|
157
|
+
SPACE_CHARS = " \t\n\x0B\f\r"
|
158
|
+
CTRL_CHARS = "\x00-\x1F\x7F"
|
159
|
+
|
160
|
+
# A hashtag must contain latin characters, numbers and underscores, but not all numbers.
|
161
|
+
HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
|
162
|
+
HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
|
163
|
+
HASHTAG_BOUNDARY = /\A|\z|[^&a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
|
164
|
+
|
165
|
+
HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
|
166
|
+
|
167
|
+
REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
|
168
|
+
# Used in Extractor for final filtering
|
169
|
+
REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
|
170
|
+
|
171
|
+
REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-zA-Z0-9_!#\$%&*@@]|^|[rR][tT]:?)/o
|
172
|
+
REGEXEN[:at_signs] = /[@@]/
|
173
|
+
REGEXEN[:valid_mention_or_list] = /
|
174
|
+
(#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
|
175
|
+
(#{REGEXEN[:at_signs]}) # $2: At mark
|
176
|
+
([a-zA-Z0-9_]{1,20}) # $3: Screen name
|
177
|
+
(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
|
178
|
+
/ox
|
179
|
+
REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
180
|
+
# Used in Extractor for final filtering
|
181
|
+
REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
|
182
|
+
|
183
|
+
# URL related hash regex collection
|
184
|
+
REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
|
185
|
+
REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
|
186
|
+
DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
|
187
|
+
REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
188
|
+
REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
189
|
+
|
190
|
+
REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^0-9a-z]|$))/i
|
191
|
+
REGEXEN[:valid_ccTLD] = %r{
|
192
|
+
(?:
|
193
|
+
(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
|
194
|
+
ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|
|
195
|
+
gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|
|
196
|
+
lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
|
197
|
+
pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|
|
198
|
+
th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
|
199
|
+
(?=[^0-9a-z]|$)
|
200
|
+
)
|
201
|
+
}ix
|
202
|
+
REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
|
203
|
+
|
204
|
+
REGEXEN[:valid_domain] = /(?:
|
205
|
+
#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
|
206
|
+
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
207
|
+
)/iox
|
208
|
+
|
209
|
+
# This is used in Extractor
|
210
|
+
REGEXEN[:valid_ascii_domain] = /
|
211
|
+
(?:(?:[A-Za-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
|
212
|
+
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
213
|
+
/iox
|
214
|
+
|
215
|
+
# This is used in Extractor for stricter t.co URL extraction
|
216
|
+
REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/[a-z0-9]+/i
|
217
|
+
|
218
|
+
# This is used in Extractor to filter out unwanted URLs.
|
219
|
+
REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
|
220
|
+
|
221
|
+
REGEXEN[:valid_port_number] = /[0-9]+/
|
222
|
+
|
223
|
+
REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\.\$\/%#\[\]\-_~&|@#{LATIN_ACCENTS}]/io
|
224
|
+
# Allow URL paths to contain balanced parens
|
225
|
+
# 1. Used in Wikipedia URLs like /Primer_(film)
|
226
|
+
# 2. Used in IIS sessions like /S(dfd346)/
|
227
|
+
REGEXEN[:valid_url_balanced_parens] = /\(#{REGEXEN[:valid_general_url_path_chars]}+\)/io
|
228
|
+
# Valid end-of-path chracters (so /foo. does not gobble the period).
|
229
|
+
# 1. Allow =&# for empty URL parameters and other URL-join artifacts
|
230
|
+
REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
|
231
|
+
REGEXEN[:valid_url_path] = /(?:
|
232
|
+
(?:
|
233
|
+
#{REGEXEN[:valid_general_url_path_chars]}*
|
234
|
+
(?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
|
235
|
+
#{REGEXEN[:valid_url_path_ending_chars]}
|
236
|
+
)|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
|
237
|
+
)/iox
|
238
|
+
|
239
|
+
REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
|
240
|
+
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
|
241
|
+
REGEXEN[:valid_url] = %r{
|
242
|
+
( # $1 total match
|
243
|
+
(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
|
244
|
+
( # $3 URL
|
245
|
+
(https?:\/\/)? # $4 Protocol (optional)
|
246
|
+
(#{REGEXEN[:valid_domain]}) # $5 Domain(s)
|
247
|
+
(?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
|
248
|
+
(/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
|
249
|
+
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
|
250
|
+
)
|
251
|
+
)
|
252
|
+
}iox;
|
253
|
+
|
254
|
+
REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
|
255
|
+
REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
|
256
|
+
|
257
|
+
# These URL validation pattern strings are based on the ABNF from RFC 3986
|
258
|
+
REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
|
259
|
+
REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
|
260
|
+
REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
|
261
|
+
REGEXEN[:validate_url_pchar] = /(?:
|
262
|
+
#{REGEXEN[:validate_url_unreserved]}|
|
263
|
+
#{REGEXEN[:validate_url_pct_encoded]}|
|
264
|
+
#{REGEXEN[:validate_url_sub_delims]}|
|
265
|
+
[:\|@]
|
266
|
+
)/iox
|
267
|
+
|
268
|
+
REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
|
269
|
+
REGEXEN[:validate_url_userinfo] = /(?:
|
270
|
+
#{REGEXEN[:validate_url_unreserved]}|
|
271
|
+
#{REGEXEN[:validate_url_pct_encoded]}|
|
272
|
+
#{REGEXEN[:validate_url_sub_delims]}|
|
273
|
+
:
|
274
|
+
)*/iox
|
275
|
+
|
276
|
+
REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
|
277
|
+
REGEXEN[:validate_url_ipv4] =
|
278
|
+
/(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
|
279
|
+
|
280
|
+
# Punting on real IPv6 validation for now
|
281
|
+
REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
|
282
|
+
|
283
|
+
# Also punting on IPvFuture for now
|
284
|
+
REGEXEN[:validate_url_ip] = /(?:
|
285
|
+
#{REGEXEN[:validate_url_ipv4]}|
|
286
|
+
#{REGEXEN[:validate_url_ipv6]}
|
287
|
+
)/iox
|
288
|
+
|
289
|
+
# This is more strict than the rfc specifies
|
290
|
+
REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
|
291
|
+
REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
|
292
|
+
REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
|
293
|
+
REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
|
294
|
+
(?:#{REGEXEN[:validate_url_domain_segment]}\.)
|
295
|
+
#{REGEXEN[:validate_url_domain_tld]})/iox
|
296
|
+
|
297
|
+
REGEXEN[:validate_url_host] = /(?:
|
298
|
+
#{REGEXEN[:validate_url_ip]}|
|
299
|
+
#{REGEXEN[:validate_url_domain]}
|
300
|
+
)/iox
|
301
|
+
|
302
|
+
# Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
|
303
|
+
REGEXEN[:validate_url_unicode_subdomain_segment] =
|
304
|
+
/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
|
305
|
+
REGEXEN[:validate_url_unicode_domain_segment] =
|
306
|
+
/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
|
307
|
+
REGEXEN[:validate_url_unicode_domain_tld] =
|
308
|
+
/(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
|
309
|
+
REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
|
310
|
+
(?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
|
311
|
+
#{REGEXEN[:validate_url_unicode_domain_tld]})/iox
|
312
|
+
|
313
|
+
REGEXEN[:validate_url_unicode_host] = /(?:
|
314
|
+
#{REGEXEN[:validate_url_ip]}|
|
315
|
+
#{REGEXEN[:validate_url_unicode_domain]}
|
316
|
+
)/iox
|
317
|
+
|
318
|
+
REGEXEN[:validate_url_port] = /[0-9]{1,5}/
|
319
|
+
|
320
|
+
REGEXEN[:validate_url_unicode_authority] = %r{
|
321
|
+
(?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
|
322
|
+
(#{REGEXEN[:validate_url_unicode_host]}) # $2 host
|
323
|
+
(?::(#{REGEXEN[:validate_url_port]}))? # $3 port
|
324
|
+
}iox
|
325
|
+
|
326
|
+
REGEXEN[:validate_url_authority] = %r{
|
327
|
+
(?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
|
328
|
+
(#{REGEXEN[:validate_url_host]}) # $2 host
|
329
|
+
(?::(#{REGEXEN[:validate_url_port]}))? # $3 port
|
330
|
+
}iox
|
331
|
+
|
332
|
+
REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
|
333
|
+
REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
|
334
|
+
REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
|
335
|
+
|
336
|
+
# Modified version of RFC 3986 Appendix B
|
337
|
+
REGEXEN[:validate_url_unencoded] = %r{
|
338
|
+
\A # Full URL
|
339
|
+
(?:
|
340
|
+
([^:/?#]+):// # $1 Scheme
|
341
|
+
)?
|
342
|
+
([^/?#]*) # $2 Authority
|
343
|
+
([^?#]*) # $3 Path
|
344
|
+
(?:
|
345
|
+
\?([^#]*) # $4 Query
|
346
|
+
)?
|
347
|
+
(?:
|
348
|
+
\#(.*) # $5 Fragment
|
349
|
+
)?\Z
|
350
|
+
}ix
|
351
|
+
|
352
|
+
REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
|
353
|
+
|
354
|
+
REGEXEN.each_pair{|k,v| v.freeze }
|
355
|
+
|
356
|
+
# Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
|
357
|
+
# is not a known symbol a <tt>nil</tt> will be returned.
|
358
|
+
def self.[](key)
|
359
|
+
REGEXEN[key]
|
360
|
+
end
|
361
|
+
end
|
362
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Twitter
|
2
|
+
# A module provides base methods to rewrite usernames, lists, hashtags and URLs.
|
3
|
+
module Rewriter extend self
|
4
|
+
def rewrite_entities(text, entities)
|
5
|
+
chars = text.to_s.to_char_a
|
6
|
+
|
7
|
+
# sort by start index
|
8
|
+
entities = entities.sort_by{|entity| entity[:indices].first}
|
9
|
+
|
10
|
+
result = []
|
11
|
+
last_index = entities.inject(0) do |last_index, entity|
|
12
|
+
result << chars[last_index...entity[:indices].first]
|
13
|
+
result << yield(entity, chars)
|
14
|
+
entity[:indices].last
|
15
|
+
end
|
16
|
+
result << chars[last_index..-1]
|
17
|
+
|
18
|
+
result.flatten.join
|
19
|
+
end
|
20
|
+
|
21
|
+
# These methods are deprecated, will be removed in future.
|
22
|
+
extend Deprecation
|
23
|
+
|
24
|
+
def rewrite(text, options = {})
|
25
|
+
[:hashtags, :urls, :usernames_or_lists].inject(text) do |key|
|
26
|
+
options[key] ? send(:"rewrite_#{key}", text, &options[key]) : text
|
27
|
+
end
|
28
|
+
end
|
29
|
+
deprecate :rewrite, :rewrite_entities
|
30
|
+
|
31
|
+
def rewrite_usernames_or_lists(text)
|
32
|
+
entities = Extractor.extract_mentions_or_lists_with_indices(text)
|
33
|
+
rewrite_entities(text, entities) do |entity, chars|
|
34
|
+
at = chars[entity[:indices].first]
|
35
|
+
list_slug = entity[:list_slug]
|
36
|
+
list_slug = nil if list_slug.empty?
|
37
|
+
yield(at, entity[:screen_name], list_slug)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
deprecate :rewrite_usernames_or_lists, :rewrite_entities
|
41
|
+
|
42
|
+
def rewrite_hashtags(text)
|
43
|
+
entities = Extractor.extract_hashtags_with_indices(text)
|
44
|
+
rewrite_entities(text, entities) do |entity, chars|
|
45
|
+
hash = chars[entity[:indices].first]
|
46
|
+
yield(hash, entity[:hashtag])
|
47
|
+
end
|
48
|
+
end
|
49
|
+
deprecate :rewrite_hashtags, :rewrite_entities
|
50
|
+
|
51
|
+
def rewrite_urls(text)
|
52
|
+
entities = Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false)
|
53
|
+
rewrite_entities(text, entities) do |entity, chars|
|
54
|
+
yield(entity[:url])
|
55
|
+
end
|
56
|
+
end
|
57
|
+
deprecate :rewrite_urls, :rewrite_entities
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Twitter
|
2
|
+
# This module lazily defines constants of the form Uxxxx for all Unicode
|
3
|
+
# codepoints from U0000 to U10FFFF. The value of each constant is the
|
4
|
+
# UTF-8 string for the codepoint.
|
5
|
+
# Examples:
|
6
|
+
# copyright = Unicode::U00A9
|
7
|
+
# euro = Unicode::U20AC
|
8
|
+
# infinity = Unicode::U221E
|
9
|
+
#
|
10
|
+
module Unicode
|
11
|
+
CODEPOINT_REGEX = /^U_?([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})$/
|
12
|
+
|
13
|
+
def self.const_missing(name)
|
14
|
+
# Check that the constant name is of the right form: U0000 to U10FFFF
|
15
|
+
if name.to_s =~ CODEPOINT_REGEX
|
16
|
+
# Convert the codepoint to an immutable UTF-8 string,
|
17
|
+
# define a real constant for that value and return the value
|
18
|
+
#p name, name.class
|
19
|
+
const_set(name, [$1.to_i(16)].pack("U").freeze)
|
20
|
+
else # Raise an error for constants that are not Unicode.
|
21
|
+
raise NameError, "Uninitialized constant: Unicode::#{name}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'unf'
|
2
|
+
|
3
|
+
module Twitter
|
4
|
+
module Validation extend self
|
5
|
+
MAX_LENGTH = 140
|
6
|
+
|
7
|
+
DEFAULT_TCO_URL_LENGTHS = {
|
8
|
+
:short_url_length => 22,
|
9
|
+
:short_url_length_https => 23,
|
10
|
+
:characters_reserved_per_media => 22
|
11
|
+
}.freeze
|
12
|
+
|
13
|
+
# Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
|
14
|
+
# (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
|
15
|
+
# string no matter which actual form was transmitted. For example:
|
16
|
+
#
|
17
|
+
# U+0065 Latin Small Letter E
|
18
|
+
# + U+0301 Combining Acute Accent
|
19
|
+
# ----------
|
20
|
+
# = 2 bytes, 2 characters, displayed as é (1 visual glyph)
|
21
|
+
# … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
|
22
|
+
#
|
23
|
+
# The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
|
24
|
+
#
|
25
|
+
def tweet_length(text, options = {})
|
26
|
+
options = DEFAULT_TCO_URL_LENGTHS.merge(options)
|
27
|
+
|
28
|
+
length = text.to_nfc.unpack("U*").length
|
29
|
+
|
30
|
+
Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
|
31
|
+
length += start_position - end_position
|
32
|
+
length += url.downcase =~ /^https:\/\// ? options[:short_url_length_https] : options[:short_url_length]
|
33
|
+
end
|
34
|
+
|
35
|
+
length
|
36
|
+
end
|
37
|
+
|
38
|
+
# Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
|
39
|
+
# before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
|
40
|
+
# will allow quicker feedback.
|
41
|
+
#
|
42
|
+
# Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
|
43
|
+
#
|
44
|
+
# <tt>:too_long</tt>:: if the <tt>text</tt> is too long
|
45
|
+
# <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
|
46
|
+
# <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
|
47
|
+
def tweet_invalid?(text)
|
48
|
+
return :empty if !text || text.empty?
|
49
|
+
begin
|
50
|
+
return :too_long if tweet_length(text) > MAX_LENGTH
|
51
|
+
return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
|
52
|
+
rescue ArgumentError => e
|
53
|
+
# non-Unicode value.
|
54
|
+
return :invalid_characters
|
55
|
+
end
|
56
|
+
|
57
|
+
return false
|
58
|
+
end
|
59
|
+
|
60
|
+
def valid_tweet_text?(text)
|
61
|
+
!tweet_invalid?(text)
|
62
|
+
end
|
63
|
+
|
64
|
+
def valid_username?(username)
|
65
|
+
return false if !username || username.empty?
|
66
|
+
|
67
|
+
extracted = Twitter::Extractor.extract_mentioned_screen_names(username)
|
68
|
+
# Should extract the username minus the @ sign, hence the [1..-1]
|
69
|
+
extracted.size == 1 && extracted.first == username[1..-1]
|
70
|
+
end
|
71
|
+
|
72
|
+
VALID_LIST_RE = /\A#{Twitter::Regex[:valid_mention_or_list]}\z/o
|
73
|
+
def valid_list?(username_list)
|
74
|
+
match = username_list.match(VALID_LIST_RE)
|
75
|
+
# Must have matched and had nothing before or after
|
76
|
+
!!(match && match[1] == "" && match[4] && !match[4].empty?)
|
77
|
+
end
|
78
|
+
|
79
|
+
def valid_hashtag?(hashtag)
|
80
|
+
return false if !hashtag || hashtag.empty?
|
81
|
+
|
82
|
+
extracted = Twitter::Extractor.extract_hashtags(hashtag)
|
83
|
+
# Should extract the hashtag minus the # sign, hence the [1..-1]
|
84
|
+
extracted.size == 1 && extracted.first == hashtag[1..-1]
|
85
|
+
end
|
86
|
+
|
87
|
+
def valid_url?(url, unicode_domains=true, require_protocol=true)
|
88
|
+
return false if !url || url.empty?
|
89
|
+
|
90
|
+
url_parts = url.match(Twitter::Regex[:validate_url_unencoded])
|
91
|
+
return false unless (url_parts && url_parts.to_s == url)
|
92
|
+
|
93
|
+
scheme, authority, path, query, fragment = url_parts.captures
|
94
|
+
|
95
|
+
return false unless ((!require_protocol ||
|
96
|
+
(valid_match?(scheme, Twitter::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) &&
|
97
|
+
valid_match?(path, Twitter::Regex[:validate_url_path]) &&
|
98
|
+
valid_match?(query, Twitter::Regex[:validate_url_query], true) &&
|
99
|
+
valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true))
|
100
|
+
|
101
|
+
return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) ||
|
102
|
+
(!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority]))
|
103
|
+
end
|
104
|
+
|
105
|
+
private
|
106
|
+
|
107
|
+
def valid_match?(string, regex, optional=false)
|
108
|
+
return (string && string.match(regex) && $~.to_s == string) unless optional
|
109
|
+
|
110
|
+
!(string && (!string.match(regex) || $~.to_s != string))
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|