ritter 0.0.87 → 0.0.88
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.gemtest +0 -0
- data/.gitignore +40 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +188 -0
- data/README.rdoc +112 -0
- data/Rakefile +64 -0
- data/lib/twitter-text/autolink.rb +443 -0
- data/lib/twitter-text/deprecation.rb +15 -0
- data/lib/twitter-text/extractor.rb +328 -0
- data/lib/twitter-text/hash_helper.rb +21 -0
- data/lib/twitter-text/hit_highlighter.rb +86 -0
- data/lib/twitter-text/regex.rb +366 -0
- data/lib/twitter-text/rewriter.rb +59 -0
- data/lib/twitter-text/unicode.rb +26 -0
- data/lib/twitter-text/validation.rb +113 -0
- data/lib/twitter-text.rb +22 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/spec/autolinking_spec.rb +826 -0
- data/spec/extractor_spec.rb +368 -0
- data/spec/hithighlighter_spec.rb +92 -0
- data/spec/regex_spec.rb +38 -0
- data/spec/rewriter_spec.rb +548 -0
- data/spec/spec_helper.rb +127 -0
- data/spec/test_urls.rb +80 -0
- data/spec/twitter_text_spec.rb +21 -0
- data/spec/unicode_spec.rb +31 -0
- data/spec/validation_spec.rb +43 -0
- data/test/conformance_test.rb +182 -0
- data/twitter-text.gemspec +29 -0
- metadata +42 -8
@@ -0,0 +1,366 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Twitter
|
4
|
+
# A collection of regular expressions for parsing Tweet text. The regular expression
|
5
|
+
# list is frozen at load time to ensure immutability. These regular expressions are
|
6
|
+
# used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
|
7
|
+
# sure these reular expressions work with Tweets in all languages.
|
8
|
+
class Regex
|
9
|
+
REGEXEN = {} # :nodoc:
|
10
|
+
|
11
|
+
def self.regex_range(from, to = nil) # :nodoc:
|
12
|
+
if $RUBY_1_9
|
13
|
+
if to
|
14
|
+
"\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
|
15
|
+
else
|
16
|
+
"\\u{#{from.to_s(16).rjust(4, '0')}}"
|
17
|
+
end
|
18
|
+
else
|
19
|
+
if to
|
20
|
+
[from].pack('U') + '-' + [to].pack('U')
|
21
|
+
else
|
22
|
+
[from].pack('U')
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
|
28
|
+
# to access both the list of characters and a pattern suitible for use with String#split
|
29
|
+
# Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
|
30
|
+
UNICODE_SPACES = [
|
31
|
+
(0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
|
32
|
+
0x0020, # White_Space # Zs SPACE
|
33
|
+
0x0085, # White_Space # Cc <control-0085>
|
34
|
+
0x00A0, # White_Space # Zs NO-BREAK SPACE
|
35
|
+
0x1680, # White_Space # Zs OGHAM SPACE MARK
|
36
|
+
0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
|
37
|
+
(0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
|
38
|
+
0x2028, # White_Space # Zl LINE SEPARATOR
|
39
|
+
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
|
40
|
+
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
|
41
|
+
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
|
42
|
+
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
|
43
|
+
].flatten.map{|c| [c].pack('U*')}.freeze
|
44
|
+
REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
|
45
|
+
|
46
|
+
# Character not allowed in Tweets
|
47
|
+
INVALID_CHARACTERS = [
|
48
|
+
0xFFFE, 0xFEFF, # BOM
|
49
|
+
0xFFFF, # Special
|
50
|
+
0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
|
51
|
+
].map{|cp| [cp].pack('U') }.freeze
|
52
|
+
REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
|
53
|
+
|
54
|
+
major, minor, patch = RUBY_VERSION.split('.')
|
55
|
+
if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
|
56
|
+
REGEXEN[:list_name] = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/
|
57
|
+
else
|
58
|
+
# This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
|
59
|
+
REGEXEN[:list_name] = eval("/[a-zA-Z][a-zA-Z0-9_\\-\x80-\xff]{0,24}/")
|
60
|
+
end
|
61
|
+
|
62
|
+
# Latin accented characters
|
63
|
+
# Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
|
64
|
+
# Also excludes 0xf7, the division sign
|
65
|
+
LATIN_ACCENTS = [
|
66
|
+
regex_range(0xc0, 0xd6),
|
67
|
+
regex_range(0xd8, 0xf6),
|
68
|
+
regex_range(0xf8, 0xff),
|
69
|
+
regex_range(0x0100, 0x024f),
|
70
|
+
regex_range(0x0253, 0x0254),
|
71
|
+
regex_range(0x0256, 0x0257),
|
72
|
+
regex_range(0x0259),
|
73
|
+
regex_range(0x025b),
|
74
|
+
regex_range(0x0263),
|
75
|
+
regex_range(0x0268),
|
76
|
+
regex_range(0x026f),
|
77
|
+
regex_range(0x0272),
|
78
|
+
regex_range(0x0289),
|
79
|
+
regex_range(0x028b),
|
80
|
+
regex_range(0x02bb),
|
81
|
+
regex_range(0x0300, 0x036f),
|
82
|
+
regex_range(0x1e00, 0x1eff)
|
83
|
+
].join('').freeze
|
84
|
+
|
85
|
+
RTL_CHARACTERS = [
|
86
|
+
regex_range(0x0600,0x06FF),
|
87
|
+
regex_range(0x0750,0x077F),
|
88
|
+
regex_range(0x0590,0x05FF),
|
89
|
+
regex_range(0xFE70,0xFEFF)
|
90
|
+
].join('').freeze
|
91
|
+
|
92
|
+
|
93
|
+
NON_LATIN_HASHTAG_CHARS = [
|
94
|
+
# Cyrillic (Russian, Ukrainian, etc.)
|
95
|
+
regex_range(0x0400, 0x04ff), # Cyrillic
|
96
|
+
regex_range(0x0500, 0x0527), # Cyrillic Supplement
|
97
|
+
regex_range(0x2de0, 0x2dff), # Cyrillic Extended A
|
98
|
+
regex_range(0xa640, 0xa69f), # Cyrillic Extended B
|
99
|
+
regex_range(0x0591, 0x05bf), # Hebrew
|
100
|
+
regex_range(0x05c1, 0x05c2),
|
101
|
+
regex_range(0x05c4, 0x05c5),
|
102
|
+
regex_range(0x05c7),
|
103
|
+
regex_range(0x05d0, 0x05ea),
|
104
|
+
regex_range(0x05f0, 0x05f4),
|
105
|
+
regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms
|
106
|
+
regex_range(0xfb2a, 0xfb36),
|
107
|
+
regex_range(0xfb38, 0xfb3c),
|
108
|
+
regex_range(0xfb3e),
|
109
|
+
regex_range(0xfb40, 0xfb41),
|
110
|
+
regex_range(0xfb43, 0xfb44),
|
111
|
+
regex_range(0xfb46, 0xfb4f),
|
112
|
+
regex_range(0x0610, 0x061a), # Arabic
|
113
|
+
regex_range(0x0620, 0x065f),
|
114
|
+
regex_range(0x066e, 0x06d3),
|
115
|
+
regex_range(0x06d5, 0x06dc),
|
116
|
+
regex_range(0x06de, 0x06e8),
|
117
|
+
regex_range(0x06ea, 0x06ef),
|
118
|
+
regex_range(0x06fa, 0x06fc),
|
119
|
+
regex_range(0x06ff),
|
120
|
+
regex_range(0x0750, 0x077f), # Arabic Supplement
|
121
|
+
regex_range(0x08a0), # Arabic Extended A
|
122
|
+
regex_range(0x08a2, 0x08ac),
|
123
|
+
regex_range(0x08e4, 0x08fe),
|
124
|
+
regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A
|
125
|
+
regex_range(0xfbd3, 0xfd3d),
|
126
|
+
regex_range(0xfd50, 0xfd8f),
|
127
|
+
regex_range(0xfd92, 0xfdc7),
|
128
|
+
regex_range(0xfdf0, 0xfdfb),
|
129
|
+
regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B
|
130
|
+
regex_range(0xfe76, 0xfefc),
|
131
|
+
regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner
|
132
|
+
regex_range(0x0e01, 0x0e3a), # Thai
|
133
|
+
regex_range(0x0e40, 0x0e4e), # Hangul (Korean)
|
134
|
+
regex_range(0x1100, 0x11ff), # Hangul Jamo
|
135
|
+
regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo
|
136
|
+
regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A
|
137
|
+
regex_range(0xAC00, 0xD7AF), # Hangul Syllables
|
138
|
+
regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B
|
139
|
+
regex_range(0xFFA1, 0xFFDC) # Half-width Hangul
|
140
|
+
].join('').freeze
|
141
|
+
REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
|
142
|
+
|
143
|
+
CJ_HASHTAG_CHARACTERS = [
|
144
|
+
regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width)
|
145
|
+
regex_range(0xFF66, 0xFF9F), # Katakana (half-width)
|
146
|
+
regex_range(0xFF10, 0xFF19), regex_range(0xFF21, 0xFF3A), regex_range(0xFF41, 0xFF5A), # Latin (full-width)
|
147
|
+
regex_range(0x3041, 0x3096), regex_range(0x3099, 0x309E), # Hiragana
|
148
|
+
regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A)
|
149
|
+
regex_range(0x4E00, 0x9FFF), # Kanji (Unified)
|
150
|
+
regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B)
|
151
|
+
regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C)
|
152
|
+
regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D)
|
153
|
+
regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
|
154
|
+
].join('').freeze
|
155
|
+
|
156
|
+
PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
|
157
|
+
SPACE_CHARS = " \t\n\x0B\f\r"
|
158
|
+
CTRL_CHARS = "\x00-\x1F\x7F"
|
159
|
+
|
160
|
+
# A hashtag must contain latin characters, numbers and underscores, but not all numbers.
|
161
|
+
HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
|
162
|
+
HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
|
163
|
+
HASHTAG_BOUNDARY = /\A|\z|[^&a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
|
164
|
+
|
165
|
+
# Original:
|
166
|
+
#HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
|
167
|
+
|
168
|
+
# The new one:
|
169
|
+
HASHTAG = /[#].\S+/io
|
170
|
+
|
171
|
+
REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
|
172
|
+
# Used in Extractor for final filtering
|
173
|
+
REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
|
174
|
+
|
175
|
+
REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)/o
|
176
|
+
REGEXEN[:at_signs] = /[@@]/
|
177
|
+
REGEXEN[:valid_mention_or_list] = /
|
178
|
+
(#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
|
179
|
+
(#{REGEXEN[:at_signs]}) # $2: At mark
|
180
|
+
([a-zA-Z0-9_]{1,20}) # $3: Screen name
|
181
|
+
(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
|
182
|
+
/ox
|
183
|
+
REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
184
|
+
# Used in Extractor for final filtering
|
185
|
+
REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
|
186
|
+
|
187
|
+
# URL related hash regex collection
|
188
|
+
REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
|
189
|
+
REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
|
190
|
+
DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
|
191
|
+
REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
192
|
+
REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
193
|
+
|
194
|
+
REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^0-9a-z]|$))/i
|
195
|
+
REGEXEN[:valid_ccTLD] = %r{
|
196
|
+
(?:
|
197
|
+
(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
|
198
|
+
ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|
|
199
|
+
gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|
|
200
|
+
lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
|
201
|
+
pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|
|
202
|
+
th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
|
203
|
+
(?=[^0-9a-z]|$)
|
204
|
+
)
|
205
|
+
}ix
|
206
|
+
REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
|
207
|
+
|
208
|
+
REGEXEN[:valid_domain] = /(?:
|
209
|
+
#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
|
210
|
+
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
211
|
+
)/iox
|
212
|
+
|
213
|
+
# This is used in Extractor
|
214
|
+
REGEXEN[:valid_ascii_domain] = /
|
215
|
+
(?:(?:[A-Za-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
|
216
|
+
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
217
|
+
/iox
|
218
|
+
|
219
|
+
# This is used in Extractor for stricter t.co URL extraction
|
220
|
+
REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/[a-z0-9]+/i
|
221
|
+
|
222
|
+
# This is used in Extractor to filter out unwanted URLs.
|
223
|
+
REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
|
224
|
+
|
225
|
+
REGEXEN[:valid_port_number] = /[0-9]+/
|
226
|
+
|
227
|
+
REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\.\$\/%#\[\]\-_~&|@#{LATIN_ACCENTS}]/io
|
228
|
+
# Allow URL paths to contain balanced parens
|
229
|
+
# 1. Used in Wikipedia URLs like /Primer_(film)
|
230
|
+
# 2. Used in IIS sessions like /S(dfd346)/
|
231
|
+
REGEXEN[:valid_url_balanced_parens] = /\(#{REGEXEN[:valid_general_url_path_chars]}+\)/io
|
232
|
+
# Valid end-of-path chracters (so /foo. does not gobble the period).
|
233
|
+
# 1. Allow =&# for empty URL parameters and other URL-join artifacts
|
234
|
+
REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
|
235
|
+
REGEXEN[:valid_url_path] = /(?:
|
236
|
+
(?:
|
237
|
+
#{REGEXEN[:valid_general_url_path_chars]}*
|
238
|
+
(?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
|
239
|
+
#{REGEXEN[:valid_url_path_ending_chars]}
|
240
|
+
)|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
|
241
|
+
)/iox
|
242
|
+
|
243
|
+
REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
|
244
|
+
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
|
245
|
+
REGEXEN[:valid_url] = %r{
|
246
|
+
( # $1 total match
|
247
|
+
(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
|
248
|
+
( # $3 URL
|
249
|
+
(https?:\/\/)? # $4 Protocol (optional)
|
250
|
+
(#{REGEXEN[:valid_domain]}) # $5 Domain(s)
|
251
|
+
(?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
|
252
|
+
(/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
|
253
|
+
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
|
254
|
+
)
|
255
|
+
)
|
256
|
+
}iox;
|
257
|
+
|
258
|
+
REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
|
259
|
+
REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
|
260
|
+
|
261
|
+
# These URL validation pattern strings are based on the ABNF from RFC 3986
|
262
|
+
REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
|
263
|
+
REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
|
264
|
+
REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
|
265
|
+
REGEXEN[:validate_url_pchar] = /(?:
|
266
|
+
#{REGEXEN[:validate_url_unreserved]}|
|
267
|
+
#{REGEXEN[:validate_url_pct_encoded]}|
|
268
|
+
#{REGEXEN[:validate_url_sub_delims]}|
|
269
|
+
[:\|@]
|
270
|
+
)/iox
|
271
|
+
|
272
|
+
REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
|
273
|
+
REGEXEN[:validate_url_userinfo] = /(?:
|
274
|
+
#{REGEXEN[:validate_url_unreserved]}|
|
275
|
+
#{REGEXEN[:validate_url_pct_encoded]}|
|
276
|
+
#{REGEXEN[:validate_url_sub_delims]}|
|
277
|
+
:
|
278
|
+
)*/iox
|
279
|
+
|
280
|
+
REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
|
281
|
+
REGEXEN[:validate_url_ipv4] =
|
282
|
+
/(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
|
283
|
+
|
284
|
+
# Punting on real IPv6 validation for now
|
285
|
+
REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
|
286
|
+
|
287
|
+
# Also punting on IPvFuture for now
|
288
|
+
REGEXEN[:validate_url_ip] = /(?:
|
289
|
+
#{REGEXEN[:validate_url_ipv4]}|
|
290
|
+
#{REGEXEN[:validate_url_ipv6]}
|
291
|
+
)/iox
|
292
|
+
|
293
|
+
# This is more strict than the rfc specifies
|
294
|
+
REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
|
295
|
+
REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
|
296
|
+
REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
|
297
|
+
REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
|
298
|
+
(?:#{REGEXEN[:validate_url_domain_segment]}\.)
|
299
|
+
#{REGEXEN[:validate_url_domain_tld]})/iox
|
300
|
+
|
301
|
+
REGEXEN[:validate_url_host] = /(?:
|
302
|
+
#{REGEXEN[:validate_url_ip]}|
|
303
|
+
#{REGEXEN[:validate_url_domain]}
|
304
|
+
)/iox
|
305
|
+
|
306
|
+
# Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
|
307
|
+
REGEXEN[:validate_url_unicode_subdomain_segment] =
|
308
|
+
/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
|
309
|
+
REGEXEN[:validate_url_unicode_domain_segment] =
|
310
|
+
/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
|
311
|
+
REGEXEN[:validate_url_unicode_domain_tld] =
|
312
|
+
/(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
|
313
|
+
REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
|
314
|
+
(?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
|
315
|
+
#{REGEXEN[:validate_url_unicode_domain_tld]})/iox
|
316
|
+
|
317
|
+
REGEXEN[:validate_url_unicode_host] = /(?:
|
318
|
+
#{REGEXEN[:validate_url_ip]}|
|
319
|
+
#{REGEXEN[:validate_url_unicode_domain]}
|
320
|
+
)/iox
|
321
|
+
|
322
|
+
REGEXEN[:validate_url_port] = /[0-9]{1,5}/
|
323
|
+
|
324
|
+
REGEXEN[:validate_url_unicode_authority] = %r{
|
325
|
+
(?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
|
326
|
+
(#{REGEXEN[:validate_url_unicode_host]}) # $2 host
|
327
|
+
(?::(#{REGEXEN[:validate_url_port]}))? # $3 port
|
328
|
+
}iox
|
329
|
+
|
330
|
+
REGEXEN[:validate_url_authority] = %r{
|
331
|
+
(?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
|
332
|
+
(#{REGEXEN[:validate_url_host]}) # $2 host
|
333
|
+
(?::(#{REGEXEN[:validate_url_port]}))? # $3 port
|
334
|
+
}iox
|
335
|
+
|
336
|
+
REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
|
337
|
+
REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
|
338
|
+
REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
|
339
|
+
|
340
|
+
# Modified version of RFC 3986 Appendix B
|
341
|
+
REGEXEN[:validate_url_unencoded] = %r{
|
342
|
+
\A # Full URL
|
343
|
+
(?:
|
344
|
+
([^:/?#]+):// # $1 Scheme
|
345
|
+
)?
|
346
|
+
([^/?#]*) # $2 Authority
|
347
|
+
([^?#]*) # $3 Path
|
348
|
+
(?:
|
349
|
+
\?([^#]*) # $4 Query
|
350
|
+
)?
|
351
|
+
(?:
|
352
|
+
\#(.*) # $5 Fragment
|
353
|
+
)?\Z
|
354
|
+
}ix
|
355
|
+
|
356
|
+
REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
|
357
|
+
|
358
|
+
REGEXEN.each_pair{|k,v| v.freeze }
|
359
|
+
|
360
|
+
# Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
|
361
|
+
# is not a known symbol a <tt>nil</tt> will be returned.
|
362
|
+
def self.[](key)
|
363
|
+
REGEXEN[key]
|
364
|
+
end
|
365
|
+
end
|
366
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Twitter
|
2
|
+
# A module provides base methods to rewrite usernames, lists, hashtags and URLs.
|
3
|
+
module Rewriter extend self
|
4
|
+
def rewrite_entities(text, entities)
|
5
|
+
chars = text.to_s.to_char_a
|
6
|
+
|
7
|
+
# sort by start index
|
8
|
+
entities = entities.sort_by{|entity| entity[:indices].first}
|
9
|
+
|
10
|
+
result = []
|
11
|
+
last_index = entities.inject(0) do |last_index, entity|
|
12
|
+
result << chars[last_index...entity[:indices].first]
|
13
|
+
result << yield(entity, chars)
|
14
|
+
entity[:indices].last
|
15
|
+
end
|
16
|
+
result << chars[last_index..-1]
|
17
|
+
|
18
|
+
result.flatten.join
|
19
|
+
end
|
20
|
+
|
21
|
+
# These methods are deprecated, will be removed in future.
|
22
|
+
extend Deprecation
|
23
|
+
|
24
|
+
def rewrite(text, options = {})
|
25
|
+
[:hashtags, :urls, :usernames_or_lists].inject(text) do |key|
|
26
|
+
options[key] ? send(:"rewrite_#{key}", text, &options[key]) : text
|
27
|
+
end
|
28
|
+
end
|
29
|
+
deprecate :rewrite, :rewrite_entities
|
30
|
+
|
31
|
+
def rewrite_usernames_or_lists(text)
|
32
|
+
entities = Extractor.extract_mentions_or_lists_with_indices(text)
|
33
|
+
rewrite_entities(text, entities) do |entity, chars|
|
34
|
+
at = chars[entity[:indices].first]
|
35
|
+
list_slug = entity[:list_slug]
|
36
|
+
list_slug = nil if list_slug.empty?
|
37
|
+
yield(at, entity[:screen_name], list_slug)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
deprecate :rewrite_usernames_or_lists, :rewrite_entities
|
41
|
+
|
42
|
+
def rewrite_hashtags(text)
|
43
|
+
entities = Extractor.extract_hashtags_with_indices(text)
|
44
|
+
rewrite_entities(text, entities) do |entity, chars|
|
45
|
+
hash = chars[entity[:indices].first]
|
46
|
+
yield(hash, entity[:hashtag])
|
47
|
+
end
|
48
|
+
end
|
49
|
+
deprecate :rewrite_hashtags, :rewrite_entities
|
50
|
+
|
51
|
+
def rewrite_urls(text)
|
52
|
+
entities = Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false)
|
53
|
+
rewrite_entities(text, entities) do |entity, chars|
|
54
|
+
yield(entity[:url])
|
55
|
+
end
|
56
|
+
end
|
57
|
+
deprecate :rewrite_urls, :rewrite_entities
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Twitter
|
2
|
+
# This module lazily defines constants of the form Uxxxx for all Unicode
|
3
|
+
# codepoints from U0000 to U10FFFF. The value of each constant is the
|
4
|
+
# UTF-8 string for the codepoint.
|
5
|
+
# Examples:
|
6
|
+
# copyright = Unicode::U00A9
|
7
|
+
# euro = Unicode::U20AC
|
8
|
+
# infinity = Unicode::U221E
|
9
|
+
#
|
10
|
+
module Unicode
|
11
|
+
CODEPOINT_REGEX = /^U_?([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})$/
|
12
|
+
|
13
|
+
def self.const_missing(name)
|
14
|
+
# Check that the constant name is of the right form: U0000 to U10FFFF
|
15
|
+
if name.to_s =~ CODEPOINT_REGEX
|
16
|
+
# Convert the codepoint to an immutable UTF-8 string,
|
17
|
+
# define a real constant for that value and return the value
|
18
|
+
#p name, name.class
|
19
|
+
const_set(name, [$1.to_i(16)].pack("U").freeze)
|
20
|
+
else # Raise an error for constants that are not Unicode.
|
21
|
+
raise NameError, "Uninitialized constant: Unicode::#{name}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'unf'
|
2
|
+
|
3
|
+
module Twitter
|
4
|
+
module Validation extend self
|
5
|
+
MAX_LENGTH = 140
|
6
|
+
|
7
|
+
DEFAULT_TCO_URL_LENGTHS = {
|
8
|
+
:short_url_length => 22,
|
9
|
+
:short_url_length_https => 23,
|
10
|
+
:characters_reserved_per_media => 22
|
11
|
+
}.freeze
|
12
|
+
|
13
|
+
# Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
|
14
|
+
# (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
|
15
|
+
# string no matter which actual form was transmitted. For example:
|
16
|
+
#
|
17
|
+
# U+0065 Latin Small Letter E
|
18
|
+
# + U+0301 Combining Acute Accent
|
19
|
+
# ----------
|
20
|
+
# = 2 bytes, 2 characters, displayed as é (1 visual glyph)
|
21
|
+
# … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
|
22
|
+
#
|
23
|
+
# The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
|
24
|
+
#
|
25
|
+
def tweet_length(text, options = {})
|
26
|
+
options = DEFAULT_TCO_URL_LENGTHS.merge(options)
|
27
|
+
|
28
|
+
length = text.to_nfc.unpack("U*").length
|
29
|
+
|
30
|
+
Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
|
31
|
+
length += start_position - end_position
|
32
|
+
length += url.downcase =~ /^https:\/\// ? options[:short_url_length_https] : options[:short_url_length]
|
33
|
+
end
|
34
|
+
|
35
|
+
length
|
36
|
+
end
|
37
|
+
|
38
|
+
# Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
|
39
|
+
# before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
|
40
|
+
# will allow quicker feedback.
|
41
|
+
#
|
42
|
+
# Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
|
43
|
+
#
|
44
|
+
# <tt>:too_long</tt>:: if the <tt>text</tt> is too long
|
45
|
+
# <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
|
46
|
+
# <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
|
47
|
+
def tweet_invalid?(text)
|
48
|
+
return :empty if !text || text.empty?
|
49
|
+
begin
|
50
|
+
return :too_long if tweet_length(text) > MAX_LENGTH
|
51
|
+
return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
|
52
|
+
rescue ArgumentError => e
|
53
|
+
# non-Unicode value.
|
54
|
+
return :invalid_characters
|
55
|
+
end
|
56
|
+
|
57
|
+
return false
|
58
|
+
end
|
59
|
+
|
60
|
+
def valid_tweet_text?(text)
|
61
|
+
!tweet_invalid?(text)
|
62
|
+
end
|
63
|
+
|
64
|
+
def valid_username?(username)
|
65
|
+
return false if !username || username.empty?
|
66
|
+
|
67
|
+
extracted = Twitter::Extractor.extract_mentioned_screen_names(username)
|
68
|
+
# Should extract the username minus the @ sign, hence the [1..-1]
|
69
|
+
extracted.size == 1 && extracted.first == username[1..-1]
|
70
|
+
end
|
71
|
+
|
72
|
+
VALID_LIST_RE = /\A#{Twitter::Regex[:valid_mention_or_list]}\z/o
|
73
|
+
def valid_list?(username_list)
|
74
|
+
match = username_list.match(VALID_LIST_RE)
|
75
|
+
# Must have matched and had nothing before or after
|
76
|
+
!!(match && match[1] == "" && match[4] && !match[4].empty?)
|
77
|
+
end
|
78
|
+
|
79
|
+
def valid_hashtag?(hashtag)
|
80
|
+
return false if !hashtag || hashtag.empty?
|
81
|
+
|
82
|
+
extracted = Twitter::Extractor.extract_hashtags(hashtag)
|
83
|
+
# Should extract the hashtag minus the # sign, hence the [1..-1]
|
84
|
+
extracted.size == 1 && extracted.first == hashtag[1..-1]
|
85
|
+
end
|
86
|
+
|
87
|
+
def valid_url?(url, unicode_domains=true, require_protocol=true)
|
88
|
+
return false if !url || url.empty?
|
89
|
+
|
90
|
+
url_parts = url.match(Twitter::Regex[:validate_url_unencoded])
|
91
|
+
return false unless (url_parts && url_parts.to_s == url)
|
92
|
+
|
93
|
+
scheme, authority, path, query, fragment = url_parts.captures
|
94
|
+
|
95
|
+
return false unless ((!require_protocol ||
|
96
|
+
(valid_match?(scheme, Twitter::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) &&
|
97
|
+
valid_match?(path, Twitter::Regex[:validate_url_path]) &&
|
98
|
+
valid_match?(query, Twitter::Regex[:validate_url_query], true) &&
|
99
|
+
valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true))
|
100
|
+
|
101
|
+
return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) ||
|
102
|
+
(!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority]))
|
103
|
+
end
|
104
|
+
|
105
|
+
private
|
106
|
+
|
107
|
+
def valid_match?(string, regex, optional=false)
|
108
|
+
return (string && string.match(regex) && $~.to_s == string) unless optional
|
109
|
+
|
110
|
+
!(string && (!string.match(regex) || $~.to_s != string))
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
data/lib/twitter-text.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
major, minor, patch = RUBY_VERSION.split('.')
|
2
|
+
|
3
|
+
$RUBY_1_9 = if major.to_i == 1 && minor.to_i < 9
|
4
|
+
# Ruby 1.8 KCODE check. Not needed on 1.9 and later.
|
5
|
+
raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless $KCODE[0].chr =~ /u/i
|
6
|
+
false
|
7
|
+
else
|
8
|
+
true
|
9
|
+
end
|
10
|
+
|
11
|
+
%w(
|
12
|
+
deprecation
|
13
|
+
regex
|
14
|
+
rewriter
|
15
|
+
autolink
|
16
|
+
extractor
|
17
|
+
unicode
|
18
|
+
validation
|
19
|
+
hit_highlighter
|
20
|
+
).each do |name|
|
21
|
+
require "twitter-text/#{name}"
|
22
|
+
end
|
data/script/destroy
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/destroy'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit]
|
14
|
+
RubiGen::Scripts::Destroy.new.run(ARGV)
|
data/script/generate
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/generate'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit]
|
14
|
+
RubiGen::Scripts::Generate.new.run(ARGV)
|