twitter-text-kow 1.3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gemtest +0 -0
- data/.gitignore +40 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +44 -0
- data/Gemfile +4 -0
- data/LICENSE +188 -0
- data/README.md +193 -0
- data/Rakefile +52 -0
- data/config/README.md +142 -0
- data/config/v1.json +8 -0
- data/config/v2.json +29 -0
- data/config/v3.json +30 -0
- data/lib/assets/tld_lib.yml +1577 -0
- data/lib/twitter-text/autolink.rb +455 -0
- data/lib/twitter-text/configuration.rb +68 -0
- data/lib/twitter-text/deprecation.rb +21 -0
- data/lib/twitter-text/emoji_regex.rb +27 -0
- data/lib/twitter-text/extractor.rb +388 -0
- data/lib/twitter-text/hash_helper.rb +27 -0
- data/lib/twitter-text/hit_highlighter.rb +92 -0
- data/lib/twitter-text/regex.rb +381 -0
- data/lib/twitter-text/rewriter.rb +69 -0
- data/lib/twitter-text/unicode.rb +31 -0
- data/lib/twitter-text/validation.rb +251 -0
- data/lib/twitter-text/weighted_range.rb +24 -0
- data/lib/twitter-text.rb +29 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/spec/autolinking_spec.rb +858 -0
- data/spec/configuration_spec.rb +136 -0
- data/spec/extractor_spec.rb +392 -0
- data/spec/hithighlighter_spec.rb +96 -0
- data/spec/regex_spec.rb +76 -0
- data/spec/rewriter_spec.rb +553 -0
- data/spec/spec_helper.rb +139 -0
- data/spec/test_urls.rb +90 -0
- data/spec/twitter_text_spec.rb +25 -0
- data/spec/unicode_spec.rb +35 -0
- data/spec/validation_spec.rb +87 -0
- data/test/conformance_test.rb +242 -0
- data/twitter-text.gemspec +35 -0
- metadata +228 -0
@@ -0,0 +1,381 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
# encoding: utf-8
|
6
|
+
|
7
|
+
module Twitter
|
8
|
+
module TwitterText
|
9
|
+
# A collection of regular expressions for parsing Tweet text. The regular expression
|
10
|
+
# list is frozen at load time to ensure immutability. These regular expressions are
|
11
|
+
# used throughout the <tt>TwitterText</tt> classes. Special care has been taken to make
|
12
|
+
# sure these reular expressions work with Tweets in all languages.
|
13
|
+
class Regex
|
14
|
+
require 'yaml'
|
15
|
+
|
16
|
+
REGEXEN = {} # :nodoc:
|
17
|
+
|
18
|
+
def self.regex_range(from, to = nil) # :nodoc:
|
19
|
+
if $RUBY_1_9
|
20
|
+
if to
|
21
|
+
"\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
|
22
|
+
else
|
23
|
+
"\\u{#{from.to_s(16).rjust(4, '0')}}"
|
24
|
+
end
|
25
|
+
else
|
26
|
+
if to
|
27
|
+
[from].pack('U') + '-' + [to].pack('U')
|
28
|
+
else
|
29
|
+
[from].pack('U')
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
TLDS = YAML.load_file(
|
35
|
+
File.join(
|
36
|
+
File.expand_path('../../..', __FILE__), # project root
|
37
|
+
'lib', 'assets', 'tld_lib.yml'
|
38
|
+
)
|
39
|
+
)
|
40
|
+
|
41
|
+
# Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
|
42
|
+
# to access both the list of characters and a pattern suitible for use with String#split
|
43
|
+
# Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
|
44
|
+
UNICODE_SPACES = [
|
45
|
+
(0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
|
46
|
+
0x0020, # White_Space # Zs SPACE
|
47
|
+
0x0085, # White_Space # Cc <control-0085>
|
48
|
+
0x00A0, # White_Space # Zs NO-BREAK SPACE
|
49
|
+
0x1680, # White_Space # Zs OGHAM SPACE MARK
|
50
|
+
0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
|
51
|
+
(0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
|
52
|
+
0x2028, # White_Space # Zl LINE SEPARATOR
|
53
|
+
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
|
54
|
+
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
|
55
|
+
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
|
56
|
+
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
|
57
|
+
].flatten.map{|c| [c].pack('U*')}.freeze
|
58
|
+
REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
|
59
|
+
|
60
|
+
DIRECTIONAL_CHARACTERS = [
|
61
|
+
0x061C, # ARABIC LETTER MARK (ALM)
|
62
|
+
0x200E, # LEFT-TO-RIGHT MARK (LRM)
|
63
|
+
0x200F, # RIGHT-TO-LEFT MARK (RLM)
|
64
|
+
0x202A, # LEFT-TO-RIGHT EMBEDDING (LRE)
|
65
|
+
0x202B, # RIGHT-TO-LEFT EMBEDDING (RLE)
|
66
|
+
0x202C, # POP DIRECTIONAL FORMATTING (PDF)
|
67
|
+
0x202D, # LEFT-TO-RIGHT OVERRIDE (LRO)
|
68
|
+
0x202E, # RIGHT-TO-LEFT OVERRIDE (RLO)
|
69
|
+
0x2066, # LEFT-TO-RIGHT ISOLATE (LRI)
|
70
|
+
0x2067, # RIGHT-TO-LEFT ISOLATE (RLI)
|
71
|
+
0x2068, # FIRST STRONG ISOLATE (FSI)
|
72
|
+
0x2069, # POP DIRECTIONAL ISOLATE (PDI)
|
73
|
+
].map{|cp| [cp].pack('U')}.freeze
|
74
|
+
REGEXEN[:directional_characters] = /[#{DIRECTIONAL_CHARACTERS.join('')}]/o
|
75
|
+
|
76
|
+
# Character not allowed in Tweets
|
77
|
+
INVALID_CHARACTERS = [
|
78
|
+
0xFFFE, 0xFEFF, # BOM
|
79
|
+
0xFFFF, # Special
|
80
|
+
].map{|cp| [cp].pack('U') }.freeze
|
81
|
+
REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
|
82
|
+
|
83
|
+
major, minor, _patch = RUBY_VERSION.split('.')
|
84
|
+
if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
|
85
|
+
REGEXEN[:list_name] = /[a-z][a-z0-9_\-\u0080-\u00ff]{0,24}/i
|
86
|
+
else
|
87
|
+
# This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
|
88
|
+
REGEXEN[:list_name] = eval("/[a-z][a-z0-9_\\-\x80-\xff]{0,24}/i")
|
89
|
+
end
|
90
|
+
|
91
|
+
# Latin accented characters
|
92
|
+
# Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
|
93
|
+
# Also excludes 0xf7, the division sign
|
94
|
+
LATIN_ACCENTS = [
|
95
|
+
regex_range(0xc0, 0xd6),
|
96
|
+
regex_range(0xd8, 0xf6),
|
97
|
+
regex_range(0xf8, 0xff),
|
98
|
+
regex_range(0x0100, 0x024f),
|
99
|
+
regex_range(0x0253, 0x0254),
|
100
|
+
regex_range(0x0256, 0x0257),
|
101
|
+
regex_range(0x0259),
|
102
|
+
regex_range(0x025b),
|
103
|
+
regex_range(0x0263),
|
104
|
+
regex_range(0x0268),
|
105
|
+
regex_range(0x026f),
|
106
|
+
regex_range(0x0272),
|
107
|
+
regex_range(0x0289),
|
108
|
+
regex_range(0x028b),
|
109
|
+
regex_range(0x02bb),
|
110
|
+
regex_range(0x0300, 0x036f),
|
111
|
+
regex_range(0x1e00, 0x1eff)
|
112
|
+
].join('').freeze
|
113
|
+
REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
|
114
|
+
|
115
|
+
RTL_CHARACTERS = [
|
116
|
+
regex_range(0x0600,0x06FF),
|
117
|
+
regex_range(0x0750,0x077F),
|
118
|
+
regex_range(0x0590,0x05FF),
|
119
|
+
regex_range(0xFE70,0xFEFF)
|
120
|
+
].join('').freeze
|
121
|
+
|
122
|
+
PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
|
123
|
+
SPACE_CHARS = " \t\n\x0B\f\r"
|
124
|
+
CTRL_CHARS = "\x00-\x1F\x7F"
|
125
|
+
|
126
|
+
# Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{L}\p{M}
|
127
|
+
HASHTAG_LETTERS_AND_MARKS = "\\p{L}\\p{M}" +
|
128
|
+
"\u037f\u0528-\u052f\u08a0-\u08b2\u08e4-\u08ff\u0978\u0980\u0c00\u0c34\u0c81\u0d01\u0ede\u0edf" +
|
129
|
+
"\u10c7\u10cd\u10fd-\u10ff\u16f1-\u16f8\u17b4\u17b5\u191d\u191e\u1ab0-\u1abe\u1bab-\u1bad\u1bba-" +
|
130
|
+
"\u1bbf\u1cf3-\u1cf6\u1cf8\u1cf9\u1de7-\u1df5\u2cf2\u2cf3\u2d27\u2d2d\u2d66\u2d67\u9fcc\ua674-" +
|
131
|
+
"\ua67b\ua698-\ua69d\ua69f\ua792-\ua79f\ua7aa-\ua7ad\ua7b0\ua7b1\ua7f7-\ua7f9\ua9e0-\ua9ef\ua9fa-" +
|
132
|
+
"\ua9fe\uaa7c-\uaa7f\uaae0-\uaaef\uaaf2-\uaaf6\uab30-\uab5a\uab5c-\uab5f\uab64\uab65\uf870-\uf87f" +
|
133
|
+
"\uf882\uf884-\uf89f\uf8b8\uf8c1-\uf8d6\ufa2e\ufa2f\ufe27-\ufe2d\u{102e0}\u{1031f}\u{10350}-\u{1037a}" +
|
134
|
+
"\u{10500}-\u{10527}\u{10530}-\u{10563}\u{10600}-\u{10736}\u{10740}-\u{10755}\u{10760}-\u{10767}" +
|
135
|
+
"\u{10860}-\u{10876}\u{10880}-\u{1089e}\u{10980}-\u{109b7}\u{109be}\u{109bf}\u{10a80}-\u{10a9c}" +
|
136
|
+
"\u{10ac0}-\u{10ac7}\u{10ac9}-\u{10ae6}\u{10b80}-\u{10b91}\u{1107f}\u{110d0}-\u{110e8}\u{11100}-" +
|
137
|
+
"\u{11134}\u{11150}-\u{11173}\u{11176}\u{11180}-\u{111c4}\u{111da}\u{11200}-\u{11211}\u{11213}-" +
|
138
|
+
"\u{11237}\u{112b0}-\u{112ea}\u{11301}-\u{11303}\u{11305}-\u{1130c}\u{1130f}\u{11310}\u{11313}-" +
|
139
|
+
"\u{11328}\u{1132a}-\u{11330}\u{11332}\u{11333}\u{11335}-\u{11339}\u{1133c}-\u{11344}\u{11347}" +
|
140
|
+
"\u{11348}\u{1134b}-\u{1134d}\u{11357}\u{1135d}-\u{11363}\u{11366}-\u{1136c}\u{11370}-\u{11374}" +
|
141
|
+
"\u{11480}-\u{114c5}\u{114c7}\u{11580}-\u{115b5}\u{115b8}-\u{115c0}\u{11600}-\u{11640}\u{11644}" +
|
142
|
+
"\u{11680}-\u{116b7}\u{118a0}-\u{118df}\u{118ff}\u{11ac0}-\u{11af8}\u{1236f}-\u{12398}\u{16a40}-" +
|
143
|
+
"\u{16a5e}\u{16ad0}-\u{16aed}\u{16af0}-\u{16af4}\u{16b00}-\u{16b36}\u{16b40}-\u{16b43}\u{16b63}-" +
|
144
|
+
"\u{16b77}\u{16b7d}-\u{16b8f}\u{16f00}-\u{16f44}\u{16f50}-\u{16f7e}\u{16f8f}-\u{16f9f}\u{1bc00}-" +
|
145
|
+
"\u{1bc6a}\u{1bc70}-\u{1bc7c}\u{1bc80}-\u{1bc88}\u{1bc90}-\u{1bc99}\u{1bc9d}\u{1bc9e}\u{1e800}-" +
|
146
|
+
"\u{1e8c4}\u{1e8d0}-\u{1e8d6}\u{1ee00}-\u{1ee03}\u{1ee05}-\u{1ee1f}\u{1ee21}\u{1ee22}\u{1ee24}" +
|
147
|
+
"\u{1ee27}\u{1ee29}-\u{1ee32}\u{1ee34}-\u{1ee37}\u{1ee39}\u{1ee3b}\u{1ee42}\u{1ee47}\u{1ee49}" +
|
148
|
+
"\u{1ee4b}\u{1ee4d}-\u{1ee4f}\u{1ee51}\u{1ee52}\u{1ee54}\u{1ee57}\u{1ee59}\u{1ee5b}\u{1ee5d}\u{1ee5f}" +
|
149
|
+
"\u{1ee61}\u{1ee62}\u{1ee64}\u{1ee67}-\u{1ee6a}\u{1ee6c}-\u{1ee72}\u{1ee74}-\u{1ee77}\u{1ee79}-" +
|
150
|
+
"\u{1ee7c}\u{1ee7e}\u{1ee80}-\u{1ee89}\u{1ee8b}-\u{1ee9b}\u{1eea1}-\u{1eea3}\u{1eea5}-\u{1eea9}" +
|
151
|
+
"\u{1eeab}-\u{1eebb}"
|
152
|
+
|
153
|
+
# Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{Nd}
|
154
|
+
HASHTAG_NUMERALS = "\\p{Nd}" +
|
155
|
+
"\u0de6-\u0def\ua9f0-\ua9f9\u{110f0}-\u{110f9}\u{11136}-\u{1113f}\u{111d0}-\u{111d9}\u{112f0}-" +
|
156
|
+
"\u{112f9}\u{114d0}-\u{114d9}\u{11650}-\u{11659}\u{116c0}-\u{116c9}\u{118e0}-\u{118e9}\u{16a60}-" +
|
157
|
+
"\u{16a69}\u{16b50}-\u{16b59}"
|
158
|
+
|
159
|
+
HASHTAG_SPECIAL_CHARS = "_\u200c\u200d\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7"
|
160
|
+
|
161
|
+
HASHTAG_LETTERS_NUMERALS = "#{HASHTAG_LETTERS_AND_MARKS}#{HASHTAG_NUMERALS}#{HASHTAG_SPECIAL_CHARS}"
|
162
|
+
HASHTAG_LETTERS_NUMERALS_SET = "[#{HASHTAG_LETTERS_NUMERALS}]"
|
163
|
+
HASHTAG_LETTERS_SET = "[#{HASHTAG_LETTERS_AND_MARKS}]"
|
164
|
+
|
165
|
+
HASHTAG = /(\A|\ufe0e|\ufe0f|[^&#{HASHTAG_LETTERS_NUMERALS}])(#|#)(?!\ufe0f|\u20e3)(#{HASHTAG_LETTERS_NUMERALS_SET}*#{HASHTAG_LETTERS_SET}#{HASHTAG_LETTERS_NUMERALS_SET}*)/io
|
166
|
+
|
167
|
+
REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
|
168
|
+
# Used in Extractor for final filtering
|
169
|
+
REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
|
170
|
+
|
171
|
+
REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/io
|
172
|
+
REGEXEN[:at_signs] = /[@@]/
|
173
|
+
REGEXEN[:valid_mention_or_list] = /
|
174
|
+
(#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
|
175
|
+
(#{REGEXEN[:at_signs]}) # $2: At mark
|
176
|
+
([a-z0-9_]{1,20}) # $3: Screen name
|
177
|
+
(\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
|
178
|
+
/iox
|
179
|
+
REGEXEN[:valid_reply] = /^(?:[#{UNICODE_SPACES}#{DIRECTIONAL_CHARACTERS}])*#{REGEXEN[:at_signs]}([a-z0-9_]{1,20})/io
|
180
|
+
# Used in Extractor for final filtering
|
181
|
+
REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/io
|
182
|
+
|
183
|
+
# URL related hash regex collection
|
184
|
+
REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|[#{DIRECTIONAL_CHARACTERS.join('')}]|^)/io
|
185
|
+
REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
|
186
|
+
|
187
|
+
DOMAIN_VALID_CHARS = "[^#{DIRECTIONAL_CHARACTERS.join('')}#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
|
188
|
+
# "[a-z0-9#{LATIN_ACCENTS}]"
|
189
|
+
REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
190
|
+
REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
191
|
+
|
192
|
+
REGEXEN[:valid_gTLD] = %r{
|
193
|
+
(?:
|
194
|
+
(?:#{TLDS['generic'].join('|')})
|
195
|
+
(?=[^0-9a-z@+-]|$)
|
196
|
+
)
|
197
|
+
}ix
|
198
|
+
|
199
|
+
REGEXEN[:valid_ccTLD] = %r{
|
200
|
+
(?:
|
201
|
+
(?:#{TLDS['country'].join('|')})
|
202
|
+
(?=[^0-9a-z@+-]|$)
|
203
|
+
)
|
204
|
+
}ix
|
205
|
+
REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
|
206
|
+
|
207
|
+
REGEXEN[:valid_domain] = /(?:
|
208
|
+
#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
|
209
|
+
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
210
|
+
)/iox
|
211
|
+
|
212
|
+
# This is used in Extractor
|
213
|
+
REGEXEN[:valid_ascii_domain] = /
|
214
|
+
(?:(?:[a-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
|
215
|
+
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
216
|
+
/iox
|
217
|
+
|
218
|
+
# This is used in Extractor for stricter t.co URL extraction
|
219
|
+
REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/([a-z0-9]+)/i
|
220
|
+
|
221
|
+
REGEXEN[:valid_port_number] = /[0-9]+/
|
222
|
+
|
223
|
+
REGEXEN[:valid_general_url_path_chars] = /[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|@#{LATIN_ACCENTS}]/io
|
224
|
+
# Allow URL paths to contain up to two nested levels of balanced parens
|
225
|
+
# 1. Used in Wikipedia URLs like /Primer_(film)
|
226
|
+
# 2. Used in IIS sessions like /S(dfd346)/
|
227
|
+
# 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
|
228
|
+
REGEXEN[:valid_url_balanced_parens] = /
|
229
|
+
\(
|
230
|
+
(?:
|
231
|
+
#{REGEXEN[:valid_general_url_path_chars]}+
|
232
|
+
|
|
233
|
+
# allow one nested level of balanced parentheses
|
234
|
+
(?:
|
235
|
+
#{REGEXEN[:valid_general_url_path_chars]}*
|
236
|
+
\(
|
237
|
+
#{REGEXEN[:valid_general_url_path_chars]}+
|
238
|
+
\)
|
239
|
+
#{REGEXEN[:valid_general_url_path_chars]}*
|
240
|
+
)
|
241
|
+
)
|
242
|
+
\)
|
243
|
+
/iox
|
244
|
+
# Valid end-of-path chracters (so /foo. does not gobble the period).
|
245
|
+
# 1. Allow =&# for empty URL parameters and other URL-join artifacts
|
246
|
+
REGEXEN[:valid_url_path_ending_chars] = /[a-z\p{Cyrillic}0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
|
247
|
+
REGEXEN[:valid_url_path] = /(?:
|
248
|
+
(?:
|
249
|
+
#{REGEXEN[:valid_general_url_path_chars]}*
|
250
|
+
(?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
|
251
|
+
#{REGEXEN[:valid_url_path_ending_chars]}
|
252
|
+
)|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
|
253
|
+
)/iox
|
254
|
+
|
255
|
+
REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
|
256
|
+
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/\-]/i
|
257
|
+
REGEXEN[:valid_url] = %r{
|
258
|
+
( # $1 total match
|
259
|
+
(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
|
260
|
+
( # $3 URL
|
261
|
+
(https?:\/\/)? # $4 Protocol (optional)
|
262
|
+
(#{REGEXEN[:valid_domain]}) # $5 Domain(s)
|
263
|
+
(?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
|
264
|
+
(/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
|
265
|
+
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
|
266
|
+
)
|
267
|
+
)
|
268
|
+
}iox
|
269
|
+
|
270
|
+
REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
|
271
|
+
REGEXEN[:valid_cashtag] = /(^|[#{UNICODE_SPACES}#{DIRECTIONAL_CHARACTERS}])(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
|
272
|
+
|
273
|
+
# These URL validation pattern strings are based on the ABNF from RFC 3986
|
274
|
+
REGEXEN[:validate_url_unreserved] = /[a-z\p{Cyrillic}0-9\p{Pd}._~]/i
|
275
|
+
REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
|
276
|
+
REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
|
277
|
+
REGEXEN[:validate_url_pchar] = /(?:
|
278
|
+
#{REGEXEN[:validate_url_unreserved]}|
|
279
|
+
#{REGEXEN[:validate_url_pct_encoded]}|
|
280
|
+
#{REGEXEN[:validate_url_sub_delims]}|
|
281
|
+
[:\|@]
|
282
|
+
)/iox
|
283
|
+
|
284
|
+
REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
|
285
|
+
REGEXEN[:validate_url_userinfo] = /(?:
|
286
|
+
#{REGEXEN[:validate_url_unreserved]}|
|
287
|
+
#{REGEXEN[:validate_url_pct_encoded]}|
|
288
|
+
#{REGEXEN[:validate_url_sub_delims]}|
|
289
|
+
:
|
290
|
+
)*/iox
|
291
|
+
|
292
|
+
REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
|
293
|
+
REGEXEN[:validate_url_ipv4] =
|
294
|
+
/(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
|
295
|
+
|
296
|
+
# Punting on real IPv6 validation for now
|
297
|
+
REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
|
298
|
+
|
299
|
+
# Also punting on IPvFuture for now
|
300
|
+
REGEXEN[:validate_url_ip] = /(?:
|
301
|
+
#{REGEXEN[:validate_url_ipv4]}|
|
302
|
+
#{REGEXEN[:validate_url_ipv6]}
|
303
|
+
)/iox
|
304
|
+
|
305
|
+
# This is more strict than the rfc specifies
|
306
|
+
REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
|
307
|
+
REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
|
308
|
+
REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
|
309
|
+
REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
|
310
|
+
(?:#{REGEXEN[:validate_url_domain_segment]}\.)
|
311
|
+
#{REGEXEN[:validate_url_domain_tld]})/iox
|
312
|
+
|
313
|
+
REGEXEN[:validate_url_host] = /(?:
|
314
|
+
#{REGEXEN[:validate_url_ip]}|
|
315
|
+
#{REGEXEN[:validate_url_domain]}
|
316
|
+
)/iox
|
317
|
+
|
318
|
+
# Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
|
319
|
+
REGEXEN[:validate_url_unicode_subdomain_segment] =
|
320
|
+
/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
|
321
|
+
REGEXEN[:validate_url_unicode_domain_segment] =
|
322
|
+
/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
|
323
|
+
REGEXEN[:validate_url_unicode_domain_tld] =
|
324
|
+
/(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
|
325
|
+
REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
|
326
|
+
(?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
|
327
|
+
#{REGEXEN[:validate_url_unicode_domain_tld]})/iox
|
328
|
+
|
329
|
+
REGEXEN[:validate_url_unicode_host] = /(?:
|
330
|
+
#{REGEXEN[:validate_url_ip]}|
|
331
|
+
#{REGEXEN[:validate_url_unicode_domain]}
|
332
|
+
)/iox
|
333
|
+
|
334
|
+
REGEXEN[:validate_url_port] = /[0-9]{1,5}/
|
335
|
+
|
336
|
+
REGEXEN[:validate_url_unicode_authority] = %r{
|
337
|
+
(?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
|
338
|
+
(#{REGEXEN[:validate_url_unicode_host]}) # $2 host
|
339
|
+
(?::(#{REGEXEN[:validate_url_port]}))? # $3 port
|
340
|
+
}iox
|
341
|
+
|
342
|
+
REGEXEN[:validate_url_authority] = %r{
|
343
|
+
(?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
|
344
|
+
(#{REGEXEN[:validate_url_host]}) # $2 host
|
345
|
+
(?::(#{REGEXEN[:validate_url_port]}))? # $3 port
|
346
|
+
}iox
|
347
|
+
|
348
|
+
REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
|
349
|
+
REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
|
350
|
+
REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
|
351
|
+
|
352
|
+
REGEXEN[:valid_emoji] = Twitter::TwitterText::Regex::Emoji[:valid_emoji]
|
353
|
+
|
354
|
+
# Modified version of RFC 3986 Appendix B
|
355
|
+
REGEXEN[:validate_url_unencoded] = %r{
|
356
|
+
\A # Full URL
|
357
|
+
(?:
|
358
|
+
([^:/?#]+):// # $1 Scheme
|
359
|
+
)?
|
360
|
+
([^/?#]*) # $2 Authority
|
361
|
+
([^?#]*) # $3 Path
|
362
|
+
(?:
|
363
|
+
\?([^#]*) # $4 Query
|
364
|
+
)?
|
365
|
+
(?:
|
366
|
+
\#(.*) # $5 Fragment
|
367
|
+
)?\Z
|
368
|
+
}ix
|
369
|
+
|
370
|
+
REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
|
371
|
+
|
372
|
+
REGEXEN.each_pair{|k,v| v.freeze }
|
373
|
+
|
374
|
+
# Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
|
375
|
+
# is not a known symbol a <tt>nil</tt> will be returned.
|
376
|
+
def self.[](key)
|
377
|
+
REGEXEN[key]
|
378
|
+
end
|
379
|
+
end
|
380
|
+
end
|
381
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
module Twitter
|
6
|
+
module TwitterText
|
7
|
+
# A module provides base methods to rewrite usernames, lists, hashtags and URLs.
|
8
|
+
module Rewriter extend self
|
9
|
+
def rewrite_entities(text, entities)
|
10
|
+
codepoints = text.to_s.to_codepoint_a
|
11
|
+
|
12
|
+
# sort by start index
|
13
|
+
entities = entities.sort_by do |entity|
|
14
|
+
indices = entity.respond_to?(:indices) ? entity.indices : entity[:indices]
|
15
|
+
indices.first
|
16
|
+
end
|
17
|
+
|
18
|
+
result = []
|
19
|
+
last_index = entities.inject(0) do |index, entity|
|
20
|
+
indices = entity.respond_to?(:indices) ? entity.indices : entity[:indices]
|
21
|
+
result << codepoints[index...indices.first]
|
22
|
+
result << yield(entity, codepoints)
|
23
|
+
indices.last
|
24
|
+
end
|
25
|
+
result << codepoints[last_index..-1]
|
26
|
+
|
27
|
+
result.flatten.join
|
28
|
+
end
|
29
|
+
|
30
|
+
# These methods are deprecated, will be removed in future.
|
31
|
+
extend Deprecation
|
32
|
+
|
33
|
+
def rewrite(text, options = {})
|
34
|
+
[:hashtags, :urls, :usernames_or_lists].inject(text) do |key|
|
35
|
+
options[key] ? send(:"rewrite_#{key}", text, &options[key]) : text
|
36
|
+
end
|
37
|
+
end
|
38
|
+
deprecate :rewrite, :rewrite_entities
|
39
|
+
|
40
|
+
def rewrite_usernames_or_lists(text)
|
41
|
+
entities = Extractor.extract_mentions_or_lists_with_indices(text)
|
42
|
+
rewrite_entities(text, entities) do |entity, codepoints|
|
43
|
+
at = codepoints[entity[:indices].first]
|
44
|
+
list_slug = entity[:list_slug]
|
45
|
+
list_slug = nil if list_slug.empty?
|
46
|
+
yield(at, entity[:screen_name], list_slug)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
deprecate :rewrite_usernames_or_lists, :rewrite_entities
|
50
|
+
|
51
|
+
def rewrite_hashtags(text)
|
52
|
+
entities = Extractor.extract_hashtags_with_indices(text)
|
53
|
+
rewrite_entities(text, entities) do |entity, codepoints|
|
54
|
+
hash = codepoints[entity[:indices].first]
|
55
|
+
yield(hash, entity[:hashtag])
|
56
|
+
end
|
57
|
+
end
|
58
|
+
deprecate :rewrite_hashtags, :rewrite_entities
|
59
|
+
|
60
|
+
def rewrite_urls(text)
|
61
|
+
entities = Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false)
|
62
|
+
rewrite_entities(text, entities) do |entity, codepoints|
|
63
|
+
yield(entity[:url])
|
64
|
+
end
|
65
|
+
end
|
66
|
+
deprecate :rewrite_urls, :rewrite_entities
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
module Twitter
|
6
|
+
module TwitterText
|
7
|
+
# This module lazily defines constants of the form Uxxxx for all Unicode
|
8
|
+
# codepoints from U0000 to U10FFFF. The value of each constant is the
|
9
|
+
# UTF-8 string for the codepoint.
|
10
|
+
# Examples:
|
11
|
+
# copyright = Unicode::U00A9
|
12
|
+
# euro = Unicode::U20AC
|
13
|
+
# infinity = Unicode::U221E
|
14
|
+
#
|
15
|
+
module Unicode
|
16
|
+
CODEPOINT_REGEX = /^U_?([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})$/
|
17
|
+
|
18
|
+
def self.const_missing(name)
|
19
|
+
# Check that the constant name is of the right form: U0000 to U10FFFF
|
20
|
+
if name.to_s =~ CODEPOINT_REGEX
|
21
|
+
# Convert the codepoint to an immutable UTF-8 string,
|
22
|
+
# define a real constant for that value and return the value
|
23
|
+
#p name, name.class
|
24
|
+
const_set(name, [$1.to_i(16)].pack("U").freeze)
|
25
|
+
else # Raise an error for constants that are not Unicode.
|
26
|
+
raise NameError, "Uninitialized constant: Unicode::#{name}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|