twitter-text 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/README.md +5 -5
- data/lib/twitter-text/autolink.rb +386 -385
- data/lib/twitter-text/configuration.rb +48 -47
- data/lib/twitter-text/deprecation.rb +11 -9
- data/lib/twitter-text/extractor.rb +270 -268
- data/lib/twitter-text/hash_helper.rb +17 -15
- data/lib/twitter-text/hit_highlighter.rb +69 -67
- data/lib/twitter-text/regex.rb +342 -340
- data/lib/twitter-text/rewriter.rb +51 -49
- data/lib/twitter-text/unicode.rb +21 -20
- data/lib/twitter-text/validation.rb +185 -183
- data/lib/twitter-text/weighted_range.rb +12 -10
- data/spec/autolinking_spec.rb +2 -2
- data/spec/configuration_spec.rb +11 -11
- data/spec/extractor_spec.rb +6 -6
- data/spec/hithighlighter_spec.rb +2 -2
- data/spec/regex_spec.rb +3 -3
- data/spec/rewriter_spec.rb +7 -7
- data/spec/spec_helper.rb +2 -2
- data/spec/unicode_spec.rb +11 -11
- data/spec/validation_spec.rb +7 -7
- data/test/conformance_test.rb +4 -4
- data/twitter-text.gemspec +1 -1
- metadata +3 -2
@@ -1,21 +1,23 @@
|
|
1
1
|
module Twitter
|
2
|
-
module
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
symbolize_keys
|
10
|
-
|
2
|
+
module TwitterText
|
3
|
+
module HashHelper
|
4
|
+
# Return a new hash with all keys converted to symbols, as long as
|
5
|
+
# they respond to +to_sym+.
|
6
|
+
#
|
7
|
+
# { 'name' => 'Rob', 'years' => '28' }.symbolize_keys
|
8
|
+
# #=> { :name => "Rob", :years => "28" }
|
9
|
+
def self.symbolize_keys(hash)
|
10
|
+
symbolize_keys!(hash.dup)
|
11
|
+
end
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
# Destructively convert all keys to symbols, as long as they respond
|
14
|
+
# to +to_sym+. Same as +symbolize_keys+, but modifies +self+.
|
15
|
+
def self.symbolize_keys!(hash)
|
16
|
+
hash.keys.each do |key|
|
17
|
+
hash[(key.to_sym rescue key) || key] = hash.delete(key)
|
18
|
+
end
|
19
|
+
hash
|
17
20
|
end
|
18
|
-
hash
|
19
21
|
end
|
20
22
|
end
|
21
23
|
end
|
@@ -1,86 +1,88 @@
|
|
1
1
|
module Twitter
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
2
|
+
module TwitterText
|
3
|
+
# Module for doing "hit highlighting" on tweets that have been auto-linked already.
|
4
|
+
# Useful with the results returned from the Search API.
|
5
|
+
module HitHighlighter extend self
|
6
|
+
# Default Tag used for hit highlighting
|
7
|
+
DEFAULT_HIGHLIGHT_TAG = "em"
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
9
|
+
# Add <tt><em></em></tt> tags around the <tt>hits</tt> provided in the <tt>text</tt>. The
|
10
|
+
# <tt>hits</tt> should be an array of (start, end) index pairs, relative to the original
|
11
|
+
# text, before auto-linking (but the <tt>text</tt> may already be auto-linked if desired)
|
12
|
+
#
|
13
|
+
# The <tt><em></em></tt> tags can be overridden using the <tt>:tag</tt> option. For example:
|
14
|
+
#
|
15
|
+
# irb> hit_highlight("test hit here", [[5, 8]], :tag => 'strong')
|
16
|
+
# => "test <strong>hit</strong> here"
|
17
|
+
def hit_highlight(text, hits = [], options = {})
|
18
|
+
if hits.empty?
|
19
|
+
return text
|
20
|
+
end
|
20
21
|
|
21
|
-
|
22
|
-
|
22
|
+
tag_name = options[:tag] || DEFAULT_HIGHLIGHT_TAG
|
23
|
+
tags = ["<" + tag_name + ">", "</" + tag_name + ">"]
|
23
24
|
|
24
|
-
|
25
|
+
chunks = text.split(/[<>]/)
|
25
26
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
27
|
+
result = []
|
28
|
+
chunk_index, chunk = 0, chunks[0]
|
29
|
+
chunk_chars = chunk.to_s.to_char_a
|
30
|
+
prev_chunks_len = 0
|
31
|
+
chunk_cursor = 0
|
32
|
+
start_in_chunk = false
|
33
|
+
for hit, index in hits.flatten.each_with_index do
|
34
|
+
tag = tags[index % 2]
|
34
35
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
36
|
+
placed = false
|
37
|
+
until chunk.nil? || hit < prev_chunks_len + chunk.length do
|
38
|
+
result << chunk_chars[chunk_cursor..-1]
|
39
|
+
if start_in_chunk && hit == prev_chunks_len + chunk_chars.length
|
40
|
+
result << tag
|
41
|
+
placed = true
|
42
|
+
end
|
42
43
|
|
43
|
-
|
44
|
-
|
45
|
-
|
44
|
+
# correctly handle highlights that end on the final character.
|
45
|
+
if tag_text = chunks[chunk_index+1]
|
46
|
+
result << "<#{tag_text}>"
|
47
|
+
end
|
48
|
+
|
49
|
+
prev_chunks_len += chunk_chars.length
|
50
|
+
chunk_cursor = 0
|
51
|
+
chunk_index += 2
|
52
|
+
chunk = chunks[chunk_index]
|
53
|
+
chunk_chars = chunk.to_s.to_char_a
|
54
|
+
start_in_chunk = false
|
46
55
|
end
|
47
56
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
57
|
+
if !placed && !chunk.nil?
|
58
|
+
hit_spot = hit - prev_chunks_len
|
59
|
+
result << chunk_chars[chunk_cursor...hit_spot] << tag
|
60
|
+
chunk_cursor = hit_spot
|
61
|
+
if index % 2 == 0
|
62
|
+
start_in_chunk = true
|
63
|
+
else
|
64
|
+
start_in_chunk = false
|
65
|
+
end
|
66
|
+
placed = true
|
67
|
+
end
|
55
68
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
chunk_cursor = hit_spot
|
60
|
-
if index % 2 == 0
|
61
|
-
start_in_chunk = true
|
62
|
-
else
|
63
|
-
start_in_chunk = false
|
69
|
+
# ultimate fallback, hits that run off the end get a closing tag
|
70
|
+
if !placed
|
71
|
+
result << tag
|
64
72
|
end
|
65
|
-
placed = true
|
66
73
|
end
|
67
74
|
|
68
|
-
|
69
|
-
|
70
|
-
|
75
|
+
if chunk
|
76
|
+
if chunk_cursor < chunk_chars.length
|
77
|
+
result << chunk_chars[chunk_cursor..-1]
|
78
|
+
end
|
79
|
+
(chunk_index+1).upto(chunks.length-1).each do |i|
|
80
|
+
result << (i.even? ? chunks[i] : "<#{chunks[i]}>")
|
81
|
+
end
|
71
82
|
end
|
72
|
-
end
|
73
83
|
|
74
|
-
|
75
|
-
if chunk_cursor < chunk_chars.length
|
76
|
-
result << chunk_chars[chunk_cursor..-1]
|
77
|
-
end
|
78
|
-
(chunk_index+1).upto(chunks.length-1).each do |i|
|
79
|
-
result << (i.even? ? chunks[i] : "<#{chunks[i]}>")
|
80
|
-
end
|
84
|
+
result.flatten.join
|
81
85
|
end
|
82
|
-
|
83
|
-
result.flatten.join
|
84
86
|
end
|
85
87
|
end
|
86
88
|
end
|
data/lib/twitter-text/regex.rb
CHANGED
@@ -1,249 +1,250 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
module Twitter
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
if
|
16
|
-
|
4
|
+
module TwitterText
|
5
|
+
# A collection of regular expressions for parsing Tweet text. The regular expression
|
6
|
+
# list is frozen at load time to ensure immutability. These regular expressions are
|
7
|
+
# used throughout the <tt>TwitterText</tt> classes. Special care has been taken to make
|
8
|
+
# sure these reular expressions work with Tweets in all languages.
|
9
|
+
class Regex
|
10
|
+
require 'yaml'
|
11
|
+
|
12
|
+
REGEXEN = {} # :nodoc:
|
13
|
+
|
14
|
+
def self.regex_range(from, to = nil) # :nodoc:
|
15
|
+
if $RUBY_1_9
|
16
|
+
if to
|
17
|
+
"\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
|
18
|
+
else
|
19
|
+
"\\u{#{from.to_s(16).rjust(4, '0')}}"
|
20
|
+
end
|
17
21
|
else
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
else
|
24
|
-
[from].pack('U')
|
22
|
+
if to
|
23
|
+
[from].pack('U') + '-' + [to].pack('U')
|
24
|
+
else
|
25
|
+
[from].pack('U')
|
26
|
+
end
|
25
27
|
end
|
26
28
|
end
|
27
|
-
end
|
28
29
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
30
|
+
TLDS = YAML.load_file(
|
31
|
+
File.join(
|
32
|
+
File.expand_path('../../..', __FILE__), # project root
|
33
|
+
'lib', 'assets', 'tld_lib.yml'
|
34
|
+
)
|
33
35
|
)
|
34
|
-
)
|
35
|
-
|
36
|
-
# Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
|
37
|
-
# to access both the list of characters and a pattern suitible for use with String#split
|
38
|
-
# Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
|
39
|
-
UNICODE_SPACES = [
|
40
|
-
(0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
|
41
|
-
0x0020, # White_Space # Zs SPACE
|
42
|
-
0x0085, # White_Space # Cc <control-0085>
|
43
|
-
0x00A0, # White_Space # Zs NO-BREAK SPACE
|
44
|
-
0x1680, # White_Space # Zs OGHAM SPACE MARK
|
45
|
-
0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
|
46
|
-
(0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
|
47
|
-
0x2028, # White_Space # Zl LINE SEPARATOR
|
48
|
-
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
|
49
|
-
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
|
50
|
-
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
|
51
|
-
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
|
52
|
-
].flatten.map{|c| [c].pack('U*')}.freeze
|
53
|
-
REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
|
54
|
-
|
55
|
-
# Character not allowed in Tweets
|
56
|
-
INVALID_CHARACTERS = [
|
57
|
-
0xFFFE, 0xFEFF, # BOM
|
58
|
-
0xFFFF, # Special
|
59
|
-
0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
|
60
|
-
].map{|cp| [cp].pack('U') }.freeze
|
61
|
-
REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
|
62
|
-
|
63
|
-
major, minor, _patch = RUBY_VERSION.split('.')
|
64
|
-
if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
|
65
|
-
REGEXEN[:list_name] = /[a-z][a-z0-9_\-\u0080-\u00ff]{0,24}/i
|
66
|
-
else
|
67
|
-
# This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
|
68
|
-
REGEXEN[:list_name] = eval("/[a-z][a-z0-9_\\-\x80-\xff]{0,24}/i")
|
69
|
-
end
|
70
36
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
# Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{L}\p{M}
|
107
|
-
HASHTAG_LETTERS_AND_MARKS = "\\p{L}\\p{M}" +
|
108
|
-
"\u037f\u0528-\u052f\u08a0-\u08b2\u08e4-\u08ff\u0978\u0980\u0c00\u0c34\u0c81\u0d01\u0ede\u0edf" +
|
109
|
-
"\u10c7\u10cd\u10fd-\u10ff\u16f1-\u16f8\u17b4\u17b5\u191d\u191e\u1ab0-\u1abe\u1bab-\u1bad\u1bba-" +
|
110
|
-
"\u1bbf\u1cf3-\u1cf6\u1cf8\u1cf9\u1de7-\u1df5\u2cf2\u2cf3\u2d27\u2d2d\u2d66\u2d67\u9fcc\ua674-" +
|
111
|
-
"\ua67b\ua698-\ua69d\ua69f\ua792-\ua79f\ua7aa-\ua7ad\ua7b0\ua7b1\ua7f7-\ua7f9\ua9e0-\ua9ef\ua9fa-" +
|
112
|
-
"\ua9fe\uaa7c-\uaa7f\uaae0-\uaaef\uaaf2-\uaaf6\uab30-\uab5a\uab5c-\uab5f\uab64\uab65\uf870-\uf87f" +
|
113
|
-
"\uf882\uf884-\uf89f\uf8b8\uf8c1-\uf8d6\ufa2e\ufa2f\ufe27-\ufe2d\u{102e0}\u{1031f}\u{10350}-\u{1037a}" +
|
114
|
-
"\u{10500}-\u{10527}\u{10530}-\u{10563}\u{10600}-\u{10736}\u{10740}-\u{10755}\u{10760}-\u{10767}" +
|
115
|
-
"\u{10860}-\u{10876}\u{10880}-\u{1089e}\u{10980}-\u{109b7}\u{109be}\u{109bf}\u{10a80}-\u{10a9c}" +
|
116
|
-
"\u{10ac0}-\u{10ac7}\u{10ac9}-\u{10ae6}\u{10b80}-\u{10b91}\u{1107f}\u{110d0}-\u{110e8}\u{11100}-" +
|
117
|
-
"\u{11134}\u{11150}-\u{11173}\u{11176}\u{11180}-\u{111c4}\u{111da}\u{11200}-\u{11211}\u{11213}-" +
|
118
|
-
"\u{11237}\u{112b0}-\u{112ea}\u{11301}-\u{11303}\u{11305}-\u{1130c}\u{1130f}\u{11310}\u{11313}-" +
|
119
|
-
"\u{11328}\u{1132a}-\u{11330}\u{11332}\u{11333}\u{11335}-\u{11339}\u{1133c}-\u{11344}\u{11347}" +
|
120
|
-
"\u{11348}\u{1134b}-\u{1134d}\u{11357}\u{1135d}-\u{11363}\u{11366}-\u{1136c}\u{11370}-\u{11374}" +
|
121
|
-
"\u{11480}-\u{114c5}\u{114c7}\u{11580}-\u{115b5}\u{115b8}-\u{115c0}\u{11600}-\u{11640}\u{11644}" +
|
122
|
-
"\u{11680}-\u{116b7}\u{118a0}-\u{118df}\u{118ff}\u{11ac0}-\u{11af8}\u{1236f}-\u{12398}\u{16a40}-" +
|
123
|
-
"\u{16a5e}\u{16ad0}-\u{16aed}\u{16af0}-\u{16af4}\u{16b00}-\u{16b36}\u{16b40}-\u{16b43}\u{16b63}-" +
|
124
|
-
"\u{16b77}\u{16b7d}-\u{16b8f}\u{16f00}-\u{16f44}\u{16f50}-\u{16f7e}\u{16f8f}-\u{16f9f}\u{1bc00}-" +
|
125
|
-
"\u{1bc6a}\u{1bc70}-\u{1bc7c}\u{1bc80}-\u{1bc88}\u{1bc90}-\u{1bc99}\u{1bc9d}\u{1bc9e}\u{1e800}-" +
|
126
|
-
"\u{1e8c4}\u{1e8d0}-\u{1e8d6}\u{1ee00}-\u{1ee03}\u{1ee05}-\u{1ee1f}\u{1ee21}\u{1ee22}\u{1ee24}" +
|
127
|
-
"\u{1ee27}\u{1ee29}-\u{1ee32}\u{1ee34}-\u{1ee37}\u{1ee39}\u{1ee3b}\u{1ee42}\u{1ee47}\u{1ee49}" +
|
128
|
-
"\u{1ee4b}\u{1ee4d}-\u{1ee4f}\u{1ee51}\u{1ee52}\u{1ee54}\u{1ee57}\u{1ee59}\u{1ee5b}\u{1ee5d}\u{1ee5f}" +
|
129
|
-
"\u{1ee61}\u{1ee62}\u{1ee64}\u{1ee67}-\u{1ee6a}\u{1ee6c}-\u{1ee72}\u{1ee74}-\u{1ee77}\u{1ee79}-" +
|
130
|
-
"\u{1ee7c}\u{1ee7e}\u{1ee80}-\u{1ee89}\u{1ee8b}-\u{1ee9b}\u{1eea1}-\u{1eea3}\u{1eea5}-\u{1eea9}" +
|
131
|
-
"\u{1eeab}-\u{1eebb}"
|
132
|
-
|
133
|
-
# Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{Nd}
|
134
|
-
HASHTAG_NUMERALS = "\\p{Nd}" +
|
135
|
-
"\u0de6-\u0def\ua9f0-\ua9f9\u{110f0}-\u{110f9}\u{11136}-\u{1113f}\u{111d0}-\u{111d9}\u{112f0}-" +
|
136
|
-
"\u{112f9}\u{114d0}-\u{114d9}\u{11650}-\u{11659}\u{116c0}-\u{116c9}\u{118e0}-\u{118e9}\u{16a60}-" +
|
137
|
-
"\u{16a69}\u{16b50}-\u{16b59}"
|
138
|
-
|
139
|
-
HASHTAG_SPECIAL_CHARS = "_\u200c\u200d\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7"
|
140
|
-
|
141
|
-
HASHTAG_LETTERS_NUMERALS = "#{HASHTAG_LETTERS_AND_MARKS}#{HASHTAG_NUMERALS}#{HASHTAG_SPECIAL_CHARS}"
|
142
|
-
HASHTAG_LETTERS_NUMERALS_SET = "[#{HASHTAG_LETTERS_NUMERALS}]"
|
143
|
-
HASHTAG_LETTERS_SET = "[#{HASHTAG_LETTERS_AND_MARKS}]"
|
144
|
-
|
145
|
-
HASHTAG = /(\A|\ufe0e|\ufe0f|[^&#{HASHTAG_LETTERS_NUMERALS}])(#|#)(?!\ufe0f|\u20e3)(#{HASHTAG_LETTERS_NUMERALS_SET}*#{HASHTAG_LETTERS_SET}#{HASHTAG_LETTERS_NUMERALS_SET}*)/io
|
146
|
-
|
147
|
-
REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
|
148
|
-
# Used in Extractor for final filtering
|
149
|
-
REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
|
150
|
-
|
151
|
-
REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/io
|
152
|
-
REGEXEN[:at_signs] = /[@@]/
|
153
|
-
REGEXEN[:valid_mention_or_list] = /
|
154
|
-
(#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
|
155
|
-
(#{REGEXEN[:at_signs]}) # $2: At mark
|
156
|
-
([a-z0-9_]{1,20}) # $3: Screen name
|
157
|
-
(\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
|
158
|
-
/iox
|
159
|
-
REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-z0-9_]{1,20})/io
|
160
|
-
# Used in Extractor for final filtering
|
161
|
-
REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/io
|
162
|
-
|
163
|
-
# URL related hash regex collection
|
164
|
-
REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
|
165
|
-
REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
|
166
|
-
DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
|
167
|
-
REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
168
|
-
REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
169
|
-
|
170
|
-
REGEXEN[:valid_gTLD] = %r{
|
171
|
-
(?:
|
172
|
-
(?:#{TLDS['generic'].join('|')})
|
173
|
-
(?=[^0-9a-z@]|$)
|
174
|
-
)
|
175
|
-
}ix
|
37
|
+
# Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
|
38
|
+
# to access both the list of characters and a pattern suitible for use with String#split
|
39
|
+
# Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
|
40
|
+
UNICODE_SPACES = [
|
41
|
+
(0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
|
42
|
+
0x0020, # White_Space # Zs SPACE
|
43
|
+
0x0085, # White_Space # Cc <control-0085>
|
44
|
+
0x00A0, # White_Space # Zs NO-BREAK SPACE
|
45
|
+
0x1680, # White_Space # Zs OGHAM SPACE MARK
|
46
|
+
0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
|
47
|
+
(0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
|
48
|
+
0x2028, # White_Space # Zl LINE SEPARATOR
|
49
|
+
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
|
50
|
+
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
|
51
|
+
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
|
52
|
+
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
|
53
|
+
].flatten.map{|c| [c].pack('U*')}.freeze
|
54
|
+
REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
|
55
|
+
|
56
|
+
# Character not allowed in Tweets
|
57
|
+
INVALID_CHARACTERS = [
|
58
|
+
0xFFFE, 0xFEFF, # BOM
|
59
|
+
0xFFFF, # Special
|
60
|
+
0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
|
61
|
+
].map{|cp| [cp].pack('U') }.freeze
|
62
|
+
REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
|
63
|
+
|
64
|
+
major, minor, _patch = RUBY_VERSION.split('.')
|
65
|
+
if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
|
66
|
+
REGEXEN[:list_name] = /[a-z][a-z0-9_\-\u0080-\u00ff]{0,24}/i
|
67
|
+
else
|
68
|
+
# This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
|
69
|
+
REGEXEN[:list_name] = eval("/[a-z][a-z0-9_\\-\x80-\xff]{0,24}/i")
|
70
|
+
end
|
176
71
|
|
177
|
-
|
178
|
-
(
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
72
|
+
# Latin accented characters
|
73
|
+
# Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
|
74
|
+
# Also excludes 0xf7, the division sign
|
75
|
+
LATIN_ACCENTS = [
|
76
|
+
regex_range(0xc0, 0xd6),
|
77
|
+
regex_range(0xd8, 0xf6),
|
78
|
+
regex_range(0xf8, 0xff),
|
79
|
+
regex_range(0x0100, 0x024f),
|
80
|
+
regex_range(0x0253, 0x0254),
|
81
|
+
regex_range(0x0256, 0x0257),
|
82
|
+
regex_range(0x0259),
|
83
|
+
regex_range(0x025b),
|
84
|
+
regex_range(0x0263),
|
85
|
+
regex_range(0x0268),
|
86
|
+
regex_range(0x026f),
|
87
|
+
regex_range(0x0272),
|
88
|
+
regex_range(0x0289),
|
89
|
+
regex_range(0x028b),
|
90
|
+
regex_range(0x02bb),
|
91
|
+
regex_range(0x0300, 0x036f),
|
92
|
+
regex_range(0x1e00, 0x1eff)
|
93
|
+
].join('').freeze
|
94
|
+
REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
|
95
|
+
|
96
|
+
RTL_CHARACTERS = [
|
97
|
+
regex_range(0x0600,0x06FF),
|
98
|
+
regex_range(0x0750,0x077F),
|
99
|
+
regex_range(0x0590,0x05FF),
|
100
|
+
regex_range(0xFE70,0xFEFF)
|
101
|
+
].join('').freeze
|
102
|
+
|
103
|
+
PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
|
104
|
+
SPACE_CHARS = " \t\n\x0B\f\r"
|
105
|
+
CTRL_CHARS = "\x00-\x1F\x7F"
|
106
|
+
|
107
|
+
# Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{L}\p{M}
|
108
|
+
HASHTAG_LETTERS_AND_MARKS = "\\p{L}\\p{M}" +
|
109
|
+
"\u037f\u0528-\u052f\u08a0-\u08b2\u08e4-\u08ff\u0978\u0980\u0c00\u0c34\u0c81\u0d01\u0ede\u0edf" +
|
110
|
+
"\u10c7\u10cd\u10fd-\u10ff\u16f1-\u16f8\u17b4\u17b5\u191d\u191e\u1ab0-\u1abe\u1bab-\u1bad\u1bba-" +
|
111
|
+
"\u1bbf\u1cf3-\u1cf6\u1cf8\u1cf9\u1de7-\u1df5\u2cf2\u2cf3\u2d27\u2d2d\u2d66\u2d67\u9fcc\ua674-" +
|
112
|
+
"\ua67b\ua698-\ua69d\ua69f\ua792-\ua79f\ua7aa-\ua7ad\ua7b0\ua7b1\ua7f7-\ua7f9\ua9e0-\ua9ef\ua9fa-" +
|
113
|
+
"\ua9fe\uaa7c-\uaa7f\uaae0-\uaaef\uaaf2-\uaaf6\uab30-\uab5a\uab5c-\uab5f\uab64\uab65\uf870-\uf87f" +
|
114
|
+
"\uf882\uf884-\uf89f\uf8b8\uf8c1-\uf8d6\ufa2e\ufa2f\ufe27-\ufe2d\u{102e0}\u{1031f}\u{10350}-\u{1037a}" +
|
115
|
+
"\u{10500}-\u{10527}\u{10530}-\u{10563}\u{10600}-\u{10736}\u{10740}-\u{10755}\u{10760}-\u{10767}" +
|
116
|
+
"\u{10860}-\u{10876}\u{10880}-\u{1089e}\u{10980}-\u{109b7}\u{109be}\u{109bf}\u{10a80}-\u{10a9c}" +
|
117
|
+
"\u{10ac0}-\u{10ac7}\u{10ac9}-\u{10ae6}\u{10b80}-\u{10b91}\u{1107f}\u{110d0}-\u{110e8}\u{11100}-" +
|
118
|
+
"\u{11134}\u{11150}-\u{11173}\u{11176}\u{11180}-\u{111c4}\u{111da}\u{11200}-\u{11211}\u{11213}-" +
|
119
|
+
"\u{11237}\u{112b0}-\u{112ea}\u{11301}-\u{11303}\u{11305}-\u{1130c}\u{1130f}\u{11310}\u{11313}-" +
|
120
|
+
"\u{11328}\u{1132a}-\u{11330}\u{11332}\u{11333}\u{11335}-\u{11339}\u{1133c}-\u{11344}\u{11347}" +
|
121
|
+
"\u{11348}\u{1134b}-\u{1134d}\u{11357}\u{1135d}-\u{11363}\u{11366}-\u{1136c}\u{11370}-\u{11374}" +
|
122
|
+
"\u{11480}-\u{114c5}\u{114c7}\u{11580}-\u{115b5}\u{115b8}-\u{115c0}\u{11600}-\u{11640}\u{11644}" +
|
123
|
+
"\u{11680}-\u{116b7}\u{118a0}-\u{118df}\u{118ff}\u{11ac0}-\u{11af8}\u{1236f}-\u{12398}\u{16a40}-" +
|
124
|
+
"\u{16a5e}\u{16ad0}-\u{16aed}\u{16af0}-\u{16af4}\u{16b00}-\u{16b36}\u{16b40}-\u{16b43}\u{16b63}-" +
|
125
|
+
"\u{16b77}\u{16b7d}-\u{16b8f}\u{16f00}-\u{16f44}\u{16f50}-\u{16f7e}\u{16f8f}-\u{16f9f}\u{1bc00}-" +
|
126
|
+
"\u{1bc6a}\u{1bc70}-\u{1bc7c}\u{1bc80}-\u{1bc88}\u{1bc90}-\u{1bc99}\u{1bc9d}\u{1bc9e}\u{1e800}-" +
|
127
|
+
"\u{1e8c4}\u{1e8d0}-\u{1e8d6}\u{1ee00}-\u{1ee03}\u{1ee05}-\u{1ee1f}\u{1ee21}\u{1ee22}\u{1ee24}" +
|
128
|
+
"\u{1ee27}\u{1ee29}-\u{1ee32}\u{1ee34}-\u{1ee37}\u{1ee39}\u{1ee3b}\u{1ee42}\u{1ee47}\u{1ee49}" +
|
129
|
+
"\u{1ee4b}\u{1ee4d}-\u{1ee4f}\u{1ee51}\u{1ee52}\u{1ee54}\u{1ee57}\u{1ee59}\u{1ee5b}\u{1ee5d}\u{1ee5f}" +
|
130
|
+
"\u{1ee61}\u{1ee62}\u{1ee64}\u{1ee67}-\u{1ee6a}\u{1ee6c}-\u{1ee72}\u{1ee74}-\u{1ee77}\u{1ee79}-" +
|
131
|
+
"\u{1ee7c}\u{1ee7e}\u{1ee80}-\u{1ee89}\u{1ee8b}-\u{1ee9b}\u{1eea1}-\u{1eea3}\u{1eea5}-\u{1eea9}" +
|
132
|
+
"\u{1eeab}-\u{1eebb}"
|
133
|
+
|
134
|
+
# Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{Nd}
|
135
|
+
HASHTAG_NUMERALS = "\\p{Nd}" +
|
136
|
+
"\u0de6-\u0def\ua9f0-\ua9f9\u{110f0}-\u{110f9}\u{11136}-\u{1113f}\u{111d0}-\u{111d9}\u{112f0}-" +
|
137
|
+
"\u{112f9}\u{114d0}-\u{114d9}\u{11650}-\u{11659}\u{116c0}-\u{116c9}\u{118e0}-\u{118e9}\u{16a60}-" +
|
138
|
+
"\u{16a69}\u{16b50}-\u{16b59}"
|
139
|
+
|
140
|
+
HASHTAG_SPECIAL_CHARS = "_\u200c\u200d\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7"
|
141
|
+
|
142
|
+
HASHTAG_LETTERS_NUMERALS = "#{HASHTAG_LETTERS_AND_MARKS}#{HASHTAG_NUMERALS}#{HASHTAG_SPECIAL_CHARS}"
|
143
|
+
HASHTAG_LETTERS_NUMERALS_SET = "[#{HASHTAG_LETTERS_NUMERALS}]"
|
144
|
+
HASHTAG_LETTERS_SET = "[#{HASHTAG_LETTERS_AND_MARKS}]"
|
145
|
+
|
146
|
+
HASHTAG = /(\A|\ufe0e|\ufe0f|[^&#{HASHTAG_LETTERS_NUMERALS}])(#|#)(?!\ufe0f|\u20e3)(#{HASHTAG_LETTERS_NUMERALS_SET}*#{HASHTAG_LETTERS_SET}#{HASHTAG_LETTERS_NUMERALS_SET}*)/io
|
147
|
+
|
148
|
+
REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
|
149
|
+
# Used in Extractor for final filtering
|
150
|
+
REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
|
151
|
+
|
152
|
+
REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/io
|
153
|
+
REGEXEN[:at_signs] = /[@@]/
|
154
|
+
REGEXEN[:valid_mention_or_list] = /
|
155
|
+
(#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
|
156
|
+
(#{REGEXEN[:at_signs]}) # $2: At mark
|
157
|
+
([a-z0-9_]{1,20}) # $3: Screen name
|
158
|
+
(\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
|
159
|
+
/iox
|
160
|
+
REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-z0-9_]{1,20})/io
|
161
|
+
# Used in Extractor for final filtering
|
162
|
+
REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/io
|
163
|
+
|
164
|
+
# URL related hash regex collection
|
165
|
+
REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
|
166
|
+
REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
|
167
|
+
DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
|
168
|
+
REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
169
|
+
REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
170
|
+
|
171
|
+
REGEXEN[:valid_gTLD] = %r{
|
172
|
+
(?:
|
173
|
+
(?:#{TLDS['generic'].join('|')})
|
174
|
+
(?=[^0-9a-z@]|$)
|
175
|
+
)
|
176
|
+
}ix
|
184
177
|
|
185
|
-
|
186
|
-
(?:
|
187
|
-
(?:co|tv)
|
188
|
-
(?=[^0-9a-z@]|$)
|
189
|
-
)
|
190
|
-
}ix
|
191
|
-
|
192
|
-
REGEXEN[:valid_domain] = /(?:
|
193
|
-
#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
|
194
|
-
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
195
|
-
)/iox
|
196
|
-
|
197
|
-
# This is used in Extractor
|
198
|
-
REGEXEN[:valid_ascii_domain] = /
|
199
|
-
(?:(?:[a-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
|
200
|
-
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
201
|
-
/iox
|
202
|
-
|
203
|
-
# This is used in Extractor for stricter t.co URL extraction
|
204
|
-
REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/([a-z0-9]+)/i
|
205
|
-
|
206
|
-
# This is used in Extractor to filter out unwanted URLs.
|
207
|
-
REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
|
208
|
-
REGEXEN[:valid_special_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_special_cctld]}\Z/io
|
209
|
-
|
210
|
-
REGEXEN[:valid_port_number] = /[0-9]+/
|
211
|
-
|
212
|
-
REGEXEN[:valid_general_url_path_chars] = /[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|@#{LATIN_ACCENTS}]/io
|
213
|
-
# Allow URL paths to contain up to two nested levels of balanced parens
|
214
|
-
# 1. Used in Wikipedia URLs like /Primer_(film)
|
215
|
-
# 2. Used in IIS sessions like /S(dfd346)/
|
216
|
-
# 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
|
217
|
-
REGEXEN[:valid_url_balanced_parens] = /
|
218
|
-
\(
|
178
|
+
REGEXEN[:valid_ccTLD] = %r{
|
219
179
|
(?:
|
220
|
-
|
221
|
-
|
222
|
-
|
180
|
+
(?:#{TLDS['country'].join('|')})
|
181
|
+
(?=[^0-9a-z@]|$)
|
182
|
+
)
|
183
|
+
}ix
|
184
|
+
REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
|
185
|
+
|
186
|
+
REGEXEN[:valid_special_cctld] = %r{
|
187
|
+
(?:
|
188
|
+
(?:co|tv)
|
189
|
+
(?=[^0-9a-z@]|$)
|
190
|
+
)
|
191
|
+
}ix
|
192
|
+
|
193
|
+
REGEXEN[:valid_domain] = /(?:
|
194
|
+
#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
|
195
|
+
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
196
|
+
)/iox
|
197
|
+
|
198
|
+
# This is used in Extractor
|
199
|
+
REGEXEN[:valid_ascii_domain] = /
|
200
|
+
(?:(?:[a-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
|
201
|
+
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
202
|
+
/iox
|
203
|
+
|
204
|
+
# This is used in Extractor for stricter t.co URL extraction
|
205
|
+
REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/([a-z0-9]+)/i
|
206
|
+
|
207
|
+
# This is used in Extractor to filter out unwanted URLs.
|
208
|
+
REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
|
209
|
+
REGEXEN[:valid_special_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_special_cctld]}\Z/io
|
210
|
+
|
211
|
+
REGEXEN[:valid_port_number] = /[0-9]+/
|
212
|
+
|
213
|
+
REGEXEN[:valid_general_url_path_chars] = /[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|@#{LATIN_ACCENTS}]/io
|
214
|
+
# Allow URL paths to contain up to two nested levels of balanced parens
|
215
|
+
# 1. Used in Wikipedia URLs like /Primer_(film)
|
216
|
+
# 2. Used in IIS sessions like /S(dfd346)/
|
217
|
+
# 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
|
218
|
+
REGEXEN[:valid_url_balanced_parens] = /
|
219
|
+
\(
|
223
220
|
(?:
|
224
|
-
#{REGEXEN[:valid_general_url_path_chars]}
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
221
|
+
#{REGEXEN[:valid_general_url_path_chars]}+
|
222
|
+
|
|
223
|
+
# allow one nested level of balanced parentheses
|
224
|
+
(?:
|
225
|
+
#{REGEXEN[:valid_general_url_path_chars]}*
|
226
|
+
\(
|
227
|
+
#{REGEXEN[:valid_general_url_path_chars]}+
|
228
|
+
\)
|
229
|
+
#{REGEXEN[:valid_general_url_path_chars]}*
|
230
|
+
)
|
229
231
|
)
|
230
|
-
)
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
)
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
REGEXEN[:valid_url] = %r{
|
232
|
+
\)
|
233
|
+
/iox
|
234
|
+
# Valid end-of-path chracters (so /foo. does not gobble the period).
|
235
|
+
# 1. Allow =&# for empty URL parameters and other URL-join artifacts
|
236
|
+
REGEXEN[:valid_url_path_ending_chars] = /[a-z\p{Cyrillic}0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
|
237
|
+
REGEXEN[:valid_url_path] = /(?:
|
238
|
+
(?:
|
239
|
+
#{REGEXEN[:valid_general_url_path_chars]}*
|
240
|
+
(?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
|
241
|
+
#{REGEXEN[:valid_url_path_ending_chars]}
|
242
|
+
)|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
|
243
|
+
)/iox
|
244
|
+
|
245
|
+
REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
|
246
|
+
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/\-]/i
|
247
|
+
REGEXEN[:valid_url] = %r{
|
247
248
|
( # $1 total match
|
248
249
|
(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
|
249
250
|
( # $3 URL
|
@@ -254,114 +255,115 @@ module Twitter
|
|
254
255
|
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
|
255
256
|
)
|
256
257
|
)
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
258
|
+
}iox
|
259
|
+
|
260
|
+
REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
|
261
|
+
REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
|
262
|
+
|
263
|
+
# These URL validation pattern strings are based on the ABNF from RFC 3986
|
264
|
+
REGEXEN[:validate_url_unreserved] = /[a-z\p{Cyrillic}0-9\p{Pd}._~]/i
|
265
|
+
REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
|
266
|
+
REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
|
267
|
+
REGEXEN[:validate_url_pchar] = /(?:
|
268
|
+
#{REGEXEN[:validate_url_unreserved]}|
|
269
|
+
#{REGEXEN[:validate_url_pct_encoded]}|
|
270
|
+
#{REGEXEN[:validate_url_sub_delims]}|
|
271
|
+
[:\|@]
|
272
|
+
)/iox
|
273
|
+
|
274
|
+
REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
|
275
|
+
REGEXEN[:validate_url_userinfo] = /(?:
|
276
|
+
#{REGEXEN[:validate_url_unreserved]}|
|
277
|
+
#{REGEXEN[:validate_url_pct_encoded]}|
|
278
|
+
#{REGEXEN[:validate_url_sub_delims]}|
|
279
|
+
:
|
280
|
+
)*/iox
|
281
|
+
|
282
|
+
REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
|
283
|
+
REGEXEN[:validate_url_ipv4] =
|
284
|
+
/(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
|
285
|
+
|
286
|
+
# Punting on real IPv6 validation for now
|
287
|
+
REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
|
288
|
+
|
289
|
+
# Also punting on IPvFuture for now
|
290
|
+
REGEXEN[:validate_url_ip] = /(?:
|
291
|
+
#{REGEXEN[:validate_url_ipv4]}|
|
292
|
+
#{REGEXEN[:validate_url_ipv6]}
|
293
|
+
)/iox
|
294
|
+
|
295
|
+
# This is more strict than the rfc specifies
|
296
|
+
REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
|
297
|
+
REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
|
298
|
+
REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
|
299
|
+
REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
|
300
|
+
(?:#{REGEXEN[:validate_url_domain_segment]}\.)
|
301
|
+
#{REGEXEN[:validate_url_domain_tld]})/iox
|
302
|
+
|
303
|
+
REGEXEN[:validate_url_host] = /(?:
|
304
|
+
#{REGEXEN[:validate_url_ip]}|
|
305
|
+
#{REGEXEN[:validate_url_domain]}
|
306
|
+
)/iox
|
307
|
+
|
308
|
+
# Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
|
309
|
+
REGEXEN[:validate_url_unicode_subdomain_segment] =
|
310
|
+
/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
|
311
|
+
REGEXEN[:validate_url_unicode_domain_segment] =
|
312
|
+
/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
|
313
|
+
REGEXEN[:validate_url_unicode_domain_tld] =
|
314
|
+
/(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
|
315
|
+
REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
|
316
|
+
(?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
|
317
|
+
#{REGEXEN[:validate_url_unicode_domain_tld]})/iox
|
318
|
+
|
319
|
+
REGEXEN[:validate_url_unicode_host] = /(?:
|
320
|
+
#{REGEXEN[:validate_url_ip]}|
|
321
|
+
#{REGEXEN[:validate_url_unicode_domain]}
|
322
|
+
)/iox
|
323
|
+
|
324
|
+
REGEXEN[:validate_url_port] = /[0-9]{1,5}/
|
325
|
+
|
326
|
+
REGEXEN[:validate_url_unicode_authority] = %r{
|
327
|
+
(?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
|
328
|
+
(#{REGEXEN[:validate_url_unicode_host]}) # $2 host
|
329
|
+
(?::(#{REGEXEN[:validate_url_port]}))? # $3 port
|
330
|
+
}iox
|
331
|
+
|
332
|
+
REGEXEN[:validate_url_authority] = %r{
|
333
|
+
(?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
|
334
|
+
(#{REGEXEN[:validate_url_host]}) # $2 host
|
335
|
+
(?::(#{REGEXEN[:validate_url_port]}))? # $3 port
|
336
|
+
}iox
|
337
|
+
|
338
|
+
REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
|
339
|
+
REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
|
340
|
+
REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
|
341
|
+
|
342
|
+
# Modified version of RFC 3986 Appendix B
|
343
|
+
REGEXEN[:validate_url_unencoded] = %r{
|
344
|
+
\A # Full URL
|
345
|
+
(?:
|
346
|
+
([^:/?#]+):// # $1 Scheme
|
347
|
+
)?
|
348
|
+
([^/?#]*) # $2 Authority
|
349
|
+
([^?#]*) # $3 Path
|
350
|
+
(?:
|
351
|
+
\?([^#]*) # $4 Query
|
352
|
+
)?
|
353
|
+
(?:
|
354
|
+
\#(.*) # $5 Fragment
|
355
|
+
)?\Z
|
356
|
+
}ix
|
357
|
+
|
358
|
+
REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
|
359
|
+
|
360
|
+
REGEXEN.each_pair{|k,v| v.freeze }
|
361
|
+
|
362
|
+
# Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
|
363
|
+
# is not a known symbol a <tt>nil</tt> will be returned.
|
364
|
+
def self.[](key)
|
365
|
+
REGEXEN[key]
|
366
|
+
end
|
365
367
|
end
|
366
368
|
end
|
367
369
|
end
|