twitter-text 2.0.2 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,21 +1,23 @@
1
1
  module Twitter
2
- module HashHelper
3
- # Return a new hash with all keys converted to symbols, as long as
4
- # they respond to +to_sym+.
5
- #
6
- # { 'name' => 'Rob', 'years' => '28' }.symbolize_keys
7
- # #=> { :name => "Rob", :years => "28" }
8
- def self.symbolize_keys(hash)
9
- symbolize_keys!(hash.dup)
10
- end
2
+ module TwitterText
3
+ module HashHelper
4
+ # Return a new hash with all keys converted to symbols, as long as
5
+ # they respond to +to_sym+.
6
+ #
7
+ # { 'name' => 'Rob', 'years' => '28' }.symbolize_keys
8
+ # #=> { :name => "Rob", :years => "28" }
9
+ def self.symbolize_keys(hash)
10
+ symbolize_keys!(hash.dup)
11
+ end
11
12
 
12
- # Destructively convert all keys to symbols, as long as they respond
13
- # to +to_sym+. Same as +symbolize_keys+, but modifies +self+.
14
- def self.symbolize_keys!(hash)
15
- hash.keys.each do |key|
16
- hash[(key.to_sym rescue key) || key] = hash.delete(key)
13
+ # Destructively convert all keys to symbols, as long as they respond
14
+ # to +to_sym+. Same as +symbolize_keys+, but modifies +self+.
15
+ def self.symbolize_keys!(hash)
16
+ hash.keys.each do |key|
17
+ hash[(key.to_sym rescue key) || key] = hash.delete(key)
18
+ end
19
+ hash
17
20
  end
18
- hash
19
21
  end
20
22
  end
21
23
  end
@@ -1,86 +1,88 @@
1
1
  module Twitter
2
- # Module for doing "hit highlighting" on tweets that have been auto-linked already.
3
- # Useful with the results returned from the Search API.
4
- module HitHighlighter extend self
5
- # Default Tag used for hit highlighting
6
- DEFAULT_HIGHLIGHT_TAG = "em"
2
+ module TwitterText
3
+ # Module for doing "hit highlighting" on tweets that have been auto-linked already.
4
+ # Useful with the results returned from the Search API.
5
+ module HitHighlighter extend self
6
+ # Default Tag used for hit highlighting
7
+ DEFAULT_HIGHLIGHT_TAG = "em"
7
8
 
8
- # Add <tt><em></em></tt> tags around the <tt>hits</tt> provided in the <tt>text</tt>. The
9
- # <tt>hits</tt> should be an array of (start, end) index pairs, relative to the original
10
- # text, before auto-linking (but the <tt>text</tt> may already be auto-linked if desired)
11
- #
12
- # The <tt><em></em></tt> tags can be overridden using the <tt>:tag</tt> option. For example:
13
- #
14
- # irb> hit_highlight("test hit here", [[5, 8]], :tag => 'strong')
15
- # => "test <strong>hit</strong> here"
16
- def hit_highlight(text, hits = [], options = {})
17
- if hits.empty?
18
- return text
19
- end
9
+ # Add <tt><em></em></tt> tags around the <tt>hits</tt> provided in the <tt>text</tt>. The
10
+ # <tt>hits</tt> should be an array of (start, end) index pairs, relative to the original
11
+ # text, before auto-linking (but the <tt>text</tt> may already be auto-linked if desired)
12
+ #
13
+ # The <tt><em></em></tt> tags can be overridden using the <tt>:tag</tt> option. For example:
14
+ #
15
+ # irb> hit_highlight("test hit here", [[5, 8]], :tag => 'strong')
16
+ # => "test <strong>hit</strong> here"
17
+ def hit_highlight(text, hits = [], options = {})
18
+ if hits.empty?
19
+ return text
20
+ end
20
21
 
21
- tag_name = options[:tag] || DEFAULT_HIGHLIGHT_TAG
22
- tags = ["<" + tag_name + ">", "</" + tag_name + ">"]
22
+ tag_name = options[:tag] || DEFAULT_HIGHLIGHT_TAG
23
+ tags = ["<" + tag_name + ">", "</" + tag_name + ">"]
23
24
 
24
- chunks = text.split(/[<>]/)
25
+ chunks = text.split(/[<>]/)
25
26
 
26
- result = []
27
- chunk_index, chunk = 0, chunks[0]
28
- chunk_chars = chunk.to_s.to_char_a
29
- prev_chunks_len = 0
30
- chunk_cursor = 0
31
- start_in_chunk = false
32
- for hit, index in hits.flatten.each_with_index do
33
- tag = tags[index % 2]
27
+ result = []
28
+ chunk_index, chunk = 0, chunks[0]
29
+ chunk_chars = chunk.to_s.to_char_a
30
+ prev_chunks_len = 0
31
+ chunk_cursor = 0
32
+ start_in_chunk = false
33
+ for hit, index in hits.flatten.each_with_index do
34
+ tag = tags[index % 2]
34
35
 
35
- placed = false
36
- until chunk.nil? || hit < prev_chunks_len + chunk.length do
37
- result << chunk_chars[chunk_cursor..-1]
38
- if start_in_chunk && hit == prev_chunks_len + chunk_chars.length
39
- result << tag
40
- placed = true
41
- end
36
+ placed = false
37
+ until chunk.nil? || hit < prev_chunks_len + chunk.length do
38
+ result << chunk_chars[chunk_cursor..-1]
39
+ if start_in_chunk && hit == prev_chunks_len + chunk_chars.length
40
+ result << tag
41
+ placed = true
42
+ end
42
43
 
43
- # correctly handle highlights that end on the final character.
44
- if tag_text = chunks[chunk_index+1]
45
- result << "<#{tag_text}>"
44
+ # correctly handle highlights that end on the final character.
45
+ if tag_text = chunks[chunk_index+1]
46
+ result << "<#{tag_text}>"
47
+ end
48
+
49
+ prev_chunks_len += chunk_chars.length
50
+ chunk_cursor = 0
51
+ chunk_index += 2
52
+ chunk = chunks[chunk_index]
53
+ chunk_chars = chunk.to_s.to_char_a
54
+ start_in_chunk = false
46
55
  end
47
56
 
48
- prev_chunks_len += chunk_chars.length
49
- chunk_cursor = 0
50
- chunk_index += 2
51
- chunk = chunks[chunk_index]
52
- chunk_chars = chunk.to_s.to_char_a
53
- start_in_chunk = false
54
- end
57
+ if !placed && !chunk.nil?
58
+ hit_spot = hit - prev_chunks_len
59
+ result << chunk_chars[chunk_cursor...hit_spot] << tag
60
+ chunk_cursor = hit_spot
61
+ if index % 2 == 0
62
+ start_in_chunk = true
63
+ else
64
+ start_in_chunk = false
65
+ end
66
+ placed = true
67
+ end
55
68
 
56
- if !placed && !chunk.nil?
57
- hit_spot = hit - prev_chunks_len
58
- result << chunk_chars[chunk_cursor...hit_spot] << tag
59
- chunk_cursor = hit_spot
60
- if index % 2 == 0
61
- start_in_chunk = true
62
- else
63
- start_in_chunk = false
69
+ # ultimate fallback, hits that run off the end get a closing tag
70
+ if !placed
71
+ result << tag
64
72
  end
65
- placed = true
66
73
  end
67
74
 
68
- # ultimate fallback, hits that run off the end get a closing tag
69
- if !placed
70
- result << tag
75
+ if chunk
76
+ if chunk_cursor < chunk_chars.length
77
+ result << chunk_chars[chunk_cursor..-1]
78
+ end
79
+ (chunk_index+1).upto(chunks.length-1).each do |i|
80
+ result << (i.even? ? chunks[i] : "<#{chunks[i]}>")
81
+ end
71
82
  end
72
- end
73
83
 
74
- if chunk
75
- if chunk_cursor < chunk_chars.length
76
- result << chunk_chars[chunk_cursor..-1]
77
- end
78
- (chunk_index+1).upto(chunks.length-1).each do |i|
79
- result << (i.even? ? chunks[i] : "<#{chunks[i]}>")
80
- end
84
+ result.flatten.join
81
85
  end
82
-
83
- result.flatten.join
84
86
  end
85
87
  end
86
88
  end
@@ -1,249 +1,250 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  module Twitter
4
- # A collection of regular expressions for parsing Tweet text. The regular expression
5
- # list is frozen at load time to ensure immutability. These regular expressions are
6
- # used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
7
- # sure these reular expressions work with Tweets in all languages.
8
- class Regex
9
- require 'yaml'
10
-
11
- REGEXEN = {} # :nodoc:
12
-
13
- def self.regex_range(from, to = nil) # :nodoc:
14
- if $RUBY_1_9
15
- if to
16
- "\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
4
+ module TwitterText
5
+ # A collection of regular expressions for parsing Tweet text. The regular expression
6
+ # list is frozen at load time to ensure immutability. These regular expressions are
7
+ # used throughout the <tt>TwitterText</tt> classes. Special care has been taken to make
8
+ # sure these reular expressions work with Tweets in all languages.
9
+ class Regex
10
+ require 'yaml'
11
+
12
+ REGEXEN = {} # :nodoc:
13
+
14
+ def self.regex_range(from, to = nil) # :nodoc:
15
+ if $RUBY_1_9
16
+ if to
17
+ "\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
18
+ else
19
+ "\\u{#{from.to_s(16).rjust(4, '0')}}"
20
+ end
17
21
  else
18
- "\\u{#{from.to_s(16).rjust(4, '0')}}"
19
- end
20
- else
21
- if to
22
- [from].pack('U') + '-' + [to].pack('U')
23
- else
24
- [from].pack('U')
22
+ if to
23
+ [from].pack('U') + '-' + [to].pack('U')
24
+ else
25
+ [from].pack('U')
26
+ end
25
27
  end
26
28
  end
27
- end
28
29
 
29
- TLDS = YAML.load_file(
30
- File.join(
31
- File.expand_path('../../..', __FILE__), # project root
32
- 'lib', 'assets', 'tld_lib.yml'
30
+ TLDS = YAML.load_file(
31
+ File.join(
32
+ File.expand_path('../../..', __FILE__), # project root
33
+ 'lib', 'assets', 'tld_lib.yml'
34
+ )
33
35
  )
34
- )
35
-
36
- # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
37
- # to access both the list of characters and a pattern suitible for use with String#split
38
- # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
39
- UNICODE_SPACES = [
40
- (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
41
- 0x0020, # White_Space # Zs SPACE
42
- 0x0085, # White_Space # Cc <control-0085>
43
- 0x00A0, # White_Space # Zs NO-BREAK SPACE
44
- 0x1680, # White_Space # Zs OGHAM SPACE MARK
45
- 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
46
- (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
47
- 0x2028, # White_Space # Zl LINE SEPARATOR
48
- 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
49
- 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
50
- 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
51
- 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
52
- ].flatten.map{|c| [c].pack('U*')}.freeze
53
- REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
54
-
55
- # Character not allowed in Tweets
56
- INVALID_CHARACTERS = [
57
- 0xFFFE, 0xFEFF, # BOM
58
- 0xFFFF, # Special
59
- 0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
60
- ].map{|cp| [cp].pack('U') }.freeze
61
- REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
62
-
63
- major, minor, _patch = RUBY_VERSION.split('.')
64
- if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
65
- REGEXEN[:list_name] = /[a-z][a-z0-9_\-\u0080-\u00ff]{0,24}/i
66
- else
67
- # This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
68
- REGEXEN[:list_name] = eval("/[a-z][a-z0-9_\\-\x80-\xff]{0,24}/i")
69
- end
70
36
 
71
- # Latin accented characters
72
- # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
73
- # Also excludes 0xf7, the division sign
74
- LATIN_ACCENTS = [
75
- regex_range(0xc0, 0xd6),
76
- regex_range(0xd8, 0xf6),
77
- regex_range(0xf8, 0xff),
78
- regex_range(0x0100, 0x024f),
79
- regex_range(0x0253, 0x0254),
80
- regex_range(0x0256, 0x0257),
81
- regex_range(0x0259),
82
- regex_range(0x025b),
83
- regex_range(0x0263),
84
- regex_range(0x0268),
85
- regex_range(0x026f),
86
- regex_range(0x0272),
87
- regex_range(0x0289),
88
- regex_range(0x028b),
89
- regex_range(0x02bb),
90
- regex_range(0x0300, 0x036f),
91
- regex_range(0x1e00, 0x1eff)
92
- ].join('').freeze
93
- REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
94
-
95
- RTL_CHARACTERS = [
96
- regex_range(0x0600,0x06FF),
97
- regex_range(0x0750,0x077F),
98
- regex_range(0x0590,0x05FF),
99
- regex_range(0xFE70,0xFEFF)
100
- ].join('').freeze
101
-
102
- PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
103
- SPACE_CHARS = " \t\n\x0B\f\r"
104
- CTRL_CHARS = "\x00-\x1F\x7F"
105
-
106
- # Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{L}\p{M}
107
- HASHTAG_LETTERS_AND_MARKS = "\\p{L}\\p{M}" +
108
- "\u037f\u0528-\u052f\u08a0-\u08b2\u08e4-\u08ff\u0978\u0980\u0c00\u0c34\u0c81\u0d01\u0ede\u0edf" +
109
- "\u10c7\u10cd\u10fd-\u10ff\u16f1-\u16f8\u17b4\u17b5\u191d\u191e\u1ab0-\u1abe\u1bab-\u1bad\u1bba-" +
110
- "\u1bbf\u1cf3-\u1cf6\u1cf8\u1cf9\u1de7-\u1df5\u2cf2\u2cf3\u2d27\u2d2d\u2d66\u2d67\u9fcc\ua674-" +
111
- "\ua67b\ua698-\ua69d\ua69f\ua792-\ua79f\ua7aa-\ua7ad\ua7b0\ua7b1\ua7f7-\ua7f9\ua9e0-\ua9ef\ua9fa-" +
112
- "\ua9fe\uaa7c-\uaa7f\uaae0-\uaaef\uaaf2-\uaaf6\uab30-\uab5a\uab5c-\uab5f\uab64\uab65\uf870-\uf87f" +
113
- "\uf882\uf884-\uf89f\uf8b8\uf8c1-\uf8d6\ufa2e\ufa2f\ufe27-\ufe2d\u{102e0}\u{1031f}\u{10350}-\u{1037a}" +
114
- "\u{10500}-\u{10527}\u{10530}-\u{10563}\u{10600}-\u{10736}\u{10740}-\u{10755}\u{10760}-\u{10767}" +
115
- "\u{10860}-\u{10876}\u{10880}-\u{1089e}\u{10980}-\u{109b7}\u{109be}\u{109bf}\u{10a80}-\u{10a9c}" +
116
- "\u{10ac0}-\u{10ac7}\u{10ac9}-\u{10ae6}\u{10b80}-\u{10b91}\u{1107f}\u{110d0}-\u{110e8}\u{11100}-" +
117
- "\u{11134}\u{11150}-\u{11173}\u{11176}\u{11180}-\u{111c4}\u{111da}\u{11200}-\u{11211}\u{11213}-" +
118
- "\u{11237}\u{112b0}-\u{112ea}\u{11301}-\u{11303}\u{11305}-\u{1130c}\u{1130f}\u{11310}\u{11313}-" +
119
- "\u{11328}\u{1132a}-\u{11330}\u{11332}\u{11333}\u{11335}-\u{11339}\u{1133c}-\u{11344}\u{11347}" +
120
- "\u{11348}\u{1134b}-\u{1134d}\u{11357}\u{1135d}-\u{11363}\u{11366}-\u{1136c}\u{11370}-\u{11374}" +
121
- "\u{11480}-\u{114c5}\u{114c7}\u{11580}-\u{115b5}\u{115b8}-\u{115c0}\u{11600}-\u{11640}\u{11644}" +
122
- "\u{11680}-\u{116b7}\u{118a0}-\u{118df}\u{118ff}\u{11ac0}-\u{11af8}\u{1236f}-\u{12398}\u{16a40}-" +
123
- "\u{16a5e}\u{16ad0}-\u{16aed}\u{16af0}-\u{16af4}\u{16b00}-\u{16b36}\u{16b40}-\u{16b43}\u{16b63}-" +
124
- "\u{16b77}\u{16b7d}-\u{16b8f}\u{16f00}-\u{16f44}\u{16f50}-\u{16f7e}\u{16f8f}-\u{16f9f}\u{1bc00}-" +
125
- "\u{1bc6a}\u{1bc70}-\u{1bc7c}\u{1bc80}-\u{1bc88}\u{1bc90}-\u{1bc99}\u{1bc9d}\u{1bc9e}\u{1e800}-" +
126
- "\u{1e8c4}\u{1e8d0}-\u{1e8d6}\u{1ee00}-\u{1ee03}\u{1ee05}-\u{1ee1f}\u{1ee21}\u{1ee22}\u{1ee24}" +
127
- "\u{1ee27}\u{1ee29}-\u{1ee32}\u{1ee34}-\u{1ee37}\u{1ee39}\u{1ee3b}\u{1ee42}\u{1ee47}\u{1ee49}" +
128
- "\u{1ee4b}\u{1ee4d}-\u{1ee4f}\u{1ee51}\u{1ee52}\u{1ee54}\u{1ee57}\u{1ee59}\u{1ee5b}\u{1ee5d}\u{1ee5f}" +
129
- "\u{1ee61}\u{1ee62}\u{1ee64}\u{1ee67}-\u{1ee6a}\u{1ee6c}-\u{1ee72}\u{1ee74}-\u{1ee77}\u{1ee79}-" +
130
- "\u{1ee7c}\u{1ee7e}\u{1ee80}-\u{1ee89}\u{1ee8b}-\u{1ee9b}\u{1eea1}-\u{1eea3}\u{1eea5}-\u{1eea9}" +
131
- "\u{1eeab}-\u{1eebb}"
132
-
133
- # Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{Nd}
134
- HASHTAG_NUMERALS = "\\p{Nd}" +
135
- "\u0de6-\u0def\ua9f0-\ua9f9\u{110f0}-\u{110f9}\u{11136}-\u{1113f}\u{111d0}-\u{111d9}\u{112f0}-" +
136
- "\u{112f9}\u{114d0}-\u{114d9}\u{11650}-\u{11659}\u{116c0}-\u{116c9}\u{118e0}-\u{118e9}\u{16a60}-" +
137
- "\u{16a69}\u{16b50}-\u{16b59}"
138
-
139
- HASHTAG_SPECIAL_CHARS = "_\u200c\u200d\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7"
140
-
141
- HASHTAG_LETTERS_NUMERALS = "#{HASHTAG_LETTERS_AND_MARKS}#{HASHTAG_NUMERALS}#{HASHTAG_SPECIAL_CHARS}"
142
- HASHTAG_LETTERS_NUMERALS_SET = "[#{HASHTAG_LETTERS_NUMERALS}]"
143
- HASHTAG_LETTERS_SET = "[#{HASHTAG_LETTERS_AND_MARKS}]"
144
-
145
- HASHTAG = /(\A|\ufe0e|\ufe0f|[^&#{HASHTAG_LETTERS_NUMERALS}])(#|#)(?!\ufe0f|\u20e3)(#{HASHTAG_LETTERS_NUMERALS_SET}*#{HASHTAG_LETTERS_SET}#{HASHTAG_LETTERS_NUMERALS_SET}*)/io
146
-
147
- REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
148
- # Used in Extractor for final filtering
149
- REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
150
-
151
- REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/io
152
- REGEXEN[:at_signs] = /[@@]/
153
- REGEXEN[:valid_mention_or_list] = /
154
- (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
155
- (#{REGEXEN[:at_signs]}) # $2: At mark
156
- ([a-z0-9_]{1,20}) # $3: Screen name
157
- (\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
158
- /iox
159
- REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-z0-9_]{1,20})/io
160
- # Used in Extractor for final filtering
161
- REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/io
162
-
163
- # URL related hash regex collection
164
- REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
165
- REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
166
- DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
167
- REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
168
- REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
169
-
170
- REGEXEN[:valid_gTLD] = %r{
171
- (?:
172
- (?:#{TLDS['generic'].join('|')})
173
- (?=[^0-9a-z@]|$)
174
- )
175
- }ix
37
+ # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
38
+ # to access both the list of characters and a pattern suitible for use with String#split
39
+ # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
40
+ UNICODE_SPACES = [
41
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
42
+ 0x0020, # White_Space # Zs SPACE
43
+ 0x0085, # White_Space # Cc <control-0085>
44
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
45
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
46
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
47
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
48
+ 0x2028, # White_Space # Zl LINE SEPARATOR
49
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
50
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
51
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
52
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
53
+ ].flatten.map{|c| [c].pack('U*')}.freeze
54
+ REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
55
+
56
+ # Character not allowed in Tweets
57
+ INVALID_CHARACTERS = [
58
+ 0xFFFE, 0xFEFF, # BOM
59
+ 0xFFFF, # Special
60
+ 0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
61
+ ].map{|cp| [cp].pack('U') }.freeze
62
+ REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
63
+
64
+ major, minor, _patch = RUBY_VERSION.split('.')
65
+ if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
66
+ REGEXEN[:list_name] = /[a-z][a-z0-9_\-\u0080-\u00ff]{0,24}/i
67
+ else
68
+ # This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
69
+ REGEXEN[:list_name] = eval("/[a-z][a-z0-9_\\-\x80-\xff]{0,24}/i")
70
+ end
176
71
 
177
- REGEXEN[:valid_ccTLD] = %r{
178
- (?:
179
- (?:#{TLDS['country'].join('|')})
180
- (?=[^0-9a-z@]|$)
181
- )
182
- }ix
183
- REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
72
+ # Latin accented characters
73
+ # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
74
+ # Also excludes 0xf7, the division sign
75
+ LATIN_ACCENTS = [
76
+ regex_range(0xc0, 0xd6),
77
+ regex_range(0xd8, 0xf6),
78
+ regex_range(0xf8, 0xff),
79
+ regex_range(0x0100, 0x024f),
80
+ regex_range(0x0253, 0x0254),
81
+ regex_range(0x0256, 0x0257),
82
+ regex_range(0x0259),
83
+ regex_range(0x025b),
84
+ regex_range(0x0263),
85
+ regex_range(0x0268),
86
+ regex_range(0x026f),
87
+ regex_range(0x0272),
88
+ regex_range(0x0289),
89
+ regex_range(0x028b),
90
+ regex_range(0x02bb),
91
+ regex_range(0x0300, 0x036f),
92
+ regex_range(0x1e00, 0x1eff)
93
+ ].join('').freeze
94
+ REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
95
+
96
+ RTL_CHARACTERS = [
97
+ regex_range(0x0600,0x06FF),
98
+ regex_range(0x0750,0x077F),
99
+ regex_range(0x0590,0x05FF),
100
+ regex_range(0xFE70,0xFEFF)
101
+ ].join('').freeze
102
+
103
+ PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
104
+ SPACE_CHARS = " \t\n\x0B\f\r"
105
+ CTRL_CHARS = "\x00-\x1F\x7F"
106
+
107
+ # Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{L}\p{M}
108
+ HASHTAG_LETTERS_AND_MARKS = "\\p{L}\\p{M}" +
109
+ "\u037f\u0528-\u052f\u08a0-\u08b2\u08e4-\u08ff\u0978\u0980\u0c00\u0c34\u0c81\u0d01\u0ede\u0edf" +
110
+ "\u10c7\u10cd\u10fd-\u10ff\u16f1-\u16f8\u17b4\u17b5\u191d\u191e\u1ab0-\u1abe\u1bab-\u1bad\u1bba-" +
111
+ "\u1bbf\u1cf3-\u1cf6\u1cf8\u1cf9\u1de7-\u1df5\u2cf2\u2cf3\u2d27\u2d2d\u2d66\u2d67\u9fcc\ua674-" +
112
+ "\ua67b\ua698-\ua69d\ua69f\ua792-\ua79f\ua7aa-\ua7ad\ua7b0\ua7b1\ua7f7-\ua7f9\ua9e0-\ua9ef\ua9fa-" +
113
+ "\ua9fe\uaa7c-\uaa7f\uaae0-\uaaef\uaaf2-\uaaf6\uab30-\uab5a\uab5c-\uab5f\uab64\uab65\uf870-\uf87f" +
114
+ "\uf882\uf884-\uf89f\uf8b8\uf8c1-\uf8d6\ufa2e\ufa2f\ufe27-\ufe2d\u{102e0}\u{1031f}\u{10350}-\u{1037a}" +
115
+ "\u{10500}-\u{10527}\u{10530}-\u{10563}\u{10600}-\u{10736}\u{10740}-\u{10755}\u{10760}-\u{10767}" +
116
+ "\u{10860}-\u{10876}\u{10880}-\u{1089e}\u{10980}-\u{109b7}\u{109be}\u{109bf}\u{10a80}-\u{10a9c}" +
117
+ "\u{10ac0}-\u{10ac7}\u{10ac9}-\u{10ae6}\u{10b80}-\u{10b91}\u{1107f}\u{110d0}-\u{110e8}\u{11100}-" +
118
+ "\u{11134}\u{11150}-\u{11173}\u{11176}\u{11180}-\u{111c4}\u{111da}\u{11200}-\u{11211}\u{11213}-" +
119
+ "\u{11237}\u{112b0}-\u{112ea}\u{11301}-\u{11303}\u{11305}-\u{1130c}\u{1130f}\u{11310}\u{11313}-" +
120
+ "\u{11328}\u{1132a}-\u{11330}\u{11332}\u{11333}\u{11335}-\u{11339}\u{1133c}-\u{11344}\u{11347}" +
121
+ "\u{11348}\u{1134b}-\u{1134d}\u{11357}\u{1135d}-\u{11363}\u{11366}-\u{1136c}\u{11370}-\u{11374}" +
122
+ "\u{11480}-\u{114c5}\u{114c7}\u{11580}-\u{115b5}\u{115b8}-\u{115c0}\u{11600}-\u{11640}\u{11644}" +
123
+ "\u{11680}-\u{116b7}\u{118a0}-\u{118df}\u{118ff}\u{11ac0}-\u{11af8}\u{1236f}-\u{12398}\u{16a40}-" +
124
+ "\u{16a5e}\u{16ad0}-\u{16aed}\u{16af0}-\u{16af4}\u{16b00}-\u{16b36}\u{16b40}-\u{16b43}\u{16b63}-" +
125
+ "\u{16b77}\u{16b7d}-\u{16b8f}\u{16f00}-\u{16f44}\u{16f50}-\u{16f7e}\u{16f8f}-\u{16f9f}\u{1bc00}-" +
126
+ "\u{1bc6a}\u{1bc70}-\u{1bc7c}\u{1bc80}-\u{1bc88}\u{1bc90}-\u{1bc99}\u{1bc9d}\u{1bc9e}\u{1e800}-" +
127
+ "\u{1e8c4}\u{1e8d0}-\u{1e8d6}\u{1ee00}-\u{1ee03}\u{1ee05}-\u{1ee1f}\u{1ee21}\u{1ee22}\u{1ee24}" +
128
+ "\u{1ee27}\u{1ee29}-\u{1ee32}\u{1ee34}-\u{1ee37}\u{1ee39}\u{1ee3b}\u{1ee42}\u{1ee47}\u{1ee49}" +
129
+ "\u{1ee4b}\u{1ee4d}-\u{1ee4f}\u{1ee51}\u{1ee52}\u{1ee54}\u{1ee57}\u{1ee59}\u{1ee5b}\u{1ee5d}\u{1ee5f}" +
130
+ "\u{1ee61}\u{1ee62}\u{1ee64}\u{1ee67}-\u{1ee6a}\u{1ee6c}-\u{1ee72}\u{1ee74}-\u{1ee77}\u{1ee79}-" +
131
+ "\u{1ee7c}\u{1ee7e}\u{1ee80}-\u{1ee89}\u{1ee8b}-\u{1ee9b}\u{1eea1}-\u{1eea3}\u{1eea5}-\u{1eea9}" +
132
+ "\u{1eeab}-\u{1eebb}"
133
+
134
+ # Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{Nd}
135
+ HASHTAG_NUMERALS = "\\p{Nd}" +
136
+ "\u0de6-\u0def\ua9f0-\ua9f9\u{110f0}-\u{110f9}\u{11136}-\u{1113f}\u{111d0}-\u{111d9}\u{112f0}-" +
137
+ "\u{112f9}\u{114d0}-\u{114d9}\u{11650}-\u{11659}\u{116c0}-\u{116c9}\u{118e0}-\u{118e9}\u{16a60}-" +
138
+ "\u{16a69}\u{16b50}-\u{16b59}"
139
+
140
+ HASHTAG_SPECIAL_CHARS = "_\u200c\u200d\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7"
141
+
142
+ HASHTAG_LETTERS_NUMERALS = "#{HASHTAG_LETTERS_AND_MARKS}#{HASHTAG_NUMERALS}#{HASHTAG_SPECIAL_CHARS}"
143
+ HASHTAG_LETTERS_NUMERALS_SET = "[#{HASHTAG_LETTERS_NUMERALS}]"
144
+ HASHTAG_LETTERS_SET = "[#{HASHTAG_LETTERS_AND_MARKS}]"
145
+
146
+ HASHTAG = /(\A|\ufe0e|\ufe0f|[^&#{HASHTAG_LETTERS_NUMERALS}])(#|#)(?!\ufe0f|\u20e3)(#{HASHTAG_LETTERS_NUMERALS_SET}*#{HASHTAG_LETTERS_SET}#{HASHTAG_LETTERS_NUMERALS_SET}*)/io
147
+
148
+ REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
149
+ # Used in Extractor for final filtering
150
+ REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
151
+
152
+ REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/io
153
+ REGEXEN[:at_signs] = /[@@]/
154
+ REGEXEN[:valid_mention_or_list] = /
155
+ (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
156
+ (#{REGEXEN[:at_signs]}) # $2: At mark
157
+ ([a-z0-9_]{1,20}) # $3: Screen name
158
+ (\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
159
+ /iox
160
+ REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-z0-9_]{1,20})/io
161
+ # Used in Extractor for final filtering
162
+ REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/io
163
+
164
+ # URL related hash regex collection
165
+ REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
166
+ REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
167
+ DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
168
+ REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
169
+ REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
170
+
171
+ REGEXEN[:valid_gTLD] = %r{
172
+ (?:
173
+ (?:#{TLDS['generic'].join('|')})
174
+ (?=[^0-9a-z@]|$)
175
+ )
176
+ }ix
184
177
 
185
- REGEXEN[:valid_special_cctld] = %r{
186
- (?:
187
- (?:co|tv)
188
- (?=[^0-9a-z@]|$)
189
- )
190
- }ix
191
-
192
- REGEXEN[:valid_domain] = /(?:
193
- #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
194
- (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
195
- )/iox
196
-
197
- # This is used in Extractor
198
- REGEXEN[:valid_ascii_domain] = /
199
- (?:(?:[a-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
200
- (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
201
- /iox
202
-
203
- # This is used in Extractor for stricter t.co URL extraction
204
- REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/([a-z0-9]+)/i
205
-
206
- # This is used in Extractor to filter out unwanted URLs.
207
- REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
208
- REGEXEN[:valid_special_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_special_cctld]}\Z/io
209
-
210
- REGEXEN[:valid_port_number] = /[0-9]+/
211
-
212
- REGEXEN[:valid_general_url_path_chars] = /[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|@#{LATIN_ACCENTS}]/io
213
- # Allow URL paths to contain up to two nested levels of balanced parens
214
- # 1. Used in Wikipedia URLs like /Primer_(film)
215
- # 2. Used in IIS sessions like /S(dfd346)/
216
- # 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
217
- REGEXEN[:valid_url_balanced_parens] = /
218
- \(
178
+ REGEXEN[:valid_ccTLD] = %r{
219
179
  (?:
220
- #{REGEXEN[:valid_general_url_path_chars]}+
221
- |
222
- # allow one nested level of balanced parentheses
180
+ (?:#{TLDS['country'].join('|')})
181
+ (?=[^0-9a-z@]|$)
182
+ )
183
+ }ix
184
+ REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
185
+
186
+ REGEXEN[:valid_special_cctld] = %r{
187
+ (?:
188
+ (?:co|tv)
189
+ (?=[^0-9a-z@]|$)
190
+ )
191
+ }ix
192
+
193
+ REGEXEN[:valid_domain] = /(?:
194
+ #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
195
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
196
+ )/iox
197
+
198
+ # This is used in Extractor
199
+ REGEXEN[:valid_ascii_domain] = /
200
+ (?:(?:[a-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
201
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
202
+ /iox
203
+
204
+ # This is used in Extractor for stricter t.co URL extraction
205
+ REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/([a-z0-9]+)/i
206
+
207
+ # This is used in Extractor to filter out unwanted URLs.
208
+ REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
209
+ REGEXEN[:valid_special_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_special_cctld]}\Z/io
210
+
211
+ REGEXEN[:valid_port_number] = /[0-9]+/
212
+
213
+ REGEXEN[:valid_general_url_path_chars] = /[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|@#{LATIN_ACCENTS}]/io
214
+ # Allow URL paths to contain up to two nested levels of balanced parens
215
+ # 1. Used in Wikipedia URLs like /Primer_(film)
216
+ # 2. Used in IIS sessions like /S(dfd346)/
217
+ # 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
218
+ REGEXEN[:valid_url_balanced_parens] = /
219
+ \(
223
220
  (?:
224
- #{REGEXEN[:valid_general_url_path_chars]}*
225
- \(
226
- #{REGEXEN[:valid_general_url_path_chars]}+
227
- \)
228
- #{REGEXEN[:valid_general_url_path_chars]}*
221
+ #{REGEXEN[:valid_general_url_path_chars]}+
222
+ |
223
+ # allow one nested level of balanced parentheses
224
+ (?:
225
+ #{REGEXEN[:valid_general_url_path_chars]}*
226
+ \(
227
+ #{REGEXEN[:valid_general_url_path_chars]}+
228
+ \)
229
+ #{REGEXEN[:valid_general_url_path_chars]}*
230
+ )
229
231
  )
230
- )
231
- \)
232
- /iox
233
- # Valid end-of-path chracters (so /foo. does not gobble the period).
234
- # 1. Allow =&# for empty URL parameters and other URL-join artifacts
235
- REGEXEN[:valid_url_path_ending_chars] = /[a-z\p{Cyrillic}0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
236
- REGEXEN[:valid_url_path] = /(?:
237
- (?:
238
- #{REGEXEN[:valid_general_url_path_chars]}*
239
- (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
240
- #{REGEXEN[:valid_url_path_ending_chars]}
241
- )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
242
- )/iox
243
-
244
- REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
245
- REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/\-]/i
246
- REGEXEN[:valid_url] = %r{
232
+ \)
233
+ /iox
234
+ # Valid end-of-path chracters (so /foo. does not gobble the period).
235
+ # 1. Allow =&# for empty URL parameters and other URL-join artifacts
236
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z\p{Cyrillic}0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
237
+ REGEXEN[:valid_url_path] = /(?:
238
+ (?:
239
+ #{REGEXEN[:valid_general_url_path_chars]}*
240
+ (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
241
+ #{REGEXEN[:valid_url_path_ending_chars]}
242
+ )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
243
+ )/iox
244
+
245
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
246
+ REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/\-]/i
247
+ REGEXEN[:valid_url] = %r{
247
248
  ( # $1 total match
248
249
  (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
249
250
  ( # $3 URL
@@ -254,114 +255,115 @@ module Twitter
254
255
  (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
255
256
  )
256
257
  )
257
- }iox
258
-
259
- REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
260
- REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
261
-
262
- # These URL validation pattern strings are based on the ABNF from RFC 3986
263
- REGEXEN[:validate_url_unreserved] = /[a-z\p{Cyrillic}0-9\p{Pd}._~]/i
264
- REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
265
- REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
266
- REGEXEN[:validate_url_pchar] = /(?:
267
- #{REGEXEN[:validate_url_unreserved]}|
268
- #{REGEXEN[:validate_url_pct_encoded]}|
269
- #{REGEXEN[:validate_url_sub_delims]}|
270
- [:\|@]
271
- )/iox
272
-
273
- REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
274
- REGEXEN[:validate_url_userinfo] = /(?:
275
- #{REGEXEN[:validate_url_unreserved]}|
276
- #{REGEXEN[:validate_url_pct_encoded]}|
277
- #{REGEXEN[:validate_url_sub_delims]}|
278
- :
279
- )*/iox
280
-
281
- REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
282
- REGEXEN[:validate_url_ipv4] =
283
- /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
284
-
285
- # Punting on real IPv6 validation for now
286
- REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
287
-
288
- # Also punting on IPvFuture for now
289
- REGEXEN[:validate_url_ip] = /(?:
290
- #{REGEXEN[:validate_url_ipv4]}|
291
- #{REGEXEN[:validate_url_ipv6]}
292
- )/iox
293
-
294
- # This is more strict than the rfc specifies
295
- REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
296
- REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
297
- REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
298
- REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
299
- (?:#{REGEXEN[:validate_url_domain_segment]}\.)
300
- #{REGEXEN[:validate_url_domain_tld]})/iox
301
-
302
- REGEXEN[:validate_url_host] = /(?:
303
- #{REGEXEN[:validate_url_ip]}|
304
- #{REGEXEN[:validate_url_domain]}
305
- )/iox
306
-
307
- # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
308
- REGEXEN[:validate_url_unicode_subdomain_segment] =
309
- /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
310
- REGEXEN[:validate_url_unicode_domain_segment] =
311
- /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
312
- REGEXEN[:validate_url_unicode_domain_tld] =
313
- /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
314
- REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
315
- (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
316
- #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
317
-
318
- REGEXEN[:validate_url_unicode_host] = /(?:
319
- #{REGEXEN[:validate_url_ip]}|
320
- #{REGEXEN[:validate_url_unicode_domain]}
321
- )/iox
322
-
323
- REGEXEN[:validate_url_port] = /[0-9]{1,5}/
324
-
325
- REGEXEN[:validate_url_unicode_authority] = %r{
326
- (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
327
- (#{REGEXEN[:validate_url_unicode_host]}) # $2 host
328
- (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
329
- }iox
330
-
331
- REGEXEN[:validate_url_authority] = %r{
332
- (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
333
- (#{REGEXEN[:validate_url_host]}) # $2 host
334
- (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
335
- }iox
336
-
337
- REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
338
- REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
339
- REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
340
-
341
- # Modified version of RFC 3986 Appendix B
342
- REGEXEN[:validate_url_unencoded] = %r{
343
- \A # Full URL
344
- (?:
345
- ([^:/?#]+):// # $1 Scheme
346
- )?
347
- ([^/?#]*) # $2 Authority
348
- ([^?#]*) # $3 Path
349
- (?:
350
- \?([^#]*) # $4 Query
351
- )?
352
- (?:
353
- \#(.*) # $5 Fragment
354
- )?\Z
355
- }ix
356
-
357
- REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
358
-
359
- REGEXEN.each_pair{|k,v| v.freeze }
360
-
361
- # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
362
- # is not a known symbol a <tt>nil</tt> will be returned.
363
- def self.[](key)
364
- REGEXEN[key]
258
+ }iox
259
+
260
+ REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
261
+ REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
262
+
263
+ # These URL validation pattern strings are based on the ABNF from RFC 3986
264
+ REGEXEN[:validate_url_unreserved] = /[a-z\p{Cyrillic}0-9\p{Pd}._~]/i
265
+ REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
266
+ REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
267
+ REGEXEN[:validate_url_pchar] = /(?:
268
+ #{REGEXEN[:validate_url_unreserved]}|
269
+ #{REGEXEN[:validate_url_pct_encoded]}|
270
+ #{REGEXEN[:validate_url_sub_delims]}|
271
+ [:\|@]
272
+ )/iox
273
+
274
+ REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
275
+ REGEXEN[:validate_url_userinfo] = /(?:
276
+ #{REGEXEN[:validate_url_unreserved]}|
277
+ #{REGEXEN[:validate_url_pct_encoded]}|
278
+ #{REGEXEN[:validate_url_sub_delims]}|
279
+ :
280
+ )*/iox
281
+
282
+ REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
283
+ REGEXEN[:validate_url_ipv4] =
284
+ /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
285
+
286
+ # Punting on real IPv6 validation for now
287
+ REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
288
+
289
+ # Also punting on IPvFuture for now
290
+ REGEXEN[:validate_url_ip] = /(?:
291
+ #{REGEXEN[:validate_url_ipv4]}|
292
+ #{REGEXEN[:validate_url_ipv6]}
293
+ )/iox
294
+
295
+ # This is more strict than the rfc specifies
296
+ REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
297
+ REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
298
+ REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
299
+ REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
300
+ (?:#{REGEXEN[:validate_url_domain_segment]}\.)
301
+ #{REGEXEN[:validate_url_domain_tld]})/iox
302
+
303
+ REGEXEN[:validate_url_host] = /(?:
304
+ #{REGEXEN[:validate_url_ip]}|
305
+ #{REGEXEN[:validate_url_domain]}
306
+ )/iox
307
+
308
+ # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
309
+ REGEXEN[:validate_url_unicode_subdomain_segment] =
310
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
311
+ REGEXEN[:validate_url_unicode_domain_segment] =
312
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
313
+ REGEXEN[:validate_url_unicode_domain_tld] =
314
+ /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
315
+ REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
316
+ (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
317
+ #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
318
+
319
+ REGEXEN[:validate_url_unicode_host] = /(?:
320
+ #{REGEXEN[:validate_url_ip]}|
321
+ #{REGEXEN[:validate_url_unicode_domain]}
322
+ )/iox
323
+
324
+ REGEXEN[:validate_url_port] = /[0-9]{1,5}/
325
+
326
+ REGEXEN[:validate_url_unicode_authority] = %r{
327
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
328
+ (#{REGEXEN[:validate_url_unicode_host]}) # $2 host
329
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
330
+ }iox
331
+
332
+ REGEXEN[:validate_url_authority] = %r{
333
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
334
+ (#{REGEXEN[:validate_url_host]}) # $2 host
335
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
336
+ }iox
337
+
338
+ REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
339
+ REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
340
+ REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
341
+
342
+ # Modified version of RFC 3986 Appendix B
343
+ REGEXEN[:validate_url_unencoded] = %r{
344
+ \A # Full URL
345
+ (?:
346
+ ([^:/?#]+):// # $1 Scheme
347
+ )?
348
+ ([^/?#]*) # $2 Authority
349
+ ([^?#]*) # $3 Path
350
+ (?:
351
+ \?([^#]*) # $4 Query
352
+ )?
353
+ (?:
354
+ \#(.*) # $5 Fragment
355
+ )?\Z
356
+ }ix
357
+
358
+ REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
359
+
360
+ REGEXEN.each_pair{|k,v| v.freeze }
361
+
362
+ # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
363
+ # is not a known symbol a <tt>nil</tt> will be returned.
364
+ def self.[](key)
365
+ REGEXEN[key]
366
+ end
365
367
  end
366
368
  end
367
369
  end