twitter-text 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,21 +1,23 @@
1
1
  module Twitter
2
- module HashHelper
3
- # Return a new hash with all keys converted to symbols, as long as
4
- # they respond to +to_sym+.
5
- #
6
- # { 'name' => 'Rob', 'years' => '28' }.symbolize_keys
7
- # #=> { :name => "Rob", :years => "28" }
8
- def self.symbolize_keys(hash)
9
- symbolize_keys!(hash.dup)
10
- end
2
+ module TwitterText
3
+ module HashHelper
4
+ # Return a new hash with all keys converted to symbols, as long as
5
+ # they respond to +to_sym+.
6
+ #
7
+ # { 'name' => 'Rob', 'years' => '28' }.symbolize_keys
8
+ # #=> { :name => "Rob", :years => "28" }
9
+ def self.symbolize_keys(hash)
10
+ symbolize_keys!(hash.dup)
11
+ end
11
12
 
12
- # Destructively convert all keys to symbols, as long as they respond
13
- # to +to_sym+. Same as +symbolize_keys+, but modifies +self+.
14
- def self.symbolize_keys!(hash)
15
- hash.keys.each do |key|
16
- hash[(key.to_sym rescue key) || key] = hash.delete(key)
13
+ # Destructively convert all keys to symbols, as long as they respond
14
+ # to +to_sym+. Same as +symbolize_keys+, but modifies +self+.
15
+ def self.symbolize_keys!(hash)
16
+ hash.keys.each do |key|
17
+ hash[(key.to_sym rescue key) || key] = hash.delete(key)
18
+ end
19
+ hash
17
20
  end
18
- hash
19
21
  end
20
22
  end
21
23
  end
@@ -1,86 +1,88 @@
1
1
  module Twitter
2
- # Module for doing "hit highlighting" on tweets that have been auto-linked already.
3
- # Useful with the results returned from the Search API.
4
- module HitHighlighter extend self
5
- # Default Tag used for hit highlighting
6
- DEFAULT_HIGHLIGHT_TAG = "em"
2
+ module TwitterText
3
+ # Module for doing "hit highlighting" on tweets that have been auto-linked already.
4
+ # Useful with the results returned from the Search API.
5
+ module HitHighlighter extend self
6
+ # Default Tag used for hit highlighting
7
+ DEFAULT_HIGHLIGHT_TAG = "em"
7
8
 
8
- # Add <tt><em></em></tt> tags around the <tt>hits</tt> provided in the <tt>text</tt>. The
9
- # <tt>hits</tt> should be an array of (start, end) index pairs, relative to the original
10
- # text, before auto-linking (but the <tt>text</tt> may already be auto-linked if desired)
11
- #
12
- # The <tt><em></em></tt> tags can be overridden using the <tt>:tag</tt> option. For example:
13
- #
14
- # irb> hit_highlight("test hit here", [[5, 8]], :tag => 'strong')
15
- # => "test <strong>hit</strong> here"
16
- def hit_highlight(text, hits = [], options = {})
17
- if hits.empty?
18
- return text
19
- end
9
+ # Add <tt><em></em></tt> tags around the <tt>hits</tt> provided in the <tt>text</tt>. The
10
+ # <tt>hits</tt> should be an array of (start, end) index pairs, relative to the original
11
+ # text, before auto-linking (but the <tt>text</tt> may already be auto-linked if desired)
12
+ #
13
+ # The <tt><em></em></tt> tags can be overridden using the <tt>:tag</tt> option. For example:
14
+ #
15
+ # irb> hit_highlight("test hit here", [[5, 8]], :tag => 'strong')
16
+ # => "test <strong>hit</strong> here"
17
+ def hit_highlight(text, hits = [], options = {})
18
+ if hits.empty?
19
+ return text
20
+ end
20
21
 
21
- tag_name = options[:tag] || DEFAULT_HIGHLIGHT_TAG
22
- tags = ["<" + tag_name + ">", "</" + tag_name + ">"]
22
+ tag_name = options[:tag] || DEFAULT_HIGHLIGHT_TAG
23
+ tags = ["<" + tag_name + ">", "</" + tag_name + ">"]
23
24
 
24
- chunks = text.split(/[<>]/)
25
+ chunks = text.split(/[<>]/)
25
26
 
26
- result = []
27
- chunk_index, chunk = 0, chunks[0]
28
- chunk_chars = chunk.to_s.to_char_a
29
- prev_chunks_len = 0
30
- chunk_cursor = 0
31
- start_in_chunk = false
32
- for hit, index in hits.flatten.each_with_index do
33
- tag = tags[index % 2]
27
+ result = []
28
+ chunk_index, chunk = 0, chunks[0]
29
+ chunk_chars = chunk.to_s.to_char_a
30
+ prev_chunks_len = 0
31
+ chunk_cursor = 0
32
+ start_in_chunk = false
33
+ for hit, index in hits.flatten.each_with_index do
34
+ tag = tags[index % 2]
34
35
 
35
- placed = false
36
- until chunk.nil? || hit < prev_chunks_len + chunk.length do
37
- result << chunk_chars[chunk_cursor..-1]
38
- if start_in_chunk && hit == prev_chunks_len + chunk_chars.length
39
- result << tag
40
- placed = true
41
- end
36
+ placed = false
37
+ until chunk.nil? || hit < prev_chunks_len + chunk.length do
38
+ result << chunk_chars[chunk_cursor..-1]
39
+ if start_in_chunk && hit == prev_chunks_len + chunk_chars.length
40
+ result << tag
41
+ placed = true
42
+ end
42
43
 
43
- # correctly handle highlights that end on the final character.
44
- if tag_text = chunks[chunk_index+1]
45
- result << "<#{tag_text}>"
44
+ # correctly handle highlights that end on the final character.
45
+ if tag_text = chunks[chunk_index+1]
46
+ result << "<#{tag_text}>"
47
+ end
48
+
49
+ prev_chunks_len += chunk_chars.length
50
+ chunk_cursor = 0
51
+ chunk_index += 2
52
+ chunk = chunks[chunk_index]
53
+ chunk_chars = chunk.to_s.to_char_a
54
+ start_in_chunk = false
46
55
  end
47
56
 
48
- prev_chunks_len += chunk_chars.length
49
- chunk_cursor = 0
50
- chunk_index += 2
51
- chunk = chunks[chunk_index]
52
- chunk_chars = chunk.to_s.to_char_a
53
- start_in_chunk = false
54
- end
57
+ if !placed && !chunk.nil?
58
+ hit_spot = hit - prev_chunks_len
59
+ result << chunk_chars[chunk_cursor...hit_spot] << tag
60
+ chunk_cursor = hit_spot
61
+ if index % 2 == 0
62
+ start_in_chunk = true
63
+ else
64
+ start_in_chunk = false
65
+ end
66
+ placed = true
67
+ end
55
68
 
56
- if !placed && !chunk.nil?
57
- hit_spot = hit - prev_chunks_len
58
- result << chunk_chars[chunk_cursor...hit_spot] << tag
59
- chunk_cursor = hit_spot
60
- if index % 2 == 0
61
- start_in_chunk = true
62
- else
63
- start_in_chunk = false
69
+ # ultimate fallback, hits that run off the end get a closing tag
70
+ if !placed
71
+ result << tag
64
72
  end
65
- placed = true
66
73
  end
67
74
 
68
- # ultimate fallback, hits that run off the end get a closing tag
69
- if !placed
70
- result << tag
75
+ if chunk
76
+ if chunk_cursor < chunk_chars.length
77
+ result << chunk_chars[chunk_cursor..-1]
78
+ end
79
+ (chunk_index+1).upto(chunks.length-1).each do |i|
80
+ result << (i.even? ? chunks[i] : "<#{chunks[i]}>")
81
+ end
71
82
  end
72
- end
73
83
 
74
- if chunk
75
- if chunk_cursor < chunk_chars.length
76
- result << chunk_chars[chunk_cursor..-1]
77
- end
78
- (chunk_index+1).upto(chunks.length-1).each do |i|
79
- result << (i.even? ? chunks[i] : "<#{chunks[i]}>")
80
- end
84
+ result.flatten.join
81
85
  end
82
-
83
- result.flatten.join
84
86
  end
85
87
  end
86
88
  end
@@ -1,249 +1,250 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  module Twitter
4
- # A collection of regular expressions for parsing Tweet text. The regular expression
5
- # list is frozen at load time to ensure immutability. These regular expressions are
6
- # used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
7
- # sure these reular expressions work with Tweets in all languages.
8
- class Regex
9
- require 'yaml'
10
-
11
- REGEXEN = {} # :nodoc:
12
-
13
- def self.regex_range(from, to = nil) # :nodoc:
14
- if $RUBY_1_9
15
- if to
16
- "\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
4
+ module TwitterText
5
+ # A collection of regular expressions for parsing Tweet text. The regular expression
6
+ # list is frozen at load time to ensure immutability. These regular expressions are
7
+ # used throughout the <tt>TwitterText</tt> classes. Special care has been taken to make
8
+ # sure these reular expressions work with Tweets in all languages.
9
+ class Regex
10
+ require 'yaml'
11
+
12
+ REGEXEN = {} # :nodoc:
13
+
14
+ def self.regex_range(from, to = nil) # :nodoc:
15
+ if $RUBY_1_9
16
+ if to
17
+ "\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}"
18
+ else
19
+ "\\u{#{from.to_s(16).rjust(4, '0')}}"
20
+ end
17
21
  else
18
- "\\u{#{from.to_s(16).rjust(4, '0')}}"
19
- end
20
- else
21
- if to
22
- [from].pack('U') + '-' + [to].pack('U')
23
- else
24
- [from].pack('U')
22
+ if to
23
+ [from].pack('U') + '-' + [to].pack('U')
24
+ else
25
+ [from].pack('U')
26
+ end
25
27
  end
26
28
  end
27
- end
28
29
 
29
- TLDS = YAML.load_file(
30
- File.join(
31
- File.expand_path('../../..', __FILE__), # project root
32
- 'lib', 'assets', 'tld_lib.yml'
30
+ TLDS = YAML.load_file(
31
+ File.join(
32
+ File.expand_path('../../..', __FILE__), # project root
33
+ 'lib', 'assets', 'tld_lib.yml'
34
+ )
33
35
  )
34
- )
35
-
36
- # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
37
- # to access both the list of characters and a pattern suitible for use with String#split
38
- # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
39
- UNICODE_SPACES = [
40
- (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
41
- 0x0020, # White_Space # Zs SPACE
42
- 0x0085, # White_Space # Cc <control-0085>
43
- 0x00A0, # White_Space # Zs NO-BREAK SPACE
44
- 0x1680, # White_Space # Zs OGHAM SPACE MARK
45
- 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
46
- (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
47
- 0x2028, # White_Space # Zl LINE SEPARATOR
48
- 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
49
- 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
50
- 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
51
- 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
52
- ].flatten.map{|c| [c].pack('U*')}.freeze
53
- REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
54
-
55
- # Character not allowed in Tweets
56
- INVALID_CHARACTERS = [
57
- 0xFFFE, 0xFEFF, # BOM
58
- 0xFFFF, # Special
59
- 0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
60
- ].map{|cp| [cp].pack('U') }.freeze
61
- REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
62
-
63
- major, minor, _patch = RUBY_VERSION.split('.')
64
- if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
65
- REGEXEN[:list_name] = /[a-z][a-z0-9_\-\u0080-\u00ff]{0,24}/i
66
- else
67
- # This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
68
- REGEXEN[:list_name] = eval("/[a-z][a-z0-9_\\-\x80-\xff]{0,24}/i")
69
- end
70
36
 
71
- # Latin accented characters
72
- # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
73
- # Also excludes 0xf7, the division sign
74
- LATIN_ACCENTS = [
75
- regex_range(0xc0, 0xd6),
76
- regex_range(0xd8, 0xf6),
77
- regex_range(0xf8, 0xff),
78
- regex_range(0x0100, 0x024f),
79
- regex_range(0x0253, 0x0254),
80
- regex_range(0x0256, 0x0257),
81
- regex_range(0x0259),
82
- regex_range(0x025b),
83
- regex_range(0x0263),
84
- regex_range(0x0268),
85
- regex_range(0x026f),
86
- regex_range(0x0272),
87
- regex_range(0x0289),
88
- regex_range(0x028b),
89
- regex_range(0x02bb),
90
- regex_range(0x0300, 0x036f),
91
- regex_range(0x1e00, 0x1eff)
92
- ].join('').freeze
93
- REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
94
-
95
- RTL_CHARACTERS = [
96
- regex_range(0x0600,0x06FF),
97
- regex_range(0x0750,0x077F),
98
- regex_range(0x0590,0x05FF),
99
- regex_range(0xFE70,0xFEFF)
100
- ].join('').freeze
101
-
102
- PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
103
- SPACE_CHARS = " \t\n\x0B\f\r"
104
- CTRL_CHARS = "\x00-\x1F\x7F"
105
-
106
- # Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{L}\p{M}
107
- HASHTAG_LETTERS_AND_MARKS = "\\p{L}\\p{M}" +
108
- "\u037f\u0528-\u052f\u08a0-\u08b2\u08e4-\u08ff\u0978\u0980\u0c00\u0c34\u0c81\u0d01\u0ede\u0edf" +
109
- "\u10c7\u10cd\u10fd-\u10ff\u16f1-\u16f8\u17b4\u17b5\u191d\u191e\u1ab0-\u1abe\u1bab-\u1bad\u1bba-" +
110
- "\u1bbf\u1cf3-\u1cf6\u1cf8\u1cf9\u1de7-\u1df5\u2cf2\u2cf3\u2d27\u2d2d\u2d66\u2d67\u9fcc\ua674-" +
111
- "\ua67b\ua698-\ua69d\ua69f\ua792-\ua79f\ua7aa-\ua7ad\ua7b0\ua7b1\ua7f7-\ua7f9\ua9e0-\ua9ef\ua9fa-" +
112
- "\ua9fe\uaa7c-\uaa7f\uaae0-\uaaef\uaaf2-\uaaf6\uab30-\uab5a\uab5c-\uab5f\uab64\uab65\uf870-\uf87f" +
113
- "\uf882\uf884-\uf89f\uf8b8\uf8c1-\uf8d6\ufa2e\ufa2f\ufe27-\ufe2d\u{102e0}\u{1031f}\u{10350}-\u{1037a}" +
114
- "\u{10500}-\u{10527}\u{10530}-\u{10563}\u{10600}-\u{10736}\u{10740}-\u{10755}\u{10760}-\u{10767}" +
115
- "\u{10860}-\u{10876}\u{10880}-\u{1089e}\u{10980}-\u{109b7}\u{109be}\u{109bf}\u{10a80}-\u{10a9c}" +
116
- "\u{10ac0}-\u{10ac7}\u{10ac9}-\u{10ae6}\u{10b80}-\u{10b91}\u{1107f}\u{110d0}-\u{110e8}\u{11100}-" +
117
- "\u{11134}\u{11150}-\u{11173}\u{11176}\u{11180}-\u{111c4}\u{111da}\u{11200}-\u{11211}\u{11213}-" +
118
- "\u{11237}\u{112b0}-\u{112ea}\u{11301}-\u{11303}\u{11305}-\u{1130c}\u{1130f}\u{11310}\u{11313}-" +
119
- "\u{11328}\u{1132a}-\u{11330}\u{11332}\u{11333}\u{11335}-\u{11339}\u{1133c}-\u{11344}\u{11347}" +
120
- "\u{11348}\u{1134b}-\u{1134d}\u{11357}\u{1135d}-\u{11363}\u{11366}-\u{1136c}\u{11370}-\u{11374}" +
121
- "\u{11480}-\u{114c5}\u{114c7}\u{11580}-\u{115b5}\u{115b8}-\u{115c0}\u{11600}-\u{11640}\u{11644}" +
122
- "\u{11680}-\u{116b7}\u{118a0}-\u{118df}\u{118ff}\u{11ac0}-\u{11af8}\u{1236f}-\u{12398}\u{16a40}-" +
123
- "\u{16a5e}\u{16ad0}-\u{16aed}\u{16af0}-\u{16af4}\u{16b00}-\u{16b36}\u{16b40}-\u{16b43}\u{16b63}-" +
124
- "\u{16b77}\u{16b7d}-\u{16b8f}\u{16f00}-\u{16f44}\u{16f50}-\u{16f7e}\u{16f8f}-\u{16f9f}\u{1bc00}-" +
125
- "\u{1bc6a}\u{1bc70}-\u{1bc7c}\u{1bc80}-\u{1bc88}\u{1bc90}-\u{1bc99}\u{1bc9d}\u{1bc9e}\u{1e800}-" +
126
- "\u{1e8c4}\u{1e8d0}-\u{1e8d6}\u{1ee00}-\u{1ee03}\u{1ee05}-\u{1ee1f}\u{1ee21}\u{1ee22}\u{1ee24}" +
127
- "\u{1ee27}\u{1ee29}-\u{1ee32}\u{1ee34}-\u{1ee37}\u{1ee39}\u{1ee3b}\u{1ee42}\u{1ee47}\u{1ee49}" +
128
- "\u{1ee4b}\u{1ee4d}-\u{1ee4f}\u{1ee51}\u{1ee52}\u{1ee54}\u{1ee57}\u{1ee59}\u{1ee5b}\u{1ee5d}\u{1ee5f}" +
129
- "\u{1ee61}\u{1ee62}\u{1ee64}\u{1ee67}-\u{1ee6a}\u{1ee6c}-\u{1ee72}\u{1ee74}-\u{1ee77}\u{1ee79}-" +
130
- "\u{1ee7c}\u{1ee7e}\u{1ee80}-\u{1ee89}\u{1ee8b}-\u{1ee9b}\u{1eea1}-\u{1eea3}\u{1eea5}-\u{1eea9}" +
131
- "\u{1eeab}-\u{1eebb}"
132
-
133
- # Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{Nd}
134
- HASHTAG_NUMERALS = "\\p{Nd}" +
135
- "\u0de6-\u0def\ua9f0-\ua9f9\u{110f0}-\u{110f9}\u{11136}-\u{1113f}\u{111d0}-\u{111d9}\u{112f0}-" +
136
- "\u{112f9}\u{114d0}-\u{114d9}\u{11650}-\u{11659}\u{116c0}-\u{116c9}\u{118e0}-\u{118e9}\u{16a60}-" +
137
- "\u{16a69}\u{16b50}-\u{16b59}"
138
-
139
- HASHTAG_SPECIAL_CHARS = "_\u200c\u200d\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7"
140
-
141
- HASHTAG_LETTERS_NUMERALS = "#{HASHTAG_LETTERS_AND_MARKS}#{HASHTAG_NUMERALS}#{HASHTAG_SPECIAL_CHARS}"
142
- HASHTAG_LETTERS_NUMERALS_SET = "[#{HASHTAG_LETTERS_NUMERALS}]"
143
- HASHTAG_LETTERS_SET = "[#{HASHTAG_LETTERS_AND_MARKS}]"
144
-
145
- HASHTAG = /(\A|\ufe0e|\ufe0f|[^&#{HASHTAG_LETTERS_NUMERALS}])(#|#)(?!\ufe0f|\u20e3)(#{HASHTAG_LETTERS_NUMERALS_SET}*#{HASHTAG_LETTERS_SET}#{HASHTAG_LETTERS_NUMERALS_SET}*)/io
146
-
147
- REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
148
- # Used in Extractor for final filtering
149
- REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
150
-
151
- REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/io
152
- REGEXEN[:at_signs] = /[@@]/
153
- REGEXEN[:valid_mention_or_list] = /
154
- (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
155
- (#{REGEXEN[:at_signs]}) # $2: At mark
156
- ([a-z0-9_]{1,20}) # $3: Screen name
157
- (\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
158
- /iox
159
- REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-z0-9_]{1,20})/io
160
- # Used in Extractor for final filtering
161
- REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/io
162
-
163
- # URL related hash regex collection
164
- REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
165
- REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
166
- DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
167
- REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
168
- REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
169
-
170
- REGEXEN[:valid_gTLD] = %r{
171
- (?:
172
- (?:#{TLDS['generic'].join('|')})
173
- (?=[^0-9a-z@]|$)
174
- )
175
- }ix
37
+ # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
38
+ # to access both the list of characters and a pattern suitible for use with String#split
39
+ # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
40
+ UNICODE_SPACES = [
41
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
42
+ 0x0020, # White_Space # Zs SPACE
43
+ 0x0085, # White_Space # Cc <control-0085>
44
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
45
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
46
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
47
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
48
+ 0x2028, # White_Space # Zl LINE SEPARATOR
49
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
50
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
51
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
52
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
53
+ ].flatten.map{|c| [c].pack('U*')}.freeze
54
+ REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
55
+
56
+ # Character not allowed in Tweets
57
+ INVALID_CHARACTERS = [
58
+ 0xFFFE, 0xFEFF, # BOM
59
+ 0xFFFF, # Special
60
+ 0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
61
+ ].map{|cp| [cp].pack('U') }.freeze
62
+ REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
63
+
64
+ major, minor, _patch = RUBY_VERSION.split('.')
65
+ if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
66
+ REGEXEN[:list_name] = /[a-z][a-z0-9_\-\u0080-\u00ff]{0,24}/i
67
+ else
68
+ # This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
69
+ REGEXEN[:list_name] = eval("/[a-z][a-z0-9_\\-\x80-\xff]{0,24}/i")
70
+ end
176
71
 
177
- REGEXEN[:valid_ccTLD] = %r{
178
- (?:
179
- (?:#{TLDS['country'].join('|')})
180
- (?=[^0-9a-z@]|$)
181
- )
182
- }ix
183
- REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
72
+ # Latin accented characters
73
+ # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
74
+ # Also excludes 0xf7, the division sign
75
+ LATIN_ACCENTS = [
76
+ regex_range(0xc0, 0xd6),
77
+ regex_range(0xd8, 0xf6),
78
+ regex_range(0xf8, 0xff),
79
+ regex_range(0x0100, 0x024f),
80
+ regex_range(0x0253, 0x0254),
81
+ regex_range(0x0256, 0x0257),
82
+ regex_range(0x0259),
83
+ regex_range(0x025b),
84
+ regex_range(0x0263),
85
+ regex_range(0x0268),
86
+ regex_range(0x026f),
87
+ regex_range(0x0272),
88
+ regex_range(0x0289),
89
+ regex_range(0x028b),
90
+ regex_range(0x02bb),
91
+ regex_range(0x0300, 0x036f),
92
+ regex_range(0x1e00, 0x1eff)
93
+ ].join('').freeze
94
+ REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
95
+
96
+ RTL_CHARACTERS = [
97
+ regex_range(0x0600,0x06FF),
98
+ regex_range(0x0750,0x077F),
99
+ regex_range(0x0590,0x05FF),
100
+ regex_range(0xFE70,0xFEFF)
101
+ ].join('').freeze
102
+
103
+ PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
104
+ SPACE_CHARS = " \t\n\x0B\f\r"
105
+ CTRL_CHARS = "\x00-\x1F\x7F"
106
+
107
+ # Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{L}\p{M}
108
+ HASHTAG_LETTERS_AND_MARKS = "\\p{L}\\p{M}" +
109
+ "\u037f\u0528-\u052f\u08a0-\u08b2\u08e4-\u08ff\u0978\u0980\u0c00\u0c34\u0c81\u0d01\u0ede\u0edf" +
110
+ "\u10c7\u10cd\u10fd-\u10ff\u16f1-\u16f8\u17b4\u17b5\u191d\u191e\u1ab0-\u1abe\u1bab-\u1bad\u1bba-" +
111
+ "\u1bbf\u1cf3-\u1cf6\u1cf8\u1cf9\u1de7-\u1df5\u2cf2\u2cf3\u2d27\u2d2d\u2d66\u2d67\u9fcc\ua674-" +
112
+ "\ua67b\ua698-\ua69d\ua69f\ua792-\ua79f\ua7aa-\ua7ad\ua7b0\ua7b1\ua7f7-\ua7f9\ua9e0-\ua9ef\ua9fa-" +
113
+ "\ua9fe\uaa7c-\uaa7f\uaae0-\uaaef\uaaf2-\uaaf6\uab30-\uab5a\uab5c-\uab5f\uab64\uab65\uf870-\uf87f" +
114
+ "\uf882\uf884-\uf89f\uf8b8\uf8c1-\uf8d6\ufa2e\ufa2f\ufe27-\ufe2d\u{102e0}\u{1031f}\u{10350}-\u{1037a}" +
115
+ "\u{10500}-\u{10527}\u{10530}-\u{10563}\u{10600}-\u{10736}\u{10740}-\u{10755}\u{10760}-\u{10767}" +
116
+ "\u{10860}-\u{10876}\u{10880}-\u{1089e}\u{10980}-\u{109b7}\u{109be}\u{109bf}\u{10a80}-\u{10a9c}" +
117
+ "\u{10ac0}-\u{10ac7}\u{10ac9}-\u{10ae6}\u{10b80}-\u{10b91}\u{1107f}\u{110d0}-\u{110e8}\u{11100}-" +
118
+ "\u{11134}\u{11150}-\u{11173}\u{11176}\u{11180}-\u{111c4}\u{111da}\u{11200}-\u{11211}\u{11213}-" +
119
+ "\u{11237}\u{112b0}-\u{112ea}\u{11301}-\u{11303}\u{11305}-\u{1130c}\u{1130f}\u{11310}\u{11313}-" +
120
+ "\u{11328}\u{1132a}-\u{11330}\u{11332}\u{11333}\u{11335}-\u{11339}\u{1133c}-\u{11344}\u{11347}" +
121
+ "\u{11348}\u{1134b}-\u{1134d}\u{11357}\u{1135d}-\u{11363}\u{11366}-\u{1136c}\u{11370}-\u{11374}" +
122
+ "\u{11480}-\u{114c5}\u{114c7}\u{11580}-\u{115b5}\u{115b8}-\u{115c0}\u{11600}-\u{11640}\u{11644}" +
123
+ "\u{11680}-\u{116b7}\u{118a0}-\u{118df}\u{118ff}\u{11ac0}-\u{11af8}\u{1236f}-\u{12398}\u{16a40}-" +
124
+ "\u{16a5e}\u{16ad0}-\u{16aed}\u{16af0}-\u{16af4}\u{16b00}-\u{16b36}\u{16b40}-\u{16b43}\u{16b63}-" +
125
+ "\u{16b77}\u{16b7d}-\u{16b8f}\u{16f00}-\u{16f44}\u{16f50}-\u{16f7e}\u{16f8f}-\u{16f9f}\u{1bc00}-" +
126
+ "\u{1bc6a}\u{1bc70}-\u{1bc7c}\u{1bc80}-\u{1bc88}\u{1bc90}-\u{1bc99}\u{1bc9d}\u{1bc9e}\u{1e800}-" +
127
+ "\u{1e8c4}\u{1e8d0}-\u{1e8d6}\u{1ee00}-\u{1ee03}\u{1ee05}-\u{1ee1f}\u{1ee21}\u{1ee22}\u{1ee24}" +
128
+ "\u{1ee27}\u{1ee29}-\u{1ee32}\u{1ee34}-\u{1ee37}\u{1ee39}\u{1ee3b}\u{1ee42}\u{1ee47}\u{1ee49}" +
129
+ "\u{1ee4b}\u{1ee4d}-\u{1ee4f}\u{1ee51}\u{1ee52}\u{1ee54}\u{1ee57}\u{1ee59}\u{1ee5b}\u{1ee5d}\u{1ee5f}" +
130
+ "\u{1ee61}\u{1ee62}\u{1ee64}\u{1ee67}-\u{1ee6a}\u{1ee6c}-\u{1ee72}\u{1ee74}-\u{1ee77}\u{1ee79}-" +
131
+ "\u{1ee7c}\u{1ee7e}\u{1ee80}-\u{1ee89}\u{1ee8b}-\u{1ee9b}\u{1eea1}-\u{1eea3}\u{1eea5}-\u{1eea9}" +
132
+ "\u{1eeab}-\u{1eebb}"
133
+
134
+ # Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Ruby's \p{Nd}
135
+ HASHTAG_NUMERALS = "\\p{Nd}" +
136
+ "\u0de6-\u0def\ua9f0-\ua9f9\u{110f0}-\u{110f9}\u{11136}-\u{1113f}\u{111d0}-\u{111d9}\u{112f0}-" +
137
+ "\u{112f9}\u{114d0}-\u{114d9}\u{11650}-\u{11659}\u{116c0}-\u{116c9}\u{118e0}-\u{118e9}\u{16a60}-" +
138
+ "\u{16a69}\u{16b50}-\u{16b59}"
139
+
140
+ HASHTAG_SPECIAL_CHARS = "_\u200c\u200d\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7"
141
+
142
+ HASHTAG_LETTERS_NUMERALS = "#{HASHTAG_LETTERS_AND_MARKS}#{HASHTAG_NUMERALS}#{HASHTAG_SPECIAL_CHARS}"
143
+ HASHTAG_LETTERS_NUMERALS_SET = "[#{HASHTAG_LETTERS_NUMERALS}]"
144
+ HASHTAG_LETTERS_SET = "[#{HASHTAG_LETTERS_AND_MARKS}]"
145
+
146
+ HASHTAG = /(\A|\ufe0e|\ufe0f|[^&#{HASHTAG_LETTERS_NUMERALS}])(#|#)(?!\ufe0f|\u20e3)(#{HASHTAG_LETTERS_NUMERALS_SET}*#{HASHTAG_LETTERS_SET}#{HASHTAG_LETTERS_NUMERALS_SET}*)/io
147
+
148
+ REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
149
+ # Used in Extractor for final filtering
150
+ REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
151
+
152
+ REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/io
153
+ REGEXEN[:at_signs] = /[@@]/
154
+ REGEXEN[:valid_mention_or_list] = /
155
+ (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
156
+ (#{REGEXEN[:at_signs]}) # $2: At mark
157
+ ([a-z0-9_]{1,20}) # $3: Screen name
158
+ (\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
159
+ /iox
160
+ REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-z0-9_]{1,20})/io
161
+ # Used in Extractor for final filtering
162
+ REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/io
163
+
164
+ # URL related hash regex collection
165
+ REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
166
+ REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
167
+ DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
168
+ REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
169
+ REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
170
+
171
+ REGEXEN[:valid_gTLD] = %r{
172
+ (?:
173
+ (?:#{TLDS['generic'].join('|')})
174
+ (?=[^0-9a-z@]|$)
175
+ )
176
+ }ix
184
177
 
185
- REGEXEN[:valid_special_cctld] = %r{
186
- (?:
187
- (?:co|tv)
188
- (?=[^0-9a-z@]|$)
189
- )
190
- }ix
191
-
192
- REGEXEN[:valid_domain] = /(?:
193
- #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
194
- (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
195
- )/iox
196
-
197
- # This is used in Extractor
198
- REGEXEN[:valid_ascii_domain] = /
199
- (?:(?:[a-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
200
- (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
201
- /iox
202
-
203
- # This is used in Extractor for stricter t.co URL extraction
204
- REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/([a-z0-9]+)/i
205
-
206
- # This is used in Extractor to filter out unwanted URLs.
207
- REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
208
- REGEXEN[:valid_special_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_special_cctld]}\Z/io
209
-
210
- REGEXEN[:valid_port_number] = /[0-9]+/
211
-
212
- REGEXEN[:valid_general_url_path_chars] = /[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|@#{LATIN_ACCENTS}]/io
213
- # Allow URL paths to contain up to two nested levels of balanced parens
214
- # 1. Used in Wikipedia URLs like /Primer_(film)
215
- # 2. Used in IIS sessions like /S(dfd346)/
216
- # 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
217
- REGEXEN[:valid_url_balanced_parens] = /
218
- \(
178
+ REGEXEN[:valid_ccTLD] = %r{
219
179
  (?:
220
- #{REGEXEN[:valid_general_url_path_chars]}+
221
- |
222
- # allow one nested level of balanced parentheses
180
+ (?:#{TLDS['country'].join('|')})
181
+ (?=[^0-9a-z@]|$)
182
+ )
183
+ }ix
184
+ REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
185
+
186
+ REGEXEN[:valid_special_cctld] = %r{
187
+ (?:
188
+ (?:co|tv)
189
+ (?=[^0-9a-z@]|$)
190
+ )
191
+ }ix
192
+
193
+ REGEXEN[:valid_domain] = /(?:
194
+ #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
195
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
196
+ )/iox
197
+
198
+ # This is used in Extractor
199
+ REGEXEN[:valid_ascii_domain] = /
200
+ (?:(?:[a-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
201
+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
202
+ /iox
203
+
204
+ # This is used in Extractor for stricter t.co URL extraction
205
+ REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/([a-z0-9]+)/i
206
+
207
+ # This is used in Extractor to filter out unwanted URLs.
208
+ REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
209
+ REGEXEN[:valid_special_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_special_cctld]}\Z/io
210
+
211
+ REGEXEN[:valid_port_number] = /[0-9]+/
212
+
213
+ REGEXEN[:valid_general_url_path_chars] = /[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|@#{LATIN_ACCENTS}]/io
214
+ # Allow URL paths to contain up to two nested levels of balanced parens
215
+ # 1. Used in Wikipedia URLs like /Primer_(film)
216
+ # 2. Used in IIS sessions like /S(dfd346)/
217
+ # 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
218
+ REGEXEN[:valid_url_balanced_parens] = /
219
+ \(
223
220
  (?:
224
- #{REGEXEN[:valid_general_url_path_chars]}*
225
- \(
226
- #{REGEXEN[:valid_general_url_path_chars]}+
227
- \)
228
- #{REGEXEN[:valid_general_url_path_chars]}*
221
+ #{REGEXEN[:valid_general_url_path_chars]}+
222
+ |
223
+ # allow one nested level of balanced parentheses
224
+ (?:
225
+ #{REGEXEN[:valid_general_url_path_chars]}*
226
+ \(
227
+ #{REGEXEN[:valid_general_url_path_chars]}+
228
+ \)
229
+ #{REGEXEN[:valid_general_url_path_chars]}*
230
+ )
229
231
  )
230
- )
231
- \)
232
- /iox
233
- # Valid end-of-path chracters (so /foo. does not gobble the period).
234
- # 1. Allow =&# for empty URL parameters and other URL-join artifacts
235
- REGEXEN[:valid_url_path_ending_chars] = /[a-z\p{Cyrillic}0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
236
- REGEXEN[:valid_url_path] = /(?:
237
- (?:
238
- #{REGEXEN[:valid_general_url_path_chars]}*
239
- (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
240
- #{REGEXEN[:valid_url_path_ending_chars]}
241
- )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
242
- )/iox
243
-
244
- REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
245
- REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/\-]/i
246
- REGEXEN[:valid_url] = %r{
232
+ \)
233
+ /iox
234
+ # Valid end-of-path chracters (so /foo. does not gobble the period).
235
+ # 1. Allow =&# for empty URL parameters and other URL-join artifacts
236
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z\p{Cyrillic}0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io
237
+ REGEXEN[:valid_url_path] = /(?:
238
+ (?:
239
+ #{REGEXEN[:valid_general_url_path_chars]}*
240
+ (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
241
+ #{REGEXEN[:valid_url_path_ending_chars]}
242
+ )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
243
+ )/iox
244
+
245
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i
246
+ REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/\-]/i
247
+ REGEXEN[:valid_url] = %r{
247
248
  ( # $1 total match
248
249
  (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
249
250
  ( # $3 URL
@@ -254,114 +255,115 @@ module Twitter
254
255
  (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
255
256
  )
256
257
  )
257
- }iox
258
-
259
- REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
260
- REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
261
-
262
- # These URL validation pattern strings are based on the ABNF from RFC 3986
263
- REGEXEN[:validate_url_unreserved] = /[a-z\p{Cyrillic}0-9\p{Pd}._~]/i
264
- REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
265
- REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
266
- REGEXEN[:validate_url_pchar] = /(?:
267
- #{REGEXEN[:validate_url_unreserved]}|
268
- #{REGEXEN[:validate_url_pct_encoded]}|
269
- #{REGEXEN[:validate_url_sub_delims]}|
270
- [:\|@]
271
- )/iox
272
-
273
- REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
274
- REGEXEN[:validate_url_userinfo] = /(?:
275
- #{REGEXEN[:validate_url_unreserved]}|
276
- #{REGEXEN[:validate_url_pct_encoded]}|
277
- #{REGEXEN[:validate_url_sub_delims]}|
278
- :
279
- )*/iox
280
-
281
- REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
282
- REGEXEN[:validate_url_ipv4] =
283
- /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
284
-
285
- # Punting on real IPv6 validation for now
286
- REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
287
-
288
- # Also punting on IPvFuture for now
289
- REGEXEN[:validate_url_ip] = /(?:
290
- #{REGEXEN[:validate_url_ipv4]}|
291
- #{REGEXEN[:validate_url_ipv6]}
292
- )/iox
293
-
294
- # This is more strict than the rfc specifies
295
- REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
296
- REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
297
- REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
298
- REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
299
- (?:#{REGEXEN[:validate_url_domain_segment]}\.)
300
- #{REGEXEN[:validate_url_domain_tld]})/iox
301
-
302
- REGEXEN[:validate_url_host] = /(?:
303
- #{REGEXEN[:validate_url_ip]}|
304
- #{REGEXEN[:validate_url_domain]}
305
- )/iox
306
-
307
- # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
308
- REGEXEN[:validate_url_unicode_subdomain_segment] =
309
- /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
310
- REGEXEN[:validate_url_unicode_domain_segment] =
311
- /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
312
- REGEXEN[:validate_url_unicode_domain_tld] =
313
- /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
314
- REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
315
- (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
316
- #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
317
-
318
- REGEXEN[:validate_url_unicode_host] = /(?:
319
- #{REGEXEN[:validate_url_ip]}|
320
- #{REGEXEN[:validate_url_unicode_domain]}
321
- )/iox
322
-
323
- REGEXEN[:validate_url_port] = /[0-9]{1,5}/
324
-
325
- REGEXEN[:validate_url_unicode_authority] = %r{
326
- (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
327
- (#{REGEXEN[:validate_url_unicode_host]}) # $2 host
328
- (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
329
- }iox
330
-
331
- REGEXEN[:validate_url_authority] = %r{
332
- (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
333
- (#{REGEXEN[:validate_url_host]}) # $2 host
334
- (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
335
- }iox
336
-
337
- REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
338
- REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
339
- REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
340
-
341
- # Modified version of RFC 3986 Appendix B
342
- REGEXEN[:validate_url_unencoded] = %r{
343
- \A # Full URL
344
- (?:
345
- ([^:/?#]+):// # $1 Scheme
346
- )?
347
- ([^/?#]*) # $2 Authority
348
- ([^?#]*) # $3 Path
349
- (?:
350
- \?([^#]*) # $4 Query
351
- )?
352
- (?:
353
- \#(.*) # $5 Fragment
354
- )?\Z
355
- }ix
356
-
357
- REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
358
-
359
- REGEXEN.each_pair{|k,v| v.freeze }
360
-
361
- # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
362
- # is not a known symbol a <tt>nil</tt> will be returned.
363
- def self.[](key)
364
- REGEXEN[key]
258
+ }iox
259
+
260
+ REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
261
+ REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
262
+
263
+ # These URL validation pattern strings are based on the ABNF from RFC 3986
264
+ REGEXEN[:validate_url_unreserved] = /[a-z\p{Cyrillic}0-9\p{Pd}._~]/i
265
+ REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
266
+ REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
267
+ REGEXEN[:validate_url_pchar] = /(?:
268
+ #{REGEXEN[:validate_url_unreserved]}|
269
+ #{REGEXEN[:validate_url_pct_encoded]}|
270
+ #{REGEXEN[:validate_url_sub_delims]}|
271
+ [:\|@]
272
+ )/iox
273
+
274
+ REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i
275
+ REGEXEN[:validate_url_userinfo] = /(?:
276
+ #{REGEXEN[:validate_url_unreserved]}|
277
+ #{REGEXEN[:validate_url_pct_encoded]}|
278
+ #{REGEXEN[:validate_url_sub_delims]}|
279
+ :
280
+ )*/iox
281
+
282
+ REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i
283
+ REGEXEN[:validate_url_ipv4] =
284
+ /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox
285
+
286
+ # Punting on real IPv6 validation for now
287
+ REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i
288
+
289
+ # Also punting on IPvFuture for now
290
+ REGEXEN[:validate_url_ip] = /(?:
291
+ #{REGEXEN[:validate_url_ipv4]}|
292
+ #{REGEXEN[:validate_url_ipv6]}
293
+ )/iox
294
+
295
+ # This is more strict than the rfc specifies
296
+ REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i
297
+ REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i
298
+ REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i
299
+ REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)*
300
+ (?:#{REGEXEN[:validate_url_domain_segment]}\.)
301
+ #{REGEXEN[:validate_url_domain_tld]})/iox
302
+
303
+ REGEXEN[:validate_url_host] = /(?:
304
+ #{REGEXEN[:validate_url_ip]}|
305
+ #{REGEXEN[:validate_url_domain]}
306
+ )/iox
307
+
308
+ # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
309
+ REGEXEN[:validate_url_unicode_subdomain_segment] =
310
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
311
+ REGEXEN[:validate_url_unicode_domain_segment] =
312
+ /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
313
+ REGEXEN[:validate_url_unicode_domain_tld] =
314
+ /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix
315
+ REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)*
316
+ (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.)
317
+ #{REGEXEN[:validate_url_unicode_domain_tld]})/iox
318
+
319
+ REGEXEN[:validate_url_unicode_host] = /(?:
320
+ #{REGEXEN[:validate_url_ip]}|
321
+ #{REGEXEN[:validate_url_unicode_domain]}
322
+ )/iox
323
+
324
+ REGEXEN[:validate_url_port] = /[0-9]{1,5}/
325
+
326
+ REGEXEN[:validate_url_unicode_authority] = %r{
327
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
328
+ (#{REGEXEN[:validate_url_unicode_host]}) # $2 host
329
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
330
+ }iox
331
+
332
+ REGEXEN[:validate_url_authority] = %r{
333
+ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo
334
+ (#{REGEXEN[:validate_url_host]}) # $2 host
335
+ (?::(#{REGEXEN[:validate_url_port]}))? # $3 port
336
+ }iox
337
+
338
+ REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i
339
+ REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
340
+ REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
341
+
342
+ # Modified version of RFC 3986 Appendix B
343
+ REGEXEN[:validate_url_unencoded] = %r{
344
+ \A # Full URL
345
+ (?:
346
+ ([^:/?#]+):// # $1 Scheme
347
+ )?
348
+ ([^/?#]*) # $2 Authority
349
+ ([^?#]*) # $3 Path
350
+ (?:
351
+ \?([^#]*) # $4 Query
352
+ )?
353
+ (?:
354
+ \#(.*) # $5 Fragment
355
+ )?\Z
356
+ }ix
357
+
358
+ REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io
359
+
360
+ REGEXEN.each_pair{|k,v| v.freeze }
361
+
362
+ # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
363
+ # is not a known symbol a <tt>nil</tt> will be returned.
364
+ def self.[](key)
365
+ REGEXEN[key]
366
+ end
365
367
  end
366
368
  end
367
369
  end