twitter-text 1.4.12 → 1.4.13

Sign up to get free protection for your applications and to get access to all the features.
data/lib/extractor.rb CHANGED
@@ -68,8 +68,9 @@ module Twitter
68
68
  return [] unless text
69
69
 
70
70
  possible_screen_names = []
71
- text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
71
+ text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn|
72
72
  extract_mentions_match_data = $~
73
+ after = $'
73
74
  unless after =~ Twitter::Regex[:end_screen_name_match]
74
75
  start_position = extract_mentions_match_data.char_begin(2) - 1
75
76
  end_position = extract_mentions_match_data.char_end(2)
@@ -99,8 +100,9 @@ module Twitter
99
100
  return [] unless text
100
101
 
101
102
  possible_entries = []
102
- text.to_s.scan(Twitter::Regex[:extract_mentions_or_lists]) do |before, sn, list_slug, after|
103
+ text.to_s.scan(Twitter::Regex[:extract_mentions_or_lists]) do |before, sn, list_slug|
103
104
  extract_mentions_match_data = $~
105
+ after = $'
104
106
  unless after =~ Twitter::Regex[:end_screen_name_match]
105
107
  start_position = extract_mentions_match_data.char_begin(2) - 1
106
108
  end_position = extract_mentions_match_data.char_end(list_slug.nil? ? 2 : 3)
@@ -130,6 +132,7 @@ module Twitter
130
132
 
131
133
  possible_screen_name = text.match(Twitter::Regex[:extract_reply])
132
134
  return unless possible_screen_name.respond_to?(:captures)
135
+ return if $' =~ Twitter::Regex[:end_screen_name_match]
133
136
  screen_name = possible_screen_name.captures.first
134
137
  yield screen_name if block_given?
135
138
  screen_name
@@ -161,18 +164,32 @@ module Twitter
161
164
  start_position = valid_url_match_data.char_begin(3)
162
165
  end_position = valid_url_match_data.char_end(3)
163
166
 
164
- # If protocol is missing, check against valid_ascii_domain
167
+ # If protocol is missing and domain contains non-ASCII characters,
168
+ # extract ASCII-only domains.
165
169
  if !protocol
166
- next unless domain =~ Twitter::Regex[:valid_ascii_domain]
167
- if $~.char_begin(0)
168
- start_position += $~.char_begin(0)
169
- url.sub!(domain, $~.to_s())
170
+ last_url = nil
171
+ last_url_invalid_match = nil
172
+ domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
173
+ last_url = {
174
+ :url => ascii_domain,
175
+ :indices => [start_position + $~.char_begin(0),
176
+ start_position + $~.char_end(0)]
177
+ }
178
+ last_url_invalid_match = ascii_domain =~ Twitter::Regex[:invalid_short_domain]
179
+ urls << last_url unless last_url_invalid_match
170
180
  end
171
- end
172
181
 
173
- # Regex in Ruby 1.8 doesn't support lookbehind, so we need to manually filter out
174
- # the short URLs without protocol and path, i.e., [domain].[ccTLD]
175
- unless !protocol && !path && domain =~ Twitter::Regex[:valid_short_domain]
182
+ # no ASCII-only domain found. Skip the entire URL
183
+ next unless last_url
184
+
185
+ # last_url only contains domain. Need to add path and query if they exist.
186
+ if path
187
+ # last_url was not added. Add it to urls here.
188
+ urls << last_url if last_url_invalid_match
189
+ last_url[:url] = url.sub(domain, last_url[:url])
190
+ last_url[:indices][1] = end_position
191
+ end
192
+ else
176
193
  urls << {
177
194
  :url => url,
178
195
  :indices => [start_position, end_position]
@@ -208,10 +225,13 @@ module Twitter
208
225
  text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
209
226
  start_position = $~.char_begin(2)
210
227
  end_position = $~.char_end(3)
211
- tags << {
212
- :hashtag => hash_text,
213
- :indices => [start_position, end_position]
214
- }
228
+ after = $'
229
+ unless after =~ Twitter::Regex[:end_hashtag_match]
230
+ tags << {
231
+ :hashtag => hash_text,
232
+ :indices => [start_position, end_position]
233
+ }
234
+ end
215
235
  end
216
236
  tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given?
217
237
  tags
data/lib/regex.rb CHANGED
@@ -50,11 +50,6 @@ module Twitter
50
50
  ].map{|cp| [cp].pack('U') }.freeze
51
51
  REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
52
52
 
53
- REGEXEN[:at_signs] = /[@@]/
54
- REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
55
- REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?(?=(.|$))/o
56
- REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
57
-
58
53
  major, minor, patch = RUBY_VERSION.split('.')
59
54
  if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
60
55
  REGEXEN[:list_name] = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/
@@ -89,8 +84,6 @@ module Twitter
89
84
  ].join('').freeze
90
85
  REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
91
86
 
92
- REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
93
-
94
87
  CJ_HASHTAG_CHARACTERS = [
95
88
  regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width)
96
89
  regex_range(0xFF66, 0xFF9F), # Katakana (half-width)
@@ -104,27 +97,35 @@ module Twitter
104
97
  regex_range(0x2F800, 0x2FA1F), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
105
98
  ].join('').freeze
106
99
 
107
- HASHTAG_BOUNDARY = /(?:\A|\z|#{REGEXEN[:spaces]}|[「」。、.,!?!?:;"'])/o
108
-
109
100
  # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
110
101
  HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
111
102
  HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
103
+ HASHTAG_BOUNDARY = /\A|\z|[^&\/a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
112
104
 
113
105
  HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
114
106
 
115
107
  REGEXEN[:auto_link_hashtags] = /#{HASHTAG}/io
108
+ # Used in Extractor and Rewriter for final filtering
109
+ REGEXEN[:end_hashtag_match] = /^(?:[##]|:\/\/)/o
110
+
111
+ REGEXEN[:at_signs] = /[@@]/
112
+ REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
113
+ REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
114
+ REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
115
+ # Used in Extractor and Rewriter for final filtering
116
+ REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
116
117
 
117
118
  REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
118
119
  REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
119
120
 
120
121
  # URL related hash regex collection
121
- REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@@\.#{INVALID_CHARACTERS.join('')}]|^)/io
122
+ REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@@##\.#{INVALID_CHARACTERS.join('')}]|^)/io
122
123
 
123
124
  DOMAIN_VALID_CHARS = "[^[:punct:][:space:][:blank:][:cntrl:]#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
124
125
  REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
125
126
  REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
126
127
 
127
- REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^[:alpha:]]|$))/i
128
+ REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^a-z]|$))/i
128
129
  REGEXEN[:valid_ccTLD] = %r{
129
130
  (?:
130
131
  (?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
@@ -133,7 +134,7 @@ module Twitter
133
134
  lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
134
135
  pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|
135
136
  tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
136
- (?=[^[:alpha:]]|$)
137
+ (?=[^a-z]|$)
137
138
  )
138
139
  }ix
139
140
  REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/
@@ -145,12 +146,12 @@ module Twitter
145
146
 
146
147
  # This is used in Extractor
147
148
  REGEXEN[:valid_ascii_domain] = /
148
- (?:(?:[[:alnum:]\-_]|#{REGEXEN[:latin_accents]})+\.)+
149
+ (?:(?:[A-Za-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
149
150
  (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
150
151
  /iox
151
152
 
152
153
  # This is used in Extractor to filter out unwanted URLs.
153
- REGEXEN[:valid_short_domain] = /^#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}$/io
154
+ REGEXEN[:invalid_short_domain] = /^#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}$/io
154
155
 
155
156
  REGEXEN[:valid_port_number] = /[0-9]+/
156
157
 
@@ -171,7 +172,7 @@ module Twitter
171
172
  )|(?:@#{REGEXEN[:valid_general_url_path_chars]}+\/)
172
173
  )/iox
173
174
 
174
- REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i
175
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i
175
176
  REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
176
177
  REGEXEN[:valid_url] = %r{
177
178
  ( # $1 total match
data/lib/rewriter.rb CHANGED
@@ -42,10 +42,12 @@ module Twitter
42
42
 
43
43
  def rewrite_hashtags(text)
44
44
  text.to_s.gsub(Twitter::Regex[:auto_link_hashtags]) do
45
- before = $1
46
- hash = $2
47
- hashtag = $3
48
- "#{before}#{yield(hash, hashtag)}"
45
+ before, hash, hashtag, after = $1, $2, $3, $'
46
+ if after =~ Twitter::Regex[:end_hashtag_match]
47
+ "#{before}#{hash}#{hashtag}"
48
+ else
49
+ "#{before}#{yield(hash, hashtag)}"
50
+ end
49
51
  end
50
52
  end
51
53
 
data/twitter-text.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "twitter-text"
5
- s.version = "1.4.12"
5
+ s.version = "1.4.13"
6
6
  s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle",
7
7
  "Raffi Krikorian", "J.P. Cummins", "Yoshimasa Niwa", "Keita Fujii"]
8
8
  s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com",
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter-text
3
3
  version: !ruby/object:Gem::Version
4
- hash: 31
4
+ hash: 29
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 4
9
- - 12
10
- version: 1.4.12
9
+ - 13
10
+ version: 1.4.13
11
11
  platform: ruby
12
12
  authors:
13
13
  - Matt Sanford
@@ -22,7 +22,7 @@ autorequire:
22
22
  bindir: bin
23
23
  cert_chain: []
24
24
 
25
- date: 2011-10-04 00:00:00 -07:00
25
+ date: 2011-11-02 00:00:00 -07:00
26
26
  default_executable:
27
27
  dependencies:
28
28
  - !ruby/object:Gem::Dependency