twitter-text 1.4.12 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/extractor.rb CHANGED
@@ -68,8 +68,9 @@ module Twitter
68
68
  return [] unless text
69
69
 
70
70
  possible_screen_names = []
71
- text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
71
+ text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn|
72
72
  extract_mentions_match_data = $~
73
+ after = $'
73
74
  unless after =~ Twitter::Regex[:end_screen_name_match]
74
75
  start_position = extract_mentions_match_data.char_begin(2) - 1
75
76
  end_position = extract_mentions_match_data.char_end(2)
@@ -99,8 +100,9 @@ module Twitter
99
100
  return [] unless text
100
101
 
101
102
  possible_entries = []
102
- text.to_s.scan(Twitter::Regex[:extract_mentions_or_lists]) do |before, sn, list_slug, after|
103
+ text.to_s.scan(Twitter::Regex[:extract_mentions_or_lists]) do |before, sn, list_slug|
103
104
  extract_mentions_match_data = $~
105
+ after = $'
104
106
  unless after =~ Twitter::Regex[:end_screen_name_match]
105
107
  start_position = extract_mentions_match_data.char_begin(2) - 1
106
108
  end_position = extract_mentions_match_data.char_end(list_slug.nil? ? 2 : 3)
@@ -130,6 +132,7 @@ module Twitter
130
132
 
131
133
  possible_screen_name = text.match(Twitter::Regex[:extract_reply])
132
134
  return unless possible_screen_name.respond_to?(:captures)
135
+ return if $' =~ Twitter::Regex[:end_screen_name_match]
133
136
  screen_name = possible_screen_name.captures.first
134
137
  yield screen_name if block_given?
135
138
  screen_name
@@ -161,18 +164,32 @@ module Twitter
161
164
  start_position = valid_url_match_data.char_begin(3)
162
165
  end_position = valid_url_match_data.char_end(3)
163
166
 
164
- # If protocol is missing, check against valid_ascii_domain
167
+ # If protocol is missing and domain contains non-ASCII characters,
168
+ # extract ASCII-only domains.
165
169
  if !protocol
166
- next unless domain =~ Twitter::Regex[:valid_ascii_domain]
167
- if $~.char_begin(0)
168
- start_position += $~.char_begin(0)
169
- url.sub!(domain, $~.to_s())
170
+ last_url = nil
171
+ last_url_invalid_match = nil
172
+ domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
173
+ last_url = {
174
+ :url => ascii_domain,
175
+ :indices => [start_position + $~.char_begin(0),
176
+ start_position + $~.char_end(0)]
177
+ }
178
+ last_url_invalid_match = ascii_domain =~ Twitter::Regex[:invalid_short_domain]
179
+ urls << last_url unless last_url_invalid_match
170
180
  end
171
- end
172
181
 
173
- # Regex in Ruby 1.8 doesn't support lookbehind, so we need to manually filter out
174
- # the short URLs without protocol and path, i.e., [domain].[ccTLD]
175
- unless !protocol && !path && domain =~ Twitter::Regex[:valid_short_domain]
182
+ # no ASCII-only domain found. Skip the entire URL
183
+ next unless last_url
184
+
185
+ # last_url only contains domain. Need to add path and query if they exist.
186
+ if path
187
+ # last_url was not added. Add it to urls here.
188
+ urls << last_url if last_url_invalid_match
189
+ last_url[:url] = url.sub(domain, last_url[:url])
190
+ last_url[:indices][1] = end_position
191
+ end
192
+ else
176
193
  urls << {
177
194
  :url => url,
178
195
  :indices => [start_position, end_position]
@@ -208,10 +225,13 @@ module Twitter
208
225
  text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
209
226
  start_position = $~.char_begin(2)
210
227
  end_position = $~.char_end(3)
211
- tags << {
212
- :hashtag => hash_text,
213
- :indices => [start_position, end_position]
214
- }
228
+ after = $'
229
+ unless after =~ Twitter::Regex[:end_hashtag_match]
230
+ tags << {
231
+ :hashtag => hash_text,
232
+ :indices => [start_position, end_position]
233
+ }
234
+ end
215
235
  end
216
236
  tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given?
217
237
  tags
data/lib/regex.rb CHANGED
@@ -50,11 +50,6 @@ module Twitter
50
50
  ].map{|cp| [cp].pack('U') }.freeze
51
51
  REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
52
52
 
53
- REGEXEN[:at_signs] = /[@@]/
54
- REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
55
- REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?(?=(.|$))/o
56
- REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
57
-
58
53
  major, minor, patch = RUBY_VERSION.split('.')
59
54
  if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
60
55
  REGEXEN[:list_name] = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/
@@ -89,8 +84,6 @@ module Twitter
89
84
  ].join('').freeze
90
85
  REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
91
86
 
92
- REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
93
-
94
87
  CJ_HASHTAG_CHARACTERS = [
95
88
  regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width)
96
89
  regex_range(0xFF66, 0xFF9F), # Katakana (half-width)
@@ -104,27 +97,35 @@ module Twitter
104
97
  regex_range(0x2F800, 0x2FA1F), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
105
98
  ].join('').freeze
106
99
 
107
- HASHTAG_BOUNDARY = /(?:\A|\z|#{REGEXEN[:spaces]}|[「」。、.,!?!?:;"'])/o
108
-
109
100
  # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
110
101
  HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
111
102
  HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
103
+ HASHTAG_BOUNDARY = /\A|\z|[^&\/a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
112
104
 
113
105
  HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
114
106
 
115
107
  REGEXEN[:auto_link_hashtags] = /#{HASHTAG}/io
108
+ # Used in Extractor and Rewriter for final filtering
109
+ REGEXEN[:end_hashtag_match] = /^(?:[##]|:\/\/)/o
110
+
111
+ REGEXEN[:at_signs] = /[@@]/
112
+ REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
113
+ REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
114
+ REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
115
+ # Used in Extractor and Rewriter for final filtering
116
+ REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
116
117
 
117
118
  REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
118
119
  REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
119
120
 
120
121
  # URL related hash regex collection
121
- REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@@\.#{INVALID_CHARACTERS.join('')}]|^)/io
122
+ REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@@##\.#{INVALID_CHARACTERS.join('')}]|^)/io
122
123
 
123
124
  DOMAIN_VALID_CHARS = "[^[:punct:][:space:][:blank:][:cntrl:]#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
124
125
  REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
125
126
  REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
126
127
 
127
- REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^[:alpha:]]|$))/i
128
+ REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^a-z]|$))/i
128
129
  REGEXEN[:valid_ccTLD] = %r{
129
130
  (?:
130
131
  (?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
@@ -133,7 +134,7 @@ module Twitter
133
134
  lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
134
135
  pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|
135
136
  tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
136
- (?=[^[:alpha:]]|$)
137
+ (?=[^a-z]|$)
137
138
  )
138
139
  }ix
139
140
  REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/
@@ -145,12 +146,12 @@ module Twitter
145
146
 
146
147
  # This is used in Extractor
147
148
  REGEXEN[:valid_ascii_domain] = /
148
- (?:(?:[[:alnum:]\-_]|#{REGEXEN[:latin_accents]})+\.)+
149
+ (?:(?:[A-Za-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
149
150
  (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
150
151
  /iox
151
152
 
152
153
  # This is used in Extractor to filter out unwanted URLs.
153
- REGEXEN[:valid_short_domain] = /^#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}$/io
154
+ REGEXEN[:invalid_short_domain] = /^#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}$/io
154
155
 
155
156
  REGEXEN[:valid_port_number] = /[0-9]+/
156
157
 
@@ -171,7 +172,7 @@ module Twitter
171
172
  )|(?:@#{REGEXEN[:valid_general_url_path_chars]}+\/)
172
173
  )/iox
173
174
 
174
- REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i
175
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i
175
176
  REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
176
177
  REGEXEN[:valid_url] = %r{
177
178
  ( # $1 total match
data/lib/rewriter.rb CHANGED
@@ -42,10 +42,12 @@ module Twitter
42
42
 
43
43
  def rewrite_hashtags(text)
44
44
  text.to_s.gsub(Twitter::Regex[:auto_link_hashtags]) do
45
- before = $1
46
- hash = $2
47
- hashtag = $3
48
- "#{before}#{yield(hash, hashtag)}"
45
+ before, hash, hashtag, after = $1, $2, $3, $'
46
+ if after =~ Twitter::Regex[:end_hashtag_match]
47
+ "#{before}#{hash}#{hashtag}"
48
+ else
49
+ "#{before}#{yield(hash, hashtag)}"
50
+ end
49
51
  end
50
52
  end
51
53
 
data/twitter-text.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "twitter-text"
5
- s.version = "1.4.12"
5
+ s.version = "1.4.13"
6
6
  s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle",
7
7
  "Raffi Krikorian", "J.P. Cummins", "Yoshimasa Niwa", "Keita Fujii"]
8
8
  s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com",
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter-text
3
3
  version: !ruby/object:Gem::Version
4
- hash: 31
4
+ hash: 29
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 4
9
- - 12
10
- version: 1.4.12
9
+ - 13
10
+ version: 1.4.13
11
11
  platform: ruby
12
12
  authors:
13
13
  - Matt Sanford
@@ -22,7 +22,7 @@ autorequire:
22
22
  bindir: bin
23
23
  cert_chain: []
24
24
 
25
- date: 2011-10-04 00:00:00 -07:00
25
+ date: 2011-11-02 00:00:00 -07:00
26
26
  default_executable:
27
27
  dependencies:
28
28
  - !ruby/object:Gem::Dependency