twitter-text 1.4.17 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  class String
2
4
  # Helper function to count the character length by first converting to an
3
5
  # array. This is needed because with unicode strings, the return value
@@ -45,16 +47,48 @@ module Twitter
45
47
  # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
46
48
  # of usernames, lists, URLs and hashtags.
47
49
  module Extractor extend self
50
+ # Remove overlapping entities.
51
+ # This returns a new array with no overlapping entities.
52
+ def remove_overlapping_entities(entities)
53
+ # sort by start index
54
+ entities = entities.sort_by{|entity| entity[:indices].first}
55
+
56
+ # remove duplicates
57
+ prev = nil
58
+ entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
59
+ entities
60
+ end
61
+
62
+ # Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
63
+ # along with the indices for where the entity ocurred
64
+ # If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
65
+ # will be returned.
66
+ #
67
+ # If a block is given then it will be called for each entity.
68
+ def extract_entities_with_indices(text, options = {}, &block)
69
+ # extract all entities
70
+ entities = extract_urls_with_indices(text, options) +
71
+ extract_hashtags_with_indices(text, :check_url_overlap => false) +
72
+ extract_mentions_or_lists_with_indices(text) +
73
+ extract_cashtags_with_indices(text)
74
+
75
+ return [] if entities.empty?
76
+
77
+ entities = remove_overlapping_entities(entities)
78
+
79
+ entities.each(&block) if block_given?
80
+ entities
81
+ end
48
82
 
49
83
  # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
50
84
  # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
51
85
  # will be returned.
52
86
  #
53
87
  # If a block is given then it will be called for each username.
54
- def extract_mentioned_screen_names(text) # :yields: username
55
- screen_names_only = extract_mentioned_screen_names_with_indices(text).map{|mention| mention[:screen_name] }
56
- screen_names_only.each{|mention| yield mention } if block_given?
57
- screen_names_only
88
+ def extract_mentioned_screen_names(text, &block) # :yields: username
89
+ screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
90
+ screen_names.each(&block) if block_given?
91
+ screen_names
58
92
  end
59
93
 
60
94
  # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
@@ -68,23 +102,20 @@ module Twitter
68
102
  return [] unless text
69
103
 
70
104
  possible_screen_names = []
71
- text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn|
72
- extract_mentions_match_data = $~
73
- after = $'
74
- unless after =~ Twitter::Regex[:end_screen_name_match]
75
- start_position = extract_mentions_match_data.char_begin(2) - 1
76
- end_position = extract_mentions_match_data.char_end(2)
77
- possible_screen_names << {
78
- :screen_name => sn,
79
- :indices => [start_position, end_position]
80
- }
81
- end
105
+ extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
106
+ next unless list_slug.empty?
107
+ possible_screen_names << {
108
+ :screen_name => screen_name,
109
+ :indices => [start_position, end_position]
110
+ }
82
111
  end
112
+
83
113
  if block_given?
84
114
  possible_screen_names.each do |mention|
85
115
  yield mention[:screen_name], mention[:indices].first, mention[:indices].last
86
116
  end
87
117
  end
118
+
88
119
  possible_screen_names
89
120
  end
90
121
 
@@ -97,17 +128,17 @@ module Twitter
97
128
  # index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
98
129
  # if this is a username mention.
99
130
  def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
100
- return [] unless text
131
+ return [] unless text =~ /[@@]/
101
132
 
102
133
  possible_entries = []
103
- text.to_s.scan(Twitter::Regex[:extract_mentions_or_lists]) do |before, sn, list_slug|
104
- extract_mentions_match_data = $~
134
+ text.to_s.scan(Twitter::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
135
+ match_data = $~
105
136
  after = $'
106
- unless after =~ Twitter::Regex[:end_screen_name_match]
107
- start_position = extract_mentions_match_data.char_begin(2) - 1
108
- end_position = extract_mentions_match_data.char_end(list_slug.nil? ? 2 : 3)
137
+ unless after =~ Twitter::Regex[:end_mention_match]
138
+ start_position = match_data.char_begin(3) - 1
139
+ end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
109
140
  possible_entries << {
110
- :screen_name => sn,
141
+ :screen_name => screen_name,
111
142
  :list_slug => list_slug || "",
112
143
  :indices => [start_position, end_position]
113
144
  }
@@ -130,9 +161,9 @@ module Twitter
130
161
  def extract_reply_screen_name(text) # :yields: username
131
162
  return nil unless text
132
163
 
133
- possible_screen_name = text.match(Twitter::Regex[:extract_reply])
164
+ possible_screen_name = text.match(Twitter::Regex[:valid_reply])
134
165
  return unless possible_screen_name.respond_to?(:captures)
135
- return if $' =~ Twitter::Regex[:end_screen_name_match]
166
+ return if $' =~ Twitter::Regex[:end_mention_match]
136
167
  screen_name = possible_screen_name.captures.first
137
168
  yield screen_name if block_given?
138
169
  screen_name
@@ -143,10 +174,10 @@ module Twitter
143
174
  # will be returned.
144
175
  #
145
176
  # If a block is given then it will be called for each URL.
146
- def extract_urls(text) # :yields: url
147
- urls_only = extract_urls_with_indices(text).map{|url| url[:url] }
148
- urls_only.each{|url| yield url } if block_given?
149
- urls_only
177
+ def extract_urls(text, &block) # :yields: url
178
+ urls = extract_urls_with_indices(text).map{|u| u[:url]}
179
+ urls.each(&block) if block_given?
180
+ urls
150
181
  end
151
182
 
152
183
  # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
@@ -154,10 +185,11 @@ module Twitter
154
185
  # URLs an empty array will be returned.
155
186
  #
156
187
  # If a block is given then it will be called for each URL.
157
- def extract_urls_with_indices(text) # :yields: url, start, end
158
- return [] unless text
188
+ def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
189
+ return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
159
190
  urls = []
160
191
  position = 0
192
+
161
193
  text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
162
194
  valid_url_match_data = $~
163
195
 
@@ -167,6 +199,7 @@ module Twitter
167
199
  # If protocol is missing and domain contains non-ASCII characters,
168
200
  # extract ASCII-only domains.
169
201
  if !protocol
202
+ next if !options[:extract_url_without_protocol] || before =~ Twitter::Regex[:invalid_url_without_protocol_preceding_chars]
170
203
  last_url = nil
171
204
  last_url_invalid_match = nil
172
205
  domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
@@ -201,7 +234,7 @@ module Twitter
201
234
  }
202
235
  end
203
236
  end
204
- urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last } if block_given?
237
+ urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
205
238
  urls
206
239
  end
207
240
 
@@ -211,10 +244,10 @@ module Twitter
211
244
  # character.
212
245
  #
213
246
  # If a block is given then it will be called for each hashtag.
214
- def extract_hashtags(text) # :yields: hashtag_text
215
- hashtags_only = extract_hashtags_with_indices(text).map{|hash| hash[:hashtag] }
216
- hashtags_only.each{|hash| yield hash } if block_given?
217
- hashtags_only
247
+ def extract_hashtags(text, &block) # :yields: hashtag_text
248
+ hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
249
+ hashtags.each(&block) if block_given?
250
+ hashtags
218
251
  end
219
252
 
220
253
  # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
@@ -223,13 +256,14 @@ module Twitter
223
256
  # character.
224
257
  #
225
258
  # If a block is given then it will be called for each hashtag.
226
- def extract_hashtags_with_indices(text) # :yields: hashtag_text, start, end
227
- return [] unless text
259
+ def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
260
+ return [] unless text =~ /[##]/
228
261
 
229
262
  tags = []
230
- text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
231
- start_position = $~.char_begin(2)
232
- end_position = $~.char_end(3)
263
+ text.scan(Twitter::Regex[:valid_hashtag]) do |before, hash, hash_text|
264
+ match_data = $~
265
+ start_position = match_data.char_begin(2)
266
+ end_position = match_data.char_end(3)
233
267
  after = $'
234
268
  unless after =~ Twitter::Regex[:end_hashtag_match]
235
269
  tags << {
@@ -238,7 +272,57 @@ module Twitter
238
272
  }
239
273
  end
240
274
  end
241
- tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given?
275
+
276
+ if options[:check_url_overlap]
277
+ # extract URLs
278
+ urls = extract_urls_with_indices(text)
279
+ unless urls.empty?
280
+ tags.concat(urls)
281
+ # remove duplicates
282
+ tags = remove_overlapping_entities(tags)
283
+ # remove URL entities
284
+ tags.reject!{|entity| !entity[:hashtag] }
285
+ end
286
+ end
287
+
288
+ tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
289
+ tags
290
+ end
291
+
292
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
293
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
294
+ # will be returned. The array returned will not include the leading <tt>$</tt>
295
+ # character.
296
+ #
297
+ # If a block is given then it will be called for each cashtag.
298
+ def extract_cashtags(text, &block) # :yields: cashtag_text
299
+ cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
300
+ cashtags.each(&block) if block_given?
301
+ cashtags
302
+ end
303
+
304
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
305
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
306
+ # will be returned. The array returned will not include the leading <tt>$</tt>
307
+ # character.
308
+ #
309
+ # If a block is given then it will be called for each cashtag.
310
+ def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
311
+ return [] unless text =~ /\$/
312
+
313
+ tags = []
314
+ text.scan(Twitter::Regex[:valid_cashtag]) do |cash_text|
315
+ match_data = $~
316
+ # cash_text doesn't contain $ symbol, so need to decrement start_position by one
317
+ start_position = match_data.char_begin(1) - 1
318
+ end_position = match_data.char_end(1)
319
+ tags << {
320
+ :cashtag => cash_text[0],
321
+ :indices => [start_position, end_position]
322
+ }
323
+ end
324
+
325
+ tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
242
326
  tags
243
327
  end
244
328
  end
@@ -23,9 +23,9 @@ module Twitter
23
23
 
24
24
  chunks = text.split(/[<>]/)
25
25
 
26
- result = ""
26
+ result = []
27
27
  chunk_index, chunk = 0, chunks[0]
28
- chunk_chars = chunk.respond_to?("mb_chars") ? chunk.mb_chars : chunk.respond_to?("chars") && chunk.chars.respond_to?("[]") ? chunk.chars : chunk
28
+ chunk_chars = chunk.to_s.to_char_a
29
29
  prev_chunks_len = 0
30
30
  chunk_cursor = 0
31
31
  start_in_chunk = false
@@ -49,13 +49,13 @@ module Twitter
49
49
  chunk_cursor = 0
50
50
  chunk_index += 2
51
51
  chunk = chunks[chunk_index]
52
- chunk_chars = chunk.respond_to?("mb_chars") ? chunk.mb_chars : chunk.respond_to?("chars") && chunk.chars.respond_to?("[]") ? chunk.chars : chunk
52
+ chunk_chars = chunk.to_s.to_char_a
53
53
  start_in_chunk = false
54
54
  end
55
55
 
56
56
  if !placed && !chunk.nil?
57
57
  hit_spot = hit - prev_chunks_len
58
- result << chunk_chars[chunk_cursor...hit_spot].to_s + tag
58
+ result << chunk_chars[chunk_cursor...hit_spot] << tag
59
59
  chunk_cursor = hit_spot
60
60
  if index % 2 == 0
61
61
  start_in_chunk = true
@@ -80,9 +80,7 @@ module Twitter
80
80
  end
81
81
  end
82
82
 
83
- result
84
- rescue
85
- text
83
+ result.flatten.join
86
84
  end
87
85
  end
88
86
  end
@@ -1,4 +1,5 @@
1
- # encoding: utf-8
1
+ # encoding: UTF-8
2
+
2
3
  module Twitter
3
4
  # A collection of regular expressions for parsing Tweet text. The regular expression
4
5
  # list is frozen at load time to ensure immutability. These reular expressions are
@@ -77,6 +78,7 @@ module Twitter
77
78
  regex_range(0x0289),
78
79
  regex_range(0x028b),
79
80
  regex_range(0x02bb),
81
+ regex_range(0x0300, 0x036f),
80
82
  regex_range(0x1e00, 0x1eff)
81
83
  ].join('').freeze
82
84
 
@@ -86,13 +88,12 @@ module Twitter
86
88
  regex_range(0x0500, 0x0527), # Cyrillic Supplement
87
89
  regex_range(0x2de0, 0x2dff), # Cyrillic Extended A
88
90
  regex_range(0xa640, 0xa69f), # Cyrillic Extended B
89
- regex_range(0x0591, 0x05bd), # Hebrew
90
- regex_range(0x05bf),
91
+ regex_range(0x0591, 0x05bf), # Hebrew
91
92
  regex_range(0x05c1, 0x05c2),
92
93
  regex_range(0x05c4, 0x05c5),
93
94
  regex_range(0x05c7),
94
95
  regex_range(0x05d0, 0x05ea),
95
- regex_range(0x05f0, 0x05f2),
96
+ regex_range(0x05f0, 0x05f4),
96
97
  regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms
97
98
  regex_range(0xfb2a, 0xfb36),
98
99
  regex_range(0xfb38, 0xfb3c),
@@ -141,38 +142,44 @@ module Twitter
141
142
  regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B)
142
143
  regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C)
143
144
  regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D)
144
- regex_range(0x2F800, 0x2FA1F), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
145
+ regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
145
146
  ].join('').freeze
146
147
 
148
+ PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
149
+ SPACE_CHARS = " \t\n\x0B\f\r"
150
+ CTRL_CHARS = "\x00-\x1F\x7F"
151
+
147
152
  # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
148
153
  HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
149
154
  HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
150
- HASHTAG_BOUNDARY = /\A|\z|[^&\/a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
155
+ HASHTAG_BOUNDARY = /\A|\z|[^&a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
151
156
 
152
157
  HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
153
158
 
154
- REGEXEN[:auto_link_hashtags] = /#{HASHTAG}/io
155
- # Used in Extractor and Rewriter for final filtering
159
+ REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
160
+ # Used in Extractor for final filtering
156
161
  REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
157
162
 
163
+ REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)/o
158
164
  REGEXEN[:at_signs] = /[@@]/
159
- REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_!#\$%&*@@])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
160
- REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_!#\$%&*@@])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
161
- REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
162
- # Used in Extractor and Rewriter for final filtering
163
- REGEXEN[:end_screen_name_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
164
-
165
- REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
166
- REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
165
+ REGEXEN[:valid_mention_or_list] = /
166
+ (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
167
+ (#{REGEXEN[:at_signs]}) # $2: At mark
168
+ ([a-zA-Z0-9_]{1,20}) # $3: Screen name
169
+ (\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
170
+ /ox
171
+ REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
172
+ # Used in Extractor for final filtering
173
+ REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
167
174
 
168
175
  # URL related hash regex collection
169
- REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@@\$##\.#{INVALID_CHARACTERS.join('')}]|^)/io
170
-
171
- DOMAIN_VALID_CHARS = "[^[:punct:][:space:][:blank:][:cntrl:]#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
176
+ REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
177
+ REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
178
+ DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
172
179
  REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
173
180
  REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
174
181
 
175
- REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^a-z]|$))/i
182
+ REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^0-9a-z]|$))/i
176
183
  REGEXEN[:valid_ccTLD] = %r{
177
184
  (?:
178
185
  (?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
@@ -181,10 +188,10 @@ module Twitter
181
188
  lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
182
189
  pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|
183
190
  tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
184
- (?=[^a-z]|$)
191
+ (?=[^0-9a-z]|$)
185
192
  )
186
193
  }ix
187
- REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/
194
+ REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
188
195
 
189
196
  REGEXEN[:valid_domain] = /(?:
190
197
  #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
@@ -226,7 +233,7 @@ module Twitter
226
233
  REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
227
234
  REGEXEN[:valid_url] = %r{
228
235
  ( # $1 total match
229
- (#{REGEXEN[:valid_preceding_chars]}) # $2 Preceeding chracter
236
+ (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
230
237
  ( # $3 URL
231
238
  (https?:\/\/)? # $4 Protocol (optional)
232
239
  (#{REGEXEN[:valid_domain]}) # $5 Domain(s)
@@ -237,6 +244,9 @@ module Twitter
237
244
  )
238
245
  }iox;
239
246
 
247
+ REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
248
+ REGEXEN[:valid_cashtag] = /(?:^|#{REGEXEN[:spaces]})\$(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
249
+
240
250
  # These URL validation pattern strings are based on the ABNF from RFC 3986
241
251
  REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
242
252
  REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
@@ -0,0 +1,59 @@
1
+ module Twitter
2
+ # A module provides base methods to rewrite usernames, lists, hashtags and URLs.
3
+ module Rewriter extend self
4
+ def rewrite_entities(text, entities)
5
+ chars = text.to_s.to_char_a
6
+
7
+ # sort by start index
8
+ entities = entities.sort_by{|entity| entity[:indices].first}
9
+
10
+ result = []
11
+ last_index = entities.inject(0) do |last_index, entity|
12
+ result << chars[last_index...entity[:indices].first]
13
+ result << yield(entity, chars)
14
+ entity[:indices].last
15
+ end
16
+ result << chars[last_index..-1]
17
+
18
+ result.flatten.join
19
+ end
20
+
21
+ # These methods are deprecated, will be removed in future.
22
+ extend Deprecation
23
+
24
+ def rewrite(text, options = {})
25
+ [:hashtags, :urls, :usernames_or_lists].inject(text) do |key|
26
+ options[key] ? send(:"rewrite_#{key}", text, &options[key]) : text
27
+ end
28
+ end
29
+ deprecate :rewrite, :rewrite_entities
30
+
31
+ def rewrite_usernames_or_lists(text)
32
+ entities = Extractor.extract_mentions_or_lists_with_indices(text)
33
+ rewrite_entities(text, entities) do |entity, chars|
34
+ at = chars[entity[:indices].first]
35
+ list_slug = entity[:list_slug]
36
+ list_slug = nil if list_slug.empty?
37
+ yield(at, entity[:screen_name], list_slug)
38
+ end
39
+ end
40
+ deprecate :rewrite_usernames_or_lists, :rewrite_entities
41
+
42
+ def rewrite_hashtags(text)
43
+ entities = Extractor.extract_hashtags_with_indices(text)
44
+ rewrite_entities(text, entities) do |entity, chars|
45
+ hash = chars[entity[:indices].first]
46
+ yield(hash, entity[:hashtag])
47
+ end
48
+ end
49
+ deprecate :rewrite_hashtags, :rewrite_entities
50
+
51
+ def rewrite_urls(text)
52
+ entities = Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false)
53
+ rewrite_entities(text, entities) do |entity, chars|
54
+ yield(entity[:url])
55
+ end
56
+ end
57
+ deprecate :rewrite_urls, :rewrite_entities
58
+ end
59
+ end
File without changes
@@ -2,6 +2,11 @@ module Twitter
2
2
  module Validation extend self
3
3
  MAX_LENGTH = 140
4
4
 
5
+ DEFAULT_TCO_URL_LENGTHS = {
6
+ :short_url_length => 20,
7
+ :short_url_length_https => 21
8
+ }.freeze
9
+
5
10
  # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
6
11
  # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
7
12
  # string no matter which actual form was transmitted. For example:
@@ -14,8 +19,17 @@ module Twitter
14
19
  #
15
20
  # The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
16
21
  #
17
- def tweet_length(text)
18
- ActiveSupport::Multibyte::Chars.new(text).normalize(:c).length
22
+ def tweet_length(text, options = {})
23
+ options = DEFAULT_TCO_URL_LENGTHS.merge(options)
24
+
25
+ length = ActiveSupport::Multibyte::Chars.new(text).normalize(:c).length
26
+
27
+ Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
28
+ length += start_position - end_position
29
+ length += url.downcase =~ /^https:\/\// ? options[:short_url_length_https] : options[:short_url_length]
30
+ end
31
+
32
+ length
19
33
  end
20
34
 
21
35
  # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
@@ -52,7 +66,7 @@ module Twitter
52
66
  extracted.size == 1 && extracted.first == username[1..-1]
53
67
  end
54
68
 
55
- VALID_LIST_RE = /\A#{Twitter::Regex[:auto_link_usernames_or_lists]}\z/o
69
+ VALID_LIST_RE = /\A#{Twitter::Regex[:valid_mention_or_list]}\z/o
56
70
  def valid_list?(username_list)
57
71
  match = username_list.match(VALID_LIST_RE)
58
72
  # Must have matched and had nothing before or after
data/lib/twitter-text.rb CHANGED
@@ -10,11 +10,17 @@ end
10
10
 
11
11
  require 'active_support'
12
12
  require 'active_support/core_ext/string/multibyte.rb'
13
+ require 'active_support/core_ext/hash/keys.rb'
13
14
 
14
- require File.join(File.dirname(__FILE__), 'regex')
15
- require File.join(File.dirname(__FILE__), 'rewriter')
16
- require File.join(File.dirname(__FILE__), 'autolink')
17
- require File.join(File.dirname(__FILE__), 'extractor')
18
- require File.join(File.dirname(__FILE__), 'unicode')
19
- require File.join(File.dirname(__FILE__), 'validation')
20
- require File.join(File.dirname(__FILE__), 'hithighlighter')
15
+ %w(
16
+ deprecation
17
+ regex
18
+ rewriter
19
+ autolink
20
+ extractor
21
+ unicode
22
+ validation
23
+ hit_highlighter
24
+ ).each do |name|
25
+ require "twitter-text/#{name}"
26
+ end