twitter-text 1.4.17 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  class String
2
4
  # Helper function to count the character length by first converting to an
3
5
  # array. This is needed because with unicode strings, the return value
@@ -45,16 +47,48 @@ module Twitter
45
47
  # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
46
48
  # of usernames, lists, URLs and hashtags.
47
49
  module Extractor extend self
50
+ # Remove overlapping entities.
51
+ # This returns a new array with no overlapping entities.
52
+ def remove_overlapping_entities(entities)
53
+ # sort by start index
54
+ entities = entities.sort_by{|entity| entity[:indices].first}
55
+
56
+ # remove duplicates
57
+ prev = nil
58
+ entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
59
+ entities
60
+ end
61
+
62
+ # Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
63
+ # along with the indices for where the entity ocurred
64
+ # If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
65
+ # will be returned.
66
+ #
67
+ # If a block is given then it will be called for each entity.
68
+ def extract_entities_with_indices(text, options = {}, &block)
69
+ # extract all entities
70
+ entities = extract_urls_with_indices(text, options) +
71
+ extract_hashtags_with_indices(text, :check_url_overlap => false) +
72
+ extract_mentions_or_lists_with_indices(text) +
73
+ extract_cashtags_with_indices(text)
74
+
75
+ return [] if entities.empty?
76
+
77
+ entities = remove_overlapping_entities(entities)
78
+
79
+ entities.each(&block) if block_given?
80
+ entities
81
+ end
48
82
 
49
83
  # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
50
84
  # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
51
85
  # will be returned.
52
86
  #
53
87
  # If a block is given then it will be called for each username.
54
- def extract_mentioned_screen_names(text) # :yields: username
55
- screen_names_only = extract_mentioned_screen_names_with_indices(text).map{|mention| mention[:screen_name] }
56
- screen_names_only.each{|mention| yield mention } if block_given?
57
- screen_names_only
88
+ def extract_mentioned_screen_names(text, &block) # :yields: username
89
+ screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
90
+ screen_names.each(&block) if block_given?
91
+ screen_names
58
92
  end
59
93
 
60
94
  # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
@@ -68,23 +102,20 @@ module Twitter
68
102
  return [] unless text
69
103
 
70
104
  possible_screen_names = []
71
- text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn|
72
- extract_mentions_match_data = $~
73
- after = $'
74
- unless after =~ Twitter::Regex[:end_screen_name_match]
75
- start_position = extract_mentions_match_data.char_begin(2) - 1
76
- end_position = extract_mentions_match_data.char_end(2)
77
- possible_screen_names << {
78
- :screen_name => sn,
79
- :indices => [start_position, end_position]
80
- }
81
- end
105
+ extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
106
+ next unless list_slug.empty?
107
+ possible_screen_names << {
108
+ :screen_name => screen_name,
109
+ :indices => [start_position, end_position]
110
+ }
82
111
  end
112
+
83
113
  if block_given?
84
114
  possible_screen_names.each do |mention|
85
115
  yield mention[:screen_name], mention[:indices].first, mention[:indices].last
86
116
  end
87
117
  end
118
+
88
119
  possible_screen_names
89
120
  end
90
121
 
@@ -97,17 +128,17 @@ module Twitter
97
128
  # index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
98
129
  # if this is a username mention.
99
130
  def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
100
- return [] unless text
131
+ return [] unless text =~ /[@@]/
101
132
 
102
133
  possible_entries = []
103
- text.to_s.scan(Twitter::Regex[:extract_mentions_or_lists]) do |before, sn, list_slug|
104
- extract_mentions_match_data = $~
134
+ text.to_s.scan(Twitter::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
135
+ match_data = $~
105
136
  after = $'
106
- unless after =~ Twitter::Regex[:end_screen_name_match]
107
- start_position = extract_mentions_match_data.char_begin(2) - 1
108
- end_position = extract_mentions_match_data.char_end(list_slug.nil? ? 2 : 3)
137
+ unless after =~ Twitter::Regex[:end_mention_match]
138
+ start_position = match_data.char_begin(3) - 1
139
+ end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
109
140
  possible_entries << {
110
- :screen_name => sn,
141
+ :screen_name => screen_name,
111
142
  :list_slug => list_slug || "",
112
143
  :indices => [start_position, end_position]
113
144
  }
@@ -130,9 +161,9 @@ module Twitter
130
161
  def extract_reply_screen_name(text) # :yields: username
131
162
  return nil unless text
132
163
 
133
- possible_screen_name = text.match(Twitter::Regex[:extract_reply])
164
+ possible_screen_name = text.match(Twitter::Regex[:valid_reply])
134
165
  return unless possible_screen_name.respond_to?(:captures)
135
- return if $' =~ Twitter::Regex[:end_screen_name_match]
166
+ return if $' =~ Twitter::Regex[:end_mention_match]
136
167
  screen_name = possible_screen_name.captures.first
137
168
  yield screen_name if block_given?
138
169
  screen_name
@@ -143,10 +174,10 @@ module Twitter
143
174
  # will be returned.
144
175
  #
145
176
  # If a block is given then it will be called for each URL.
146
- def extract_urls(text) # :yields: url
147
- urls_only = extract_urls_with_indices(text).map{|url| url[:url] }
148
- urls_only.each{|url| yield url } if block_given?
149
- urls_only
177
+ def extract_urls(text, &block) # :yields: url
178
+ urls = extract_urls_with_indices(text).map{|u| u[:url]}
179
+ urls.each(&block) if block_given?
180
+ urls
150
181
  end
151
182
 
152
183
  # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
@@ -154,10 +185,11 @@ module Twitter
154
185
  # URLs an empty array will be returned.
155
186
  #
156
187
  # If a block is given then it will be called for each URL.
157
- def extract_urls_with_indices(text) # :yields: url, start, end
158
- return [] unless text
188
+ def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
189
+ return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
159
190
  urls = []
160
191
  position = 0
192
+
161
193
  text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
162
194
  valid_url_match_data = $~
163
195
 
@@ -167,6 +199,7 @@ module Twitter
167
199
  # If protocol is missing and domain contains non-ASCII characters,
168
200
  # extract ASCII-only domains.
169
201
  if !protocol
202
+ next if !options[:extract_url_without_protocol] || before =~ Twitter::Regex[:invalid_url_without_protocol_preceding_chars]
170
203
  last_url = nil
171
204
  last_url_invalid_match = nil
172
205
  domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
@@ -201,7 +234,7 @@ module Twitter
201
234
  }
202
235
  end
203
236
  end
204
- urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last } if block_given?
237
+ urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
205
238
  urls
206
239
  end
207
240
 
@@ -211,10 +244,10 @@ module Twitter
211
244
  # character.
212
245
  #
213
246
  # If a block is given then it will be called for each hashtag.
214
- def extract_hashtags(text) # :yields: hashtag_text
215
- hashtags_only = extract_hashtags_with_indices(text).map{|hash| hash[:hashtag] }
216
- hashtags_only.each{|hash| yield hash } if block_given?
217
- hashtags_only
247
+ def extract_hashtags(text, &block) # :yields: hashtag_text
248
+ hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
249
+ hashtags.each(&block) if block_given?
250
+ hashtags
218
251
  end
219
252
 
220
253
  # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
@@ -223,13 +256,14 @@ module Twitter
223
256
  # character.
224
257
  #
225
258
  # If a block is given then it will be called for each hashtag.
226
- def extract_hashtags_with_indices(text) # :yields: hashtag_text, start, end
227
- return [] unless text
259
+ def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
260
+ return [] unless text =~ /[##]/
228
261
 
229
262
  tags = []
230
- text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
231
- start_position = $~.char_begin(2)
232
- end_position = $~.char_end(3)
263
+ text.scan(Twitter::Regex[:valid_hashtag]) do |before, hash, hash_text|
264
+ match_data = $~
265
+ start_position = match_data.char_begin(2)
266
+ end_position = match_data.char_end(3)
233
267
  after = $'
234
268
  unless after =~ Twitter::Regex[:end_hashtag_match]
235
269
  tags << {
@@ -238,7 +272,57 @@ module Twitter
238
272
  }
239
273
  end
240
274
  end
241
- tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given?
275
+
276
+ if options[:check_url_overlap]
277
+ # extract URLs
278
+ urls = extract_urls_with_indices(text)
279
+ unless urls.empty?
280
+ tags.concat(urls)
281
+ # remove duplicates
282
+ tags = remove_overlapping_entities(tags)
283
+ # remove URL entities
284
+ tags.reject!{|entity| !entity[:hashtag] }
285
+ end
286
+ end
287
+
288
+ tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
289
+ tags
290
+ end
291
+
292
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
293
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
294
+ # will be returned. The array returned will not include the leading <tt>$</tt>
295
+ # character.
296
+ #
297
+ # If a block is given then it will be called for each cashtag.
298
+ def extract_cashtags(text, &block) # :yields: cashtag_text
299
+ cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
300
+ cashtags.each(&block) if block_given?
301
+ cashtags
302
+ end
303
+
304
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
305
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
306
+ # will be returned. The array returned will not include the leading <tt>$</tt>
307
+ # character.
308
+ #
309
+ # If a block is given then it will be called for each cashtag.
310
+ def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
311
+ return [] unless text =~ /\$/
312
+
313
+ tags = []
314
+ text.scan(Twitter::Regex[:valid_cashtag]) do |cash_text|
315
+ match_data = $~
316
+ # cash_text doesn't contain $ symbol, so need to decrement start_position by one
317
+ start_position = match_data.char_begin(1) - 1
318
+ end_position = match_data.char_end(1)
319
+ tags << {
320
+ :cashtag => cash_text[0],
321
+ :indices => [start_position, end_position]
322
+ }
323
+ end
324
+
325
+ tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
242
326
  tags
243
327
  end
244
328
  end
@@ -23,9 +23,9 @@ module Twitter
23
23
 
24
24
  chunks = text.split(/[<>]/)
25
25
 
26
- result = ""
26
+ result = []
27
27
  chunk_index, chunk = 0, chunks[0]
28
- chunk_chars = chunk.respond_to?("mb_chars") ? chunk.mb_chars : chunk.respond_to?("chars") && chunk.chars.respond_to?("[]") ? chunk.chars : chunk
28
+ chunk_chars = chunk.to_s.to_char_a
29
29
  prev_chunks_len = 0
30
30
  chunk_cursor = 0
31
31
  start_in_chunk = false
@@ -49,13 +49,13 @@ module Twitter
49
49
  chunk_cursor = 0
50
50
  chunk_index += 2
51
51
  chunk = chunks[chunk_index]
52
- chunk_chars = chunk.respond_to?("mb_chars") ? chunk.mb_chars : chunk.respond_to?("chars") && chunk.chars.respond_to?("[]") ? chunk.chars : chunk
52
+ chunk_chars = chunk.to_s.to_char_a
53
53
  start_in_chunk = false
54
54
  end
55
55
 
56
56
  if !placed && !chunk.nil?
57
57
  hit_spot = hit - prev_chunks_len
58
- result << chunk_chars[chunk_cursor...hit_spot].to_s + tag
58
+ result << chunk_chars[chunk_cursor...hit_spot] << tag
59
59
  chunk_cursor = hit_spot
60
60
  if index % 2 == 0
61
61
  start_in_chunk = true
@@ -80,9 +80,7 @@ module Twitter
80
80
  end
81
81
  end
82
82
 
83
- result
84
- rescue
85
- text
83
+ result.flatten.join
86
84
  end
87
85
  end
88
86
  end
@@ -1,4 +1,5 @@
1
- # encoding: utf-8
1
+ # encoding: UTF-8
2
+
2
3
  module Twitter
3
4
  # A collection of regular expressions for parsing Tweet text. The regular expression
4
5
  # list is frozen at load time to ensure immutability. These reular expressions are
@@ -77,6 +78,7 @@ module Twitter
77
78
  regex_range(0x0289),
78
79
  regex_range(0x028b),
79
80
  regex_range(0x02bb),
81
+ regex_range(0x0300, 0x036f),
80
82
  regex_range(0x1e00, 0x1eff)
81
83
  ].join('').freeze
82
84
 
@@ -86,13 +88,12 @@ module Twitter
86
88
  regex_range(0x0500, 0x0527), # Cyrillic Supplement
87
89
  regex_range(0x2de0, 0x2dff), # Cyrillic Extended A
88
90
  regex_range(0xa640, 0xa69f), # Cyrillic Extended B
89
- regex_range(0x0591, 0x05bd), # Hebrew
90
- regex_range(0x05bf),
91
+ regex_range(0x0591, 0x05bf), # Hebrew
91
92
  regex_range(0x05c1, 0x05c2),
92
93
  regex_range(0x05c4, 0x05c5),
93
94
  regex_range(0x05c7),
94
95
  regex_range(0x05d0, 0x05ea),
95
- regex_range(0x05f0, 0x05f2),
96
+ regex_range(0x05f0, 0x05f4),
96
97
  regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms
97
98
  regex_range(0xfb2a, 0xfb36),
98
99
  regex_range(0xfb38, 0xfb3c),
@@ -141,38 +142,44 @@ module Twitter
141
142
  regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B)
142
143
  regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C)
143
144
  regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D)
144
- regex_range(0x2F800, 0x2FA1F), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
145
+ regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
145
146
  ].join('').freeze
146
147
 
148
+ PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
149
+ SPACE_CHARS = " \t\n\x0B\f\r"
150
+ CTRL_CHARS = "\x00-\x1F\x7F"
151
+
147
152
  # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
148
153
  HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
149
154
  HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
150
- HASHTAG_BOUNDARY = /\A|\z|[^&\/a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
155
+ HASHTAG_BOUNDARY = /\A|\z|[^&a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
151
156
 
152
157
  HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
153
158
 
154
- REGEXEN[:auto_link_hashtags] = /#{HASHTAG}/io
155
- # Used in Extractor and Rewriter for final filtering
159
+ REGEXEN[:valid_hashtag] = /#{HASHTAG}/io
160
+ # Used in Extractor for final filtering
156
161
  REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
157
162
 
163
+ REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)/o
158
164
  REGEXEN[:at_signs] = /[@@]/
159
- REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_!#\$%&*@@])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
160
- REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_!#\$%&*@@])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
161
- REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
162
- # Used in Extractor and Rewriter for final filtering
163
- REGEXEN[:end_screen_name_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
164
-
165
- REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
166
- REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
165
+ REGEXEN[:valid_mention_or_list] = /
166
+ (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
167
+ (#{REGEXEN[:at_signs]}) # $2: At mark
168
+ ([a-zA-Z0-9_]{1,20}) # $3: Screen name
169
+ (\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
170
+ /ox
171
+ REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
172
+ # Used in Extractor for final filtering
173
+ REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
167
174
 
168
175
  # URL related hash regex collection
169
- REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@@\$##\.#{INVALID_CHARACTERS.join('')}]|^)/io
170
-
171
- DOMAIN_VALID_CHARS = "[^[:punct:][:space:][:blank:][:cntrl:]#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
176
+ REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
177
+ REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
178
+ DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
172
179
  REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
173
180
  REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
174
181
 
175
- REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^a-z]|$))/i
182
+ REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^0-9a-z]|$))/i
176
183
  REGEXEN[:valid_ccTLD] = %r{
177
184
  (?:
178
185
  (?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
@@ -181,10 +188,10 @@ module Twitter
181
188
  lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
182
189
  pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|
183
190
  tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
184
- (?=[^a-z]|$)
191
+ (?=[^0-9a-z]|$)
185
192
  )
186
193
  }ix
187
- REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/
194
+ REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
188
195
 
189
196
  REGEXEN[:valid_domain] = /(?:
190
197
  #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
@@ -226,7 +233,7 @@ module Twitter
226
233
  REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
227
234
  REGEXEN[:valid_url] = %r{
228
235
  ( # $1 total match
229
- (#{REGEXEN[:valid_preceding_chars]}) # $2 Preceeding chracter
236
+ (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
230
237
  ( # $3 URL
231
238
  (https?:\/\/)? # $4 Protocol (optional)
232
239
  (#{REGEXEN[:valid_domain]}) # $5 Domain(s)
@@ -237,6 +244,9 @@ module Twitter
237
244
  )
238
245
  }iox;
239
246
 
247
+ REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
248
+ REGEXEN[:valid_cashtag] = /(?:^|#{REGEXEN[:spaces]})\$(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
249
+
240
250
  # These URL validation pattern strings are based on the ABNF from RFC 3986
241
251
  REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i
242
252
  REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
@@ -0,0 +1,59 @@
1
+ module Twitter
2
+ # A module provides base methods to rewrite usernames, lists, hashtags and URLs.
3
+ module Rewriter extend self
4
+ def rewrite_entities(text, entities)
5
+ chars = text.to_s.to_char_a
6
+
7
+ # sort by start index
8
+ entities = entities.sort_by{|entity| entity[:indices].first}
9
+
10
+ result = []
11
+ last_index = entities.inject(0) do |last_index, entity|
12
+ result << chars[last_index...entity[:indices].first]
13
+ result << yield(entity, chars)
14
+ entity[:indices].last
15
+ end
16
+ result << chars[last_index..-1]
17
+
18
+ result.flatten.join
19
+ end
20
+
21
+ # These methods are deprecated, will be removed in future.
22
+ extend Deprecation
23
+
24
+ def rewrite(text, options = {})
25
+ [:hashtags, :urls, :usernames_or_lists].inject(text) do |key|
26
+ options[key] ? send(:"rewrite_#{key}", text, &options[key]) : text
27
+ end
28
+ end
29
+ deprecate :rewrite, :rewrite_entities
30
+
31
+ def rewrite_usernames_or_lists(text)
32
+ entities = Extractor.extract_mentions_or_lists_with_indices(text)
33
+ rewrite_entities(text, entities) do |entity, chars|
34
+ at = chars[entity[:indices].first]
35
+ list_slug = entity[:list_slug]
36
+ list_slug = nil if list_slug.empty?
37
+ yield(at, entity[:screen_name], list_slug)
38
+ end
39
+ end
40
+ deprecate :rewrite_usernames_or_lists, :rewrite_entities
41
+
42
+ def rewrite_hashtags(text)
43
+ entities = Extractor.extract_hashtags_with_indices(text)
44
+ rewrite_entities(text, entities) do |entity, chars|
45
+ hash = chars[entity[:indices].first]
46
+ yield(hash, entity[:hashtag])
47
+ end
48
+ end
49
+ deprecate :rewrite_hashtags, :rewrite_entities
50
+
51
+ def rewrite_urls(text)
52
+ entities = Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false)
53
+ rewrite_entities(text, entities) do |entity, chars|
54
+ yield(entity[:url])
55
+ end
56
+ end
57
+ deprecate :rewrite_urls, :rewrite_entities
58
+ end
59
+ end
File without changes
@@ -2,6 +2,11 @@ module Twitter
2
2
  module Validation extend self
3
3
  MAX_LENGTH = 140
4
4
 
5
+ DEFAULT_TCO_URL_LENGTHS = {
6
+ :short_url_length => 20,
7
+ :short_url_length_https => 21
8
+ }.freeze
9
+
5
10
  # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
6
11
  # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
7
12
  # string no matter which actual form was transmitted. For example:
@@ -14,8 +19,17 @@ module Twitter
14
19
  #
15
20
  # The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
16
21
  #
17
- def tweet_length(text)
18
- ActiveSupport::Multibyte::Chars.new(text).normalize(:c).length
22
+ def tweet_length(text, options = {})
23
+ options = DEFAULT_TCO_URL_LENGTHS.merge(options)
24
+
25
+ length = ActiveSupport::Multibyte::Chars.new(text).normalize(:c).length
26
+
27
+ Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
28
+ length += start_position - end_position
29
+ length += url.downcase =~ /^https:\/\// ? options[:short_url_length_https] : options[:short_url_length]
30
+ end
31
+
32
+ length
19
33
  end
20
34
 
21
35
  # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
@@ -52,7 +66,7 @@ module Twitter
52
66
  extracted.size == 1 && extracted.first == username[1..-1]
53
67
  end
54
68
 
55
- VALID_LIST_RE = /\A#{Twitter::Regex[:auto_link_usernames_or_lists]}\z/o
69
+ VALID_LIST_RE = /\A#{Twitter::Regex[:valid_mention_or_list]}\z/o
56
70
  def valid_list?(username_list)
57
71
  match = username_list.match(VALID_LIST_RE)
58
72
  # Must have matched and had nothing before or after
data/lib/twitter-text.rb CHANGED
@@ -10,11 +10,17 @@ end
10
10
 
11
11
  require 'active_support'
12
12
  require 'active_support/core_ext/string/multibyte.rb'
13
+ require 'active_support/core_ext/hash/keys.rb'
13
14
 
14
- require File.join(File.dirname(__FILE__), 'regex')
15
- require File.join(File.dirname(__FILE__), 'rewriter')
16
- require File.join(File.dirname(__FILE__), 'autolink')
17
- require File.join(File.dirname(__FILE__), 'extractor')
18
- require File.join(File.dirname(__FILE__), 'unicode')
19
- require File.join(File.dirname(__FILE__), 'validation')
20
- require File.join(File.dirname(__FILE__), 'hithighlighter')
15
+ %w(
16
+ deprecation
17
+ regex
18
+ rewriter
19
+ autolink
20
+ extractor
21
+ unicode
22
+ validation
23
+ hit_highlighter
24
+ ).each do |name|
25
+ require "twitter-text/#{name}"
26
+ end