twitter-text 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,53 +1,54 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  module Twitter
4
- class Configuration
5
- require 'json'
6
-
7
- PARSER_VERSION_CLASSIC = "v1"
8
- PARSER_VERSION_DEFAULT = "v2"
9
-
10
- class << self
11
- attr_accessor :default_configuration
12
- end
13
-
14
- attr_reader :version, :max_weighted_tweet_length, :scale
15
- attr_reader :default_weight, :transformed_url_length, :ranges
16
-
17
- CONFIG_V1 = File.join(
18
- File.expand_path('../../../config', __FILE__), # project root
19
- "#{PARSER_VERSION_CLASSIC}.json"
20
- )
21
-
22
- CONFIG_V2 = File.join(
23
- File.expand_path('../../../config', __FILE__), # project root
24
- "#{PARSER_VERSION_DEFAULT}.json"
25
- )
26
-
27
- def self.parse_string(string, options = {})
28
- JSON.parse(string, options.merge(symbolize_names: true))
4
+ module TwitterText
5
+ class Configuration
6
+ require 'json'
7
+
8
+ PARSER_VERSION_CLASSIC = "v1"
9
+ PARSER_VERSION_DEFAULT = "v2"
10
+
11
+ class << self
12
+ attr_accessor :default_configuration
13
+ end
14
+
15
+ attr_reader :version, :max_weighted_tweet_length, :scale
16
+ attr_reader :default_weight, :transformed_url_length, :ranges
17
+
18
+ CONFIG_V1 = File.join(
19
+ File.expand_path('../../../config', __FILE__), # project root
20
+ "#{PARSER_VERSION_CLASSIC}.json"
21
+ )
22
+
23
+ CONFIG_V2 = File.join(
24
+ File.expand_path('../../../config', __FILE__), # project root
25
+ "#{PARSER_VERSION_DEFAULT}.json"
26
+ )
27
+
28
+ def self.parse_string(string, options = {})
29
+ JSON.parse(string, options.merge(symbolize_names: true))
30
+ end
31
+
32
+ def self.parse_file(filename)
33
+ string = File.open(filename, 'rb') { |f| f.read }
34
+ parse_string(string)
35
+ end
36
+
37
+ def self.configuration_from_file(filename)
38
+ config = parse_file(filename)
39
+ config ? self.new(config) : nil
40
+ end
41
+
42
+ def initialize(config = {})
43
+ @version = config[:version]
44
+ @max_weighted_tweet_length = config[:maxWeightedTweetLength]
45
+ @scale = config[:scale]
46
+ @default_weight = config[:defaultWeight]
47
+ @transformed_url_length = config[:transformedURLLength]
48
+ @ranges = config[:ranges].map { |range| Twitter::TwitterText::WeightedRange.new(range) } if config.key?(:ranges) && config[:ranges].is_a?(Array)
49
+ end
50
+
51
+ self.default_configuration = self.configuration_from_file(CONFIG_V2)
29
52
  end
30
-
31
- def self.parse_file(filename)
32
- string = File.open(filename, 'rb') { |f| f.read }
33
- parse_string(string)
34
- end
35
-
36
- def self.configuration_from_file(filename)
37
- config = parse_file(filename)
38
- config ? Twitter::Configuration.new(config) : nil
39
- end
40
-
41
- def initialize(config = {})
42
- @version = config[:version]
43
- @max_weighted_tweet_length = config[:maxWeightedTweetLength]
44
- @scale = config[:scale]
45
- @default_weight = config[:defaultWeight]
46
- @transformed_url_length = config[:transformedURLLength]
47
- @ranges = config[:ranges].map { |range| Twitter::WeightedRange.new(range) } if config.key?(:ranges) && config[:ranges].is_a?(Array)
48
- end
49
-
50
- self.default_configuration = Twitter::Configuration.configuration_from_file(Twitter::Configuration::CONFIG_V2)
51
53
  end
52
54
  end
53
-
@@ -1,14 +1,16 @@
1
1
  module Twitter
2
- module Deprecation
3
- def deprecate(method, new_method = nil)
4
- deprecated_method = :"deprecated_#{method}"
5
- message = "Deprecation: `#{method}` is deprecated."
6
- message << " Please use `#{new_method}` instead." if new_method
2
+ module TwitterText
3
+ module Deprecation
4
+ def deprecate(method, new_method = nil)
5
+ deprecated_method = :"deprecated_#{method}"
6
+ message = "Deprecation: `#{method}` is deprecated."
7
+ message << " Please use `#{new_method}` instead." if new_method
7
8
 
8
- alias_method(deprecated_method, method)
9
- define_method method do |*args, &block|
10
- warn message unless $TESTING
11
- send(deprecated_method, *args, &block)
9
+ alias_method(deprecated_method, method)
10
+ define_method method do |*args, &block|
11
+ warn message unless $TESTING
12
+ send(deprecated_method, *args, &block)
13
+ end
12
14
  end
13
15
  end
14
16
  end
@@ -45,313 +45,315 @@ class MatchData
45
45
  end
46
46
 
47
47
  module Twitter
48
- # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
49
- # of usernames, lists, URLs and hashtags.
50
- module Extractor extend self
48
+ module TwitterText
49
+ # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
50
+ # of usernames, lists, URLs and hashtags.
51
+ module Extractor extend self
51
52
 
52
- # Maximum URL length as defined by Twitter's backend.
53
- MAX_URL_LENGTH = 4096
53
+ # Maximum URL length as defined by Twitter's backend.
54
+ MAX_URL_LENGTH = 4096
54
55
 
55
- # The maximum t.co path length that the Twitter backend supports.
56
- MAX_TCO_SLUG_LENGTH = 40
56
+ # The maximum t.co path length that the Twitter backend supports.
57
+ MAX_TCO_SLUG_LENGTH = 40
57
58
 
58
- URL_PROTOCOL_LENGTH = "https://".length
59
+ URL_PROTOCOL_LENGTH = "https://".length
59
60
 
60
- # Remove overlapping entities.
61
- # This returns a new array with no overlapping entities.
62
- def remove_overlapping_entities(entities)
63
- # sort by start index
64
- entities = entities.sort_by{|entity| entity[:indices].first}
61
+ # Remove overlapping entities.
62
+ # This returns a new array with no overlapping entities.
63
+ def remove_overlapping_entities(entities)
64
+ # sort by start index
65
+ entities = entities.sort_by{|entity| entity[:indices].first}
65
66
 
66
- # remove duplicates
67
- prev = nil
68
- entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
69
- entities
70
- end
71
-
72
- # Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
73
- # along with the indices for where the entity ocurred
74
- # If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
75
- # will be returned.
76
- #
77
- # If a block is given then it will be called for each entity.
78
- def extract_entities_with_indices(text, options = {}, &block)
79
- # extract all entities
80
- entities = extract_urls_with_indices(text, options) +
81
- extract_hashtags_with_indices(text, :check_url_overlap => false) +
82
- extract_mentions_or_lists_with_indices(text) +
83
- extract_cashtags_with_indices(text)
84
-
85
- return [] if entities.empty?
86
-
87
- entities = remove_overlapping_entities(entities)
88
-
89
- entities.each(&block) if block_given?
90
- entities
91
- end
92
-
93
- # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
94
- # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
95
- # will be returned.
96
- #
97
- # If a block is given then it will be called for each username.
98
- def extract_mentioned_screen_names(text, &block) # :yields: username
99
- screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
100
- screen_names.each(&block) if block_given?
101
- screen_names
102
- end
103
-
104
- # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
105
- # along with the indices for where the mention ocurred. If the
106
- # <tt>text</tt> is nil or contains no username mentions, an empty array
107
- # will be returned.
108
- #
109
- # If a block is given, then it will be called with each username, the start
110
- # index, and the end index in the <tt>text</tt>.
111
- def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
112
- return [] unless text
113
-
114
- possible_screen_names = []
115
- extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
116
- next unless list_slug.empty?
117
- possible_screen_names << {
118
- :screen_name => screen_name,
119
- :indices => [start_position, end_position]
120
- }
67
+ # remove duplicates
68
+ prev = nil
69
+ entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
70
+ entities
121
71
  end
122
72
 
123
- if block_given?
124
- possible_screen_names.each do |mention|
125
- yield mention[:screen_name], mention[:indices].first, mention[:indices].last
126
- end
73
+ # Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
74
+ # along with the indices for where the entity ocurred
75
+ # If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
76
+ # will be returned.
77
+ #
78
+ # If a block is given then it will be called for each entity.
79
+ def extract_entities_with_indices(text, options = {}, &block)
80
+ # extract all entities
81
+ entities = extract_urls_with_indices(text, options) +
82
+ extract_hashtags_with_indices(text, :check_url_overlap => false) +
83
+ extract_mentions_or_lists_with_indices(text) +
84
+ extract_cashtags_with_indices(text)
85
+
86
+ return [] if entities.empty?
87
+
88
+ entities = remove_overlapping_entities(entities)
89
+
90
+ entities.each(&block) if block_given?
91
+ entities
127
92
  end
128
93
 
129
- possible_screen_names
130
- end
94
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
95
+ # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
96
+ # will be returned.
97
+ #
98
+ # If a block is given then it will be called for each username.
99
+ def extract_mentioned_screen_names(text, &block) # :yields: username
100
+ screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
101
+ screen_names.each(&block) if block_given?
102
+ screen_names
103
+ end
131
104
 
132
- # Extracts a list of all usernames or lists mentioned in the Tweet <tt>text</tt>
133
- # along with the indices for where the mention ocurred. If the
134
- # <tt>text</tt> is nil or contains no username or list mentions, an empty array
135
- # will be returned.
136
- #
137
- # If a block is given, then it will be called with each username, list slug, the start
138
- # index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
139
- # if this is a username mention.
140
- def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
141
- return [] unless text =~ /[@@]/
142
-
143
- possible_entries = []
144
- text.to_s.scan(Twitter::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
145
- match_data = $~
146
- after = $'
147
- unless after =~ Twitter::Regex[:end_mention_match]
148
- start_position = match_data.char_begin(3) - 1
149
- end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
150
- possible_entries << {
105
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
106
+ # along with the indices for where the mention ocurred. If the
107
+ # <tt>text</tt> is nil or contains no username mentions, an empty array
108
+ # will be returned.
109
+ #
110
+ # If a block is given, then it will be called with each username, the start
111
+ # index, and the end index in the <tt>text</tt>.
112
+ def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
113
+ return [] unless text
114
+
115
+ possible_screen_names = []
116
+ extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
117
+ next unless list_slug.empty?
118
+ possible_screen_names << {
151
119
  :screen_name => screen_name,
152
- :list_slug => list_slug || "",
153
120
  :indices => [start_position, end_position]
154
121
  }
155
122
  end
156
- end
157
123
 
158
- if block_given?
159
- possible_entries.each do |mention|
160
- yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
124
+ if block_given?
125
+ possible_screen_names.each do |mention|
126
+ yield mention[:screen_name], mention[:indices].first, mention[:indices].last
127
+ end
161
128
  end
129
+
130
+ possible_screen_names
162
131
  end
163
132
 
164
- possible_entries
165
- end
133
+ # Extracts a list of all usernames or lists mentioned in the Tweet <tt>text</tt>
134
+ # along with the indices for where the mention ocurred. If the
135
+ # <tt>text</tt> is nil or contains no username or list mentions, an empty array
136
+ # will be returned.
137
+ #
138
+ # If a block is given, then it will be called with each username, list slug, the start
139
+ # index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
140
+ # if this is a username mention.
141
+ def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
142
+ return [] unless text =~ /[@@]/
143
+
144
+ possible_entries = []
145
+ text.to_s.scan(Twitter::TwitterText::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
146
+ match_data = $~
147
+ after = $'
148
+ unless after =~ Twitter::TwitterText::Regex[:end_mention_match]
149
+ start_position = match_data.char_begin(3) - 1
150
+ end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
151
+ possible_entries << {
152
+ :screen_name => screen_name,
153
+ :list_slug => list_slug || "",
154
+ :indices => [start_position, end_position]
155
+ }
156
+ end
157
+ end
166
158
 
167
- # Extracts the username username replied to in the Tweet <tt>text</tt>. If the
168
- # <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
169
- #
170
- # If a block is given then it will be called with the username replied to (if any)
171
- def extract_reply_screen_name(text) # :yields: username
172
- return nil unless text
173
-
174
- possible_screen_name = text.match(Twitter::Regex[:valid_reply])
175
- return unless possible_screen_name.respond_to?(:captures)
176
- return if $' =~ Twitter::Regex[:end_mention_match]
177
- screen_name = possible_screen_name.captures.first
178
- yield screen_name if block_given?
179
- screen_name
180
- end
159
+ if block_given?
160
+ possible_entries.each do |mention|
161
+ yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
162
+ end
163
+ end
181
164
 
182
- # Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
183
- # <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
184
- # will be returned.
185
- #
186
- # If a block is given then it will be called for each URL.
187
- def extract_urls(text, &block) # :yields: url
188
- urls = extract_urls_with_indices(text).map{|u| u[:url]}
189
- urls.each(&block) if block_given?
190
- urls
191
- end
165
+ possible_entries
166
+ end
192
167
 
193
- # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
194
- # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
195
- # URLs an empty array will be returned.
196
- #
197
- # If a block is given then it will be called for each URL.
198
- def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
199
- return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
200
- urls = []
201
-
202
- text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
203
- valid_url_match_data = $~
204
-
205
- start_position = valid_url_match_data.char_begin(3)
206
- end_position = valid_url_match_data.char_end(3)
207
-
208
- # If protocol is missing and domain contains non-ASCII characters,
209
- # extract ASCII-only domains.
210
- if !protocol
211
- next if !options[:extract_url_without_protocol] || before =~ Twitter::Regex[:invalid_url_without_protocol_preceding_chars]
212
- last_url = nil
213
- domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
214
- next unless is_valid_domain(url.length, ascii_domain, protocol)
215
- last_url = {
216
- :url => ascii_domain,
217
- :indices => [start_position + $~.char_begin(0),
218
- start_position + $~.char_end(0)]
219
- }
220
- if path ||
221
- ascii_domain =~ Twitter::Regex[:valid_special_short_domain] ||
222
- ascii_domain !~ Twitter::Regex[:invalid_short_domain]
223
- urls << last_url
168
+ # Extracts the username username replied to in the Tweet <tt>text</tt>. If the
169
+ # <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
170
+ #
171
+ # If a block is given then it will be called with the username replied to (if any)
172
+ def extract_reply_screen_name(text) # :yields: username
173
+ return nil unless text
174
+
175
+ possible_screen_name = text.match(Twitter::TwitterText::Regex[:valid_reply])
176
+ return unless possible_screen_name.respond_to?(:captures)
177
+ return if $' =~ Twitter::TwitterText::Regex[:end_mention_match]
178
+ screen_name = possible_screen_name.captures.first
179
+ yield screen_name if block_given?
180
+ screen_name
181
+ end
182
+
183
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
184
+ # <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
185
+ # will be returned.
186
+ #
187
+ # If a block is given then it will be called for each URL.
188
+ def extract_urls(text, &block) # :yields: url
189
+ urls = extract_urls_with_indices(text).map{|u| u[:url]}
190
+ urls.each(&block) if block_given?
191
+ urls
192
+ end
193
+
194
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
195
+ # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
196
+ # URLs an empty array will be returned.
197
+ #
198
+ # If a block is given then it will be called for each URL.
199
+ def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
200
+ return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
201
+ urls = []
202
+
203
+ text.to_s.scan(Twitter::TwitterText::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
204
+ valid_url_match_data = $~
205
+
206
+ start_position = valid_url_match_data.char_begin(3)
207
+ end_position = valid_url_match_data.char_end(3)
208
+
209
+ # If protocol is missing and domain contains non-ASCII characters,
210
+ # extract ASCII-only domains.
211
+ if !protocol
212
+ next if !options[:extract_url_without_protocol] || before =~ Twitter::TwitterText::Regex[:invalid_url_without_protocol_preceding_chars]
213
+ last_url = nil
214
+ domain.scan(Twitter::TwitterText::Regex[:valid_ascii_domain]) do |ascii_domain|
215
+ next unless is_valid_domain(url.length, ascii_domain, protocol)
216
+ last_url = {
217
+ :url => ascii_domain,
218
+ :indices => [start_position + $~.char_begin(0),
219
+ start_position + $~.char_end(0)]
220
+ }
221
+ if path ||
222
+ ascii_domain =~ Twitter::TwitterText::Regex[:valid_special_short_domain] ||
223
+ ascii_domain !~ Twitter::TwitterText::Regex[:invalid_short_domain]
224
+ urls << last_url
225
+ end
224
226
  end
225
- end
226
227
 
227
- # no ASCII-only domain found. Skip the entire URL
228
- next unless last_url
228
+ # no ASCII-only domain found. Skip the entire URL
229
+ next unless last_url
229
230
 
230
- # last_url only contains domain. Need to add path and query if they exist.
231
- if path
232
- # last_url was not added. Add it to urls here.
233
- last_url[:url] = url.sub(domain, last_url[:url])
234
- last_url[:indices][1] = end_position
235
- end
236
- else
237
- # In the case of t.co URLs, don't allow additional path characters
238
- if url =~ Twitter::Regex[:valid_tco_url]
239
- next if $1 && $1.length > MAX_TCO_SLUG_LENGTH
240
- url = $&
241
- end_position = start_position + url.char_length
242
- end
231
+ # last_url only contains domain. Need to add path and query if they exist.
232
+ if path
233
+ # last_url was not added. Add it to urls here.
234
+ last_url[:url] = url.sub(domain, last_url[:url])
235
+ last_url[:indices][1] = end_position
236
+ end
237
+ else
238
+ # In the case of t.co URLs, don't allow additional path characters
239
+ if url =~ Twitter::TwitterText::Regex[:valid_tco_url]
240
+ next if $1 && $1.length > MAX_TCO_SLUG_LENGTH
241
+ url = $&
242
+ end_position = start_position + url.char_length
243
+ end
243
244
 
244
- next unless is_valid_domain(url.length, domain, protocol)
245
+ next unless is_valid_domain(url.length, domain, protocol)
245
246
 
246
- urls << {
247
- :url => url,
248
- :indices => [start_position, end_position]
249
- }
247
+ urls << {
248
+ :url => url,
249
+ :indices => [start_position, end_position]
250
+ }
251
+ end
250
252
  end
253
+ urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
254
+ urls
251
255
  end
252
- urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
253
- urls
254
- end
255
256
 
256
- # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
257
- # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
258
- # will be returned. The array returned will not include the leading <tt>#</tt>
259
- # character.
260
- #
261
- # If a block is given then it will be called for each hashtag.
262
- def extract_hashtags(text, &block) # :yields: hashtag_text
263
- hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
264
- hashtags.each(&block) if block_given?
265
- hashtags
266
- end
257
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
258
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
259
+ # will be returned. The array returned will not include the leading <tt>#</tt>
260
+ # character.
261
+ #
262
+ # If a block is given then it will be called for each hashtag.
263
+ def extract_hashtags(text, &block) # :yields: hashtag_text
264
+ hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
265
+ hashtags.each(&block) if block_given?
266
+ hashtags
267
+ end
267
268
 
268
- # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
269
- # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
270
- # will be returned. The array returned will not include the leading <tt>#</tt>
271
- # character.
272
- #
273
- # If a block is given then it will be called for each hashtag.
274
- def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
275
- return [] unless text =~ /[##]/
276
-
277
- tags = []
278
- text.scan(Twitter::Regex[:valid_hashtag]) do |before, hash, hash_text|
279
- match_data = $~
280
- start_position = match_data.char_begin(2)
281
- end_position = match_data.char_end(3)
282
- after = $'
283
- unless after =~ Twitter::Regex[:end_hashtag_match]
284
- tags << {
285
- :hashtag => hash_text,
286
- :indices => [start_position, end_position]
287
- }
269
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
270
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
271
+ # will be returned. The array returned will not include the leading <tt>#</tt>
272
+ # character.
273
+ #
274
+ # If a block is given then it will be called for each hashtag.
275
+ def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
276
+ return [] unless text =~ /[##]/
277
+
278
+ tags = []
279
+ text.scan(Twitter::TwitterText::Regex[:valid_hashtag]) do |before, hash, hash_text|
280
+ match_data = $~
281
+ start_position = match_data.char_begin(2)
282
+ end_position = match_data.char_end(3)
283
+ after = $'
284
+ unless after =~ Twitter::TwitterText::Regex[:end_hashtag_match]
285
+ tags << {
286
+ :hashtag => hash_text,
287
+ :indices => [start_position, end_position]
288
+ }
289
+ end
288
290
  end
289
- end
290
291
 
291
- if options[:check_url_overlap]
292
- # extract URLs
293
- urls = extract_urls_with_indices(text)
294
- unless urls.empty?
295
- tags.concat(urls)
296
- # remove duplicates
297
- tags = remove_overlapping_entities(tags)
298
- # remove URL entities
299
- tags.reject!{|entity| !entity[:hashtag] }
292
+ if options[:check_url_overlap]
293
+ # extract URLs
294
+ urls = extract_urls_with_indices(text)
295
+ unless urls.empty?
296
+ tags.concat(urls)
297
+ # remove duplicates
298
+ tags = remove_overlapping_entities(tags)
299
+ # remove URL entities
300
+ tags.reject!{|entity| !entity[:hashtag] }
301
+ end
300
302
  end
303
+
304
+ tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
305
+ tags
301
306
  end
302
307
 
303
- tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
304
- tags
305
- end
308
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
309
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
310
+ # will be returned. The array returned will not include the leading <tt>$</tt>
311
+ # character.
312
+ #
313
+ # If a block is given then it will be called for each cashtag.
314
+ def extract_cashtags(text, &block) # :yields: cashtag_text
315
+ cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
316
+ cashtags.each(&block) if block_given?
317
+ cashtags
318
+ end
306
319
 
307
- # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
308
- # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
309
- # will be returned. The array returned will not include the leading <tt>$</tt>
310
- # character.
311
- #
312
- # If a block is given then it will be called for each cashtag.
313
- def extract_cashtags(text, &block) # :yields: cashtag_text
314
- cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
315
- cashtags.each(&block) if block_given?
316
- cashtags
317
- end
320
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
321
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
322
+ # will be returned. The array returned will not include the leading <tt>$</tt>
323
+ # character.
324
+ #
325
+ # If a block is given then it will be called for each cashtag.
326
+ def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
327
+ return [] unless text =~ /\$/
328
+
329
+ tags = []
330
+ text.scan(Twitter::TwitterText::Regex[:valid_cashtag]) do |before, dollar, cash_text|
331
+ match_data = $~
332
+ start_position = match_data.char_begin(2)
333
+ end_position = match_data.char_end(3)
334
+ tags << {
335
+ :cashtag => cash_text,
336
+ :indices => [start_position, end_position]
337
+ }
338
+ end
318
339
 
319
- # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
320
- # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
321
- # will be returned. The array returned will not include the leading <tt>$</tt>
322
- # character.
323
- #
324
- # If a block is given then it will be called for each cashtag.
325
- def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
326
- return [] unless text =~ /\$/
327
-
328
- tags = []
329
- text.scan(Twitter::Regex[:valid_cashtag]) do |before, dollar, cash_text|
330
- match_data = $~
331
- start_position = match_data.char_begin(2)
332
- end_position = match_data.char_end(3)
333
- tags << {
334
- :cashtag => cash_text,
335
- :indices => [start_position, end_position]
336
- }
340
+ tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
341
+ tags
337
342
  end
338
343
 
339
- tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
340
- tags
341
- end
342
-
343
- def is_valid_domain(url_length, domain, protocol)
344
- begin
345
- raise ArgumentError.new("invalid empty domain") unless domain
346
- original_domain_length = domain.length
347
- encoded_domain = IDN::Idna.toASCII(domain)
348
- updated_domain_length = encoded_domain.length
349
- url_length += (updated_domain_length - original_domain_length) if (updated_domain_length > original_domain_length)
350
- url_length += URL_PROTOCOL_LENGTH unless protocol
351
- url_length <= MAX_URL_LENGTH
352
- rescue Exception
353
- # On error don't consider this a valid domain.
354
- return false
344
+ def is_valid_domain(url_length, domain, protocol)
345
+ begin
346
+ raise ArgumentError.new("invalid empty domain") unless domain
347
+ original_domain_length = domain.length
348
+ encoded_domain = IDN::Idna.toASCII(domain)
349
+ updated_domain_length = encoded_domain.length
350
+ url_length += (updated_domain_length - original_domain_length) if (updated_domain_length > original_domain_length)
351
+ url_length += URL_PROTOCOL_LENGTH unless protocol
352
+ url_length <= MAX_URL_LENGTH
353
+ rescue Exception
354
+ # On error don't consider this a valid domain.
355
+ return false
356
+ end
355
357
  end
356
358
  end
357
359
  end