twitter-text 2.0.2 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,53 +1,54 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  module Twitter
4
- class Configuration
5
- require 'json'
6
-
7
- PARSER_VERSION_CLASSIC = "v1"
8
- PARSER_VERSION_DEFAULT = "v2"
9
-
10
- class << self
11
- attr_accessor :default_configuration
12
- end
13
-
14
- attr_reader :version, :max_weighted_tweet_length, :scale
15
- attr_reader :default_weight, :transformed_url_length, :ranges
16
-
17
- CONFIG_V1 = File.join(
18
- File.expand_path('../../../config', __FILE__), # project root
19
- "#{PARSER_VERSION_CLASSIC}.json"
20
- )
21
-
22
- CONFIG_V2 = File.join(
23
- File.expand_path('../../../config', __FILE__), # project root
24
- "#{PARSER_VERSION_DEFAULT}.json"
25
- )
26
-
27
- def self.parse_string(string, options = {})
28
- JSON.parse(string, options.merge(symbolize_names: true))
4
+ module TwitterText
5
+ class Configuration
6
+ require 'json'
7
+
8
+ PARSER_VERSION_CLASSIC = "v1"
9
+ PARSER_VERSION_DEFAULT = "v2"
10
+
11
+ class << self
12
+ attr_accessor :default_configuration
13
+ end
14
+
15
+ attr_reader :version, :max_weighted_tweet_length, :scale
16
+ attr_reader :default_weight, :transformed_url_length, :ranges
17
+
18
+ CONFIG_V1 = File.join(
19
+ File.expand_path('../../../config', __FILE__), # project root
20
+ "#{PARSER_VERSION_CLASSIC}.json"
21
+ )
22
+
23
+ CONFIG_V2 = File.join(
24
+ File.expand_path('../../../config', __FILE__), # project root
25
+ "#{PARSER_VERSION_DEFAULT}.json"
26
+ )
27
+
28
+ def self.parse_string(string, options = {})
29
+ JSON.parse(string, options.merge(symbolize_names: true))
30
+ end
31
+
32
+ def self.parse_file(filename)
33
+ string = File.open(filename, 'rb') { |f| f.read }
34
+ parse_string(string)
35
+ end
36
+
37
+ def self.configuration_from_file(filename)
38
+ config = parse_file(filename)
39
+ config ? self.new(config) : nil
40
+ end
41
+
42
+ def initialize(config = {})
43
+ @version = config[:version]
44
+ @max_weighted_tweet_length = config[:maxWeightedTweetLength]
45
+ @scale = config[:scale]
46
+ @default_weight = config[:defaultWeight]
47
+ @transformed_url_length = config[:transformedURLLength]
48
+ @ranges = config[:ranges].map { |range| Twitter::TwitterText::WeightedRange.new(range) } if config.key?(:ranges) && config[:ranges].is_a?(Array)
49
+ end
50
+
51
+ self.default_configuration = self.configuration_from_file(CONFIG_V2)
29
52
  end
30
-
31
- def self.parse_file(filename)
32
- string = File.open(filename, 'rb') { |f| f.read }
33
- parse_string(string)
34
- end
35
-
36
- def self.configuration_from_file(filename)
37
- config = parse_file(filename)
38
- config ? Twitter::Configuration.new(config) : nil
39
- end
40
-
41
- def initialize(config = {})
42
- @version = config[:version]
43
- @max_weighted_tweet_length = config[:maxWeightedTweetLength]
44
- @scale = config[:scale]
45
- @default_weight = config[:defaultWeight]
46
- @transformed_url_length = config[:transformedURLLength]
47
- @ranges = config[:ranges].map { |range| Twitter::WeightedRange.new(range) } if config.key?(:ranges) && config[:ranges].is_a?(Array)
48
- end
49
-
50
- self.default_configuration = Twitter::Configuration.configuration_from_file(Twitter::Configuration::CONFIG_V2)
51
53
  end
52
54
  end
53
-
@@ -1,14 +1,16 @@
1
1
  module Twitter
2
- module Deprecation
3
- def deprecate(method, new_method = nil)
4
- deprecated_method = :"deprecated_#{method}"
5
- message = "Deprecation: `#{method}` is deprecated."
6
- message << " Please use `#{new_method}` instead." if new_method
2
+ module TwitterText
3
+ module Deprecation
4
+ def deprecate(method, new_method = nil)
5
+ deprecated_method = :"deprecated_#{method}"
6
+ message = "Deprecation: `#{method}` is deprecated."
7
+ message << " Please use `#{new_method}` instead." if new_method
7
8
 
8
- alias_method(deprecated_method, method)
9
- define_method method do |*args, &block|
10
- warn message unless $TESTING
11
- send(deprecated_method, *args, &block)
9
+ alias_method(deprecated_method, method)
10
+ define_method method do |*args, &block|
11
+ warn message unless $TESTING
12
+ send(deprecated_method, *args, &block)
13
+ end
12
14
  end
13
15
  end
14
16
  end
@@ -45,313 +45,315 @@ class MatchData
45
45
  end
46
46
 
47
47
  module Twitter
48
- # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
49
- # of usernames, lists, URLs and hashtags.
50
- module Extractor extend self
48
+ module TwitterText
49
+ # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
50
+ # of usernames, lists, URLs and hashtags.
51
+ module Extractor extend self
51
52
 
52
- # Maximum URL length as defined by Twitter's backend.
53
- MAX_URL_LENGTH = 4096
53
+ # Maximum URL length as defined by Twitter's backend.
54
+ MAX_URL_LENGTH = 4096
54
55
 
55
- # The maximum t.co path length that the Twitter backend supports.
56
- MAX_TCO_SLUG_LENGTH = 40
56
+ # The maximum t.co path length that the Twitter backend supports.
57
+ MAX_TCO_SLUG_LENGTH = 40
57
58
 
58
- URL_PROTOCOL_LENGTH = "https://".length
59
+ URL_PROTOCOL_LENGTH = "https://".length
59
60
 
60
- # Remove overlapping entities.
61
- # This returns a new array with no overlapping entities.
62
- def remove_overlapping_entities(entities)
63
- # sort by start index
64
- entities = entities.sort_by{|entity| entity[:indices].first}
61
+ # Remove overlapping entities.
62
+ # This returns a new array with no overlapping entities.
63
+ def remove_overlapping_entities(entities)
64
+ # sort by start index
65
+ entities = entities.sort_by{|entity| entity[:indices].first}
65
66
 
66
- # remove duplicates
67
- prev = nil
68
- entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
69
- entities
70
- end
71
-
72
- # Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
73
- # along with the indices for where the entity ocurred
74
- # If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
75
- # will be returned.
76
- #
77
- # If a block is given then it will be called for each entity.
78
- def extract_entities_with_indices(text, options = {}, &block)
79
- # extract all entities
80
- entities = extract_urls_with_indices(text, options) +
81
- extract_hashtags_with_indices(text, :check_url_overlap => false) +
82
- extract_mentions_or_lists_with_indices(text) +
83
- extract_cashtags_with_indices(text)
84
-
85
- return [] if entities.empty?
86
-
87
- entities = remove_overlapping_entities(entities)
88
-
89
- entities.each(&block) if block_given?
90
- entities
91
- end
92
-
93
- # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
94
- # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
95
- # will be returned.
96
- #
97
- # If a block is given then it will be called for each username.
98
- def extract_mentioned_screen_names(text, &block) # :yields: username
99
- screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
100
- screen_names.each(&block) if block_given?
101
- screen_names
102
- end
103
-
104
- # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
105
- # along with the indices for where the mention ocurred. If the
106
- # <tt>text</tt> is nil or contains no username mentions, an empty array
107
- # will be returned.
108
- #
109
- # If a block is given, then it will be called with each username, the start
110
- # index, and the end index in the <tt>text</tt>.
111
- def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
112
- return [] unless text
113
-
114
- possible_screen_names = []
115
- extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
116
- next unless list_slug.empty?
117
- possible_screen_names << {
118
- :screen_name => screen_name,
119
- :indices => [start_position, end_position]
120
- }
67
+ # remove duplicates
68
+ prev = nil
69
+ entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
70
+ entities
121
71
  end
122
72
 
123
- if block_given?
124
- possible_screen_names.each do |mention|
125
- yield mention[:screen_name], mention[:indices].first, mention[:indices].last
126
- end
73
+ # Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
74
+ # along with the indices for where the entity ocurred
75
+ # If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
76
+ # will be returned.
77
+ #
78
+ # If a block is given then it will be called for each entity.
79
+ def extract_entities_with_indices(text, options = {}, &block)
80
+ # extract all entities
81
+ entities = extract_urls_with_indices(text, options) +
82
+ extract_hashtags_with_indices(text, :check_url_overlap => false) +
83
+ extract_mentions_or_lists_with_indices(text) +
84
+ extract_cashtags_with_indices(text)
85
+
86
+ return [] if entities.empty?
87
+
88
+ entities = remove_overlapping_entities(entities)
89
+
90
+ entities.each(&block) if block_given?
91
+ entities
127
92
  end
128
93
 
129
- possible_screen_names
130
- end
94
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
95
+ # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
96
+ # will be returned.
97
+ #
98
+ # If a block is given then it will be called for each username.
99
+ def extract_mentioned_screen_names(text, &block) # :yields: username
100
+ screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
101
+ screen_names.each(&block) if block_given?
102
+ screen_names
103
+ end
131
104
 
132
- # Extracts a list of all usernames or lists mentioned in the Tweet <tt>text</tt>
133
- # along with the indices for where the mention ocurred. If the
134
- # <tt>text</tt> is nil or contains no username or list mentions, an empty array
135
- # will be returned.
136
- #
137
- # If a block is given, then it will be called with each username, list slug, the start
138
- # index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
139
- # if this is a username mention.
140
- def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
141
- return [] unless text =~ /[@@]/
142
-
143
- possible_entries = []
144
- text.to_s.scan(Twitter::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
145
- match_data = $~
146
- after = $'
147
- unless after =~ Twitter::Regex[:end_mention_match]
148
- start_position = match_data.char_begin(3) - 1
149
- end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
150
- possible_entries << {
105
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
106
+ # along with the indices for where the mention ocurred. If the
107
+ # <tt>text</tt> is nil or contains no username mentions, an empty array
108
+ # will be returned.
109
+ #
110
+ # If a block is given, then it will be called with each username, the start
111
+ # index, and the end index in the <tt>text</tt>.
112
+ def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
113
+ return [] unless text
114
+
115
+ possible_screen_names = []
116
+ extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
117
+ next unless list_slug.empty?
118
+ possible_screen_names << {
151
119
  :screen_name => screen_name,
152
- :list_slug => list_slug || "",
153
120
  :indices => [start_position, end_position]
154
121
  }
155
122
  end
156
- end
157
123
 
158
- if block_given?
159
- possible_entries.each do |mention|
160
- yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
124
+ if block_given?
125
+ possible_screen_names.each do |mention|
126
+ yield mention[:screen_name], mention[:indices].first, mention[:indices].last
127
+ end
161
128
  end
129
+
130
+ possible_screen_names
162
131
  end
163
132
 
164
- possible_entries
165
- end
133
+ # Extracts a list of all usernames or lists mentioned in the Tweet <tt>text</tt>
134
+ # along with the indices for where the mention ocurred. If the
135
+ # <tt>text</tt> is nil or contains no username or list mentions, an empty array
136
+ # will be returned.
137
+ #
138
+ # If a block is given, then it will be called with each username, list slug, the start
139
+ # index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
140
+ # if this is a username mention.
141
+ def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
142
+ return [] unless text =~ /[@@]/
143
+
144
+ possible_entries = []
145
+ text.to_s.scan(Twitter::TwitterText::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
146
+ match_data = $~
147
+ after = $'
148
+ unless after =~ Twitter::TwitterText::Regex[:end_mention_match]
149
+ start_position = match_data.char_begin(3) - 1
150
+ end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
151
+ possible_entries << {
152
+ :screen_name => screen_name,
153
+ :list_slug => list_slug || "",
154
+ :indices => [start_position, end_position]
155
+ }
156
+ end
157
+ end
166
158
 
167
- # Extracts the username username replied to in the Tweet <tt>text</tt>. If the
168
- # <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
169
- #
170
- # If a block is given then it will be called with the username replied to (if any)
171
- def extract_reply_screen_name(text) # :yields: username
172
- return nil unless text
173
-
174
- possible_screen_name = text.match(Twitter::Regex[:valid_reply])
175
- return unless possible_screen_name.respond_to?(:captures)
176
- return if $' =~ Twitter::Regex[:end_mention_match]
177
- screen_name = possible_screen_name.captures.first
178
- yield screen_name if block_given?
179
- screen_name
180
- end
159
+ if block_given?
160
+ possible_entries.each do |mention|
161
+ yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
162
+ end
163
+ end
181
164
 
182
- # Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
183
- # <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
184
- # will be returned.
185
- #
186
- # If a block is given then it will be called for each URL.
187
- def extract_urls(text, &block) # :yields: url
188
- urls = extract_urls_with_indices(text).map{|u| u[:url]}
189
- urls.each(&block) if block_given?
190
- urls
191
- end
165
+ possible_entries
166
+ end
192
167
 
193
- # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
194
- # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
195
- # URLs an empty array will be returned.
196
- #
197
- # If a block is given then it will be called for each URL.
198
- def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
199
- return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
200
- urls = []
201
-
202
- text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
203
- valid_url_match_data = $~
204
-
205
- start_position = valid_url_match_data.char_begin(3)
206
- end_position = valid_url_match_data.char_end(3)
207
-
208
- # If protocol is missing and domain contains non-ASCII characters,
209
- # extract ASCII-only domains.
210
- if !protocol
211
- next if !options[:extract_url_without_protocol] || before =~ Twitter::Regex[:invalid_url_without_protocol_preceding_chars]
212
- last_url = nil
213
- domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
214
- next unless is_valid_domain(url.length, ascii_domain, protocol)
215
- last_url = {
216
- :url => ascii_domain,
217
- :indices => [start_position + $~.char_begin(0),
218
- start_position + $~.char_end(0)]
219
- }
220
- if path ||
221
- ascii_domain =~ Twitter::Regex[:valid_special_short_domain] ||
222
- ascii_domain !~ Twitter::Regex[:invalid_short_domain]
223
- urls << last_url
168
+ # Extracts the username username replied to in the Tweet <tt>text</tt>. If the
169
+ # <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
170
+ #
171
+ # If a block is given then it will be called with the username replied to (if any)
172
+ def extract_reply_screen_name(text) # :yields: username
173
+ return nil unless text
174
+
175
+ possible_screen_name = text.match(Twitter::TwitterText::Regex[:valid_reply])
176
+ return unless possible_screen_name.respond_to?(:captures)
177
+ return if $' =~ Twitter::TwitterText::Regex[:end_mention_match]
178
+ screen_name = possible_screen_name.captures.first
179
+ yield screen_name if block_given?
180
+ screen_name
181
+ end
182
+
183
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
184
+ # <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
185
+ # will be returned.
186
+ #
187
+ # If a block is given then it will be called for each URL.
188
+ def extract_urls(text, &block) # :yields: url
189
+ urls = extract_urls_with_indices(text).map{|u| u[:url]}
190
+ urls.each(&block) if block_given?
191
+ urls
192
+ end
193
+
194
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
195
+ # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
196
+ # URLs an empty array will be returned.
197
+ #
198
+ # If a block is given then it will be called for each URL.
199
+ def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
200
+ return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
201
+ urls = []
202
+
203
+ text.to_s.scan(Twitter::TwitterText::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
204
+ valid_url_match_data = $~
205
+
206
+ start_position = valid_url_match_data.char_begin(3)
207
+ end_position = valid_url_match_data.char_end(3)
208
+
209
+ # If protocol is missing and domain contains non-ASCII characters,
210
+ # extract ASCII-only domains.
211
+ if !protocol
212
+ next if !options[:extract_url_without_protocol] || before =~ Twitter::TwitterText::Regex[:invalid_url_without_protocol_preceding_chars]
213
+ last_url = nil
214
+ domain.scan(Twitter::TwitterText::Regex[:valid_ascii_domain]) do |ascii_domain|
215
+ next unless is_valid_domain(url.length, ascii_domain, protocol)
216
+ last_url = {
217
+ :url => ascii_domain,
218
+ :indices => [start_position + $~.char_begin(0),
219
+ start_position + $~.char_end(0)]
220
+ }
221
+ if path ||
222
+ ascii_domain =~ Twitter::TwitterText::Regex[:valid_special_short_domain] ||
223
+ ascii_domain !~ Twitter::TwitterText::Regex[:invalid_short_domain]
224
+ urls << last_url
225
+ end
224
226
  end
225
- end
226
227
 
227
- # no ASCII-only domain found. Skip the entire URL
228
- next unless last_url
228
+ # no ASCII-only domain found. Skip the entire URL
229
+ next unless last_url
229
230
 
230
- # last_url only contains domain. Need to add path and query if they exist.
231
- if path
232
- # last_url was not added. Add it to urls here.
233
- last_url[:url] = url.sub(domain, last_url[:url])
234
- last_url[:indices][1] = end_position
235
- end
236
- else
237
- # In the case of t.co URLs, don't allow additional path characters
238
- if url =~ Twitter::Regex[:valid_tco_url]
239
- next if $1 && $1.length > MAX_TCO_SLUG_LENGTH
240
- url = $&
241
- end_position = start_position + url.char_length
242
- end
231
+ # last_url only contains domain. Need to add path and query if they exist.
232
+ if path
233
+ # last_url was not added. Add it to urls here.
234
+ last_url[:url] = url.sub(domain, last_url[:url])
235
+ last_url[:indices][1] = end_position
236
+ end
237
+ else
238
+ # In the case of t.co URLs, don't allow additional path characters
239
+ if url =~ Twitter::TwitterText::Regex[:valid_tco_url]
240
+ next if $1 && $1.length > MAX_TCO_SLUG_LENGTH
241
+ url = $&
242
+ end_position = start_position + url.char_length
243
+ end
243
244
 
244
- next unless is_valid_domain(url.length, domain, protocol)
245
+ next unless is_valid_domain(url.length, domain, protocol)
245
246
 
246
- urls << {
247
- :url => url,
248
- :indices => [start_position, end_position]
249
- }
247
+ urls << {
248
+ :url => url,
249
+ :indices => [start_position, end_position]
250
+ }
251
+ end
250
252
  end
253
+ urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
254
+ urls
251
255
  end
252
- urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
253
- urls
254
- end
255
256
 
256
- # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
257
- # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
258
- # will be returned. The array returned will not include the leading <tt>#</tt>
259
- # character.
260
- #
261
- # If a block is given then it will be called for each hashtag.
262
- def extract_hashtags(text, &block) # :yields: hashtag_text
263
- hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
264
- hashtags.each(&block) if block_given?
265
- hashtags
266
- end
257
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
258
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
259
+ # will be returned. The array returned will not include the leading <tt>#</tt>
260
+ # character.
261
+ #
262
+ # If a block is given then it will be called for each hashtag.
263
+ def extract_hashtags(text, &block) # :yields: hashtag_text
264
+ hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
265
+ hashtags.each(&block) if block_given?
266
+ hashtags
267
+ end
267
268
 
268
- # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
269
- # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
270
- # will be returned. The array returned will not include the leading <tt>#</tt>
271
- # character.
272
- #
273
- # If a block is given then it will be called for each hashtag.
274
- def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
275
- return [] unless text =~ /[##]/
276
-
277
- tags = []
278
- text.scan(Twitter::Regex[:valid_hashtag]) do |before, hash, hash_text|
279
- match_data = $~
280
- start_position = match_data.char_begin(2)
281
- end_position = match_data.char_end(3)
282
- after = $'
283
- unless after =~ Twitter::Regex[:end_hashtag_match]
284
- tags << {
285
- :hashtag => hash_text,
286
- :indices => [start_position, end_position]
287
- }
269
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
270
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
271
+ # will be returned. The array returned will not include the leading <tt>#</tt>
272
+ # character.
273
+ #
274
+ # If a block is given then it will be called for each hashtag.
275
+ def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
276
+ return [] unless text =~ /[##]/
277
+
278
+ tags = []
279
+ text.scan(Twitter::TwitterText::Regex[:valid_hashtag]) do |before, hash, hash_text|
280
+ match_data = $~
281
+ start_position = match_data.char_begin(2)
282
+ end_position = match_data.char_end(3)
283
+ after = $'
284
+ unless after =~ Twitter::TwitterText::Regex[:end_hashtag_match]
285
+ tags << {
286
+ :hashtag => hash_text,
287
+ :indices => [start_position, end_position]
288
+ }
289
+ end
288
290
  end
289
- end
290
291
 
291
- if options[:check_url_overlap]
292
- # extract URLs
293
- urls = extract_urls_with_indices(text)
294
- unless urls.empty?
295
- tags.concat(urls)
296
- # remove duplicates
297
- tags = remove_overlapping_entities(tags)
298
- # remove URL entities
299
- tags.reject!{|entity| !entity[:hashtag] }
292
+ if options[:check_url_overlap]
293
+ # extract URLs
294
+ urls = extract_urls_with_indices(text)
295
+ unless urls.empty?
296
+ tags.concat(urls)
297
+ # remove duplicates
298
+ tags = remove_overlapping_entities(tags)
299
+ # remove URL entities
300
+ tags.reject!{|entity| !entity[:hashtag] }
301
+ end
300
302
  end
303
+
304
+ tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
305
+ tags
301
306
  end
302
307
 
303
- tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
304
- tags
305
- end
308
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
309
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
310
+ # will be returned. The array returned will not include the leading <tt>$</tt>
311
+ # character.
312
+ #
313
+ # If a block is given then it will be called for each cashtag.
314
+ def extract_cashtags(text, &block) # :yields: cashtag_text
315
+ cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
316
+ cashtags.each(&block) if block_given?
317
+ cashtags
318
+ end
306
319
 
307
- # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
308
- # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
309
- # will be returned. The array returned will not include the leading <tt>$</tt>
310
- # character.
311
- #
312
- # If a block is given then it will be called for each cashtag.
313
- def extract_cashtags(text, &block) # :yields: cashtag_text
314
- cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
315
- cashtags.each(&block) if block_given?
316
- cashtags
317
- end
320
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
321
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
322
+ # will be returned. The array returned will not include the leading <tt>$</tt>
323
+ # character.
324
+ #
325
+ # If a block is given then it will be called for each cashtag.
326
+ def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
327
+ return [] unless text =~ /\$/
328
+
329
+ tags = []
330
+ text.scan(Twitter::TwitterText::Regex[:valid_cashtag]) do |before, dollar, cash_text|
331
+ match_data = $~
332
+ start_position = match_data.char_begin(2)
333
+ end_position = match_data.char_end(3)
334
+ tags << {
335
+ :cashtag => cash_text,
336
+ :indices => [start_position, end_position]
337
+ }
338
+ end
318
339
 
319
- # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
320
- # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
321
- # will be returned. The array returned will not include the leading <tt>$</tt>
322
- # character.
323
- #
324
- # If a block is given then it will be called for each cashtag.
325
- def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
326
- return [] unless text =~ /\$/
327
-
328
- tags = []
329
- text.scan(Twitter::Regex[:valid_cashtag]) do |before, dollar, cash_text|
330
- match_data = $~
331
- start_position = match_data.char_begin(2)
332
- end_position = match_data.char_end(3)
333
- tags << {
334
- :cashtag => cash_text,
335
- :indices => [start_position, end_position]
336
- }
340
+ tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
341
+ tags
337
342
  end
338
343
 
339
- tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
340
- tags
341
- end
342
-
343
- def is_valid_domain(url_length, domain, protocol)
344
- begin
345
- raise ArgumentError.new("invalid empty domain") unless domain
346
- original_domain_length = domain.length
347
- encoded_domain = IDN::Idna.toASCII(domain)
348
- updated_domain_length = encoded_domain.length
349
- url_length += (updated_domain_length - original_domain_length) if (updated_domain_length > original_domain_length)
350
- url_length += URL_PROTOCOL_LENGTH unless protocol
351
- url_length <= MAX_URL_LENGTH
352
- rescue Exception
353
- # On error don't consider this a valid domain.
354
- return false
344
+ def is_valid_domain(url_length, domain, protocol)
345
+ begin
346
+ raise ArgumentError.new("invalid empty domain") unless domain
347
+ original_domain_length = domain.length
348
+ encoded_domain = IDN::Idna.toASCII(domain)
349
+ updated_domain_length = encoded_domain.length
350
+ url_length += (updated_domain_length - original_domain_length) if (updated_domain_length > original_domain_length)
351
+ url_length += URL_PROTOCOL_LENGTH unless protocol
352
+ url_length <= MAX_URL_LENGTH
353
+ rescue Exception
354
+ # On error don't consider this a valid domain.
355
+ return false
356
+ end
355
357
  end
356
358
  end
357
359
  end