twitter-text 2.0.2 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/README.md +5 -5
- data/lib/twitter-text/autolink.rb +386 -385
- data/lib/twitter-text/configuration.rb +48 -47
- data/lib/twitter-text/deprecation.rb +11 -9
- data/lib/twitter-text/extractor.rb +270 -268
- data/lib/twitter-text/hash_helper.rb +17 -15
- data/lib/twitter-text/hit_highlighter.rb +69 -67
- data/lib/twitter-text/regex.rb +342 -340
- data/lib/twitter-text/rewriter.rb +51 -49
- data/lib/twitter-text/unicode.rb +21 -20
- data/lib/twitter-text/validation.rb +185 -183
- data/lib/twitter-text/weighted_range.rb +12 -10
- data/spec/autolinking_spec.rb +2 -2
- data/spec/configuration_spec.rb +11 -11
- data/spec/extractor_spec.rb +6 -6
- data/spec/hithighlighter_spec.rb +2 -2
- data/spec/regex_spec.rb +3 -3
- data/spec/rewriter_spec.rb +7 -7
- data/spec/spec_helper.rb +2 -2
- data/spec/unicode_spec.rb +11 -11
- data/spec/validation_spec.rb +7 -7
- data/test/conformance_test.rb +4 -4
- data/twitter-text.gemspec +1 -1
- metadata +3 -2
@@ -1,53 +1,54 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
module Twitter
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
File.
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
File.
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
4
|
+
module TwitterText
|
5
|
+
class Configuration
|
6
|
+
require 'json'
|
7
|
+
|
8
|
+
PARSER_VERSION_CLASSIC = "v1"
|
9
|
+
PARSER_VERSION_DEFAULT = "v2"
|
10
|
+
|
11
|
+
class << self
|
12
|
+
attr_accessor :default_configuration
|
13
|
+
end
|
14
|
+
|
15
|
+
attr_reader :version, :max_weighted_tweet_length, :scale
|
16
|
+
attr_reader :default_weight, :transformed_url_length, :ranges
|
17
|
+
|
18
|
+
CONFIG_V1 = File.join(
|
19
|
+
File.expand_path('../../../config', __FILE__), # project root
|
20
|
+
"#{PARSER_VERSION_CLASSIC}.json"
|
21
|
+
)
|
22
|
+
|
23
|
+
CONFIG_V2 = File.join(
|
24
|
+
File.expand_path('../../../config', __FILE__), # project root
|
25
|
+
"#{PARSER_VERSION_DEFAULT}.json"
|
26
|
+
)
|
27
|
+
|
28
|
+
def self.parse_string(string, options = {})
|
29
|
+
JSON.parse(string, options.merge(symbolize_names: true))
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.parse_file(filename)
|
33
|
+
string = File.open(filename, 'rb') { |f| f.read }
|
34
|
+
parse_string(string)
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.configuration_from_file(filename)
|
38
|
+
config = parse_file(filename)
|
39
|
+
config ? self.new(config) : nil
|
40
|
+
end
|
41
|
+
|
42
|
+
def initialize(config = {})
|
43
|
+
@version = config[:version]
|
44
|
+
@max_weighted_tweet_length = config[:maxWeightedTweetLength]
|
45
|
+
@scale = config[:scale]
|
46
|
+
@default_weight = config[:defaultWeight]
|
47
|
+
@transformed_url_length = config[:transformedURLLength]
|
48
|
+
@ranges = config[:ranges].map { |range| Twitter::TwitterText::WeightedRange.new(range) } if config.key?(:ranges) && config[:ranges].is_a?(Array)
|
49
|
+
end
|
50
|
+
|
51
|
+
self.default_configuration = self.configuration_from_file(CONFIG_V2)
|
29
52
|
end
|
30
|
-
|
31
|
-
def self.parse_file(filename)
|
32
|
-
string = File.open(filename, 'rb') { |f| f.read }
|
33
|
-
parse_string(string)
|
34
|
-
end
|
35
|
-
|
36
|
-
def self.configuration_from_file(filename)
|
37
|
-
config = parse_file(filename)
|
38
|
-
config ? Twitter::Configuration.new(config) : nil
|
39
|
-
end
|
40
|
-
|
41
|
-
def initialize(config = {})
|
42
|
-
@version = config[:version]
|
43
|
-
@max_weighted_tweet_length = config[:maxWeightedTweetLength]
|
44
|
-
@scale = config[:scale]
|
45
|
-
@default_weight = config[:defaultWeight]
|
46
|
-
@transformed_url_length = config[:transformedURLLength]
|
47
|
-
@ranges = config[:ranges].map { |range| Twitter::WeightedRange.new(range) } if config.key?(:ranges) && config[:ranges].is_a?(Array)
|
48
|
-
end
|
49
|
-
|
50
|
-
self.default_configuration = Twitter::Configuration.configuration_from_file(Twitter::Configuration::CONFIG_V2)
|
51
53
|
end
|
52
54
|
end
|
53
|
-
|
@@ -1,14 +1,16 @@
|
|
1
1
|
module Twitter
|
2
|
-
module
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
2
|
+
module TwitterText
|
3
|
+
module Deprecation
|
4
|
+
def deprecate(method, new_method = nil)
|
5
|
+
deprecated_method = :"deprecated_#{method}"
|
6
|
+
message = "Deprecation: `#{method}` is deprecated."
|
7
|
+
message << " Please use `#{new_method}` instead." if new_method
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
alias_method(deprecated_method, method)
|
10
|
+
define_method method do |*args, &block|
|
11
|
+
warn message unless $TESTING
|
12
|
+
send(deprecated_method, *args, &block)
|
13
|
+
end
|
12
14
|
end
|
13
15
|
end
|
14
16
|
end
|
@@ -45,313 +45,315 @@ class MatchData
|
|
45
45
|
end
|
46
46
|
|
47
47
|
module Twitter
|
48
|
-
|
49
|
-
|
50
|
-
|
48
|
+
module TwitterText
|
49
|
+
# A module for including Tweet parsing in a class. This module provides function for the extraction and processing
|
50
|
+
# of usernames, lists, URLs and hashtags.
|
51
|
+
module Extractor extend self
|
51
52
|
|
52
|
-
|
53
|
-
|
53
|
+
# Maximum URL length as defined by Twitter's backend.
|
54
|
+
MAX_URL_LENGTH = 4096
|
54
55
|
|
55
|
-
|
56
|
-
|
56
|
+
# The maximum t.co path length that the Twitter backend supports.
|
57
|
+
MAX_TCO_SLUG_LENGTH = 40
|
57
58
|
|
58
|
-
|
59
|
+
URL_PROTOCOL_LENGTH = "https://".length
|
59
60
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
61
|
+
# Remove overlapping entities.
|
62
|
+
# This returns a new array with no overlapping entities.
|
63
|
+
def remove_overlapping_entities(entities)
|
64
|
+
# sort by start index
|
65
|
+
entities = entities.sort_by{|entity| entity[:indices].first}
|
65
66
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
end
|
71
|
-
|
72
|
-
# Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
|
73
|
-
# along with the indices for where the entity ocurred
|
74
|
-
# If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
|
75
|
-
# will be returned.
|
76
|
-
#
|
77
|
-
# If a block is given then it will be called for each entity.
|
78
|
-
def extract_entities_with_indices(text, options = {}, &block)
|
79
|
-
# extract all entities
|
80
|
-
entities = extract_urls_with_indices(text, options) +
|
81
|
-
extract_hashtags_with_indices(text, :check_url_overlap => false) +
|
82
|
-
extract_mentions_or_lists_with_indices(text) +
|
83
|
-
extract_cashtags_with_indices(text)
|
84
|
-
|
85
|
-
return [] if entities.empty?
|
86
|
-
|
87
|
-
entities = remove_overlapping_entities(entities)
|
88
|
-
|
89
|
-
entities.each(&block) if block_given?
|
90
|
-
entities
|
91
|
-
end
|
92
|
-
|
93
|
-
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
|
94
|
-
# <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
|
95
|
-
# will be returned.
|
96
|
-
#
|
97
|
-
# If a block is given then it will be called for each username.
|
98
|
-
def extract_mentioned_screen_names(text, &block) # :yields: username
|
99
|
-
screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
|
100
|
-
screen_names.each(&block) if block_given?
|
101
|
-
screen_names
|
102
|
-
end
|
103
|
-
|
104
|
-
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
|
105
|
-
# along with the indices for where the mention ocurred. If the
|
106
|
-
# <tt>text</tt> is nil or contains no username mentions, an empty array
|
107
|
-
# will be returned.
|
108
|
-
#
|
109
|
-
# If a block is given, then it will be called with each username, the start
|
110
|
-
# index, and the end index in the <tt>text</tt>.
|
111
|
-
def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
|
112
|
-
return [] unless text
|
113
|
-
|
114
|
-
possible_screen_names = []
|
115
|
-
extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
|
116
|
-
next unless list_slug.empty?
|
117
|
-
possible_screen_names << {
|
118
|
-
:screen_name => screen_name,
|
119
|
-
:indices => [start_position, end_position]
|
120
|
-
}
|
67
|
+
# remove duplicates
|
68
|
+
prev = nil
|
69
|
+
entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
|
70
|
+
entities
|
121
71
|
end
|
122
72
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
73
|
+
# Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
|
74
|
+
# along with the indices for where the entity ocurred
|
75
|
+
# If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
|
76
|
+
# will be returned.
|
77
|
+
#
|
78
|
+
# If a block is given then it will be called for each entity.
|
79
|
+
def extract_entities_with_indices(text, options = {}, &block)
|
80
|
+
# extract all entities
|
81
|
+
entities = extract_urls_with_indices(text, options) +
|
82
|
+
extract_hashtags_with_indices(text, :check_url_overlap => false) +
|
83
|
+
extract_mentions_or_lists_with_indices(text) +
|
84
|
+
extract_cashtags_with_indices(text)
|
85
|
+
|
86
|
+
return [] if entities.empty?
|
87
|
+
|
88
|
+
entities = remove_overlapping_entities(entities)
|
89
|
+
|
90
|
+
entities.each(&block) if block_given?
|
91
|
+
entities
|
127
92
|
end
|
128
93
|
|
129
|
-
|
130
|
-
|
94
|
+
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
|
95
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
|
96
|
+
# will be returned.
|
97
|
+
#
|
98
|
+
# If a block is given then it will be called for each username.
|
99
|
+
def extract_mentioned_screen_names(text, &block) # :yields: username
|
100
|
+
screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
|
101
|
+
screen_names.each(&block) if block_given?
|
102
|
+
screen_names
|
103
|
+
end
|
131
104
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
after = $'
|
147
|
-
unless after =~ Twitter::Regex[:end_mention_match]
|
148
|
-
start_position = match_data.char_begin(3) - 1
|
149
|
-
end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
|
150
|
-
possible_entries << {
|
105
|
+
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
|
106
|
+
# along with the indices for where the mention ocurred. If the
|
107
|
+
# <tt>text</tt> is nil or contains no username mentions, an empty array
|
108
|
+
# will be returned.
|
109
|
+
#
|
110
|
+
# If a block is given, then it will be called with each username, the start
|
111
|
+
# index, and the end index in the <tt>text</tt>.
|
112
|
+
def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
|
113
|
+
return [] unless text
|
114
|
+
|
115
|
+
possible_screen_names = []
|
116
|
+
extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
|
117
|
+
next unless list_slug.empty?
|
118
|
+
possible_screen_names << {
|
151
119
|
:screen_name => screen_name,
|
152
|
-
:list_slug => list_slug || "",
|
153
120
|
:indices => [start_position, end_position]
|
154
121
|
}
|
155
122
|
end
|
156
|
-
end
|
157
123
|
|
158
|
-
|
159
|
-
|
160
|
-
|
124
|
+
if block_given?
|
125
|
+
possible_screen_names.each do |mention|
|
126
|
+
yield mention[:screen_name], mention[:indices].first, mention[:indices].last
|
127
|
+
end
|
161
128
|
end
|
129
|
+
|
130
|
+
possible_screen_names
|
162
131
|
end
|
163
132
|
|
164
|
-
|
165
|
-
|
133
|
+
# Extracts a list of all usernames or lists mentioned in the Tweet <tt>text</tt>
|
134
|
+
# along with the indices for where the mention ocurred. If the
|
135
|
+
# <tt>text</tt> is nil or contains no username or list mentions, an empty array
|
136
|
+
# will be returned.
|
137
|
+
#
|
138
|
+
# If a block is given, then it will be called with each username, list slug, the start
|
139
|
+
# index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
|
140
|
+
# if this is a username mention.
|
141
|
+
def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
|
142
|
+
return [] unless text =~ /[@@]/
|
143
|
+
|
144
|
+
possible_entries = []
|
145
|
+
text.to_s.scan(Twitter::TwitterText::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
|
146
|
+
match_data = $~
|
147
|
+
after = $'
|
148
|
+
unless after =~ Twitter::TwitterText::Regex[:end_mention_match]
|
149
|
+
start_position = match_data.char_begin(3) - 1
|
150
|
+
end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
|
151
|
+
possible_entries << {
|
152
|
+
:screen_name => screen_name,
|
153
|
+
:list_slug => list_slug || "",
|
154
|
+
:indices => [start_position, end_position]
|
155
|
+
}
|
156
|
+
end
|
157
|
+
end
|
166
158
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
return nil unless text
|
173
|
-
|
174
|
-
possible_screen_name = text.match(Twitter::Regex[:valid_reply])
|
175
|
-
return unless possible_screen_name.respond_to?(:captures)
|
176
|
-
return if $' =~ Twitter::Regex[:end_mention_match]
|
177
|
-
screen_name = possible_screen_name.captures.first
|
178
|
-
yield screen_name if block_given?
|
179
|
-
screen_name
|
180
|
-
end
|
159
|
+
if block_given?
|
160
|
+
possible_entries.each do |mention|
|
161
|
+
yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
|
162
|
+
end
|
163
|
+
end
|
181
164
|
|
182
|
-
|
183
|
-
|
184
|
-
# will be returned.
|
185
|
-
#
|
186
|
-
# If a block is given then it will be called for each URL.
|
187
|
-
def extract_urls(text, &block) # :yields: url
|
188
|
-
urls = extract_urls_with_indices(text).map{|u| u[:url]}
|
189
|
-
urls.each(&block) if block_given?
|
190
|
-
urls
|
191
|
-
end
|
165
|
+
possible_entries
|
166
|
+
end
|
192
167
|
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
168
|
+
# Extracts the username username replied to in the Tweet <tt>text</tt>. If the
|
169
|
+
# <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
|
170
|
+
#
|
171
|
+
# If a block is given then it will be called with the username replied to (if any)
|
172
|
+
def extract_reply_screen_name(text) # :yields: username
|
173
|
+
return nil unless text
|
174
|
+
|
175
|
+
possible_screen_name = text.match(Twitter::TwitterText::Regex[:valid_reply])
|
176
|
+
return unless possible_screen_name.respond_to?(:captures)
|
177
|
+
return if $' =~ Twitter::TwitterText::Regex[:end_mention_match]
|
178
|
+
screen_name = possible_screen_name.captures.first
|
179
|
+
yield screen_name if block_given?
|
180
|
+
screen_name
|
181
|
+
end
|
182
|
+
|
183
|
+
# Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
|
184
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
|
185
|
+
# will be returned.
|
186
|
+
#
|
187
|
+
# If a block is given then it will be called for each URL.
|
188
|
+
def extract_urls(text, &block) # :yields: url
|
189
|
+
urls = extract_urls_with_indices(text).map{|u| u[:url]}
|
190
|
+
urls.each(&block) if block_given?
|
191
|
+
urls
|
192
|
+
end
|
193
|
+
|
194
|
+
# Extracts a list of all URLs included in the Tweet <tt>text</tt> along
|
195
|
+
# with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
|
196
|
+
# URLs an empty array will be returned.
|
197
|
+
#
|
198
|
+
# If a block is given then it will be called for each URL.
|
199
|
+
def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
|
200
|
+
return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
|
201
|
+
urls = []
|
202
|
+
|
203
|
+
text.to_s.scan(Twitter::TwitterText::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
|
204
|
+
valid_url_match_data = $~
|
205
|
+
|
206
|
+
start_position = valid_url_match_data.char_begin(3)
|
207
|
+
end_position = valid_url_match_data.char_end(3)
|
208
|
+
|
209
|
+
# If protocol is missing and domain contains non-ASCII characters,
|
210
|
+
# extract ASCII-only domains.
|
211
|
+
if !protocol
|
212
|
+
next if !options[:extract_url_without_protocol] || before =~ Twitter::TwitterText::Regex[:invalid_url_without_protocol_preceding_chars]
|
213
|
+
last_url = nil
|
214
|
+
domain.scan(Twitter::TwitterText::Regex[:valid_ascii_domain]) do |ascii_domain|
|
215
|
+
next unless is_valid_domain(url.length, ascii_domain, protocol)
|
216
|
+
last_url = {
|
217
|
+
:url => ascii_domain,
|
218
|
+
:indices => [start_position + $~.char_begin(0),
|
219
|
+
start_position + $~.char_end(0)]
|
220
|
+
}
|
221
|
+
if path ||
|
222
|
+
ascii_domain =~ Twitter::TwitterText::Regex[:valid_special_short_domain] ||
|
223
|
+
ascii_domain !~ Twitter::TwitterText::Regex[:invalid_short_domain]
|
224
|
+
urls << last_url
|
225
|
+
end
|
224
226
|
end
|
225
|
-
end
|
226
227
|
|
227
|
-
|
228
|
-
|
228
|
+
# no ASCII-only domain found. Skip the entire URL
|
229
|
+
next unless last_url
|
229
230
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
231
|
+
# last_url only contains domain. Need to add path and query if they exist.
|
232
|
+
if path
|
233
|
+
# last_url was not added. Add it to urls here.
|
234
|
+
last_url[:url] = url.sub(domain, last_url[:url])
|
235
|
+
last_url[:indices][1] = end_position
|
236
|
+
end
|
237
|
+
else
|
238
|
+
# In the case of t.co URLs, don't allow additional path characters
|
239
|
+
if url =~ Twitter::TwitterText::Regex[:valid_tco_url]
|
240
|
+
next if $1 && $1.length > MAX_TCO_SLUG_LENGTH
|
241
|
+
url = $&
|
242
|
+
end_position = start_position + url.char_length
|
243
|
+
end
|
243
244
|
|
244
|
-
|
245
|
+
next unless is_valid_domain(url.length, domain, protocol)
|
245
246
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
247
|
+
urls << {
|
248
|
+
:url => url,
|
249
|
+
:indices => [start_position, end_position]
|
250
|
+
}
|
251
|
+
end
|
250
252
|
end
|
253
|
+
urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
|
254
|
+
urls
|
251
255
|
end
|
252
|
-
urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
|
253
|
-
urls
|
254
|
-
end
|
255
256
|
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
257
|
+
# Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
|
258
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
|
259
|
+
# will be returned. The array returned will not include the leading <tt>#</tt>
|
260
|
+
# character.
|
261
|
+
#
|
262
|
+
# If a block is given then it will be called for each hashtag.
|
263
|
+
def extract_hashtags(text, &block) # :yields: hashtag_text
|
264
|
+
hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
|
265
|
+
hashtags.each(&block) if block_given?
|
266
|
+
hashtags
|
267
|
+
end
|
267
268
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
269
|
+
# Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
|
270
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
|
271
|
+
# will be returned. The array returned will not include the leading <tt>#</tt>
|
272
|
+
# character.
|
273
|
+
#
|
274
|
+
# If a block is given then it will be called for each hashtag.
|
275
|
+
def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
|
276
|
+
return [] unless text =~ /[##]/
|
277
|
+
|
278
|
+
tags = []
|
279
|
+
text.scan(Twitter::TwitterText::Regex[:valid_hashtag]) do |before, hash, hash_text|
|
280
|
+
match_data = $~
|
281
|
+
start_position = match_data.char_begin(2)
|
282
|
+
end_position = match_data.char_end(3)
|
283
|
+
after = $'
|
284
|
+
unless after =~ Twitter::TwitterText::Regex[:end_hashtag_match]
|
285
|
+
tags << {
|
286
|
+
:hashtag => hash_text,
|
287
|
+
:indices => [start_position, end_position]
|
288
|
+
}
|
289
|
+
end
|
288
290
|
end
|
289
|
-
end
|
290
291
|
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
292
|
+
if options[:check_url_overlap]
|
293
|
+
# extract URLs
|
294
|
+
urls = extract_urls_with_indices(text)
|
295
|
+
unless urls.empty?
|
296
|
+
tags.concat(urls)
|
297
|
+
# remove duplicates
|
298
|
+
tags = remove_overlapping_entities(tags)
|
299
|
+
# remove URL entities
|
300
|
+
tags.reject!{|entity| !entity[:hashtag] }
|
301
|
+
end
|
300
302
|
end
|
303
|
+
|
304
|
+
tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
|
305
|
+
tags
|
301
306
|
end
|
302
307
|
|
303
|
-
|
304
|
-
|
305
|
-
|
308
|
+
# Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
|
309
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
|
310
|
+
# will be returned. The array returned will not include the leading <tt>$</tt>
|
311
|
+
# character.
|
312
|
+
#
|
313
|
+
# If a block is given then it will be called for each cashtag.
|
314
|
+
def extract_cashtags(text, &block) # :yields: cashtag_text
|
315
|
+
cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
|
316
|
+
cashtags.each(&block) if block_given?
|
317
|
+
cashtags
|
318
|
+
end
|
306
319
|
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
320
|
+
# Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
|
321
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
|
322
|
+
# will be returned. The array returned will not include the leading <tt>$</tt>
|
323
|
+
# character.
|
324
|
+
#
|
325
|
+
# If a block is given then it will be called for each cashtag.
|
326
|
+
def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
|
327
|
+
return [] unless text =~ /\$/
|
328
|
+
|
329
|
+
tags = []
|
330
|
+
text.scan(Twitter::TwitterText::Regex[:valid_cashtag]) do |before, dollar, cash_text|
|
331
|
+
match_data = $~
|
332
|
+
start_position = match_data.char_begin(2)
|
333
|
+
end_position = match_data.char_end(3)
|
334
|
+
tags << {
|
335
|
+
:cashtag => cash_text,
|
336
|
+
:indices => [start_position, end_position]
|
337
|
+
}
|
338
|
+
end
|
318
339
|
|
319
|
-
|
320
|
-
|
321
|
-
# will be returned. The array returned will not include the leading <tt>$</tt>
|
322
|
-
# character.
|
323
|
-
#
|
324
|
-
# If a block is given then it will be called for each cashtag.
|
325
|
-
def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
|
326
|
-
return [] unless text =~ /\$/
|
327
|
-
|
328
|
-
tags = []
|
329
|
-
text.scan(Twitter::Regex[:valid_cashtag]) do |before, dollar, cash_text|
|
330
|
-
match_data = $~
|
331
|
-
start_position = match_data.char_begin(2)
|
332
|
-
end_position = match_data.char_end(3)
|
333
|
-
tags << {
|
334
|
-
:cashtag => cash_text,
|
335
|
-
:indices => [start_position, end_position]
|
336
|
-
}
|
340
|
+
tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
|
341
|
+
tags
|
337
342
|
end
|
338
343
|
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
rescue Exception
|
353
|
-
# On error don't consider this a valid domain.
|
354
|
-
return false
|
344
|
+
def is_valid_domain(url_length, domain, protocol)
|
345
|
+
begin
|
346
|
+
raise ArgumentError.new("invalid empty domain") unless domain
|
347
|
+
original_domain_length = domain.length
|
348
|
+
encoded_domain = IDN::Idna.toASCII(domain)
|
349
|
+
updated_domain_length = encoded_domain.length
|
350
|
+
url_length += (updated_domain_length - original_domain_length) if (updated_domain_length > original_domain_length)
|
351
|
+
url_length += URL_PROTOCOL_LENGTH unless protocol
|
352
|
+
url_length <= MAX_URL_LENGTH
|
353
|
+
rescue Exception
|
354
|
+
# On error don't consider this a valid domain.
|
355
|
+
return false
|
356
|
+
end
|
355
357
|
end
|
356
358
|
end
|
357
359
|
end
|