twitter-text 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/README.md +5 -5
- data/lib/twitter-text/autolink.rb +386 -385
- data/lib/twitter-text/configuration.rb +48 -47
- data/lib/twitter-text/deprecation.rb +11 -9
- data/lib/twitter-text/extractor.rb +270 -268
- data/lib/twitter-text/hash_helper.rb +17 -15
- data/lib/twitter-text/hit_highlighter.rb +69 -67
- data/lib/twitter-text/regex.rb +342 -340
- data/lib/twitter-text/rewriter.rb +51 -49
- data/lib/twitter-text/unicode.rb +21 -20
- data/lib/twitter-text/validation.rb +185 -183
- data/lib/twitter-text/weighted_range.rb +12 -10
- data/spec/autolinking_spec.rb +2 -2
- data/spec/configuration_spec.rb +11 -11
- data/spec/extractor_spec.rb +6 -6
- data/spec/hithighlighter_spec.rb +2 -2
- data/spec/regex_spec.rb +3 -3
- data/spec/rewriter_spec.rb +7 -7
- data/spec/spec_helper.rb +2 -2
- data/spec/unicode_spec.rb +11 -11
- data/spec/validation_spec.rb +7 -7
- data/test/conformance_test.rb +4 -4
- data/twitter-text.gemspec +1 -1
- metadata +3 -2
@@ -1,53 +1,54 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
module Twitter
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
File.
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
File.
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
4
|
+
module TwitterText
|
5
|
+
class Configuration
|
6
|
+
require 'json'
|
7
|
+
|
8
|
+
PARSER_VERSION_CLASSIC = "v1"
|
9
|
+
PARSER_VERSION_DEFAULT = "v2"
|
10
|
+
|
11
|
+
class << self
|
12
|
+
attr_accessor :default_configuration
|
13
|
+
end
|
14
|
+
|
15
|
+
attr_reader :version, :max_weighted_tweet_length, :scale
|
16
|
+
attr_reader :default_weight, :transformed_url_length, :ranges
|
17
|
+
|
18
|
+
CONFIG_V1 = File.join(
|
19
|
+
File.expand_path('../../../config', __FILE__), # project root
|
20
|
+
"#{PARSER_VERSION_CLASSIC}.json"
|
21
|
+
)
|
22
|
+
|
23
|
+
CONFIG_V2 = File.join(
|
24
|
+
File.expand_path('../../../config', __FILE__), # project root
|
25
|
+
"#{PARSER_VERSION_DEFAULT}.json"
|
26
|
+
)
|
27
|
+
|
28
|
+
def self.parse_string(string, options = {})
|
29
|
+
JSON.parse(string, options.merge(symbolize_names: true))
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.parse_file(filename)
|
33
|
+
string = File.open(filename, 'rb') { |f| f.read }
|
34
|
+
parse_string(string)
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.configuration_from_file(filename)
|
38
|
+
config = parse_file(filename)
|
39
|
+
config ? self.new(config) : nil
|
40
|
+
end
|
41
|
+
|
42
|
+
def initialize(config = {})
|
43
|
+
@version = config[:version]
|
44
|
+
@max_weighted_tweet_length = config[:maxWeightedTweetLength]
|
45
|
+
@scale = config[:scale]
|
46
|
+
@default_weight = config[:defaultWeight]
|
47
|
+
@transformed_url_length = config[:transformedURLLength]
|
48
|
+
@ranges = config[:ranges].map { |range| Twitter::TwitterText::WeightedRange.new(range) } if config.key?(:ranges) && config[:ranges].is_a?(Array)
|
49
|
+
end
|
50
|
+
|
51
|
+
self.default_configuration = self.configuration_from_file(CONFIG_V2)
|
29
52
|
end
|
30
|
-
|
31
|
-
def self.parse_file(filename)
|
32
|
-
string = File.open(filename, 'rb') { |f| f.read }
|
33
|
-
parse_string(string)
|
34
|
-
end
|
35
|
-
|
36
|
-
def self.configuration_from_file(filename)
|
37
|
-
config = parse_file(filename)
|
38
|
-
config ? Twitter::Configuration.new(config) : nil
|
39
|
-
end
|
40
|
-
|
41
|
-
def initialize(config = {})
|
42
|
-
@version = config[:version]
|
43
|
-
@max_weighted_tweet_length = config[:maxWeightedTweetLength]
|
44
|
-
@scale = config[:scale]
|
45
|
-
@default_weight = config[:defaultWeight]
|
46
|
-
@transformed_url_length = config[:transformedURLLength]
|
47
|
-
@ranges = config[:ranges].map { |range| Twitter::WeightedRange.new(range) } if config.key?(:ranges) && config[:ranges].is_a?(Array)
|
48
|
-
end
|
49
|
-
|
50
|
-
self.default_configuration = Twitter::Configuration.configuration_from_file(Twitter::Configuration::CONFIG_V2)
|
51
53
|
end
|
52
54
|
end
|
53
|
-
|
@@ -1,14 +1,16 @@
|
|
1
1
|
module Twitter
|
2
|
-
module
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
2
|
+
module TwitterText
|
3
|
+
module Deprecation
|
4
|
+
def deprecate(method, new_method = nil)
|
5
|
+
deprecated_method = :"deprecated_#{method}"
|
6
|
+
message = "Deprecation: `#{method}` is deprecated."
|
7
|
+
message << " Please use `#{new_method}` instead." if new_method
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
alias_method(deprecated_method, method)
|
10
|
+
define_method method do |*args, &block|
|
11
|
+
warn message unless $TESTING
|
12
|
+
send(deprecated_method, *args, &block)
|
13
|
+
end
|
12
14
|
end
|
13
15
|
end
|
14
16
|
end
|
@@ -45,313 +45,315 @@ class MatchData
|
|
45
45
|
end
|
46
46
|
|
47
47
|
module Twitter
|
48
|
-
|
49
|
-
|
50
|
-
|
48
|
+
module TwitterText
|
49
|
+
# A module for including Tweet parsing in a class. This module provides function for the extraction and processing
|
50
|
+
# of usernames, lists, URLs and hashtags.
|
51
|
+
module Extractor extend self
|
51
52
|
|
52
|
-
|
53
|
-
|
53
|
+
# Maximum URL length as defined by Twitter's backend.
|
54
|
+
MAX_URL_LENGTH = 4096
|
54
55
|
|
55
|
-
|
56
|
-
|
56
|
+
# The maximum t.co path length that the Twitter backend supports.
|
57
|
+
MAX_TCO_SLUG_LENGTH = 40
|
57
58
|
|
58
|
-
|
59
|
+
URL_PROTOCOL_LENGTH = "https://".length
|
59
60
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
61
|
+
# Remove overlapping entities.
|
62
|
+
# This returns a new array with no overlapping entities.
|
63
|
+
def remove_overlapping_entities(entities)
|
64
|
+
# sort by start index
|
65
|
+
entities = entities.sort_by{|entity| entity[:indices].first}
|
65
66
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
end
|
71
|
-
|
72
|
-
# Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
|
73
|
-
# along with the indices for where the entity ocurred
|
74
|
-
# If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
|
75
|
-
# will be returned.
|
76
|
-
#
|
77
|
-
# If a block is given then it will be called for each entity.
|
78
|
-
def extract_entities_with_indices(text, options = {}, &block)
|
79
|
-
# extract all entities
|
80
|
-
entities = extract_urls_with_indices(text, options) +
|
81
|
-
extract_hashtags_with_indices(text, :check_url_overlap => false) +
|
82
|
-
extract_mentions_or_lists_with_indices(text) +
|
83
|
-
extract_cashtags_with_indices(text)
|
84
|
-
|
85
|
-
return [] if entities.empty?
|
86
|
-
|
87
|
-
entities = remove_overlapping_entities(entities)
|
88
|
-
|
89
|
-
entities.each(&block) if block_given?
|
90
|
-
entities
|
91
|
-
end
|
92
|
-
|
93
|
-
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
|
94
|
-
# <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
|
95
|
-
# will be returned.
|
96
|
-
#
|
97
|
-
# If a block is given then it will be called for each username.
|
98
|
-
def extract_mentioned_screen_names(text, &block) # :yields: username
|
99
|
-
screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
|
100
|
-
screen_names.each(&block) if block_given?
|
101
|
-
screen_names
|
102
|
-
end
|
103
|
-
|
104
|
-
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
|
105
|
-
# along with the indices for where the mention ocurred. If the
|
106
|
-
# <tt>text</tt> is nil or contains no username mentions, an empty array
|
107
|
-
# will be returned.
|
108
|
-
#
|
109
|
-
# If a block is given, then it will be called with each username, the start
|
110
|
-
# index, and the end index in the <tt>text</tt>.
|
111
|
-
def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
|
112
|
-
return [] unless text
|
113
|
-
|
114
|
-
possible_screen_names = []
|
115
|
-
extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
|
116
|
-
next unless list_slug.empty?
|
117
|
-
possible_screen_names << {
|
118
|
-
:screen_name => screen_name,
|
119
|
-
:indices => [start_position, end_position]
|
120
|
-
}
|
67
|
+
# remove duplicates
|
68
|
+
prev = nil
|
69
|
+
entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
|
70
|
+
entities
|
121
71
|
end
|
122
72
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
73
|
+
# Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
|
74
|
+
# along with the indices for where the entity ocurred
|
75
|
+
# If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
|
76
|
+
# will be returned.
|
77
|
+
#
|
78
|
+
# If a block is given then it will be called for each entity.
|
79
|
+
def extract_entities_with_indices(text, options = {}, &block)
|
80
|
+
# extract all entities
|
81
|
+
entities = extract_urls_with_indices(text, options) +
|
82
|
+
extract_hashtags_with_indices(text, :check_url_overlap => false) +
|
83
|
+
extract_mentions_or_lists_with_indices(text) +
|
84
|
+
extract_cashtags_with_indices(text)
|
85
|
+
|
86
|
+
return [] if entities.empty?
|
87
|
+
|
88
|
+
entities = remove_overlapping_entities(entities)
|
89
|
+
|
90
|
+
entities.each(&block) if block_given?
|
91
|
+
entities
|
127
92
|
end
|
128
93
|
|
129
|
-
|
130
|
-
|
94
|
+
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
|
95
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
|
96
|
+
# will be returned.
|
97
|
+
#
|
98
|
+
# If a block is given then it will be called for each username.
|
99
|
+
def extract_mentioned_screen_names(text, &block) # :yields: username
|
100
|
+
screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
|
101
|
+
screen_names.each(&block) if block_given?
|
102
|
+
screen_names
|
103
|
+
end
|
131
104
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
after = $'
|
147
|
-
unless after =~ Twitter::Regex[:end_mention_match]
|
148
|
-
start_position = match_data.char_begin(3) - 1
|
149
|
-
end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
|
150
|
-
possible_entries << {
|
105
|
+
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
|
106
|
+
# along with the indices for where the mention ocurred. If the
|
107
|
+
# <tt>text</tt> is nil or contains no username mentions, an empty array
|
108
|
+
# will be returned.
|
109
|
+
#
|
110
|
+
# If a block is given, then it will be called with each username, the start
|
111
|
+
# index, and the end index in the <tt>text</tt>.
|
112
|
+
def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
|
113
|
+
return [] unless text
|
114
|
+
|
115
|
+
possible_screen_names = []
|
116
|
+
extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
|
117
|
+
next unless list_slug.empty?
|
118
|
+
possible_screen_names << {
|
151
119
|
:screen_name => screen_name,
|
152
|
-
:list_slug => list_slug || "",
|
153
120
|
:indices => [start_position, end_position]
|
154
121
|
}
|
155
122
|
end
|
156
|
-
end
|
157
123
|
|
158
|
-
|
159
|
-
|
160
|
-
|
124
|
+
if block_given?
|
125
|
+
possible_screen_names.each do |mention|
|
126
|
+
yield mention[:screen_name], mention[:indices].first, mention[:indices].last
|
127
|
+
end
|
161
128
|
end
|
129
|
+
|
130
|
+
possible_screen_names
|
162
131
|
end
|
163
132
|
|
164
|
-
|
165
|
-
|
133
|
+
# Extracts a list of all usernames or lists mentioned in the Tweet <tt>text</tt>
|
134
|
+
# along with the indices for where the mention ocurred. If the
|
135
|
+
# <tt>text</tt> is nil or contains no username or list mentions, an empty array
|
136
|
+
# will be returned.
|
137
|
+
#
|
138
|
+
# If a block is given, then it will be called with each username, list slug, the start
|
139
|
+
# index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
|
140
|
+
# if this is a username mention.
|
141
|
+
def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
|
142
|
+
return [] unless text =~ /[@@]/
|
143
|
+
|
144
|
+
possible_entries = []
|
145
|
+
text.to_s.scan(Twitter::TwitterText::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
|
146
|
+
match_data = $~
|
147
|
+
after = $'
|
148
|
+
unless after =~ Twitter::TwitterText::Regex[:end_mention_match]
|
149
|
+
start_position = match_data.char_begin(3) - 1
|
150
|
+
end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
|
151
|
+
possible_entries << {
|
152
|
+
:screen_name => screen_name,
|
153
|
+
:list_slug => list_slug || "",
|
154
|
+
:indices => [start_position, end_position]
|
155
|
+
}
|
156
|
+
end
|
157
|
+
end
|
166
158
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
return nil unless text
|
173
|
-
|
174
|
-
possible_screen_name = text.match(Twitter::Regex[:valid_reply])
|
175
|
-
return unless possible_screen_name.respond_to?(:captures)
|
176
|
-
return if $' =~ Twitter::Regex[:end_mention_match]
|
177
|
-
screen_name = possible_screen_name.captures.first
|
178
|
-
yield screen_name if block_given?
|
179
|
-
screen_name
|
180
|
-
end
|
159
|
+
if block_given?
|
160
|
+
possible_entries.each do |mention|
|
161
|
+
yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
|
162
|
+
end
|
163
|
+
end
|
181
164
|
|
182
|
-
|
183
|
-
|
184
|
-
# will be returned.
|
185
|
-
#
|
186
|
-
# If a block is given then it will be called for each URL.
|
187
|
-
def extract_urls(text, &block) # :yields: url
|
188
|
-
urls = extract_urls_with_indices(text).map{|u| u[:url]}
|
189
|
-
urls.each(&block) if block_given?
|
190
|
-
urls
|
191
|
-
end
|
165
|
+
possible_entries
|
166
|
+
end
|
192
167
|
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
168
|
+
# Extracts the username username replied to in the Tweet <tt>text</tt>. If the
|
169
|
+
# <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
|
170
|
+
#
|
171
|
+
# If a block is given then it will be called with the username replied to (if any)
|
172
|
+
def extract_reply_screen_name(text) # :yields: username
|
173
|
+
return nil unless text
|
174
|
+
|
175
|
+
possible_screen_name = text.match(Twitter::TwitterText::Regex[:valid_reply])
|
176
|
+
return unless possible_screen_name.respond_to?(:captures)
|
177
|
+
return if $' =~ Twitter::TwitterText::Regex[:end_mention_match]
|
178
|
+
screen_name = possible_screen_name.captures.first
|
179
|
+
yield screen_name if block_given?
|
180
|
+
screen_name
|
181
|
+
end
|
182
|
+
|
183
|
+
# Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
|
184
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
|
185
|
+
# will be returned.
|
186
|
+
#
|
187
|
+
# If a block is given then it will be called for each URL.
|
188
|
+
def extract_urls(text, &block) # :yields: url
|
189
|
+
urls = extract_urls_with_indices(text).map{|u| u[:url]}
|
190
|
+
urls.each(&block) if block_given?
|
191
|
+
urls
|
192
|
+
end
|
193
|
+
|
194
|
+
# Extracts a list of all URLs included in the Tweet <tt>text</tt> along
|
195
|
+
# with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
|
196
|
+
# URLs an empty array will be returned.
|
197
|
+
#
|
198
|
+
# If a block is given then it will be called for each URL.
|
199
|
+
def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
|
200
|
+
return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
|
201
|
+
urls = []
|
202
|
+
|
203
|
+
text.to_s.scan(Twitter::TwitterText::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
|
204
|
+
valid_url_match_data = $~
|
205
|
+
|
206
|
+
start_position = valid_url_match_data.char_begin(3)
|
207
|
+
end_position = valid_url_match_data.char_end(3)
|
208
|
+
|
209
|
+
# If protocol is missing and domain contains non-ASCII characters,
|
210
|
+
# extract ASCII-only domains.
|
211
|
+
if !protocol
|
212
|
+
next if !options[:extract_url_without_protocol] || before =~ Twitter::TwitterText::Regex[:invalid_url_without_protocol_preceding_chars]
|
213
|
+
last_url = nil
|
214
|
+
domain.scan(Twitter::TwitterText::Regex[:valid_ascii_domain]) do |ascii_domain|
|
215
|
+
next unless is_valid_domain(url.length, ascii_domain, protocol)
|
216
|
+
last_url = {
|
217
|
+
:url => ascii_domain,
|
218
|
+
:indices => [start_position + $~.char_begin(0),
|
219
|
+
start_position + $~.char_end(0)]
|
220
|
+
}
|
221
|
+
if path ||
|
222
|
+
ascii_domain =~ Twitter::TwitterText::Regex[:valid_special_short_domain] ||
|
223
|
+
ascii_domain !~ Twitter::TwitterText::Regex[:invalid_short_domain]
|
224
|
+
urls << last_url
|
225
|
+
end
|
224
226
|
end
|
225
|
-
end
|
226
227
|
|
227
|
-
|
228
|
-
|
228
|
+
# no ASCII-only domain found. Skip the entire URL
|
229
|
+
next unless last_url
|
229
230
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
231
|
+
# last_url only contains domain. Need to add path and query if they exist.
|
232
|
+
if path
|
233
|
+
# last_url was not added. Add it to urls here.
|
234
|
+
last_url[:url] = url.sub(domain, last_url[:url])
|
235
|
+
last_url[:indices][1] = end_position
|
236
|
+
end
|
237
|
+
else
|
238
|
+
# In the case of t.co URLs, don't allow additional path characters
|
239
|
+
if url =~ Twitter::TwitterText::Regex[:valid_tco_url]
|
240
|
+
next if $1 && $1.length > MAX_TCO_SLUG_LENGTH
|
241
|
+
url = $&
|
242
|
+
end_position = start_position + url.char_length
|
243
|
+
end
|
243
244
|
|
244
|
-
|
245
|
+
next unless is_valid_domain(url.length, domain, protocol)
|
245
246
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
247
|
+
urls << {
|
248
|
+
:url => url,
|
249
|
+
:indices => [start_position, end_position]
|
250
|
+
}
|
251
|
+
end
|
250
252
|
end
|
253
|
+
urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
|
254
|
+
urls
|
251
255
|
end
|
252
|
-
urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
|
253
|
-
urls
|
254
|
-
end
|
255
256
|
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
257
|
+
# Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
|
258
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
|
259
|
+
# will be returned. The array returned will not include the leading <tt>#</tt>
|
260
|
+
# character.
|
261
|
+
#
|
262
|
+
# If a block is given then it will be called for each hashtag.
|
263
|
+
def extract_hashtags(text, &block) # :yields: hashtag_text
|
264
|
+
hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
|
265
|
+
hashtags.each(&block) if block_given?
|
266
|
+
hashtags
|
267
|
+
end
|
267
268
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
269
|
+
# Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
|
270
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
|
271
|
+
# will be returned. The array returned will not include the leading <tt>#</tt>
|
272
|
+
# character.
|
273
|
+
#
|
274
|
+
# If a block is given then it will be called for each hashtag.
|
275
|
+
def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
|
276
|
+
return [] unless text =~ /[##]/
|
277
|
+
|
278
|
+
tags = []
|
279
|
+
text.scan(Twitter::TwitterText::Regex[:valid_hashtag]) do |before, hash, hash_text|
|
280
|
+
match_data = $~
|
281
|
+
start_position = match_data.char_begin(2)
|
282
|
+
end_position = match_data.char_end(3)
|
283
|
+
after = $'
|
284
|
+
unless after =~ Twitter::TwitterText::Regex[:end_hashtag_match]
|
285
|
+
tags << {
|
286
|
+
:hashtag => hash_text,
|
287
|
+
:indices => [start_position, end_position]
|
288
|
+
}
|
289
|
+
end
|
288
290
|
end
|
289
|
-
end
|
290
291
|
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
292
|
+
if options[:check_url_overlap]
|
293
|
+
# extract URLs
|
294
|
+
urls = extract_urls_with_indices(text)
|
295
|
+
unless urls.empty?
|
296
|
+
tags.concat(urls)
|
297
|
+
# remove duplicates
|
298
|
+
tags = remove_overlapping_entities(tags)
|
299
|
+
# remove URL entities
|
300
|
+
tags.reject!{|entity| !entity[:hashtag] }
|
301
|
+
end
|
300
302
|
end
|
303
|
+
|
304
|
+
tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
|
305
|
+
tags
|
301
306
|
end
|
302
307
|
|
303
|
-
|
304
|
-
|
305
|
-
|
308
|
+
# Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
|
309
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
|
310
|
+
# will be returned. The array returned will not include the leading <tt>$</tt>
|
311
|
+
# character.
|
312
|
+
#
|
313
|
+
# If a block is given then it will be called for each cashtag.
|
314
|
+
def extract_cashtags(text, &block) # :yields: cashtag_text
|
315
|
+
cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
|
316
|
+
cashtags.each(&block) if block_given?
|
317
|
+
cashtags
|
318
|
+
end
|
306
319
|
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
320
|
+
# Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
|
321
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
|
322
|
+
# will be returned. The array returned will not include the leading <tt>$</tt>
|
323
|
+
# character.
|
324
|
+
#
|
325
|
+
# If a block is given then it will be called for each cashtag.
|
326
|
+
def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
|
327
|
+
return [] unless text =~ /\$/
|
328
|
+
|
329
|
+
tags = []
|
330
|
+
text.scan(Twitter::TwitterText::Regex[:valid_cashtag]) do |before, dollar, cash_text|
|
331
|
+
match_data = $~
|
332
|
+
start_position = match_data.char_begin(2)
|
333
|
+
end_position = match_data.char_end(3)
|
334
|
+
tags << {
|
335
|
+
:cashtag => cash_text,
|
336
|
+
:indices => [start_position, end_position]
|
337
|
+
}
|
338
|
+
end
|
318
339
|
|
319
|
-
|
320
|
-
|
321
|
-
# will be returned. The array returned will not include the leading <tt>$</tt>
|
322
|
-
# character.
|
323
|
-
#
|
324
|
-
# If a block is given then it will be called for each cashtag.
|
325
|
-
def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
|
326
|
-
return [] unless text =~ /\$/
|
327
|
-
|
328
|
-
tags = []
|
329
|
-
text.scan(Twitter::Regex[:valid_cashtag]) do |before, dollar, cash_text|
|
330
|
-
match_data = $~
|
331
|
-
start_position = match_data.char_begin(2)
|
332
|
-
end_position = match_data.char_end(3)
|
333
|
-
tags << {
|
334
|
-
:cashtag => cash_text,
|
335
|
-
:indices => [start_position, end_position]
|
336
|
-
}
|
340
|
+
tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
|
341
|
+
tags
|
337
342
|
end
|
338
343
|
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
rescue Exception
|
353
|
-
# On error don't consider this a valid domain.
|
354
|
-
return false
|
344
|
+
def is_valid_domain(url_length, domain, protocol)
|
345
|
+
begin
|
346
|
+
raise ArgumentError.new("invalid empty domain") unless domain
|
347
|
+
original_domain_length = domain.length
|
348
|
+
encoded_domain = IDN::Idna.toASCII(domain)
|
349
|
+
updated_domain_length = encoded_domain.length
|
350
|
+
url_length += (updated_domain_length - original_domain_length) if (updated_domain_length > original_domain_length)
|
351
|
+
url_length += URL_PROTOCOL_LENGTH unless protocol
|
352
|
+
url_length <= MAX_URL_LENGTH
|
353
|
+
rescue Exception
|
354
|
+
# On error don't consider this a valid domain.
|
355
|
+
return false
|
356
|
+
end
|
355
357
|
end
|
356
358
|
end
|
357
359
|
end
|