twitter-text 1.4.12 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/extractor.rb +35 -15
- data/lib/regex.rb +16 -15
- data/lib/rewriter.rb +6 -4
- data/twitter-text.gemspec +1 -1
- metadata +4 -4
data/lib/extractor.rb
CHANGED
|
@@ -68,8 +68,9 @@ module Twitter
|
|
|
68
68
|
return [] unless text
|
|
69
69
|
|
|
70
70
|
possible_screen_names = []
|
|
71
|
-
text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn
|
|
71
|
+
text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn|
|
|
72
72
|
extract_mentions_match_data = $~
|
|
73
|
+
after = $'
|
|
73
74
|
unless after =~ Twitter::Regex[:end_screen_name_match]
|
|
74
75
|
start_position = extract_mentions_match_data.char_begin(2) - 1
|
|
75
76
|
end_position = extract_mentions_match_data.char_end(2)
|
|
@@ -99,8 +100,9 @@ module Twitter
|
|
|
99
100
|
return [] unless text
|
|
100
101
|
|
|
101
102
|
possible_entries = []
|
|
102
|
-
text.to_s.scan(Twitter::Regex[:extract_mentions_or_lists]) do |before, sn, list_slug
|
|
103
|
+
text.to_s.scan(Twitter::Regex[:extract_mentions_or_lists]) do |before, sn, list_slug|
|
|
103
104
|
extract_mentions_match_data = $~
|
|
105
|
+
after = $'
|
|
104
106
|
unless after =~ Twitter::Regex[:end_screen_name_match]
|
|
105
107
|
start_position = extract_mentions_match_data.char_begin(2) - 1
|
|
106
108
|
end_position = extract_mentions_match_data.char_end(list_slug.nil? ? 2 : 3)
|
|
@@ -130,6 +132,7 @@ module Twitter
|
|
|
130
132
|
|
|
131
133
|
possible_screen_name = text.match(Twitter::Regex[:extract_reply])
|
|
132
134
|
return unless possible_screen_name.respond_to?(:captures)
|
|
135
|
+
return if $' =~ Twitter::Regex[:end_screen_name_match]
|
|
133
136
|
screen_name = possible_screen_name.captures.first
|
|
134
137
|
yield screen_name if block_given?
|
|
135
138
|
screen_name
|
|
@@ -161,18 +164,32 @@ module Twitter
|
|
|
161
164
|
start_position = valid_url_match_data.char_begin(3)
|
|
162
165
|
end_position = valid_url_match_data.char_end(3)
|
|
163
166
|
|
|
164
|
-
# If protocol is missing
|
|
167
|
+
# If protocol is missing and domain contains non-ASCII characters,
|
|
168
|
+
# extract ASCII-only domains.
|
|
165
169
|
if !protocol
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
+
last_url = nil
|
|
171
|
+
last_url_invalid_match = nil
|
|
172
|
+
domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
|
|
173
|
+
last_url = {
|
|
174
|
+
:url => ascii_domain,
|
|
175
|
+
:indices => [start_position + $~.char_begin(0),
|
|
176
|
+
start_position + $~.char_end(0)]
|
|
177
|
+
}
|
|
178
|
+
last_url_invalid_match = ascii_domain =~ Twitter::Regex[:invalid_short_domain]
|
|
179
|
+
urls << last_url unless last_url_invalid_match
|
|
170
180
|
end
|
|
171
|
-
end
|
|
172
181
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
182
|
+
# no ASCII-only domain found. Skip the entire URL
|
|
183
|
+
next unless last_url
|
|
184
|
+
|
|
185
|
+
# last_url only contains domain. Need to add path and query if they exist.
|
|
186
|
+
if path
|
|
187
|
+
# last_url was not added. Add it to urls here.
|
|
188
|
+
urls << last_url if last_url_invalid_match
|
|
189
|
+
last_url[:url] = url.sub(domain, last_url[:url])
|
|
190
|
+
last_url[:indices][1] = end_position
|
|
191
|
+
end
|
|
192
|
+
else
|
|
176
193
|
urls << {
|
|
177
194
|
:url => url,
|
|
178
195
|
:indices => [start_position, end_position]
|
|
@@ -208,10 +225,13 @@ module Twitter
|
|
|
208
225
|
text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
|
|
209
226
|
start_position = $~.char_begin(2)
|
|
210
227
|
end_position = $~.char_end(3)
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
228
|
+
after = $'
|
|
229
|
+
unless after =~ Twitter::Regex[:end_hashtag_match]
|
|
230
|
+
tags << {
|
|
231
|
+
:hashtag => hash_text,
|
|
232
|
+
:indices => [start_position, end_position]
|
|
233
|
+
}
|
|
234
|
+
end
|
|
215
235
|
end
|
|
216
236
|
tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given?
|
|
217
237
|
tags
|
data/lib/regex.rb
CHANGED
|
@@ -50,11 +50,6 @@ module Twitter
|
|
|
50
50
|
].map{|cp| [cp].pack('U') }.freeze
|
|
51
51
|
REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
|
|
52
52
|
|
|
53
|
-
REGEXEN[:at_signs] = /[@@]/
|
|
54
|
-
REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
|
|
55
|
-
REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?(?=(.|$))/o
|
|
56
|
-
REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
|
57
|
-
|
|
58
53
|
major, minor, patch = RUBY_VERSION.split('.')
|
|
59
54
|
if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
|
|
60
55
|
REGEXEN[:list_name] = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/
|
|
@@ -89,8 +84,6 @@ module Twitter
|
|
|
89
84
|
].join('').freeze
|
|
90
85
|
REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
|
|
91
86
|
|
|
92
|
-
REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
|
|
93
|
-
|
|
94
87
|
CJ_HASHTAG_CHARACTERS = [
|
|
95
88
|
regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width)
|
|
96
89
|
regex_range(0xFF66, 0xFF9F), # Katakana (half-width)
|
|
@@ -104,27 +97,35 @@ module Twitter
|
|
|
104
97
|
regex_range(0x2F800, 0x2FA1F), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
|
|
105
98
|
].join('').freeze
|
|
106
99
|
|
|
107
|
-
HASHTAG_BOUNDARY = /(?:\A|\z|#{REGEXEN[:spaces]}|[「」。、.,!?!?:;"'])/o
|
|
108
|
-
|
|
109
100
|
# A hashtag must contain latin characters, numbers and underscores, but not all numbers.
|
|
110
101
|
HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
|
|
111
102
|
HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
|
|
103
|
+
HASHTAG_BOUNDARY = /\A|\z|[^&\/a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
|
|
112
104
|
|
|
113
105
|
HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
|
|
114
106
|
|
|
115
107
|
REGEXEN[:auto_link_hashtags] = /#{HASHTAG}/io
|
|
108
|
+
# Used in Extractor and Rewriter for final filtering
|
|
109
|
+
REGEXEN[:end_hashtag_match] = /^(?:[##]|:\/\/)/o
|
|
110
|
+
|
|
111
|
+
REGEXEN[:at_signs] = /[@@]/
|
|
112
|
+
REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
|
113
|
+
REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
|
|
114
|
+
REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
|
115
|
+
# Used in Extractor and Rewriter for final filtering
|
|
116
|
+
REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
|
|
116
117
|
|
|
117
118
|
REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
|
|
118
119
|
REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
|
|
119
120
|
|
|
120
121
|
# URL related hash regex collection
|
|
121
|
-
REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_
|
|
122
|
+
REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@@##\.#{INVALID_CHARACTERS.join('')}]|^)/io
|
|
122
123
|
|
|
123
124
|
DOMAIN_VALID_CHARS = "[^[:punct:][:space:][:blank:][:cntrl:]#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
|
|
124
125
|
REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
|
125
126
|
REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
|
126
127
|
|
|
127
|
-
REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^
|
|
128
|
+
REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^a-z]|$))/i
|
|
128
129
|
REGEXEN[:valid_ccTLD] = %r{
|
|
129
130
|
(?:
|
|
130
131
|
(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
|
|
@@ -133,7 +134,7 @@ module Twitter
|
|
|
133
134
|
lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
|
|
134
135
|
pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|
|
|
135
136
|
tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
|
|
136
|
-
(?=[^
|
|
137
|
+
(?=[^a-z]|$)
|
|
137
138
|
)
|
|
138
139
|
}ix
|
|
139
140
|
REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/
|
|
@@ -145,12 +146,12 @@ module Twitter
|
|
|
145
146
|
|
|
146
147
|
# This is used in Extractor
|
|
147
148
|
REGEXEN[:valid_ascii_domain] = /
|
|
148
|
-
(?:(?:[
|
|
149
|
+
(?:(?:[A-Za-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
|
|
149
150
|
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
|
150
151
|
/iox
|
|
151
152
|
|
|
152
153
|
# This is used in Extractor to filter out unwanted URLs.
|
|
153
|
-
REGEXEN[:
|
|
154
|
+
REGEXEN[:invalid_short_domain] = /^#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}$/io
|
|
154
155
|
|
|
155
156
|
REGEXEN[:valid_port_number] = /[0-9]+/
|
|
156
157
|
|
|
@@ -171,7 +172,7 @@ module Twitter
|
|
|
171
172
|
)|(?:@#{REGEXEN[:valid_general_url_path_chars]}+\/)
|
|
172
173
|
)/iox
|
|
173
174
|
|
|
174
|
-
REGEXEN[:valid_url_query_chars] = /[a-z0-9
|
|
175
|
+
REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i
|
|
175
176
|
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
|
|
176
177
|
REGEXEN[:valid_url] = %r{
|
|
177
178
|
( # $1 total match
|
data/lib/rewriter.rb
CHANGED
|
@@ -42,10 +42,12 @@ module Twitter
|
|
|
42
42
|
|
|
43
43
|
def rewrite_hashtags(text)
|
|
44
44
|
text.to_s.gsub(Twitter::Regex[:auto_link_hashtags]) do
|
|
45
|
-
before = $1
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
45
|
+
before, hash, hashtag, after = $1, $2, $3, $'
|
|
46
|
+
if after =~ Twitter::Regex[:end_hashtag_match]
|
|
47
|
+
"#{before}#{hash}#{hashtag}"
|
|
48
|
+
else
|
|
49
|
+
"#{before}#{yield(hash, hashtag)}"
|
|
50
|
+
end
|
|
49
51
|
end
|
|
50
52
|
end
|
|
51
53
|
|
data/twitter-text.gemspec
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Gem::Specification.new do |s|
|
|
4
4
|
s.name = "twitter-text"
|
|
5
|
-
s.version = "1.4.
|
|
5
|
+
s.version = "1.4.13"
|
|
6
6
|
s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle",
|
|
7
7
|
"Raffi Krikorian", "J.P. Cummins", "Yoshimasa Niwa", "Keita Fujii"]
|
|
8
8
|
s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com",
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: twitter-text
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
hash:
|
|
4
|
+
hash: 29
|
|
5
5
|
prerelease:
|
|
6
6
|
segments:
|
|
7
7
|
- 1
|
|
8
8
|
- 4
|
|
9
|
-
-
|
|
10
|
-
version: 1.4.
|
|
9
|
+
- 13
|
|
10
|
+
version: 1.4.13
|
|
11
11
|
platform: ruby
|
|
12
12
|
authors:
|
|
13
13
|
- Matt Sanford
|
|
@@ -22,7 +22,7 @@ autorequire:
|
|
|
22
22
|
bindir: bin
|
|
23
23
|
cert_chain: []
|
|
24
24
|
|
|
25
|
-
date: 2011-
|
|
25
|
+
date: 2011-11-02 00:00:00 -07:00
|
|
26
26
|
default_executable:
|
|
27
27
|
dependencies:
|
|
28
28
|
- !ruby/object:Gem::Dependency
|