twitter-text 1.4.12 → 1.4.13
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/extractor.rb +35 -15
- data/lib/regex.rb +16 -15
- data/lib/rewriter.rb +6 -4
- data/twitter-text.gemspec +1 -1
- metadata +4 -4
data/lib/extractor.rb
CHANGED
@@ -68,8 +68,9 @@ module Twitter
|
|
68
68
|
return [] unless text
|
69
69
|
|
70
70
|
possible_screen_names = []
|
71
|
-
text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn
|
71
|
+
text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn|
|
72
72
|
extract_mentions_match_data = $~
|
73
|
+
after = $'
|
73
74
|
unless after =~ Twitter::Regex[:end_screen_name_match]
|
74
75
|
start_position = extract_mentions_match_data.char_begin(2) - 1
|
75
76
|
end_position = extract_mentions_match_data.char_end(2)
|
@@ -99,8 +100,9 @@ module Twitter
|
|
99
100
|
return [] unless text
|
100
101
|
|
101
102
|
possible_entries = []
|
102
|
-
text.to_s.scan(Twitter::Regex[:extract_mentions_or_lists]) do |before, sn, list_slug
|
103
|
+
text.to_s.scan(Twitter::Regex[:extract_mentions_or_lists]) do |before, sn, list_slug|
|
103
104
|
extract_mentions_match_data = $~
|
105
|
+
after = $'
|
104
106
|
unless after =~ Twitter::Regex[:end_screen_name_match]
|
105
107
|
start_position = extract_mentions_match_data.char_begin(2) - 1
|
106
108
|
end_position = extract_mentions_match_data.char_end(list_slug.nil? ? 2 : 3)
|
@@ -130,6 +132,7 @@ module Twitter
|
|
130
132
|
|
131
133
|
possible_screen_name = text.match(Twitter::Regex[:extract_reply])
|
132
134
|
return unless possible_screen_name.respond_to?(:captures)
|
135
|
+
return if $' =~ Twitter::Regex[:end_screen_name_match]
|
133
136
|
screen_name = possible_screen_name.captures.first
|
134
137
|
yield screen_name if block_given?
|
135
138
|
screen_name
|
@@ -161,18 +164,32 @@ module Twitter
|
|
161
164
|
start_position = valid_url_match_data.char_begin(3)
|
162
165
|
end_position = valid_url_match_data.char_end(3)
|
163
166
|
|
164
|
-
# If protocol is missing
|
167
|
+
# If protocol is missing and domain contains non-ASCII characters,
|
168
|
+
# extract ASCII-only domains.
|
165
169
|
if !protocol
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
+
last_url = nil
|
171
|
+
last_url_invalid_match = nil
|
172
|
+
domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
|
173
|
+
last_url = {
|
174
|
+
:url => ascii_domain,
|
175
|
+
:indices => [start_position + $~.char_begin(0),
|
176
|
+
start_position + $~.char_end(0)]
|
177
|
+
}
|
178
|
+
last_url_invalid_match = ascii_domain =~ Twitter::Regex[:invalid_short_domain]
|
179
|
+
urls << last_url unless last_url_invalid_match
|
170
180
|
end
|
171
|
-
end
|
172
181
|
|
173
|
-
|
174
|
-
|
175
|
-
|
182
|
+
# no ASCII-only domain found. Skip the entire URL
|
183
|
+
next unless last_url
|
184
|
+
|
185
|
+
# last_url only contains domain. Need to add path and query if they exist.
|
186
|
+
if path
|
187
|
+
# last_url was not added. Add it to urls here.
|
188
|
+
urls << last_url if last_url_invalid_match
|
189
|
+
last_url[:url] = url.sub(domain, last_url[:url])
|
190
|
+
last_url[:indices][1] = end_position
|
191
|
+
end
|
192
|
+
else
|
176
193
|
urls << {
|
177
194
|
:url => url,
|
178
195
|
:indices => [start_position, end_position]
|
@@ -208,10 +225,13 @@ module Twitter
|
|
208
225
|
text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
|
209
226
|
start_position = $~.char_begin(2)
|
210
227
|
end_position = $~.char_end(3)
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
228
|
+
after = $'
|
229
|
+
unless after =~ Twitter::Regex[:end_hashtag_match]
|
230
|
+
tags << {
|
231
|
+
:hashtag => hash_text,
|
232
|
+
:indices => [start_position, end_position]
|
233
|
+
}
|
234
|
+
end
|
215
235
|
end
|
216
236
|
tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given?
|
217
237
|
tags
|
data/lib/regex.rb
CHANGED
@@ -50,11 +50,6 @@ module Twitter
|
|
50
50
|
].map{|cp| [cp].pack('U') }.freeze
|
51
51
|
REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
|
52
52
|
|
53
|
-
REGEXEN[:at_signs] = /[@@]/
|
54
|
-
REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
|
55
|
-
REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?(?=(.|$))/o
|
56
|
-
REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
57
|
-
|
58
53
|
major, minor, patch = RUBY_VERSION.split('.')
|
59
54
|
if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
|
60
55
|
REGEXEN[:list_name] = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/
|
@@ -89,8 +84,6 @@ module Twitter
|
|
89
84
|
].join('').freeze
|
90
85
|
REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
|
91
86
|
|
92
|
-
REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
|
93
|
-
|
94
87
|
CJ_HASHTAG_CHARACTERS = [
|
95
88
|
regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width)
|
96
89
|
regex_range(0xFF66, 0xFF9F), # Katakana (half-width)
|
@@ -104,27 +97,35 @@ module Twitter
|
|
104
97
|
regex_range(0x2F800, 0x2FA1F), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
|
105
98
|
].join('').freeze
|
106
99
|
|
107
|
-
HASHTAG_BOUNDARY = /(?:\A|\z|#{REGEXEN[:spaces]}|[「」。、.,!?!?:;"'])/o
|
108
|
-
|
109
100
|
# A hashtag must contain latin characters, numbers and underscores, but not all numbers.
|
110
101
|
HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
|
111
102
|
HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io
|
103
|
+
HASHTAG_BOUNDARY = /\A|\z|[^&\/a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o
|
112
104
|
|
113
105
|
HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io
|
114
106
|
|
115
107
|
REGEXEN[:auto_link_hashtags] = /#{HASHTAG}/io
|
108
|
+
# Used in Extractor and Rewriter for final filtering
|
109
|
+
REGEXEN[:end_hashtag_match] = /^(?:[##]|:\/\/)/o
|
110
|
+
|
111
|
+
REGEXEN[:at_signs] = /[@@]/
|
112
|
+
REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
113
|
+
REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
|
114
|
+
REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
115
|
+
# Used in Extractor and Rewriter for final filtering
|
116
|
+
REGEXEN[:end_screen_name_match] = /^(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
|
116
117
|
|
117
118
|
REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
|
118
119
|
REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
|
119
120
|
|
120
121
|
# URL related hash regex collection
|
121
|
-
REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_
|
122
|
+
REGEXEN[:valid_preceding_chars] = /(?:[^-\/"'!=A-Z0-9_@@##\.#{INVALID_CHARACTERS.join('')}]|^)/io
|
122
123
|
|
123
124
|
DOMAIN_VALID_CHARS = "[^[:punct:][:space:][:blank:][:cntrl:]#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
|
124
125
|
REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
125
126
|
REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
126
127
|
|
127
|
-
REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^
|
128
|
+
REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^a-z]|$))/i
|
128
129
|
REGEXEN[:valid_ccTLD] = %r{
|
129
130
|
(?:
|
130
131
|
(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
|
@@ -133,7 +134,7 @@ module Twitter
|
|
133
134
|
lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|
|
134
135
|
pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|
|
135
136
|
tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)
|
136
|
-
(?=[^
|
137
|
+
(?=[^a-z]|$)
|
137
138
|
)
|
138
139
|
}ix
|
139
140
|
REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/
|
@@ -145,12 +146,12 @@ module Twitter
|
|
145
146
|
|
146
147
|
# This is used in Extractor
|
147
148
|
REGEXEN[:valid_ascii_domain] = /
|
148
|
-
(?:(?:[
|
149
|
+
(?:(?:[A-Za-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
|
149
150
|
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
150
151
|
/iox
|
151
152
|
|
152
153
|
# This is used in Extractor to filter out unwanted URLs.
|
153
|
-
REGEXEN[:
|
154
|
+
REGEXEN[:invalid_short_domain] = /^#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}$/io
|
154
155
|
|
155
156
|
REGEXEN[:valid_port_number] = /[0-9]+/
|
156
157
|
|
@@ -171,7 +172,7 @@ module Twitter
|
|
171
172
|
)|(?:@#{REGEXEN[:valid_general_url_path_chars]}+\/)
|
172
173
|
)/iox
|
173
174
|
|
174
|
-
REGEXEN[:valid_url_query_chars] = /[a-z0-9
|
175
|
+
REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i
|
175
176
|
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i
|
176
177
|
REGEXEN[:valid_url] = %r{
|
177
178
|
( # $1 total match
|
data/lib/rewriter.rb
CHANGED
@@ -42,10 +42,12 @@ module Twitter
|
|
42
42
|
|
43
43
|
def rewrite_hashtags(text)
|
44
44
|
text.to_s.gsub(Twitter::Regex[:auto_link_hashtags]) do
|
45
|
-
before = $1
|
46
|
-
|
47
|
-
|
48
|
-
|
45
|
+
before, hash, hashtag, after = $1, $2, $3, $'
|
46
|
+
if after =~ Twitter::Regex[:end_hashtag_match]
|
47
|
+
"#{before}#{hash}#{hashtag}"
|
48
|
+
else
|
49
|
+
"#{before}#{yield(hash, hashtag)}"
|
50
|
+
end
|
49
51
|
end
|
50
52
|
end
|
51
53
|
|
data/twitter-text.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = "twitter-text"
|
5
|
-
s.version = "1.4.
|
5
|
+
s.version = "1.4.13"
|
6
6
|
s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle",
|
7
7
|
"Raffi Krikorian", "J.P. Cummins", "Yoshimasa Niwa", "Keita Fujii"]
|
8
8
|
s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com",
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 29
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 4
|
9
|
-
-
|
10
|
-
version: 1.4.
|
9
|
+
- 13
|
10
|
+
version: 1.4.13
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Matt Sanford
|
@@ -22,7 +22,7 @@ autorequire:
|
|
22
22
|
bindir: bin
|
23
23
|
cert_chain: []
|
24
24
|
|
25
|
-
date: 2011-
|
25
|
+
date: 2011-11-02 00:00:00 -07:00
|
26
26
|
default_executable:
|
27
27
|
dependencies:
|
28
28
|
- !ruby/object:Gem::Dependency
|