twitter-text-kow 1.3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gemtest +0 -0
- data/.gitignore +40 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +44 -0
- data/Gemfile +4 -0
- data/LICENSE +188 -0
- data/README.md +193 -0
- data/Rakefile +52 -0
- data/config/README.md +142 -0
- data/config/v1.json +8 -0
- data/config/v2.json +29 -0
- data/config/v3.json +30 -0
- data/lib/assets/tld_lib.yml +1577 -0
- data/lib/twitter-text/autolink.rb +455 -0
- data/lib/twitter-text/configuration.rb +68 -0
- data/lib/twitter-text/deprecation.rb +21 -0
- data/lib/twitter-text/emoji_regex.rb +27 -0
- data/lib/twitter-text/extractor.rb +388 -0
- data/lib/twitter-text/hash_helper.rb +27 -0
- data/lib/twitter-text/hit_highlighter.rb +92 -0
- data/lib/twitter-text/regex.rb +381 -0
- data/lib/twitter-text/rewriter.rb +69 -0
- data/lib/twitter-text/unicode.rb +31 -0
- data/lib/twitter-text/validation.rb +251 -0
- data/lib/twitter-text/weighted_range.rb +24 -0
- data/lib/twitter-text.rb +29 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/spec/autolinking_spec.rb +858 -0
- data/spec/configuration_spec.rb +136 -0
- data/spec/extractor_spec.rb +392 -0
- data/spec/hithighlighter_spec.rb +96 -0
- data/spec/regex_spec.rb +76 -0
- data/spec/rewriter_spec.rb +553 -0
- data/spec/spec_helper.rb +139 -0
- data/spec/test_urls.rb +90 -0
- data/spec/twitter_text_spec.rb +25 -0
- data/spec/unicode_spec.rb +35 -0
- data/spec/validation_spec.rb +87 -0
- data/test/conformance_test.rb +242 -0
- data/twitter-text.gemspec +35 -0
- metadata +228 -0
@@ -0,0 +1,251 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
require 'unf'
|
6
|
+
|
7
|
+
module Twitter
|
8
|
+
module TwitterText
|
9
|
+
module Validation extend self
|
10
|
+
DEFAULT_TCO_URL_LENGTHS = {
|
11
|
+
:short_url_length => 23,
|
12
|
+
}
|
13
|
+
|
14
|
+
# :weighted_length the weighted length of tweet based on weights specified in the config
|
15
|
+
# :valid If tweet is valid
|
16
|
+
# :permillage permillage of the tweet over the max length specified in config
|
17
|
+
# :valid_range_start beginning of valid text
|
18
|
+
# :valid_range_end End index of valid part of the tweet text (inclusive)
|
19
|
+
# :display_range_start beginning index of display text
|
20
|
+
# :display_range_end end index of display text (inclusive)
|
21
|
+
class ParseResults < Hash
|
22
|
+
|
23
|
+
RESULT_PARAMS = [:weighted_length, :valid, :permillage, :valid_range_start, :valid_range_end, :display_range_start, :display_range_end]
|
24
|
+
|
25
|
+
def self.empty
|
26
|
+
return ParseResults.new(weighted_length: 0, permillage: 0, valid: true, display_range_start: 0, display_range_end: 0, valid_range_start: 0, valid_range_end: 0)
|
27
|
+
end
|
28
|
+
|
29
|
+
def initialize(params = {})
|
30
|
+
RESULT_PARAMS.each do |key|
|
31
|
+
super[key] = params[key] if params.key?(key)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Parse input text and return hash with descriptive parameters populated.
|
37
|
+
def parse_tweet(text, options = {})
|
38
|
+
options = DEFAULT_TCO_URL_LENGTHS.merge(options)
|
39
|
+
config = options[:config] || Twitter::TwitterText::Configuration.default_configuration
|
40
|
+
normalized_text = text.to_nfc
|
41
|
+
unless (normalized_text.length > 0)
|
42
|
+
ParseResults.empty()
|
43
|
+
end
|
44
|
+
|
45
|
+
scale = config.scale
|
46
|
+
max_weighted_tweet_length = config.max_weighted_tweet_length
|
47
|
+
scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale
|
48
|
+
transformed_url_length = config.transformed_url_length * scale
|
49
|
+
ranges = config.ranges
|
50
|
+
|
51
|
+
url_entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(normalized_text)
|
52
|
+
emoji_entities = config.emoji_parsing_enabled ? Twitter::TwitterText::Extractor.extract_emoji_with_indices(normalized_text) : []
|
53
|
+
|
54
|
+
has_invalid_chars = false
|
55
|
+
weighted_count = 0
|
56
|
+
offset = 0
|
57
|
+
display_offset = 0
|
58
|
+
valid_offset = 0
|
59
|
+
|
60
|
+
while offset < normalized_text.codepoint_length
|
61
|
+
# Reset the default char weight each pass through the loop
|
62
|
+
char_weight = config.default_weight
|
63
|
+
entity_length = 0
|
64
|
+
|
65
|
+
url_entities.each do |url_entity|
|
66
|
+
if url_entity[:indices].first == offset
|
67
|
+
entity_length = url_entity[:indices].last - url_entity[:indices].first
|
68
|
+
weighted_count += transformed_url_length
|
69
|
+
offset += entity_length
|
70
|
+
display_offset += entity_length
|
71
|
+
if weighted_count <= scaled_max_weighted_tweet_length
|
72
|
+
valid_offset += entity_length
|
73
|
+
end
|
74
|
+
# Finding a match breaks the loop
|
75
|
+
break
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
emoji_entities.each do |emoji_entity|
|
80
|
+
if emoji_entity[:indices].first == offset
|
81
|
+
entity_length = emoji_entity[:indices].last - emoji_entity[:indices].first
|
82
|
+
weighted_count += char_weight # the default weight
|
83
|
+
offset += entity_length
|
84
|
+
display_offset += entity_length
|
85
|
+
if weighted_count <= scaled_max_weighted_tweet_length
|
86
|
+
valid_offset += entity_length
|
87
|
+
end
|
88
|
+
# Finding a match breaks the loop
|
89
|
+
break
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
next if entity_length > 0
|
94
|
+
|
95
|
+
if offset < normalized_text.codepoint_length
|
96
|
+
code_point = normalized_text[offset]
|
97
|
+
|
98
|
+
ranges.each do |range|
|
99
|
+
if range.contains?(code_point.unpack("U").first)
|
100
|
+
char_weight = range.weight
|
101
|
+
break
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
weighted_count += char_weight
|
106
|
+
|
107
|
+
has_invalid_chars = contains_invalid?(code_point) unless has_invalid_chars
|
108
|
+
codepoint_length = code_point.codepoint_length
|
109
|
+
offset += codepoint_length
|
110
|
+
display_offset += codepoint_length
|
111
|
+
# index += codepoint_length
|
112
|
+
|
113
|
+
if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length)
|
114
|
+
valid_offset += codepoint_length
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
normalized_text_offset = text.codepoint_length - normalized_text.codepoint_length
|
120
|
+
scaled_weighted_length = weighted_count / scale
|
121
|
+
is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length) && (scaled_weighted_length != 0)
|
122
|
+
permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length
|
123
|
+
|
124
|
+
return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1))
|
125
|
+
end
|
126
|
+
|
127
|
+
def contains_invalid?(text)
|
128
|
+
return false if !text || text.empty?
|
129
|
+
begin
|
130
|
+
return true if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
|
131
|
+
rescue ArgumentError
|
132
|
+
# non-Unicode value.
|
133
|
+
return true
|
134
|
+
end
|
135
|
+
return false
|
136
|
+
end
|
137
|
+
|
138
|
+
def valid_username?(username)
|
139
|
+
return false if !username || username.empty?
|
140
|
+
|
141
|
+
extracted = Twitter::TwitterText::Extractor.extract_mentioned_screen_names(username)
|
142
|
+
# Should extract the username minus the @ sign, hence the [1..-1]
|
143
|
+
extracted.size == 1 && extracted.first == username[1..-1]
|
144
|
+
end
|
145
|
+
|
146
|
+
VALID_LIST_RE = /\A#{Twitter::TwitterText::Regex[:valid_mention_or_list]}\z/o
|
147
|
+
def valid_list?(username_list)
|
148
|
+
match = username_list.match(VALID_LIST_RE)
|
149
|
+
# Must have matched and had nothing before or after
|
150
|
+
!!(match && match[1] == "" && match[4] && !match[4].empty?)
|
151
|
+
end
|
152
|
+
|
153
|
+
def valid_hashtag?(hashtag)
|
154
|
+
return false if !hashtag || hashtag.empty?
|
155
|
+
|
156
|
+
extracted = Twitter::TwitterText::Extractor.extract_hashtags(hashtag)
|
157
|
+
# Should extract the hashtag minus the # sign, hence the [1..-1]
|
158
|
+
extracted.size == 1 && extracted.first == hashtag[1..-1]
|
159
|
+
end
|
160
|
+
|
161
|
+
def valid_url?(url, unicode_domains=true, require_protocol=true)
|
162
|
+
return false if !url || url.empty?
|
163
|
+
|
164
|
+
url_parts = url.match(Twitter::TwitterText::Regex[:validate_url_unencoded])
|
165
|
+
return false unless (url_parts && url_parts.to_s == url)
|
166
|
+
|
167
|
+
scheme, authority, path, query, fragment = url_parts.captures
|
168
|
+
|
169
|
+
return false unless ((!require_protocol ||
|
170
|
+
(valid_match?(scheme, Twitter::TwitterText::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) &&
|
171
|
+
valid_match?(path, Twitter::TwitterText::Regex[:validate_url_path]) &&
|
172
|
+
valid_match?(query, Twitter::TwitterText::Regex[:validate_url_query], true) &&
|
173
|
+
valid_match?(fragment, Twitter::TwitterText::Regex[:validate_url_fragment], true))
|
174
|
+
|
175
|
+
return (unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_unicode_authority])) ||
|
176
|
+
(!unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_authority]))
|
177
|
+
end
|
178
|
+
|
179
|
+
# These methods are deprecated, will be removed in future.
|
180
|
+
extend Deprecation
|
181
|
+
|
182
|
+
MAX_LENGTH_LEGACY = 140
|
183
|
+
|
184
|
+
# DEPRECATED: Please use parse_text instead.
|
185
|
+
#
|
186
|
+
# Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
|
187
|
+
# (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
|
188
|
+
# string no matter which actual form was transmitted. For example:
|
189
|
+
#
|
190
|
+
# U+0065 Latin Small Letter E
|
191
|
+
# + U+0301 Combining Acute Accent
|
192
|
+
# ----------
|
193
|
+
# = 2 bytes, 2 characters, displayed as é (1 visual glyph)
|
194
|
+
# … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
|
195
|
+
#
|
196
|
+
# The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
|
197
|
+
#
|
198
|
+
def tweet_length(text, options = {})
|
199
|
+
options = DEFAULT_TCO_URL_LENGTHS.merge(options)
|
200
|
+
|
201
|
+
length = text.to_nfc.unpack("U*").length
|
202
|
+
|
203
|
+
Twitter::TwitterText::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
|
204
|
+
length += start_position - end_position
|
205
|
+
length += options[:short_url_length] if url.length > 0
|
206
|
+
end
|
207
|
+
|
208
|
+
length
|
209
|
+
end
|
210
|
+
deprecate :tweet_length, :parse_tweet
|
211
|
+
|
212
|
+
# DEPRECATED: Please use parse_text instead.
|
213
|
+
#
|
214
|
+
# Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
|
215
|
+
# before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
|
216
|
+
# will allow quicker feedback.
|
217
|
+
#
|
218
|
+
# Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
|
219
|
+
#
|
220
|
+
# <tt>:too_long</tt>:: if the <tt>text</tt> is too long
|
221
|
+
# <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
|
222
|
+
# <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
|
223
|
+
def tweet_invalid?(text)
|
224
|
+
return :empty if !text || text.empty?
|
225
|
+
begin
|
226
|
+
return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY
|
227
|
+
return :invalid_characters if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
|
228
|
+
rescue ArgumentError
|
229
|
+
# non-Unicode value.
|
230
|
+
return :invalid_characters
|
231
|
+
end
|
232
|
+
|
233
|
+
return false
|
234
|
+
end
|
235
|
+
deprecate :tweet_invalid?, :parse_tweet
|
236
|
+
|
237
|
+
def valid_tweet_text?(text)
|
238
|
+
!tweet_invalid?(text)
|
239
|
+
end
|
240
|
+
deprecate :valid_tweet_text?, :parse_tweet
|
241
|
+
|
242
|
+
private
|
243
|
+
|
244
|
+
def valid_match?(string, regex, optional=false)
|
245
|
+
return (string && string.match(regex) && $~.to_s == string) unless optional
|
246
|
+
|
247
|
+
!(string && (!string.match(regex) || $~.to_s != string))
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
# encoding: UTF-8
|
6
|
+
|
7
|
+
module Twitter
|
8
|
+
module TwitterText
|
9
|
+
class WeightedRange
|
10
|
+
attr_reader :start, :end, :weight
|
11
|
+
|
12
|
+
def initialize(range = {})
|
13
|
+
raise ArgumentError.new("Invalid range") unless [:start, :end, :weight].all? { |key| range.key?(key) && range[key].is_a?(Integer) }
|
14
|
+
@start = range[:start]
|
15
|
+
@end = range[:end]
|
16
|
+
@weight = range[:weight]
|
17
|
+
end
|
18
|
+
|
19
|
+
def contains?(code_point)
|
20
|
+
code_point >= @start && code_point <= @end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/twitter-text.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
major, minor, _patch = RUBY_VERSION.split('.')
|
6
|
+
|
7
|
+
$RUBY_1_9 = if major.to_i == 1 && minor.to_i < 9
|
8
|
+
# Ruby 1.8 KCODE check. Not needed on 1.9 and later.
|
9
|
+
raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless $KCODE[0].chr =~ /u/i
|
10
|
+
false
|
11
|
+
else
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
%w(
|
16
|
+
deprecation
|
17
|
+
emoji_regex
|
18
|
+
regex
|
19
|
+
rewriter
|
20
|
+
autolink
|
21
|
+
extractor
|
22
|
+
unicode
|
23
|
+
weighted_range
|
24
|
+
configuration
|
25
|
+
validation
|
26
|
+
hit_highlighter
|
27
|
+
).each do |name|
|
28
|
+
require "twitter-text/#{name}"
|
29
|
+
end
|
data/script/destroy
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/destroy'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit]
|
14
|
+
RubiGen::Scripts::Destroy.new.run(ARGV)
|
data/script/generate
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/generate'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit]
|
14
|
+
RubiGen::Scripts::Generate.new.run(ARGV)
|