twitter-text-kow 1.3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gemtest +0 -0
- data/.gitignore +40 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +44 -0
- data/Gemfile +4 -0
- data/LICENSE +188 -0
- data/README.md +193 -0
- data/Rakefile +52 -0
- data/config/README.md +142 -0
- data/config/v1.json +8 -0
- data/config/v2.json +29 -0
- data/config/v3.json +30 -0
- data/lib/assets/tld_lib.yml +1577 -0
- data/lib/twitter-text/autolink.rb +455 -0
- data/lib/twitter-text/configuration.rb +68 -0
- data/lib/twitter-text/deprecation.rb +21 -0
- data/lib/twitter-text/emoji_regex.rb +27 -0
- data/lib/twitter-text/extractor.rb +388 -0
- data/lib/twitter-text/hash_helper.rb +27 -0
- data/lib/twitter-text/hit_highlighter.rb +92 -0
- data/lib/twitter-text/regex.rb +381 -0
- data/lib/twitter-text/rewriter.rb +69 -0
- data/lib/twitter-text/unicode.rb +31 -0
- data/lib/twitter-text/validation.rb +251 -0
- data/lib/twitter-text/weighted_range.rb +24 -0
- data/lib/twitter-text.rb +29 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/spec/autolinking_spec.rb +858 -0
- data/spec/configuration_spec.rb +136 -0
- data/spec/extractor_spec.rb +392 -0
- data/spec/hithighlighter_spec.rb +96 -0
- data/spec/regex_spec.rb +76 -0
- data/spec/rewriter_spec.rb +553 -0
- data/spec/spec_helper.rb +139 -0
- data/spec/test_urls.rb +90 -0
- data/spec/twitter_text_spec.rb +25 -0
- data/spec/unicode_spec.rb +35 -0
- data/spec/validation_spec.rb +87 -0
- data/test/conformance_test.rb +242 -0
- data/twitter-text.gemspec +35 -0
- metadata +228 -0
@@ -0,0 +1,251 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
require 'unf'
|
6
|
+
|
7
|
+
module Twitter
|
8
|
+
module TwitterText
|
9
|
+
module Validation extend self
|
10
|
+
DEFAULT_TCO_URL_LENGTHS = {
|
11
|
+
:short_url_length => 23,
|
12
|
+
}
|
13
|
+
|
14
|
+
# :weighted_length the weighted length of tweet based on weights specified in the config
|
15
|
+
# :valid If tweet is valid
|
16
|
+
# :permillage permillage of the tweet over the max length specified in config
|
17
|
+
# :valid_range_start beginning of valid text
|
18
|
+
# :valid_range_end End index of valid part of the tweet text (inclusive)
|
19
|
+
# :display_range_start beginning index of display text
|
20
|
+
# :display_range_end end index of display text (inclusive)
|
21
|
+
class ParseResults < Hash
|
22
|
+
|
23
|
+
RESULT_PARAMS = [:weighted_length, :valid, :permillage, :valid_range_start, :valid_range_end, :display_range_start, :display_range_end]
|
24
|
+
|
25
|
+
def self.empty
|
26
|
+
return ParseResults.new(weighted_length: 0, permillage: 0, valid: true, display_range_start: 0, display_range_end: 0, valid_range_start: 0, valid_range_end: 0)
|
27
|
+
end
|
28
|
+
|
29
|
+
def initialize(params = {})
|
30
|
+
RESULT_PARAMS.each do |key|
|
31
|
+
super[key] = params[key] if params.key?(key)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Parse input text and return hash with descriptive parameters populated.
|
37
|
+
def parse_tweet(text, options = {})
|
38
|
+
options = DEFAULT_TCO_URL_LENGTHS.merge(options)
|
39
|
+
config = options[:config] || Twitter::TwitterText::Configuration.default_configuration
|
40
|
+
normalized_text = text.to_nfc
|
41
|
+
unless (normalized_text.length > 0)
|
42
|
+
ParseResults.empty()
|
43
|
+
end
|
44
|
+
|
45
|
+
scale = config.scale
|
46
|
+
max_weighted_tweet_length = config.max_weighted_tweet_length
|
47
|
+
scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale
|
48
|
+
transformed_url_length = config.transformed_url_length * scale
|
49
|
+
ranges = config.ranges
|
50
|
+
|
51
|
+
url_entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(normalized_text)
|
52
|
+
emoji_entities = config.emoji_parsing_enabled ? Twitter::TwitterText::Extractor.extract_emoji_with_indices(normalized_text) : []
|
53
|
+
|
54
|
+
has_invalid_chars = false
|
55
|
+
weighted_count = 0
|
56
|
+
offset = 0
|
57
|
+
display_offset = 0
|
58
|
+
valid_offset = 0
|
59
|
+
|
60
|
+
while offset < normalized_text.codepoint_length
|
61
|
+
# Reset the default char weight each pass through the loop
|
62
|
+
char_weight = config.default_weight
|
63
|
+
entity_length = 0
|
64
|
+
|
65
|
+
url_entities.each do |url_entity|
|
66
|
+
if url_entity[:indices].first == offset
|
67
|
+
entity_length = url_entity[:indices].last - url_entity[:indices].first
|
68
|
+
weighted_count += transformed_url_length
|
69
|
+
offset += entity_length
|
70
|
+
display_offset += entity_length
|
71
|
+
if weighted_count <= scaled_max_weighted_tweet_length
|
72
|
+
valid_offset += entity_length
|
73
|
+
end
|
74
|
+
# Finding a match breaks the loop
|
75
|
+
break
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
emoji_entities.each do |emoji_entity|
|
80
|
+
if emoji_entity[:indices].first == offset
|
81
|
+
entity_length = emoji_entity[:indices].last - emoji_entity[:indices].first
|
82
|
+
weighted_count += char_weight # the default weight
|
83
|
+
offset += entity_length
|
84
|
+
display_offset += entity_length
|
85
|
+
if weighted_count <= scaled_max_weighted_tweet_length
|
86
|
+
valid_offset += entity_length
|
87
|
+
end
|
88
|
+
# Finding a match breaks the loop
|
89
|
+
break
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
next if entity_length > 0
|
94
|
+
|
95
|
+
if offset < normalized_text.codepoint_length
|
96
|
+
code_point = normalized_text[offset]
|
97
|
+
|
98
|
+
ranges.each do |range|
|
99
|
+
if range.contains?(code_point.unpack("U").first)
|
100
|
+
char_weight = range.weight
|
101
|
+
break
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
weighted_count += char_weight
|
106
|
+
|
107
|
+
has_invalid_chars = contains_invalid?(code_point) unless has_invalid_chars
|
108
|
+
codepoint_length = code_point.codepoint_length
|
109
|
+
offset += codepoint_length
|
110
|
+
display_offset += codepoint_length
|
111
|
+
# index += codepoint_length
|
112
|
+
|
113
|
+
if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length)
|
114
|
+
valid_offset += codepoint_length
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
normalized_text_offset = text.codepoint_length - normalized_text.codepoint_length
|
120
|
+
scaled_weighted_length = weighted_count / scale
|
121
|
+
is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length) && (scaled_weighted_length != 0)
|
122
|
+
permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length
|
123
|
+
|
124
|
+
return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1))
|
125
|
+
end
|
126
|
+
|
127
|
+
def contains_invalid?(text)
|
128
|
+
return false if !text || text.empty?
|
129
|
+
begin
|
130
|
+
return true if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
|
131
|
+
rescue ArgumentError
|
132
|
+
# non-Unicode value.
|
133
|
+
return true
|
134
|
+
end
|
135
|
+
return false
|
136
|
+
end
|
137
|
+
|
138
|
+
def valid_username?(username)
|
139
|
+
return false if !username || username.empty?
|
140
|
+
|
141
|
+
extracted = Twitter::TwitterText::Extractor.extract_mentioned_screen_names(username)
|
142
|
+
# Should extract the username minus the @ sign, hence the [1..-1]
|
143
|
+
extracted.size == 1 && extracted.first == username[1..-1]
|
144
|
+
end
|
145
|
+
|
146
|
+
VALID_LIST_RE = /\A#{Twitter::TwitterText::Regex[:valid_mention_or_list]}\z/o
|
147
|
+
def valid_list?(username_list)
|
148
|
+
match = username_list.match(VALID_LIST_RE)
|
149
|
+
# Must have matched and had nothing before or after
|
150
|
+
!!(match && match[1] == "" && match[4] && !match[4].empty?)
|
151
|
+
end
|
152
|
+
|
153
|
+
def valid_hashtag?(hashtag)
|
154
|
+
return false if !hashtag || hashtag.empty?
|
155
|
+
|
156
|
+
extracted = Twitter::TwitterText::Extractor.extract_hashtags(hashtag)
|
157
|
+
# Should extract the hashtag minus the # sign, hence the [1..-1]
|
158
|
+
extracted.size == 1 && extracted.first == hashtag[1..-1]
|
159
|
+
end
|
160
|
+
|
161
|
+
def valid_url?(url, unicode_domains=true, require_protocol=true)
|
162
|
+
return false if !url || url.empty?
|
163
|
+
|
164
|
+
url_parts = url.match(Twitter::TwitterText::Regex[:validate_url_unencoded])
|
165
|
+
return false unless (url_parts && url_parts.to_s == url)
|
166
|
+
|
167
|
+
scheme, authority, path, query, fragment = url_parts.captures
|
168
|
+
|
169
|
+
return false unless ((!require_protocol ||
|
170
|
+
(valid_match?(scheme, Twitter::TwitterText::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) &&
|
171
|
+
valid_match?(path, Twitter::TwitterText::Regex[:validate_url_path]) &&
|
172
|
+
valid_match?(query, Twitter::TwitterText::Regex[:validate_url_query], true) &&
|
173
|
+
valid_match?(fragment, Twitter::TwitterText::Regex[:validate_url_fragment], true))
|
174
|
+
|
175
|
+
return (unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_unicode_authority])) ||
|
176
|
+
(!unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_authority]))
|
177
|
+
end
|
178
|
+
|
179
|
+
# These methods are deprecated, will be removed in future.
|
180
|
+
extend Deprecation
|
181
|
+
|
182
|
+
MAX_LENGTH_LEGACY = 140
|
183
|
+
|
184
|
+
# DEPRECATED: Please use parse_text instead.
|
185
|
+
#
|
186
|
+
# Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
|
187
|
+
# (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
|
188
|
+
# string no matter which actual form was transmitted. For example:
|
189
|
+
#
|
190
|
+
# U+0065 Latin Small Letter E
|
191
|
+
# + U+0301 Combining Acute Accent
|
192
|
+
# ----------
|
193
|
+
# = 2 bytes, 2 characters, displayed as é (1 visual glyph)
|
194
|
+
# … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
|
195
|
+
#
|
196
|
+
# The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
|
197
|
+
#
|
198
|
+
def tweet_length(text, options = {})
|
199
|
+
options = DEFAULT_TCO_URL_LENGTHS.merge(options)
|
200
|
+
|
201
|
+
length = text.to_nfc.unpack("U*").length
|
202
|
+
|
203
|
+
Twitter::TwitterText::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
|
204
|
+
length += start_position - end_position
|
205
|
+
length += options[:short_url_length] if url.length > 0
|
206
|
+
end
|
207
|
+
|
208
|
+
length
|
209
|
+
end
|
210
|
+
deprecate :tweet_length, :parse_tweet
|
211
|
+
|
212
|
+
# DEPRECATED: Please use parse_text instead.
|
213
|
+
#
|
214
|
+
# Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
|
215
|
+
# before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
|
216
|
+
# will allow quicker feedback.
|
217
|
+
#
|
218
|
+
# Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
|
219
|
+
#
|
220
|
+
# <tt>:too_long</tt>:: if the <tt>text</tt> is too long
|
221
|
+
# <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
|
222
|
+
# <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
|
223
|
+
def tweet_invalid?(text)
|
224
|
+
return :empty if !text || text.empty?
|
225
|
+
begin
|
226
|
+
return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY
|
227
|
+
return :invalid_characters if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
|
228
|
+
rescue ArgumentError
|
229
|
+
# non-Unicode value.
|
230
|
+
return :invalid_characters
|
231
|
+
end
|
232
|
+
|
233
|
+
return false
|
234
|
+
end
|
235
|
+
deprecate :tweet_invalid?, :parse_tweet
|
236
|
+
|
237
|
+
def valid_tweet_text?(text)
|
238
|
+
!tweet_invalid?(text)
|
239
|
+
end
|
240
|
+
deprecate :valid_tweet_text?, :parse_tweet
|
241
|
+
|
242
|
+
private
|
243
|
+
|
244
|
+
def valid_match?(string, regex, optional=false)
|
245
|
+
return (string && string.match(regex) && $~.to_s == string) unless optional
|
246
|
+
|
247
|
+
!(string && (!string.match(regex) || $~.to_s != string))
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
# encoding: UTF-8
|
6
|
+
|
7
|
+
module Twitter
|
8
|
+
module TwitterText
|
9
|
+
class WeightedRange
|
10
|
+
attr_reader :start, :end, :weight
|
11
|
+
|
12
|
+
def initialize(range = {})
|
13
|
+
raise ArgumentError.new("Invalid range") unless [:start, :end, :weight].all? { |key| range.key?(key) && range[key].is_a?(Integer) }
|
14
|
+
@start = range[:start]
|
15
|
+
@end = range[:end]
|
16
|
+
@weight = range[:weight]
|
17
|
+
end
|
18
|
+
|
19
|
+
def contains?(code_point)
|
20
|
+
code_point >= @start && code_point <= @end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/twitter-text.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
major, minor, _patch = RUBY_VERSION.split('.')
|
6
|
+
|
7
|
+
$RUBY_1_9 = if major.to_i == 1 && minor.to_i < 9
|
8
|
+
# Ruby 1.8 KCODE check. Not needed on 1.9 and later.
|
9
|
+
raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless $KCODE[0].chr =~ /u/i
|
10
|
+
false
|
11
|
+
else
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
%w(
|
16
|
+
deprecation
|
17
|
+
emoji_regex
|
18
|
+
regex
|
19
|
+
rewriter
|
20
|
+
autolink
|
21
|
+
extractor
|
22
|
+
unicode
|
23
|
+
weighted_range
|
24
|
+
configuration
|
25
|
+
validation
|
26
|
+
hit_highlighter
|
27
|
+
).each do |name|
|
28
|
+
require "twitter-text/#{name}"
|
29
|
+
end
|
data/script/destroy
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/destroy'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit]
|
14
|
+
RubiGen::Scripts::Destroy.new.run(ARGV)
|
data/script/generate
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/generate'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit]
|
14
|
+
RubiGen::Scripts::Generate.new.run(ARGV)
|