twitter-text-kow 1.3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,251 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ require 'unf'
6
+
7
+ module Twitter
8
+ module TwitterText
9
+ module Validation extend self
10
+ DEFAULT_TCO_URL_LENGTHS = {
11
+ :short_url_length => 23,
12
+ }
13
+
14
+ # :weighted_length the weighted length of tweet based on weights specified in the config
15
+ # :valid If tweet is valid
16
+ # :permillage permillage of the tweet over the max length specified in config
17
+ # :valid_range_start beginning of valid text
18
+ # :valid_range_end End index of valid part of the tweet text (inclusive)
19
+ # :display_range_start beginning index of display text
20
+ # :display_range_end end index of display text (inclusive)
21
+ class ParseResults < Hash
22
+
23
+ RESULT_PARAMS = [:weighted_length, :valid, :permillage, :valid_range_start, :valid_range_end, :display_range_start, :display_range_end]
24
+
25
+ def self.empty
26
+ return ParseResults.new(weighted_length: 0, permillage: 0, valid: true, display_range_start: 0, display_range_end: 0, valid_range_start: 0, valid_range_end: 0)
27
+ end
28
+
29
+ def initialize(params = {})
30
+ RESULT_PARAMS.each do |key|
31
+ super[key] = params[key] if params.key?(key)
32
+ end
33
+ end
34
+ end
35
+
36
+ # Parse input text and return hash with descriptive parameters populated.
37
+ def parse_tweet(text, options = {})
38
+ options = DEFAULT_TCO_URL_LENGTHS.merge(options)
39
+ config = options[:config] || Twitter::TwitterText::Configuration.default_configuration
40
+ normalized_text = text.to_nfc
41
+ unless (normalized_text.length > 0)
42
+ ParseResults.empty()
43
+ end
44
+
45
+ scale = config.scale
46
+ max_weighted_tweet_length = config.max_weighted_tweet_length
47
+ scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale
48
+ transformed_url_length = config.transformed_url_length * scale
49
+ ranges = config.ranges
50
+
51
+ url_entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(normalized_text)
52
+ emoji_entities = config.emoji_parsing_enabled ? Twitter::TwitterText::Extractor.extract_emoji_with_indices(normalized_text) : []
53
+
54
+ has_invalid_chars = false
55
+ weighted_count = 0
56
+ offset = 0
57
+ display_offset = 0
58
+ valid_offset = 0
59
+
60
+ while offset < normalized_text.codepoint_length
61
+ # Reset the default char weight each pass through the loop
62
+ char_weight = config.default_weight
63
+ entity_length = 0
64
+
65
+ url_entities.each do |url_entity|
66
+ if url_entity[:indices].first == offset
67
+ entity_length = url_entity[:indices].last - url_entity[:indices].first
68
+ weighted_count += transformed_url_length
69
+ offset += entity_length
70
+ display_offset += entity_length
71
+ if weighted_count <= scaled_max_weighted_tweet_length
72
+ valid_offset += entity_length
73
+ end
74
+ # Finding a match breaks the loop
75
+ break
76
+ end
77
+ end
78
+
79
+ emoji_entities.each do |emoji_entity|
80
+ if emoji_entity[:indices].first == offset
81
+ entity_length = emoji_entity[:indices].last - emoji_entity[:indices].first
82
+ weighted_count += char_weight # the default weight
83
+ offset += entity_length
84
+ display_offset += entity_length
85
+ if weighted_count <= scaled_max_weighted_tweet_length
86
+ valid_offset += entity_length
87
+ end
88
+ # Finding a match breaks the loop
89
+ break
90
+ end
91
+ end
92
+
93
+ next if entity_length > 0
94
+
95
+ if offset < normalized_text.codepoint_length
96
+ code_point = normalized_text[offset]
97
+
98
+ ranges.each do |range|
99
+ if range.contains?(code_point.unpack("U").first)
100
+ char_weight = range.weight
101
+ break
102
+ end
103
+ end
104
+
105
+ weighted_count += char_weight
106
+
107
+ has_invalid_chars = contains_invalid?(code_point) unless has_invalid_chars
108
+ codepoint_length = code_point.codepoint_length
109
+ offset += codepoint_length
110
+ display_offset += codepoint_length
111
+ # index += codepoint_length
112
+
113
+ if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length)
114
+ valid_offset += codepoint_length
115
+ end
116
+ end
117
+ end
118
+
119
+ normalized_text_offset = text.codepoint_length - normalized_text.codepoint_length
120
+ scaled_weighted_length = weighted_count / scale
121
+ is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length) && (scaled_weighted_length != 0)
122
+ permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length
123
+
124
+ return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1))
125
+ end
126
+
127
+ def contains_invalid?(text)
128
+ return false if !text || text.empty?
129
+ begin
130
+ return true if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
131
+ rescue ArgumentError
132
+ # non-Unicode value.
133
+ return true
134
+ end
135
+ return false
136
+ end
137
+
138
+ def valid_username?(username)
139
+ return false if !username || username.empty?
140
+
141
+ extracted = Twitter::TwitterText::Extractor.extract_mentioned_screen_names(username)
142
+ # Should extract the username minus the @ sign, hence the [1..-1]
143
+ extracted.size == 1 && extracted.first == username[1..-1]
144
+ end
145
+
146
+ VALID_LIST_RE = /\A#{Twitter::TwitterText::Regex[:valid_mention_or_list]}\z/o
147
+ def valid_list?(username_list)
148
+ match = username_list.match(VALID_LIST_RE)
149
+ # Must have matched and had nothing before or after
150
+ !!(match && match[1] == "" && match[4] && !match[4].empty?)
151
+ end
152
+
153
+ def valid_hashtag?(hashtag)
154
+ return false if !hashtag || hashtag.empty?
155
+
156
+ extracted = Twitter::TwitterText::Extractor.extract_hashtags(hashtag)
157
+ # Should extract the hashtag minus the # sign, hence the [1..-1]
158
+ extracted.size == 1 && extracted.first == hashtag[1..-1]
159
+ end
160
+
161
+ def valid_url?(url, unicode_domains=true, require_protocol=true)
162
+ return false if !url || url.empty?
163
+
164
+ url_parts = url.match(Twitter::TwitterText::Regex[:validate_url_unencoded])
165
+ return false unless (url_parts && url_parts.to_s == url)
166
+
167
+ scheme, authority, path, query, fragment = url_parts.captures
168
+
169
+ return false unless ((!require_protocol ||
170
+ (valid_match?(scheme, Twitter::TwitterText::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) &&
171
+ valid_match?(path, Twitter::TwitterText::Regex[:validate_url_path]) &&
172
+ valid_match?(query, Twitter::TwitterText::Regex[:validate_url_query], true) &&
173
+ valid_match?(fragment, Twitter::TwitterText::Regex[:validate_url_fragment], true))
174
+
175
+ return (unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_unicode_authority])) ||
176
+ (!unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_authority]))
177
+ end
178
+
179
+ # These methods are deprecated, will be removed in future.
180
+ extend Deprecation
181
+
182
+ MAX_LENGTH_LEGACY = 140
183
+
184
+ # DEPRECATED: Please use parse_text instead.
185
+ #
186
+ # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
187
+ # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
188
+ # string no matter which actual form was transmitted. For example:
189
+ #
190
+ # U+0065 Latin Small Letter E
191
+ # + U+0301 Combining Acute Accent
192
+ # ----------
193
+ # = 2 bytes, 2 characters, displayed as é (1 visual glyph)
194
+ # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
195
+ #
196
+ # The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
197
+ #
198
+ def tweet_length(text, options = {})
199
+ options = DEFAULT_TCO_URL_LENGTHS.merge(options)
200
+
201
+ length = text.to_nfc.unpack("U*").length
202
+
203
+ Twitter::TwitterText::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
204
+ length += start_position - end_position
205
+ length += options[:short_url_length] if url.length > 0
206
+ end
207
+
208
+ length
209
+ end
210
+ deprecate :tweet_length, :parse_tweet
211
+
212
+ # DEPRECATED: Please use parse_text instead.
213
+ #
214
+ # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
215
+ # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
216
+ # will allow quicker feedback.
217
+ #
218
+ # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
219
+ #
220
+ # <tt>:too_long</tt>:: if the <tt>text</tt> is too long
221
+ # <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
222
+ # <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
223
+ def tweet_invalid?(text)
224
+ return :empty if !text || text.empty?
225
+ begin
226
+ return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY
227
+ return :invalid_characters if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
228
+ rescue ArgumentError
229
+ # non-Unicode value.
230
+ return :invalid_characters
231
+ end
232
+
233
+ return false
234
+ end
235
+ deprecate :tweet_invalid?, :parse_tweet
236
+
237
+ def valid_tweet_text?(text)
238
+ !tweet_invalid?(text)
239
+ end
240
+ deprecate :valid_tweet_text?, :parse_tweet
241
+
242
+ private
243
+
244
+ def valid_match?(string, regex, optional=false)
245
+ return (string && string.match(regex) && $~.to_s == string) unless optional
246
+
247
+ !(string && (!string.match(regex) || $~.to_s != string))
248
+ end
249
+ end
250
+ end
251
+ end
@@ -0,0 +1,24 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ # encoding: UTF-8
6
+
7
+ module Twitter
8
+ module TwitterText
9
+ class WeightedRange
10
+ attr_reader :start, :end, :weight
11
+
12
+ def initialize(range = {})
13
+ raise ArgumentError.new("Invalid range") unless [:start, :end, :weight].all? { |key| range.key?(key) && range[key].is_a?(Integer) }
14
+ @start = range[:start]
15
+ @end = range[:end]
16
+ @weight = range[:weight]
17
+ end
18
+
19
+ def contains?(code_point)
20
+ code_point >= @start && code_point <= @end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,29 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ major, minor, _patch = RUBY_VERSION.split('.')
6
+
7
+ $RUBY_1_9 = if major.to_i == 1 && minor.to_i < 9
8
+ # Ruby 1.8 KCODE check. Not needed on 1.9 and later.
9
+ raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless $KCODE[0].chr =~ /u/i
10
+ false
11
+ else
12
+ true
13
+ end
14
+
15
+ %w(
16
+ deprecation
17
+ emoji_regex
18
+ regex
19
+ rewriter
20
+ autolink
21
+ extractor
22
+ unicode
23
+ weighted_range
24
+ configuration
25
+ validation
26
+ hit_highlighter
27
+ ).each do |name|
28
+ require "twitter-text/#{name}"
29
+ end
data/script/destroy ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
data/script/generate ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)