twitter-text-kow 1.3.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,251 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ require 'unf'
6
+
7
+ module Twitter
8
+ module TwitterText
9
+ module Validation extend self
10
+ DEFAULT_TCO_URL_LENGTHS = {
11
+ :short_url_length => 23,
12
+ }
13
+
14
+ # :weighted_length the weighted length of tweet based on weights specified in the config
15
+ # :valid If tweet is valid
16
+ # :permillage permillage of the tweet over the max length specified in config
17
+ # :valid_range_start beginning of valid text
18
+ # :valid_range_end End index of valid part of the tweet text (inclusive)
19
+ # :display_range_start beginning index of display text
20
+ # :display_range_end end index of display text (inclusive)
21
+ class ParseResults < Hash
22
+
23
+ RESULT_PARAMS = [:weighted_length, :valid, :permillage, :valid_range_start, :valid_range_end, :display_range_start, :display_range_end]
24
+
25
+ def self.empty
26
+ return ParseResults.new(weighted_length: 0, permillage: 0, valid: true, display_range_start: 0, display_range_end: 0, valid_range_start: 0, valid_range_end: 0)
27
+ end
28
+
29
+ def initialize(params = {})
30
+ RESULT_PARAMS.each do |key|
31
+ super[key] = params[key] if params.key?(key)
32
+ end
33
+ end
34
+ end
35
+
36
+ # Parse input text and return hash with descriptive parameters populated.
37
+ def parse_tweet(text, options = {})
38
+ options = DEFAULT_TCO_URL_LENGTHS.merge(options)
39
+ config = options[:config] || Twitter::TwitterText::Configuration.default_configuration
40
+ normalized_text = text.to_nfc
41
+ unless (normalized_text.length > 0)
42
+ ParseResults.empty()
43
+ end
44
+
45
+ scale = config.scale
46
+ max_weighted_tweet_length = config.max_weighted_tweet_length
47
+ scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale
48
+ transformed_url_length = config.transformed_url_length * scale
49
+ ranges = config.ranges
50
+
51
+ url_entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(normalized_text)
52
+ emoji_entities = config.emoji_parsing_enabled ? Twitter::TwitterText::Extractor.extract_emoji_with_indices(normalized_text) : []
53
+
54
+ has_invalid_chars = false
55
+ weighted_count = 0
56
+ offset = 0
57
+ display_offset = 0
58
+ valid_offset = 0
59
+
60
+ while offset < normalized_text.codepoint_length
61
+ # Reset the default char weight each pass through the loop
62
+ char_weight = config.default_weight
63
+ entity_length = 0
64
+
65
+ url_entities.each do |url_entity|
66
+ if url_entity[:indices].first == offset
67
+ entity_length = url_entity[:indices].last - url_entity[:indices].first
68
+ weighted_count += transformed_url_length
69
+ offset += entity_length
70
+ display_offset += entity_length
71
+ if weighted_count <= scaled_max_weighted_tweet_length
72
+ valid_offset += entity_length
73
+ end
74
+ # Finding a match breaks the loop
75
+ break
76
+ end
77
+ end
78
+
79
+ emoji_entities.each do |emoji_entity|
80
+ if emoji_entity[:indices].first == offset
81
+ entity_length = emoji_entity[:indices].last - emoji_entity[:indices].first
82
+ weighted_count += char_weight # the default weight
83
+ offset += entity_length
84
+ display_offset += entity_length
85
+ if weighted_count <= scaled_max_weighted_tweet_length
86
+ valid_offset += entity_length
87
+ end
88
+ # Finding a match breaks the loop
89
+ break
90
+ end
91
+ end
92
+
93
+ next if entity_length > 0
94
+
95
+ if offset < normalized_text.codepoint_length
96
+ code_point = normalized_text[offset]
97
+
98
+ ranges.each do |range|
99
+ if range.contains?(code_point.unpack("U").first)
100
+ char_weight = range.weight
101
+ break
102
+ end
103
+ end
104
+
105
+ weighted_count += char_weight
106
+
107
+ has_invalid_chars = contains_invalid?(code_point) unless has_invalid_chars
108
+ codepoint_length = code_point.codepoint_length
109
+ offset += codepoint_length
110
+ display_offset += codepoint_length
111
+ # index += codepoint_length
112
+
113
+ if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length)
114
+ valid_offset += codepoint_length
115
+ end
116
+ end
117
+ end
118
+
119
+ normalized_text_offset = text.codepoint_length - normalized_text.codepoint_length
120
+ scaled_weighted_length = weighted_count / scale
121
+ is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length) && (scaled_weighted_length != 0)
122
+ permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length
123
+
124
+ return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1))
125
+ end
126
+
127
+ def contains_invalid?(text)
128
+ return false if !text || text.empty?
129
+ begin
130
+ return true if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
131
+ rescue ArgumentError
132
+ # non-Unicode value.
133
+ return true
134
+ end
135
+ return false
136
+ end
137
+
138
+ def valid_username?(username)
139
+ return false if !username || username.empty?
140
+
141
+ extracted = Twitter::TwitterText::Extractor.extract_mentioned_screen_names(username)
142
+ # Should extract the username minus the @ sign, hence the [1..-1]
143
+ extracted.size == 1 && extracted.first == username[1..-1]
144
+ end
145
+
146
+ VALID_LIST_RE = /\A#{Twitter::TwitterText::Regex[:valid_mention_or_list]}\z/o
147
+ def valid_list?(username_list)
148
+ match = username_list.match(VALID_LIST_RE)
149
+ # Must have matched and had nothing before or after
150
+ !!(match && match[1] == "" && match[4] && !match[4].empty?)
151
+ end
152
+
153
+ def valid_hashtag?(hashtag)
154
+ return false if !hashtag || hashtag.empty?
155
+
156
+ extracted = Twitter::TwitterText::Extractor.extract_hashtags(hashtag)
157
+ # Should extract the hashtag minus the # sign, hence the [1..-1]
158
+ extracted.size == 1 && extracted.first == hashtag[1..-1]
159
+ end
160
+
161
+ def valid_url?(url, unicode_domains=true, require_protocol=true)
162
+ return false if !url || url.empty?
163
+
164
+ url_parts = url.match(Twitter::TwitterText::Regex[:validate_url_unencoded])
165
+ return false unless (url_parts && url_parts.to_s == url)
166
+
167
+ scheme, authority, path, query, fragment = url_parts.captures
168
+
169
+ return false unless ((!require_protocol ||
170
+ (valid_match?(scheme, Twitter::TwitterText::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) &&
171
+ valid_match?(path, Twitter::TwitterText::Regex[:validate_url_path]) &&
172
+ valid_match?(query, Twitter::TwitterText::Regex[:validate_url_query], true) &&
173
+ valid_match?(fragment, Twitter::TwitterText::Regex[:validate_url_fragment], true))
174
+
175
+ return (unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_unicode_authority])) ||
176
+ (!unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_authority]))
177
+ end
178
+
179
+ # These methods are deprecated, will be removed in future.
180
+ extend Deprecation
181
+
182
+ MAX_LENGTH_LEGACY = 140
183
+
184
+ # DEPRECATED: Please use parse_text instead.
185
+ #
186
+ # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
187
+ # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
188
+ # string no matter which actual form was transmitted. For example:
189
+ #
190
+ # U+0065 Latin Small Letter E
191
+ # + U+0301 Combining Acute Accent
192
+ # ----------
193
+ # = 2 bytes, 2 characters, displayed as é (1 visual glyph)
194
+ # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
195
+ #
196
+ # The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
197
+ #
198
+ def tweet_length(text, options = {})
199
+ options = DEFAULT_TCO_URL_LENGTHS.merge(options)
200
+
201
+ length = text.to_nfc.unpack("U*").length
202
+
203
+ Twitter::TwitterText::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
204
+ length += start_position - end_position
205
+ length += options[:short_url_length] if url.length > 0
206
+ end
207
+
208
+ length
209
+ end
210
+ deprecate :tweet_length, :parse_tweet
211
+
212
+ # DEPRECATED: Please use parse_text instead.
213
+ #
214
+ # Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
215
+ # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
216
+ # will allow quicker feedback.
217
+ #
218
+ # Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
219
+ #
220
+ # <tt>:too_long</tt>:: if the <tt>text</tt> is too long
221
+ # <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
222
+ # <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
223
+ def tweet_invalid?(text)
224
+ return :empty if !text || text.empty?
225
+ begin
226
+ return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY
227
+ return :invalid_characters if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
228
+ rescue ArgumentError
229
+ # non-Unicode value.
230
+ return :invalid_characters
231
+ end
232
+
233
+ return false
234
+ end
235
+ deprecate :tweet_invalid?, :parse_tweet
236
+
237
+ def valid_tweet_text?(text)
238
+ !tweet_invalid?(text)
239
+ end
240
+ deprecate :valid_tweet_text?, :parse_tweet
241
+
242
+ private
243
+
244
+ def valid_match?(string, regex, optional=false)
245
+ return (string && string.match(regex) && $~.to_s == string) unless optional
246
+
247
+ !(string && (!string.match(regex) || $~.to_s != string))
248
+ end
249
+ end
250
+ end
251
+ end
@@ -0,0 +1,24 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ # encoding: UTF-8
6
+
7
+ module Twitter
8
+ module TwitterText
9
+ class WeightedRange
10
+ attr_reader :start, :end, :weight
11
+
12
+ def initialize(range = {})
13
+ raise ArgumentError.new("Invalid range") unless [:start, :end, :weight].all? { |key| range.key?(key) && range[key].is_a?(Integer) }
14
+ @start = range[:start]
15
+ @end = range[:end]
16
+ @weight = range[:weight]
17
+ end
18
+
19
+ def contains?(code_point)
20
+ code_point >= @start && code_point <= @end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,29 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ major, minor, _patch = RUBY_VERSION.split('.')
6
+
7
+ $RUBY_1_9 = if major.to_i == 1 && minor.to_i < 9
8
+ # Ruby 1.8 KCODE check. Not needed on 1.9 and later.
9
+ raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless $KCODE[0].chr =~ /u/i
10
+ false
11
+ else
12
+ true
13
+ end
14
+
15
+ %w(
16
+ deprecation
17
+ emoji_regex
18
+ regex
19
+ rewriter
20
+ autolink
21
+ extractor
22
+ unicode
23
+ weighted_range
24
+ configuration
25
+ validation
26
+ hit_highlighter
27
+ ).each do |name|
28
+ require "twitter-text/#{name}"
29
+ end
data/script/destroy ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
data/script/generate ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)