twitter-text 1.14.7 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.rspec +1 -1
- data/README.md +104 -33
- data/lib/assets/tld_lib.yml +1 -0
- data/lib/twitter-text.rb +2 -0
- data/lib/twitter-text/autolink.rb +4 -4
- data/lib/twitter-text/configuration.rb +53 -0
- data/lib/twitter-text/deprecation.rb +1 -1
- data/lib/twitter-text/extractor.rb +31 -1
- data/lib/twitter-text/regex.rb +13 -13
- data/lib/twitter-text/validation.rb +155 -43
- data/lib/twitter-text/weighted_range.rb +18 -0
- data/spec/autolinking_spec.rb +161 -161
- data/spec/configuration_spec.rb +91 -0
- data/spec/extractor_spec.rb +92 -72
- data/spec/hithighlighter_spec.rb +15 -15
- data/spec/regex_spec.rb +7 -7
- data/spec/rewriter_spec.rb +110 -109
- data/spec/spec_helper.rb +13 -15
- data/spec/test_urls.rb +6 -4
- data/spec/twitter_text_spec.rb +2 -2
- data/spec/unicode_spec.rb +10 -10
- data/spec/validation_spec.rb +35 -11
- data/test/conformance_test.rb +14 -0
- data/twitter-text.gemspec +11 -9
- metadata +53 -32
- data/lib/assets/tld_lib.yml +0 -1565
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 92e1f709304c7902186bbe50ff5f7d215059d292a4e8730b9cdff12210dff1aa
|
4
|
+
data.tar.gz: fd50deede86bb5ba1a47ff214350f86a928ed59926438d3361475f3640ff8531
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 85f39c5bd4d9c58b863d5e9490618ee941a528ab8fd23a463857a206d53ba50a4235cb7e287245a0e3bb66bb78955b98cf6973f1ed5e2ec5741090ef34a77c52
|
7
|
+
data.tar.gz: 6a0133f3acd0a34742435777f4fc276df4639066adc80b39d3a4b84f77ec73eb0772fae0ff52f20e3af752b089ca976de3b215d83241944c2fd0ef8f9823ba85
|
data/.rspec
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
--color
|
2
|
-
--format=
|
2
|
+
--format=documentation
|
data/README.md
CHANGED
@@ -1,16 +1,82 @@
|
|
1
1
|
# twitter-text
|
2
2
|
|
3
|
-
![
|
3
|
+
![](https://img.shields.io/gem/v/twitter-text.svg)
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
This is the Ruby implementation of the twitter-text parsing
|
6
|
+
library. The library has methods to parse Tweets and calculate length,
|
7
|
+
validity, parse @mentions, #hashtags, URLs, and more.
|
8
8
|
|
9
|
-
##
|
9
|
+
## Setup
|
10
10
|
|
11
|
+
Installation uses bundler.
|
11
12
|
|
12
|
-
# Extraction
|
13
13
|
```
|
14
|
+
% gem install bundler
|
15
|
+
% bundle install
|
16
|
+
```
|
17
|
+
|
18
|
+
## Conformance tests
|
19
|
+
|
20
|
+
To run the Conformance test suite from the command line via rake:
|
21
|
+
|
22
|
+
```
|
23
|
+
% rake test:conformance:run
|
24
|
+
```
|
25
|
+
|
26
|
+
You can also run the rspec tests in the `spec` directory:
|
27
|
+
|
28
|
+
```
|
29
|
+
% rspec spec
|
30
|
+
```
|
31
|
+
|
32
|
+
# Length validation
|
33
|
+
|
34
|
+
twitter-text 2.0 introduces configuration files that define how Tweets
|
35
|
+
are parsed for length. This allows for backwards compatibility and
|
36
|
+
flexibility going forward. Old-style traditional 140-character parsing
|
37
|
+
is defined by the v1.json configuration file, whereas v2.json is
|
38
|
+
updated for "weighted" Tweets where ranges of Unicode code points can
|
39
|
+
have independent weights aside from the default weight. The sum of all
|
40
|
+
code points, each weighted appropriately, should not exceed the max
|
41
|
+
weighted length.
|
42
|
+
|
43
|
+
Some old methods from twitter-text 1.0 have been marked deprecated,
|
44
|
+
such as the `tweet_length()` method. The new API is based on the
|
45
|
+
following method, `parse_tweet()`
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
def parse_tweet(text, options = {}) { ... }
|
49
|
+
```
|
50
|
+
|
51
|
+
This method takes a string as input and returns a results object that
|
52
|
+
contains information about the
|
53
|
+
string. `Twitter::Validation::ParseResults` object includes:
|
54
|
+
|
55
|
+
* `:weighted_length`: the overall length of the tweet with code points
|
56
|
+
weighted per the ranges defined in the configuration file.
|
57
|
+
|
58
|
+
* `:permillage`: indicates the proportion (per thousand) of the weighted
|
59
|
+
length in comparison to the max weighted length. A value > 1000
|
60
|
+
indicates input text that is longer than the allowable maximum.
|
61
|
+
|
62
|
+
* `:valid`: indicates if input text length corresponds to a valid
|
63
|
+
result.
|
64
|
+
|
65
|
+
* `:display_range_start, :display_range_end`: An array of two unicode code point
|
66
|
+
indices identifying the inclusive start and exclusive end of the
|
67
|
+
displayable content of the Tweet. For more information, see
|
68
|
+
the description of `display_text_range` here:
|
69
|
+
[Tweet updates](https://developer.twitter.com/en/docs/tweets/tweet-updates)
|
70
|
+
|
71
|
+
* `:valid_range_start, :valid_range_end`: An array of two unicode code point
|
72
|
+
indices identifying the inclusive start and exclusive end of the valid
|
73
|
+
content of the Tweet. For more information on the extended Tweet
|
74
|
+
payload see [Tweet updates](https://developer.twitter.com/en/docs/tweets/tweet-updates)
|
75
|
+
|
76
|
+
## Extraction Examples
|
77
|
+
|
78
|
+
# Extraction
|
79
|
+
```ruby
|
14
80
|
class MyClass
|
15
81
|
include Twitter::Extractor
|
16
82
|
usernames = extract_mentioned_screen_names("Mentioning @twitter and @jack")
|
@@ -18,9 +84,9 @@ class MyClass
|
|
18
84
|
end
|
19
85
|
```
|
20
86
|
|
21
|
-
|
22
|
-
```ruby
|
87
|
+
### Extraction with a block argument
|
23
88
|
|
89
|
+
```ruby
|
24
90
|
class MyClass
|
25
91
|
include Twitter::Extractor
|
26
92
|
extract_reply_screen_name("@twitter are you hiring?").do |username|
|
@@ -31,8 +97,9 @@ end
|
|
31
97
|
|
32
98
|
## Auto-linking Examples
|
33
99
|
|
34
|
-
|
35
|
-
|
100
|
+
### Auto-link
|
101
|
+
|
102
|
+
```ruby
|
36
103
|
class MyClass
|
37
104
|
include Twitter::Autolink
|
38
105
|
|
@@ -40,14 +107,14 @@ class MyClass
|
|
40
107
|
end
|
41
108
|
```
|
42
109
|
|
43
|
-
|
44
|
-
```
|
110
|
+
### For Ruby on Rails you want to add this to app/helpers/application_helper.rb
|
111
|
+
```ruby
|
45
112
|
module ApplicationHelper
|
46
113
|
include Twitter::Autolink
|
47
114
|
end
|
48
115
|
```
|
49
116
|
|
50
|
-
|
117
|
+
### Now the auto_link function is available in every view. So in index.html.erb:
|
51
118
|
```ruby
|
52
119
|
<%= auto_link("link @user, please #request") %>
|
53
120
|
```
|
@@ -90,33 +157,37 @@ words should work equally well.
|
|
90
157
|
Use to provide emphasis around the "hits" returned from the Search API, built
|
91
158
|
to work against text that has been auto-linked already.
|
92
159
|
|
93
|
-
|
160
|
+
## Issues
|
94
161
|
|
95
|
-
|
96
|
-
patches. Patches courtesy of:
|
162
|
+
Have a bug? Please create an issue here on GitHub!
|
97
163
|
|
98
|
-
|
99
|
-
* Matt Sanford - http://github.com/mzsanford
|
100
|
-
* Raffi Krikorian - http://github.com/r
|
101
|
-
* Ben Cherry - http://github.com/bcherry
|
102
|
-
* Patrick Ewing - http://github.com/hoverbird
|
103
|
-
* Jeff Smick - http://github.com/sprsquish
|
104
|
-
* Kenneth Kufluk - https://github.com/kennethkufluk
|
105
|
-
* Keita Fujii - https://github.com/keitaf
|
106
|
-
* Yoshimasa Niwa - https://github.com/niw
|
164
|
+
<https://github.com/twitter/twitter-text/issues>
|
107
165
|
|
166
|
+
## Authors
|
108
167
|
|
109
|
-
|
110
|
-
* Jean-Philippe Bougie - http://github.com/jpbougie
|
111
|
-
* Erik Michaels-Ober - https://github.com/sferik
|
168
|
+
### V2.0
|
112
169
|
|
170
|
+
* David LaMacchia (<https://github.com/dlamacchia>)
|
171
|
+
* Yoshimasa Niwa (<https://github.com/niw>)
|
172
|
+
* Sudheer Guntupalli (<https://github.com/sudhee>)
|
173
|
+
* Kaushik Lakshmikanth (<https://github.com/kaushlakers>)
|
174
|
+
* Jose Antonio Marquez Russo (<https://github.com/joseeight>)
|
175
|
+
* Lee Adams (<https://github.com/leeaustinadams>)
|
113
176
|
|
114
|
-
|
177
|
+
### Previous authors
|
115
178
|
|
179
|
+
* Matt Sanford (<http://github.com/mzsanford>)
|
180
|
+
* Raffi Krikorian (<http://github.com/r>)
|
181
|
+
* Ben Cherry (<http://github.com/bcherry>)
|
182
|
+
* Patrick Ewing (<http://github.com/hoverbird>)
|
183
|
+
* Jeff Smick (<http://github.com/sprsquish>)
|
184
|
+
* Kenneth Kufluk (<https://github.com/kennethkufluk>)
|
185
|
+
* Keita Fujii (<https://github.com/keitaf>)
|
186
|
+
* Jean-Philippe Bougie (<http://github.com/jpbougie>)
|
187
|
+
* Erik Michaels-Ober (<https://github.com/sferik>)
|
116
188
|
|
117
|
-
|
189
|
+
## License
|
118
190
|
|
119
|
-
|
191
|
+
Copyright 2012-2017 Twitter, Inc and other contributors
|
120
192
|
|
121
|
-
Licensed under the Apache License, Version 2.0
|
122
|
-
http://www.apache.org/licenses/LICENSE-2.0
|
193
|
+
Licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0)
|
@@ -0,0 +1 @@
|
|
1
|
+
lib/assets/../../../conformance/tld_lib.yml
|
data/lib/twitter-text.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# encoding:
|
1
|
+
# encoding: utf-8
|
2
2
|
|
3
3
|
require 'set'
|
4
4
|
require 'twitter-text/hash_helper'
|
@@ -21,9 +21,9 @@ module Twitter
|
|
21
21
|
# Default URL base for auto-linked lists
|
22
22
|
DEFAULT_LIST_URL_BASE = "https://twitter.com/".freeze
|
23
23
|
# Default URL base for auto-linked hashtags
|
24
|
-
DEFAULT_HASHTAG_URL_BASE = "https://twitter.com
|
24
|
+
DEFAULT_HASHTAG_URL_BASE = "https://twitter.com/search?q=%23".freeze
|
25
25
|
# Default URL base for auto-linked cashtags
|
26
|
-
DEFAULT_CASHTAG_URL_BASE = "https://twitter.com
|
26
|
+
DEFAULT_CASHTAG_URL_BASE = "https://twitter.com/search?q=%24".freeze
|
27
27
|
|
28
28
|
# Default attributes for invisible span tag
|
29
29
|
DEFAULT_INVISIBLE_TAG_ATTRS = "style='position:absolute;left:-9999px;'".freeze
|
@@ -286,7 +286,7 @@ module Twitter
|
|
286
286
|
# wrap the ellipses in a tco-ellipsis class and provide an onCopy handler that sets display:none on
|
287
287
|
# everything with the tco-ellipsis class.
|
288
288
|
#
|
289
|
-
# Exception: pic.twitter.com images, for which expandedUrl = "https://twitter.com
|
289
|
+
# Exception: pic.twitter.com images, for which expandedUrl = "https://twitter.com/username/status/1234/photo/1
|
290
290
|
# For those URLs, display_url is not a substring of expanded_url, so we don't do anything special to render the elided parts.
|
291
291
|
# For a pic.twitter.com URL, the only elided part will be the "https://", so this is fine.
|
292
292
|
display_url_sans_ellipses = display_url.gsub("…", "")
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Twitter
|
4
|
+
class Configuration
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
PARSER_VERSION_CLASSIC = "v1"
|
8
|
+
PARSER_VERSION_DEFAULT = "v2"
|
9
|
+
|
10
|
+
class << self
|
11
|
+
attr_accessor :default_configuration
|
12
|
+
end
|
13
|
+
|
14
|
+
attr_reader :version, :max_weighted_tweet_length, :scale
|
15
|
+
attr_reader :default_weight, :transformed_url_length, :ranges
|
16
|
+
|
17
|
+
CONFIG_V1 = File.join(
|
18
|
+
File.expand_path('../../../../config', __FILE__), # project root
|
19
|
+
"#{PARSER_VERSION_CLASSIC}.json"
|
20
|
+
)
|
21
|
+
|
22
|
+
CONFIG_V2 = File.join(
|
23
|
+
File.expand_path('../../../../config', __FILE__), # project root
|
24
|
+
"#{PARSER_VERSION_DEFAULT}.json"
|
25
|
+
)
|
26
|
+
|
27
|
+
def self.parse_string(string, options = {})
|
28
|
+
JSON.parse(string, options.merge(symbolize_names: true))
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.parse_file(filename)
|
32
|
+
string = File.open(filename, 'rb') { |f| f.read }
|
33
|
+
parse_string(string)
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.configuration_from_file(filename)
|
37
|
+
config = parse_file(filename)
|
38
|
+
config ? Twitter::Configuration.new(config) : nil
|
39
|
+
end
|
40
|
+
|
41
|
+
def initialize(config = {})
|
42
|
+
@version = config[:version]
|
43
|
+
@max_weighted_tweet_length = config[:maxWeightedTweetLength]
|
44
|
+
@scale = config[:scale]
|
45
|
+
@default_weight = config[:defaultWeight]
|
46
|
+
@transformed_url_length = config[:transformedURLLength]
|
47
|
+
@ranges = config[:ranges].map { |range| Twitter::WeightedRange.new(range) } if config.key?(:ranges) && config[:ranges].is_a?(Array)
|
48
|
+
end
|
49
|
+
|
50
|
+
self.default_configuration = Twitter::Configuration.configuration_from_file(Twitter::Configuration::CONFIG_V2)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
@@ -1,4 +1,5 @@
|
|
1
|
-
# encoding:
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'idn'
|
2
3
|
|
3
4
|
class String
|
4
5
|
# Helper function to count the character length by first converting to an
|
@@ -47,6 +48,15 @@ module Twitter
|
|
47
48
|
# A module for including Tweet parsing in a class. This module provides function for the extraction and processing
|
48
49
|
# of usernames, lists, URLs and hashtags.
|
49
50
|
module Extractor extend self
|
51
|
+
|
52
|
+
# Maximum URL length as defined by Twitter's backend.
|
53
|
+
MAX_URL_LENGTH = 4096
|
54
|
+
|
55
|
+
# The maximum t.co path length that the Twitter backend supports.
|
56
|
+
MAX_TCO_SLUG_LENGTH = 40
|
57
|
+
|
58
|
+
URL_PROTOCOL_LENGTH = "https://".length
|
59
|
+
|
50
60
|
# Remove overlapping entities.
|
51
61
|
# This returns a new array with no overlapping entities.
|
52
62
|
def remove_overlapping_entities(entities)
|
@@ -201,6 +211,7 @@ module Twitter
|
|
201
211
|
next if !options[:extract_url_without_protocol] || before =~ Twitter::Regex[:invalid_url_without_protocol_preceding_chars]
|
202
212
|
last_url = nil
|
203
213
|
domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
|
214
|
+
next unless is_valid_domain(url.length, ascii_domain, protocol)
|
204
215
|
last_url = {
|
205
216
|
:url => ascii_domain,
|
206
217
|
:indices => [start_position + $~.char_begin(0),
|
@@ -225,9 +236,13 @@ module Twitter
|
|
225
236
|
else
|
226
237
|
# In the case of t.co URLs, don't allow additional path characters
|
227
238
|
if url =~ Twitter::Regex[:valid_tco_url]
|
239
|
+
next if $1 && $1.length > MAX_TCO_SLUG_LENGTH
|
228
240
|
url = $&
|
229
241
|
end_position = start_position + url.char_length
|
230
242
|
end
|
243
|
+
|
244
|
+
next unless is_valid_domain(url.length, domain, protocol)
|
245
|
+
|
231
246
|
urls << {
|
232
247
|
:url => url,
|
233
248
|
:indices => [start_position, end_position]
|
@@ -324,5 +339,20 @@ module Twitter
|
|
324
339
|
tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
|
325
340
|
tags
|
326
341
|
end
|
342
|
+
|
343
|
+
def is_valid_domain(url_length, domain, protocol)
|
344
|
+
begin
|
345
|
+
raise ArgumentError.new("invalid empty domain") unless domain
|
346
|
+
original_domain_length = domain.length
|
347
|
+
encoded_domain = IDN::Idna.toASCII(domain)
|
348
|
+
updated_domain_length = encoded_domain.length
|
349
|
+
url_length += (updated_domain_length - original_domain_length) if (updated_domain_length > original_domain_length)
|
350
|
+
url_length += URL_PROTOCOL_LENGTH unless protocol
|
351
|
+
url_length <= MAX_URL_LENGTH
|
352
|
+
rescue Exception
|
353
|
+
# On error don't consider this a valid domain.
|
354
|
+
return false
|
355
|
+
end
|
356
|
+
end
|
327
357
|
end
|
328
358
|
end
|
data/lib/twitter-text/regex.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# encoding:
|
1
|
+
# encoding: utf-8
|
2
2
|
|
3
3
|
module Twitter
|
4
4
|
# A collection of regular expressions for parsing Tweet text. The regular expression
|
@@ -62,10 +62,10 @@ module Twitter
|
|
62
62
|
|
63
63
|
major, minor, _patch = RUBY_VERSION.split('.')
|
64
64
|
if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE))
|
65
|
-
REGEXEN[:list_name] = /[a-
|
65
|
+
REGEXEN[:list_name] = /[a-z][a-z0-9_\-\u0080-\u00ff]{0,24}/i
|
66
66
|
else
|
67
67
|
# This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius.
|
68
|
-
REGEXEN[:list_name] = eval("/[a-
|
68
|
+
REGEXEN[:list_name] = eval("/[a-z][a-z0-9_\\-\x80-\xff]{0,24}/i")
|
69
69
|
end
|
70
70
|
|
71
71
|
# Latin accented characters
|
@@ -148,17 +148,17 @@ module Twitter
|
|
148
148
|
# Used in Extractor for final filtering
|
149
149
|
REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
|
150
150
|
|
151
|
-
REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-
|
151
|
+
REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/io
|
152
152
|
REGEXEN[:at_signs] = /[@@]/
|
153
153
|
REGEXEN[:valid_mention_or_list] = /
|
154
154
|
(#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character
|
155
155
|
(#{REGEXEN[:at_signs]}) # $2: At mark
|
156
|
-
([a-
|
157
|
-
(\/[a-
|
158
|
-
/
|
159
|
-
REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-
|
156
|
+
([a-z0-9_]{1,20}) # $3: Screen name
|
157
|
+
(\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
|
158
|
+
/iox
|
159
|
+
REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-z0-9_]{1,20})/io
|
160
160
|
# Used in Extractor for final filtering
|
161
|
-
REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/
|
161
|
+
REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/io
|
162
162
|
|
163
163
|
# URL related hash regex collection
|
164
164
|
REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
|
@@ -196,12 +196,12 @@ module Twitter
|
|
196
196
|
|
197
197
|
# This is used in Extractor
|
198
198
|
REGEXEN[:valid_ascii_domain] = /
|
199
|
-
(?:(?:[
|
199
|
+
(?:(?:[a-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+
|
200
200
|
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
201
201
|
/iox
|
202
202
|
|
203
203
|
# This is used in Extractor for stricter t.co URL extraction
|
204
|
-
REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/[a-z0-9]
|
204
|
+
REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/([a-z0-9]+)/i
|
205
205
|
|
206
206
|
# This is used in Extractor to filter out unwanted URLs.
|
207
207
|
REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
|
@@ -209,7 +209,7 @@ module Twitter
|
|
209
209
|
|
210
210
|
REGEXEN[:valid_port_number] = /[0-9]+/
|
211
211
|
|
212
|
-
REGEXEN[:valid_general_url_path_chars] = /[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]
|
212
|
+
REGEXEN[:valid_general_url_path_chars] = /[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|@#{LATIN_ACCENTS}]/io
|
213
213
|
# Allow URL paths to contain up to two nested levels of balanced parens
|
214
214
|
# 1. Used in Wikipedia URLs like /Primer_(film)
|
215
215
|
# 2. Used in IIS sessions like /S(dfd346)/
|
@@ -260,7 +260,7 @@ module Twitter
|
|
260
260
|
REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
|
261
261
|
|
262
262
|
# These URL validation pattern strings are based on the ABNF from RFC 3986
|
263
|
-
REGEXEN[:validate_url_unreserved] = /[a-z\p{Cyrillic}0-9
|
263
|
+
REGEXEN[:validate_url_unreserved] = /[a-z\p{Cyrillic}0-9\p{Pd}._~]/i
|
264
264
|
REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i
|
265
265
|
REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i
|
266
266
|
REGEXEN[:validate_url_pchar] = /(?:
|
@@ -2,65 +2,114 @@ require 'unf'
|
|
2
2
|
|
3
3
|
module Twitter
|
4
4
|
module Validation extend self
|
5
|
-
MAX_LENGTH = 140
|
6
|
-
|
7
5
|
DEFAULT_TCO_URL_LENGTHS = {
|
8
6
|
:short_url_length => 23,
|
9
|
-
|
10
|
-
:characters_reserved_per_media => 23
|
11
|
-
}.freeze
|
7
|
+
}
|
12
8
|
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
|
21
|
-
# … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
|
22
|
-
#
|
23
|
-
# The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
|
24
|
-
#
|
25
|
-
def tweet_length(text, options = {})
|
26
|
-
options = DEFAULT_TCO_URL_LENGTHS.merge(options)
|
9
|
+
# :weighted_length the weighted length of tweet based on weights specified in the config
|
10
|
+
# :valid If tweet is valid
|
11
|
+
# :permillage permillage of the tweet over the max length specified in config
|
12
|
+
# :valid_range_start beginning of valid text
|
13
|
+
# :valid_range_end End index of valid part of the tweet text (inclusive)
|
14
|
+
# :display_range_start beginning index of display text
|
15
|
+
# :display_range_end end index of display text (inclusive)
|
16
|
+
class ParseResults < Hash
|
27
17
|
|
28
|
-
|
18
|
+
RESULT_PARAMS = [:weighted_length, :valid, :permillage, :valid_range_start, :valid_range_end, :display_range_start, :display_range_end]
|
29
19
|
|
30
|
-
|
31
|
-
|
32
|
-
length += url.downcase =~ /^https:\/\// ? options[:short_url_length_https] : options[:short_url_length]
|
20
|
+
def self.empty
|
21
|
+
return ParseResults.new(weighted_length: 0, permillage: 0, valid: true, display_range_start: 0, display_range_end: 0, valid_range_start: 0, valid_range_end: 0)
|
33
22
|
end
|
34
23
|
|
35
|
-
|
24
|
+
def initialize(params = {})
|
25
|
+
RESULT_PARAMS.each do |key|
|
26
|
+
super[key] = params[key] if params.key?(key)
|
27
|
+
end
|
28
|
+
end
|
36
29
|
end
|
37
30
|
|
38
|
-
#
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
31
|
+
# Parse input text and return hash with descriptive parameters populated.
|
32
|
+
def parse_tweet(text, options = {})
|
33
|
+
options = DEFAULT_TCO_URL_LENGTHS.merge(options)
|
34
|
+
config = options[:config] || Twitter::Configuration.default_configuration
|
35
|
+
normalized_text = text.to_nfc
|
36
|
+
normalized_text_length = normalized_text.char_length
|
37
|
+
unless (normalized_text_length > 0)
|
38
|
+
ParseResults.empty()
|
39
|
+
end
|
40
|
+
|
41
|
+
scale = config.scale
|
42
|
+
max_weighted_tweet_length = config.max_weighted_tweet_length
|
43
|
+
scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale
|
44
|
+
transformed_url_length = config.transformed_url_length * scale
|
45
|
+
ranges = config.ranges
|
46
|
+
|
47
|
+
url_entities = Twitter::Extractor.extract_urls_with_indices(normalized_text)
|
48
|
+
|
49
|
+
has_invalid_chars = false
|
50
|
+
weighted_count = 0
|
51
|
+
offset = 0
|
52
|
+
display_offset = 0
|
53
|
+
valid_offset = 0
|
54
|
+
|
55
|
+
while offset < normalized_text_length
|
56
|
+
# Reset the default char weight each pass through the loop
|
57
|
+
char_weight = config.default_weight
|
58
|
+
url_entities.each do |url_entity|
|
59
|
+
if url_entity[:indices].first == offset
|
60
|
+
url_length = url_entity[:indices].last - url_entity[:indices].first
|
61
|
+
weighted_count += transformed_url_length
|
62
|
+
offset += url_length
|
63
|
+
display_offset += url_length
|
64
|
+
if weighted_count <= scaled_max_weighted_tweet_length
|
65
|
+
valid_offset += url_length
|
66
|
+
end
|
67
|
+
# Finding a match breaks the loop; order of ranges matters.
|
68
|
+
break
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
if offset < normalized_text_length
|
73
|
+
code_point = normalized_text[offset]
|
74
|
+
|
75
|
+
ranges.each do |range|
|
76
|
+
if range.contains?(code_point.unpack("U").first)
|
77
|
+
char_weight = range.weight
|
78
|
+
break
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
weighted_count += char_weight
|
83
|
+
|
84
|
+
has_invalid_chars = contains_invalid?(normalized_text[offset]) unless has_invalid_chars
|
85
|
+
char_count = code_point.char_length
|
86
|
+
offset += char_count
|
87
|
+
display_offset += char_count
|
88
|
+
|
89
|
+
if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length)
|
90
|
+
valid_offset += char_count
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
normalized_text_offset = text.char_length - normalized_text.char_length
|
95
|
+
scaled_weighted_length = weighted_count / scale
|
96
|
+
is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length)
|
97
|
+
permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length
|
98
|
+
|
99
|
+
return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1))
|
100
|
+
end
|
101
|
+
|
102
|
+
def contains_invalid?(text)
|
103
|
+
return false if !text || text.empty?
|
49
104
|
begin
|
50
|
-
return
|
51
|
-
return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
|
105
|
+
return true if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
|
52
106
|
rescue ArgumentError
|
53
107
|
# non-Unicode value.
|
54
|
-
return
|
108
|
+
return true
|
55
109
|
end
|
56
|
-
|
57
110
|
return false
|
58
111
|
end
|
59
112
|
|
60
|
-
def valid_tweet_text?(text)
|
61
|
-
!tweet_invalid?(text)
|
62
|
-
end
|
63
|
-
|
64
113
|
def valid_username?(username)
|
65
114
|
return false if !username || username.empty?
|
66
115
|
|
@@ -102,6 +151,69 @@ module Twitter
|
|
102
151
|
(!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority]))
|
103
152
|
end
|
104
153
|
|
154
|
+
# These methods are deprecated, will be removed in future.
|
155
|
+
extend Deprecation
|
156
|
+
|
157
|
+
MAX_LENGTH_LEGACY = 140
|
158
|
+
|
159
|
+
# DEPRECATED: Please use parse_text instead.
|
160
|
+
#
|
161
|
+
# Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
|
162
|
+
# (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
|
163
|
+
# string no matter which actual form was transmitted. For example:
|
164
|
+
#
|
165
|
+
# U+0065 Latin Small Letter E
|
166
|
+
# + U+0301 Combining Acute Accent
|
167
|
+
# ----------
|
168
|
+
# = 2 bytes, 2 characters, displayed as é (1 visual glyph)
|
169
|
+
# … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
|
170
|
+
#
|
171
|
+
# The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
|
172
|
+
#
|
173
|
+
def tweet_length(text, options = {})
|
174
|
+
options = DEFAULT_TCO_URL_LENGTHS.merge(options)
|
175
|
+
|
176
|
+
length = text.to_nfc.unpack("U*").length
|
177
|
+
|
178
|
+
Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
|
179
|
+
length += start_position - end_position
|
180
|
+
length += options[:short_url_length] if url.length > 0
|
181
|
+
end
|
182
|
+
|
183
|
+
length
|
184
|
+
end
|
185
|
+
deprecate :tweet_length, :parse_tweet
|
186
|
+
|
187
|
+
# DEPRECATED: Please use parse_text instead.
|
188
|
+
#
|
189
|
+
# Check the <tt>text</tt> for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
|
190
|
+
# before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
|
191
|
+
# will allow quicker feedback.
|
192
|
+
#
|
193
|
+
# Returns <tt>false</tt> if this <tt>text</tt> is valid. Otherwise one of the following Symbols will be returned:
|
194
|
+
#
|
195
|
+
# <tt>:too_long</tt>:: if the <tt>text</tt> is too long
|
196
|
+
# <tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
|
197
|
+
# <tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
|
198
|
+
def tweet_invalid?(text)
|
199
|
+
return :empty if !text || text.empty?
|
200
|
+
begin
|
201
|
+
return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY
|
202
|
+
return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
|
203
|
+
rescue ArgumentError
|
204
|
+
# non-Unicode value.
|
205
|
+
return :invalid_characters
|
206
|
+
end
|
207
|
+
|
208
|
+
return false
|
209
|
+
end
|
210
|
+
deprecate :tweet_invalid?, :parse_tweet
|
211
|
+
|
212
|
+
def valid_tweet_text?(text)
|
213
|
+
!tweet_invalid?(text)
|
214
|
+
end
|
215
|
+
deprecate :valid_tweet_text?, :parse_tweet
|
216
|
+
|
105
217
|
private
|
106
218
|
|
107
219
|
def valid_match?(string, regex, optional=false)
|