twitter-text 2.1.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGELOG.md +11 -1
- data/README.md +1 -1
- data/Rakefile +1 -1
- data/config/README.md +8 -0
- data/config/v3.json +30 -0
- data/lib/assets/tld_lib.yml +5 -0
- data/lib/twitter-text.rb +5 -0
- data/lib/twitter-text/autolink.rb +4 -0
- data/lib/twitter-text/configuration.rb +17 -3
- data/lib/twitter-text/deprecation.rb +4 -0
- data/lib/twitter-text/emoji_regex.rb +27 -0
- data/lib/twitter-text/extractor.rb +44 -16
- data/lib/twitter-text/hash_helper.rb +4 -0
- data/lib/twitter-text/hit_highlighter.rb +6 -2
- data/lib/twitter-text/regex.rb +28 -16
- data/lib/twitter-text/rewriter.rb +13 -9
- data/lib/twitter-text/unicode.rb +4 -0
- data/lib/twitter-text/validation.rb +39 -15
- data/lib/twitter-text/weighted_range.rb +4 -0
- data/spec/autolinking_spec.rb +4 -0
- data/spec/configuration_spec.rb +45 -0
- data/spec/extractor_spec.rb +4 -0
- data/spec/hithighlighter_spec.rb +4 -0
- data/spec/regex_spec.rb +38 -0
- data/spec/rewriter_spec.rb +4 -0
- data/spec/spec_helper.rb +13 -0
- data/spec/test_urls.rb +9 -5
- data/spec/twitter_text_spec.rb +4 -0
- data/spec/unicode_spec.rb +4 -0
- data/spec/validation_spec.rb +22 -2
- data/test/conformance_test.rb +20 -3
- data/twitter-text.gemspec +5 -2
- metadata +5 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 5ddb91693d013ffff5407a5ba5a6f08985221611
|
4
|
+
data.tar.gz: 85e60ba0ce0ad24a92d570204408f8cd78fd62df
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 259ad390fb78ea8e090ab17b2843b0c7a6b15ac2baf837278ae12449d36ea65cd37403cf7451b5c4288b8bb743469ae6da316d0150269b5eb6bf3d233d78dcfb
|
7
|
+
data.tar.gz: 5ad46e00e0d79a31bfac2a491dc690b93ea727743b900e665d76dfdc4f4b1f07e829b69525e5461b0760a73cfd3eefb28042bf880fe759f603e996a948179f76
|
data/CHANGELOG.md
CHANGED
@@ -1,7 +1,17 @@
|
|
1
1
|
# Changelog
|
2
2
|
All notable changes to this project will be documented in this file.
|
3
3
|
|
4
|
-
## [
|
4
|
+
## [3.0.0]
|
5
|
+
### Added
|
6
|
+
- New v3.json config file with emojiParsingEnabled config option. When
|
7
|
+
true, twitter-text will parse and discount emoji supported by the
|
8
|
+
twemoji library (see https://github.com/twitter/twemoji). The length
|
9
|
+
of these emoji will be the default weight (200 or two characters) even
|
10
|
+
if they contain multiple code points combined by zero-width
|
11
|
+
joiners. This means that emoji with skin tone and gender modifiers no
|
12
|
+
longer count as more characters than those without such modifiers.
|
13
|
+
### Changed
|
14
|
+
- Updates known gTLDs to recognize recent additions by IANA (#261)
|
5
15
|
|
6
16
|
## [2.1] - 2017-12-20
|
7
17
|
### Added
|
data/README.md
CHANGED
@@ -188,6 +188,6 @@ Have a bug? Please create an issue here on GitHub!
|
|
188
188
|
|
189
189
|
## License
|
190
190
|
|
191
|
-
Copyright 2012-
|
191
|
+
Copyright 2012-2018 Twitter, Inc and other contributors
|
192
192
|
|
193
193
|
Licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0)
|
data/Rakefile
CHANGED
data/config/README.md
CHANGED
@@ -13,6 +13,7 @@ The configuration format is a JSON string. The JSON can have the following prope
|
|
13
13
|
* `maxWeightedTweetLength` (required, integer, min value 0)
|
14
14
|
* `scale` (required, integer, min value 1)
|
15
15
|
* `defaultWeight` (required, integer, min value 0)
|
16
|
+
* `emojiParsingEnabled` (optional, boolean)
|
16
17
|
* `transformedURLLength` (integer, min value 0)
|
17
18
|
* `ranges` (array of range items)
|
18
19
|
|
@@ -48,6 +49,13 @@ The Tweet length is the (`weighted length` / `scale`).
|
|
48
49
|
The default weight applied to all code points. This is overridden in
|
49
50
|
one or more range items.
|
50
51
|
|
52
|
+
### emojiParsingEnabled
|
53
|
+
|
54
|
+
When set to true, the weighted Tweet length considers all emoji as a
|
55
|
+
single code point (with a default weight of 200), including longer
|
56
|
+
grapheme clusters combined by zero-width joiners. When set to false,
|
57
|
+
Tweet length is calculated by weighing individual Unicode code points.
|
58
|
+
|
51
59
|
### transformedURLLength
|
52
60
|
|
53
61
|
The length counted for URLs against the total weight of the Tweet. In
|
data/config/v3.json
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
{
|
2
|
+
"version": 3,
|
3
|
+
"maxWeightedTweetLength": 280,
|
4
|
+
"scale": 100,
|
5
|
+
"defaultWeight": 200,
|
6
|
+
"emojiParsingEnabled": true,
|
7
|
+
"transformedURLLength": 23,
|
8
|
+
"ranges": [
|
9
|
+
{
|
10
|
+
"start": 0,
|
11
|
+
"end": 4351,
|
12
|
+
"weight": 100
|
13
|
+
},
|
14
|
+
{
|
15
|
+
"start": 8192,
|
16
|
+
"end": 8205,
|
17
|
+
"weight": 100
|
18
|
+
},
|
19
|
+
{
|
20
|
+
"start": 8208,
|
21
|
+
"end": 8223,
|
22
|
+
"weight": 100
|
23
|
+
},
|
24
|
+
{
|
25
|
+
"start": 8242,
|
26
|
+
"end": 8247,
|
27
|
+
"weight": 100
|
28
|
+
}
|
29
|
+
]
|
30
|
+
}
|
data/lib/assets/tld_lib.yml
CHANGED
@@ -343,6 +343,7 @@ generic:
|
|
343
343
|
- 新闻
|
344
344
|
- 政府
|
345
345
|
- 政务
|
346
|
+
- 招聘
|
346
347
|
- 手表
|
347
348
|
- 手机
|
348
349
|
- 我爱你
|
@@ -598,6 +599,7 @@ generic:
|
|
598
599
|
- srl
|
599
600
|
- spreadbetting
|
600
601
|
- spot
|
602
|
+
- sport
|
601
603
|
- spiegel
|
602
604
|
- space
|
603
605
|
- soy
|
@@ -942,6 +944,7 @@ generic:
|
|
942
944
|
- locker
|
943
945
|
- loans
|
944
946
|
- loan
|
947
|
+
- llc
|
945
948
|
- lixil
|
946
949
|
- living
|
947
950
|
- live
|
@@ -1047,6 +1050,7 @@ generic:
|
|
1047
1050
|
- info
|
1048
1051
|
- infiniti
|
1049
1052
|
- industries
|
1053
|
+
- inc
|
1050
1054
|
- immobilien
|
1051
1055
|
- immo
|
1052
1056
|
- imdb
|
@@ -1364,6 +1368,7 @@ generic:
|
|
1364
1368
|
- cheap
|
1365
1369
|
- chat
|
1366
1370
|
- chase
|
1371
|
+
- charity
|
1367
1372
|
- channel
|
1368
1373
|
- chanel
|
1369
1374
|
- cfd
|
data/lib/twitter-text.rb
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
1
5
|
major, minor, _patch = RUBY_VERSION.split('.')
|
2
6
|
|
3
7
|
$RUBY_1_9 = if major.to_i == 1 && minor.to_i < 9
|
@@ -10,6 +14,7 @@ end
|
|
10
14
|
|
11
15
|
%w(
|
12
16
|
deprecation
|
17
|
+
emoji_regex
|
13
18
|
regex
|
14
19
|
rewriter
|
15
20
|
autolink
|
@@ -1,3 +1,7 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
1
5
|
# encoding: UTF-8
|
2
6
|
|
3
7
|
module Twitter
|
@@ -6,7 +10,10 @@ module Twitter
|
|
6
10
|
require 'json'
|
7
11
|
|
8
12
|
PARSER_VERSION_CLASSIC = "v1"
|
9
|
-
|
13
|
+
PARSER_VERSION_WEIGHTED = "v2"
|
14
|
+
PARSER_VERSION_EMOJI_PARSING = "v3"
|
15
|
+
|
16
|
+
PARSER_VERSION_DEFAULT = PARSER_VERSION_WEIGHTED
|
10
17
|
|
11
18
|
class << self
|
12
19
|
attr_accessor :default_configuration
|
@@ -14,6 +21,7 @@ module Twitter
|
|
14
21
|
|
15
22
|
attr_reader :version, :max_weighted_tweet_length, :scale
|
16
23
|
attr_reader :default_weight, :transformed_url_length, :ranges
|
24
|
+
attr_reader :emoji_parsing_enabled
|
17
25
|
|
18
26
|
CONFIG_V1 = File.join(
|
19
27
|
File.expand_path('../../../config', __FILE__), # project root
|
@@ -22,7 +30,12 @@ module Twitter
|
|
22
30
|
|
23
31
|
CONFIG_V2 = File.join(
|
24
32
|
File.expand_path('../../../config', __FILE__), # project root
|
25
|
-
"#{
|
33
|
+
"#{PARSER_VERSION_WEIGHTED}.json"
|
34
|
+
)
|
35
|
+
|
36
|
+
CONFIG_V3 = File.join(
|
37
|
+
File.expand_path('../../../config', __FILE__), # project root
|
38
|
+
"#{PARSER_VERSION_EMOJI_PARSING}.json"
|
26
39
|
)
|
27
40
|
|
28
41
|
def self.parse_string(string, options = {})
|
@@ -45,10 +58,11 @@ module Twitter
|
|
45
58
|
@scale = config[:scale]
|
46
59
|
@default_weight = config[:defaultWeight]
|
47
60
|
@transformed_url_length = config[:transformedURLLength]
|
61
|
+
@emoji_parsing_enabled = config[:emojiParsingEnabled]
|
48
62
|
@ranges = config[:ranges].map { |range| Twitter::TwitterText::WeightedRange.new(range) } if config.key?(:ranges) && config[:ranges].is_a?(Array)
|
49
63
|
end
|
50
64
|
|
51
|
-
self.default_configuration = self.configuration_from_file(
|
65
|
+
self.default_configuration = self.configuration_from_file(CONFIG_V3)
|
52
66
|
end
|
53
67
|
end
|
54
68
|
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#
|
2
|
+
# emoji_regex.rb
|
3
|
+
#
|
4
|
+
# Copyright © 2018 Twitter. All rights reserved.
|
5
|
+
#
|
6
|
+
# DO NOT MODIFY THIS FILE -- it is generated for twitter-text automatically
|
7
|
+
|
8
|
+
# encoding: utf-8
|
9
|
+
|
10
|
+
module Twitter
|
11
|
+
module TwitterText
|
12
|
+
class Regex
|
13
|
+
class Emoji
|
14
|
+
REGEXEN = {} # :nodoc:
|
15
|
+
|
16
|
+
# This regex pattern matches a single emoji
|
17
|
+
REGEXEN[:valid_emoji] = %r{
|
18
|
+
[\u{01f468}\u{01f469}][\u{01f3fb}-\u{01f3ff}]?\u200d(?:\u2695\ufe0f|\u2696\ufe0f|\u2708\ufe0f|[\u{01f33e}\u{01f373}\u{01f393}\u{01f3a4}\u{01f3a8}\u{01f3eb}\u{01f3ed}\u{01f4bb}\u{01f4bc}\u{01f527}\u{01f52c}\u{01f680}\u{01f692}\u{01f9b0}-\u{01f9b3}])|[\u26f9\u{01f3cb}\u{01f3cc}\u{01f574}\u{01f575}](?:[\ufe0f\u{01f3fb}-\u{01f3ff}]\u200d[\u2640\u2642]\ufe0f)|[\u{01f3c3}\u{01f3c4}\u{01f3ca}\u{01f46e}\u{01f471}\u{01f473}\u{01f477}\u{01f481}\u{01f482}\u{01f486}\u{01f487}\u{01f645}-\u{01f647}\u{01f64b}\u{01f64d}\u{01f64e}\u{01f6a3}\u{01f6b4}-\u{01f6b6}\u{01f926}\u{01f935}\u{01f937}-\u{01f939}\u{01f93d}\u{01f93e}\u{01f9b8}\u{01f9b9}\u{01f9d6}-\u{01f9dd}][\u{01f3fb}-\u{01f3ff}]?\u200d[\u2640\u2642]\ufe0f|(?:\u{01f468}\u200d\u2764\ufe0f\u200d\u{01f48b}\u200d\u{01f468}|\u{01f469}\u200d\u2764\ufe0f\u200d\u{01f48b}\u200d[\u{01f468}\u{01f469}]|\u{01f468}\u200d\u{01f468}\u200d\u{01f466}\u200d\u{01f466}|\u{01f468}\u200d\u{01f468}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f468}\u200d\u{01f469}\u200d\u{01f466}\u200d\u{01f466}|\u{01f468}\u200d\u{01f469}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f469}\u200d\u{01f469}\u200d\u{01f466}\u200d\u{01f466}|\u{01f469}\u200d\u{01f469}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f468}\u200d\u2764\ufe0f\u200d\u{01f468}|\u{01f469}\u200d\u2764\ufe0f\u200d[\u{01f468}\u{01f469}]|\u{01f468}\u200d\u{01f466}\u200d\u{01f466}|\u{01f468}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f468}\u200d\u{01f468}\u200d[\u{01f466}\u{01f467}]|\u{01f468}\u200d\u{01f469}\u200d[\u{01f466}\u{01f467}]|\u{01f469}\u200d\u{01f466}\u200d\u{01f466}|\u{01f469}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f469}\u200d\u{01f469}\u200d[\u{01f466}\u{01f467}]|\u{01f3f3}\ufe0f\u200d\u{01f308}|\u{01f3f4}\u200d\u2620\ufe0f|\u{01f46f}\u200d\u2640\ufe0f|\u{01f46f}\u200d\u2642\ufe0f|\u{01f93c}\u200d\u2640\ufe0f|\u{01f93c}\u200d\u2642\ufe0f|\u{01f9de}\u200d\u2640\ufe0f|\u{01f9de}\u200d\u2642\ufe0f|\u{01f9df}\u200d\u2640\ufe0f|\u{01f9df}\u200d\u2642\ufe0f|\u{01f441}\u200d\u{01f5e8}|\u{01f468}\u200d[\u{01f466}\u{01f467}]|\u{01f469}\u200d[\u{01f466}\u{01f467}])|[#*0-9]\ufe0f?\u20e3|(?:[©®\u2122\u265f]\ufe0f)|[\u203c\u2049\u2139\u2194-\u2199\u21a9\u21aa\u231a\u231b\u2328\u23cf\u23ed-\u23ef\u23f1\u23f2\u23f8-\u23fa\u24c2\u25aa\u25ab\u25b6\u25c0\u25fb-\u25fe\u2600-\u2604\u260e\u2611\u2614\u2615\u2618\u2620\u2622\u2623\u2626\u262a\u262e\u262f\u2638-\u263a\u2640\u2642\u2648-\u2653\u2660\u2663\u2665\u2666\u2668\u267b\u267f\u2692-\u2697\u2699\u269b\u269c\u26a0\u26a1\u26aa\u26ab\u26b0\u26b1\u26bd\u26be\u26c4\u26c5\u26c8\u26cf\u26d1\u26d3\u26d4\u26e9\u26ea\u26f0-\u26f5\u26f8\u26fa\u26fd\u2702\u2708\u2709\u270f\u2712\u2714\u2716\u271d\u2721\u2733\u2734\u2744\u2747\u2757\u2763\u2764\u27a1\u2934\u2935\u2b05-\u2b07\u2b1b\u2b1c\u2b50\u2b55\u3030\u303d\u3297\u3299\u{01f004}\u{01f170}\u{01f171}\u{01f17e}\u{01f17f}\u{01f202}\u{01f21a}\u{01f22f}\u{01f237}\u{01f321}\u{01f324}-\u{01f32c}\u{01f336}\u{01f37d}\u{01f396}\u{01f397}\u{01f399}-\u{01f39b}\u{01f39e}\u{01f39f}\u{01f3cd}\u{01f3ce}\u{01f3d4}-\u{01f3df}\u{01f3f3}\u{01f3f5}\u{01f3f7}\u{01f43f}\u{01f441}\u{01f4fd}\u{01f549}\u{01f54a}\u{01f56f}\u{01f570}\u{01f573}\u{01f576}-\u{01f579}\u{01f587}\u{01f58a}-\u{01f58d}\u{01f5a5}\u{01f5a8}\u{01f5b1}\u{01f5b2}\u{01f5bc}\u{01f5c2}-\u{01f5c4}\u{01f5d1}-\u{01f5d3}\u{01f5dc}-\u{01f5de}\u{01f5e1}\u{01f5e3}\u{01f5e8}\u{01f5ef}\u{01f5f3}\u{01f5fa}\u{01f6cb}\u{01f6cd}-\u{01f6cf}\u{01f6e0}-\u{01f6e5}\u{01f6e9}\u{01f6f0}\u{01f6f3}](?:\ufe0f|(?!\ufe0e))|(?:[\u261d\u26f7\u26f9\u270c\u270d\u{01f3cb}\u{01f3cc}\u{01f574}\u{01f575}\u{01f590}](?:\ufe0f|(?!\ufe0e))|[\u270a\u270b\u{01f385}\u{01f3c2}-\u{01f3c4}\u{01f3c7}\u{01f3ca}\u{01f442}\u{01f443}\u{01f446}-\u{01f450}\u{01f466}-\u{01f469}\u{01f46e}\u{01f470}-\u{01f478}\u{01f47c}\u{01f481}-\u{01f483}\u{01f485}-\u{01f487}\u{01f4aa}\u{01f57a}\u{01f595}\u{01f596}\u{01f645}-\u{01f647}\u{01f64b}-\u{01f64f}\u{01f6a3}\u{01f6b4}-\u{01f6b6}\u{01f6c0}\u{01f6cc}\u{01f918}-\u{01f91c}\u{01f91e}\u{01f91f}\u{01f926}\u{01f930}-\u{01f939}\u{01f93d}\u{01f93e}\u{01f9b5}\u{01f9b6}\u{01f9b8}\u{01f9b9}\u{01f9d1}-\u{01f9dd}])[\u{01f3fb}-\u{01f3ff}]?|(?:\u{01f3f4}\u{0e0067}\u{0e0062}\u{0e0065}\u{0e006e}\u{0e0067}\u{0e007f}|\u{01f3f4}\u{0e0067}\u{0e0062}\u{0e0073}\u{0e0063}\u{0e0074}\u{0e007f}|\u{01f3f4}\u{0e0067}\u{0e0062}\u{0e0077}\u{0e006c}\u{0e0073}\u{0e007f}|\u{01f1e6}[\u{01f1e8}-\u{01f1ec}\u{01f1ee}\u{01f1f1}\u{01f1f2}\u{01f1f4}\u{01f1f6}-\u{01f1fa}\u{01f1fc}\u{01f1fd}\u{01f1ff}]|\u{01f1e7}[\u{01f1e6}\u{01f1e7}\u{01f1e9}-\u{01f1ef}\u{01f1f1}-\u{01f1f4}\u{01f1f6}-\u{01f1f9}\u{01f1fb}\u{01f1fc}\u{01f1fe}\u{01f1ff}]|\u{01f1e8}[\u{01f1e6}\u{01f1e8}\u{01f1e9}\u{01f1eb}-\u{01f1ee}\u{01f1f0}-\u{01f1f5}\u{01f1f7}\u{01f1fa}-\u{01f1ff}]|\u{01f1e9}[\u{01f1ea}\u{01f1ec}\u{01f1ef}\u{01f1f0}\u{01f1f2}\u{01f1f4}\u{01f1ff}]|\u{01f1ea}[\u{01f1e6}\u{01f1e8}\u{01f1ea}\u{01f1ec}\u{01f1ed}\u{01f1f7}-\u{01f1fa}]|\u{01f1eb}[\u{01f1ee}-\u{01f1f0}\u{01f1f2}\u{01f1f4}\u{01f1f7}]|\u{01f1ec}[\u{01f1e6}\u{01f1e7}\u{01f1e9}-\u{01f1ee}\u{01f1f1}-\u{01f1f3}\u{01f1f5}-\u{01f1fa}\u{01f1fc}\u{01f1fe}]|\u{01f1ed}[\u{01f1f0}\u{01f1f2}\u{01f1f3}\u{01f1f7}\u{01f1f9}\u{01f1fa}]|\u{01f1ee}[\u{01f1e8}-\u{01f1ea}\u{01f1f1}-\u{01f1f4}\u{01f1f6}-\u{01f1f9}]|\u{01f1ef}[\u{01f1ea}\u{01f1f2}\u{01f1f4}\u{01f1f5}]|\u{01f1f0}[\u{01f1ea}\u{01f1ec}-\u{01f1ee}\u{01f1f2}\u{01f1f3}\u{01f1f5}\u{01f1f7}\u{01f1fc}\u{01f1fe}\u{01f1ff}]|\u{01f1f1}[\u{01f1e6}-\u{01f1e8}\u{01f1ee}\u{01f1f0}\u{01f1f7}-\u{01f1fb}\u{01f1fe}]|\u{01f1f2}[\u{01f1e6}\u{01f1e8}-\u{01f1ed}\u{01f1f0}-\u{01f1ff}]|\u{01f1f3}[\u{01f1e6}\u{01f1e8}\u{01f1ea}-\u{01f1ec}\u{01f1ee}\u{01f1f1}\u{01f1f4}\u{01f1f5}\u{01f1f7}\u{01f1fa}\u{01f1ff}]|\u{01f1f4}\u{01f1f2}|\u{01f1f5}[\u{01f1e6}\u{01f1ea}-\u{01f1ed}\u{01f1f0}-\u{01f1f3}\u{01f1f7}-\u{01f1f9}\u{01f1fc}\u{01f1fe}]|\u{01f1f6}\u{01f1e6}|\u{01f1f7}[\u{01f1ea}\u{01f1f4}\u{01f1f8}\u{01f1fa}\u{01f1fc}]|\u{01f1f8}[\u{01f1e6}-\u{01f1ea}\u{01f1ec}-\u{01f1f4}\u{01f1f7}-\u{01f1f9}\u{01f1fb}\u{01f1fd}-\u{01f1ff}]|\u{01f1f9}[\u{01f1e6}\u{01f1e8}\u{01f1e9}\u{01f1eb}-\u{01f1ed}\u{01f1ef}-\u{01f1f4}\u{01f1f7}\u{01f1f9}\u{01f1fb}\u{01f1fc}\u{01f1ff}]|\u{01f1fa}[\u{01f1e6}\u{01f1ec}\u{01f1f2}\u{01f1f3}\u{01f1f8}\u{01f1fe}\u{01f1ff}]|\u{01f1fb}[\u{01f1e6}\u{01f1e8}\u{01f1ea}\u{01f1ec}\u{01f1ee}\u{01f1f3}\u{01f1fa}]|\u{01f1fc}[\u{01f1eb}\u{01f1f8}]|\u{01f1fd}\u{01f1f0}|\u{01f1fe}[\u{01f1ea}\u{01f1f9}]|\u{01f1ff}[\u{01f1e6}\u{01f1f2}\u{01f1fc}]|[\u23e9-\u23ec\u23f0\u23f3\u267e\u26ce\u2705\u2728\u274c\u274e\u2753-\u2755\u2795-\u2797\u27b0\u27bf\ue50a\u{01f0cf}\u{01f18e}\u{01f191}-\u{01f19a}\u{01f1e6}-\u{01f1ff}\u{01f201}\u{01f232}-\u{01f236}\u{01f238}-\u{01f23a}\u{01f250}\u{01f251}\u{01f300}-\u{01f320}\u{01f32d}-\u{01f335}\u{01f337}-\u{01f37c}\u{01f37e}-\u{01f384}\u{01f386}-\u{01f393}\u{01f3a0}-\u{01f3c1}\u{01f3c5}\u{01f3c6}\u{01f3c8}\u{01f3c9}\u{01f3cf}-\u{01f3d3}\u{01f3e0}-\u{01f3f0}\u{01f3f4}\u{01f3f8}-\u{01f43e}\u{01f440}\u{01f444}\u{01f445}\u{01f451}-\u{01f465}\u{01f46a}-\u{01f46d}\u{01f46f}\u{01f479}-\u{01f47b}\u{01f47d}-\u{01f480}\u{01f484}\u{01f488}-\u{01f4a9}\u{01f4ab}-\u{01f4fc}\u{01f4ff}-\u{01f53d}\u{01f54b}-\u{01f54e}\u{01f550}-\u{01f567}\u{01f5a4}\u{01f5fb}-\u{01f644}\u{01f648}-\u{01f64a}\u{01f680}-\u{01f6a2}\u{01f6a4}-\u{01f6b3}\u{01f6b7}-\u{01f6bf}\u{01f6c1}-\u{01f6c5}\u{01f6d0}-\u{01f6d2}\u{01f6eb}\u{01f6ec}\u{01f6f4}-\u{01f6f9}\u{01f910}-\u{01f917}\u{01f91d}\u{01f920}-\u{01f925}\u{01f927}-\u{01f92f}\u{01f93a}\u{01f93c}\u{01f940}-\u{01f945}\u{01f947}-\u{01f970}\u{01f973}-\u{01f976}\u{01f97a}\u{01f97c}-\u{01f9a2}\u{01f9b4}\u{01f9b7}\u{01f9c0}-\u{01f9c2}\u{01f9d0}\u{01f9de}-\u{01f9ff}])|\ufe0f
|
19
|
+
}iox
|
20
|
+
|
21
|
+
def self.[](key)
|
22
|
+
REGEXEN[key]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -1,3 +1,7 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
1
5
|
# encoding: utf-8
|
2
6
|
require 'idn'
|
3
7
|
|
@@ -5,7 +9,7 @@ class String
|
|
5
9
|
# Helper function to count the character length by first converting to an
|
6
10
|
# array. This is needed because with unicode strings, the return value
|
7
11
|
# of length may be incorrect
|
8
|
-
def
|
12
|
+
def codepoint_length
|
9
13
|
if respond_to? :codepoints
|
10
14
|
length
|
11
15
|
else
|
@@ -13,25 +17,25 @@ class String
|
|
13
17
|
end
|
14
18
|
end
|
15
19
|
|
16
|
-
# Helper function to convert this string into an array of unicode
|
17
|
-
def
|
18
|
-
@
|
20
|
+
# Helper function to convert this string into an array of unicode code points.
|
21
|
+
def to_codepoint_a
|
22
|
+
@to_codepoint_a ||= if chars.kind_of?(Enumerable)
|
19
23
|
chars.to_a
|
20
24
|
else
|
21
|
-
|
22
|
-
0.upto(
|
23
|
-
|
25
|
+
codepoint_array = []
|
26
|
+
0.upto(codepoint_length - 1) { |i| codepoint_array << [chars.slice(i)].pack('U') }
|
27
|
+
codepoint_array
|
24
28
|
end
|
25
29
|
end
|
26
30
|
end
|
27
31
|
|
28
|
-
# Helper functions to return
|
32
|
+
# Helper functions to return code point offsets instead of byte offsets.
|
29
33
|
class MatchData
|
30
34
|
def char_begin(n)
|
31
35
|
if string.respond_to? :codepoints
|
32
36
|
self.begin(n)
|
33
37
|
else
|
34
|
-
string[0, self.begin(n)].
|
38
|
+
string[0, self.begin(n)].codepoint_length
|
35
39
|
end
|
36
40
|
end
|
37
41
|
|
@@ -39,7 +43,7 @@ class MatchData
|
|
39
43
|
if string.respond_to? :codepoints
|
40
44
|
self.end(n)
|
41
45
|
else
|
42
|
-
string[0, self.end(n)].
|
46
|
+
string[0, self.end(n)].codepoint_length
|
43
47
|
end
|
44
48
|
end
|
45
49
|
end
|
@@ -77,11 +81,14 @@ module Twitter
|
|
77
81
|
#
|
78
82
|
# If a block is given then it will be called for each entity.
|
79
83
|
def extract_entities_with_indices(text, options = {}, &block)
|
84
|
+
config = options[:config] || Twitter::TwitterText::Configuration.default_configuration
|
85
|
+
|
80
86
|
# extract all entities
|
81
87
|
entities = extract_urls_with_indices(text, options) +
|
82
88
|
extract_hashtags_with_indices(text, :check_url_overlap => false) +
|
83
89
|
extract_mentions_or_lists_with_indices(text) +
|
84
90
|
extract_cashtags_with_indices(text)
|
91
|
+
entities += extract_emoji_with_indices(text) if config.emoji_parsing_enabled
|
85
92
|
|
86
93
|
return [] if entities.empty?
|
87
94
|
|
@@ -218,11 +225,7 @@ module Twitter
|
|
218
225
|
:indices => [start_position + $~.char_begin(0),
|
219
226
|
start_position + $~.char_end(0)]
|
220
227
|
}
|
221
|
-
|
222
|
-
ascii_domain =~ Twitter::TwitterText::Regex[:valid_special_short_domain] ||
|
223
|
-
ascii_domain !~ Twitter::TwitterText::Regex[:invalid_short_domain]
|
224
|
-
urls << last_url
|
225
|
-
end
|
228
|
+
urls << last_url
|
226
229
|
end
|
227
230
|
|
228
231
|
# no ASCII-only domain found. Skip the entire URL
|
@@ -239,7 +242,7 @@ module Twitter
|
|
239
242
|
if url =~ Twitter::TwitterText::Regex[:valid_tco_url]
|
240
243
|
next if $1 && $1.length > MAX_TCO_SLUG_LENGTH
|
241
244
|
url = $&
|
242
|
-
end_position = start_position + url.
|
245
|
+
end_position = start_position + url.codepoint_length
|
243
246
|
end
|
244
247
|
|
245
248
|
next unless is_valid_domain(url.length, domain, protocol)
|
@@ -341,6 +344,31 @@ module Twitter
|
|
341
344
|
tags
|
342
345
|
end
|
343
346
|
|
347
|
+
def extract_emoji_with_indices(text) # :yields: emoji, start, end
|
348
|
+
emoji = []
|
349
|
+
text.scan(Twitter::TwitterText::Regex[:valid_emoji]) do |emoji_text|
|
350
|
+
match_data = $~
|
351
|
+
start_position = match_data.char_begin(0)
|
352
|
+
end_position = match_data.char_end(0)
|
353
|
+
emoji << {
|
354
|
+
:emoji => emoji_text,
|
355
|
+
:indices => [start_position, end_position]
|
356
|
+
}
|
357
|
+
end
|
358
|
+
emoji
|
359
|
+
end
|
360
|
+
|
361
|
+
def is_valid_emoji(text)
|
362
|
+
begin
|
363
|
+
raise ArgumentError.new("invalid empty emoji") unless text
|
364
|
+
entities = extract_emoji_with_indices(text)
|
365
|
+
entities.count == 1 && entities[0][:emoji] == text
|
366
|
+
rescue Exception
|
367
|
+
# On error don't consider this a valid domain.
|
368
|
+
return false
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
344
372
|
def is_valid_domain(url_length, domain, protocol)
|
345
373
|
begin
|
346
374
|
raise ArgumentError.new("invalid empty domain") unless domain
|
@@ -1,3 +1,7 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
1
5
|
module Twitter
|
2
6
|
module TwitterText
|
3
7
|
# Module for doing "hit highlighting" on tweets that have been auto-linked already.
|
@@ -26,7 +30,7 @@ module Twitter
|
|
26
30
|
|
27
31
|
result = []
|
28
32
|
chunk_index, chunk = 0, chunks[0]
|
29
|
-
chunk_chars = chunk.to_s.
|
33
|
+
chunk_chars = chunk.to_s.to_codepoint_a
|
30
34
|
prev_chunks_len = 0
|
31
35
|
chunk_cursor = 0
|
32
36
|
start_in_chunk = false
|
@@ -50,7 +54,7 @@ module Twitter
|
|
50
54
|
chunk_cursor = 0
|
51
55
|
chunk_index += 2
|
52
56
|
chunk = chunks[chunk_index]
|
53
|
-
chunk_chars = chunk.to_s.
|
57
|
+
chunk_chars = chunk.to_s.to_codepoint_a
|
54
58
|
start_in_chunk = false
|
55
59
|
end
|
56
60
|
|
data/lib/twitter-text/regex.rb
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
1
5
|
# encoding: utf-8
|
2
6
|
|
3
7
|
module Twitter
|
@@ -53,11 +57,26 @@ module Twitter
|
|
53
57
|
].flatten.map{|c| [c].pack('U*')}.freeze
|
54
58
|
REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o
|
55
59
|
|
60
|
+
DIRECTIONAL_CHARACTERS = [
|
61
|
+
0x061C, # ARABIC LETTER MARK (ALM)
|
62
|
+
0x200E, # LEFT-TO-RIGHT MARK (LRM)
|
63
|
+
0x200F, # RIGHT-TO-LEFT MARK (RLM)
|
64
|
+
0x202A, # LEFT-TO-RIGHT EMBEDDING (LRE)
|
65
|
+
0x202B, # RIGHT-TO-LEFT EMBEDDING (RLE)
|
66
|
+
0x202C, # POP DIRECTIONAL FORMATTING (PDF)
|
67
|
+
0x202D, # LEFT-TO-RIGHT OVERRIDE (LRO)
|
68
|
+
0x202E, # RIGHT-TO-LEFT OVERRIDE (RLO)
|
69
|
+
0x2066, # LEFT-TO-RIGHT ISOLATE (LRI)
|
70
|
+
0x2067, # RIGHT-TO-LEFT ISOLATE (RLI)
|
71
|
+
0x2068, # FIRST STRONG ISOLATE (FSI)
|
72
|
+
0x2069, # POP DIRECTIONAL ISOLATE (PDI)
|
73
|
+
].map{|cp| [cp].pack('U')}.freeze
|
74
|
+
REGEXEN[:directional_characters] = /[#{DIRECTIONAL_CHARACTERS.join('')}]/o
|
75
|
+
|
56
76
|
# Character not allowed in Tweets
|
57
77
|
INVALID_CHARACTERS = [
|
58
78
|
0xFFFE, 0xFEFF, # BOM
|
59
79
|
0xFFFF, # Special
|
60
|
-
0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change
|
61
80
|
].map{|cp| [cp].pack('U') }.freeze
|
62
81
|
REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o
|
63
82
|
|
@@ -157,14 +176,16 @@ module Twitter
|
|
157
176
|
([a-z0-9_]{1,20}) # $3: Screen name
|
158
177
|
(\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
|
159
178
|
/iox
|
160
|
-
REGEXEN[:valid_reply] = /^(
|
179
|
+
REGEXEN[:valid_reply] = /^(?:[#{UNICODE_SPACES}#{DIRECTIONAL_CHARACTERS}])*#{REGEXEN[:at_signs]}([a-z0-9_]{1,20})/io
|
161
180
|
# Used in Extractor for final filtering
|
162
181
|
REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/io
|
163
182
|
|
164
183
|
# URL related hash regex collection
|
165
|
-
REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io
|
184
|
+
REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|[#{DIRECTIONAL_CHARACTERS.join('')}]|^)/io
|
166
185
|
REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/
|
167
|
-
|
186
|
+
|
187
|
+
DOMAIN_VALID_CHARS = "[^#{DIRECTIONAL_CHARACTERS.join('')}#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]"
|
188
|
+
# "[a-z0-9#{LATIN_ACCENTS}]"
|
168
189
|
REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
169
190
|
REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
|
170
191
|
|
@@ -183,13 +204,6 @@ module Twitter
|
|
183
204
|
}ix
|
184
205
|
REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i
|
185
206
|
|
186
|
-
REGEXEN[:valid_special_cctld] = %r{
|
187
|
-
(?:
|
188
|
-
(?:co|tv)
|
189
|
-
(?=[^0-9a-z@]|$)
|
190
|
-
)
|
191
|
-
}ix
|
192
|
-
|
193
207
|
REGEXEN[:valid_domain] = /(?:
|
194
208
|
#{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]}
|
195
209
|
(?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
|
@@ -204,10 +218,6 @@ module Twitter
|
|
204
218
|
# This is used in Extractor for stricter t.co URL extraction
|
205
219
|
REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/([a-z0-9]+)/i
|
206
220
|
|
207
|
-
# This is used in Extractor to filter out unwanted URLs.
|
208
|
-
REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
|
209
|
-
REGEXEN[:valid_special_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_special_cctld]}\Z/io
|
210
|
-
|
211
221
|
REGEXEN[:valid_port_number] = /[0-9]+/
|
212
222
|
|
213
223
|
REGEXEN[:valid_general_url_path_chars] = /[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|@#{LATIN_ACCENTS}]/io
|
@@ -258,7 +268,7 @@ module Twitter
|
|
258
268
|
}iox
|
259
269
|
|
260
270
|
REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i
|
261
|
-
REGEXEN[:valid_cashtag] = /(
|
271
|
+
REGEXEN[:valid_cashtag] = /(^|[#{UNICODE_SPACES}#{DIRECTIONAL_CHARACTERS}])(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i
|
262
272
|
|
263
273
|
# These URL validation pattern strings are based on the ABNF from RFC 3986
|
264
274
|
REGEXEN[:validate_url_unreserved] = /[a-z\p{Cyrillic}0-9\p{Pd}._~]/i
|
@@ -339,6 +349,8 @@ module Twitter
|
|
339
349
|
REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
|
340
350
|
REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i
|
341
351
|
|
352
|
+
REGEXEN[:valid_emoji] = Twitter::TwitterText::Regex::Emoji[:valid_emoji]
|
353
|
+
|
342
354
|
# Modified version of RFC 3986 Appendix B
|
343
355
|
REGEXEN[:validate_url_unencoded] = %r{
|
344
356
|
\A # Full URL
|
@@ -1,9 +1,13 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
1
5
|
module Twitter
|
2
6
|
module TwitterText
|
3
7
|
# A module provides base methods to rewrite usernames, lists, hashtags and URLs.
|
4
8
|
module Rewriter extend self
|
5
9
|
def rewrite_entities(text, entities)
|
6
|
-
|
10
|
+
codepoints = text.to_s.to_codepoint_a
|
7
11
|
|
8
12
|
# sort by start index
|
9
13
|
entities = entities.sort_by do |entity|
|
@@ -14,11 +18,11 @@ module Twitter
|
|
14
18
|
result = []
|
15
19
|
last_index = entities.inject(0) do |index, entity|
|
16
20
|
indices = entity.respond_to?(:indices) ? entity.indices : entity[:indices]
|
17
|
-
result <<
|
18
|
-
result << yield(entity,
|
21
|
+
result << codepoints[index...indices.first]
|
22
|
+
result << yield(entity, codepoints)
|
19
23
|
indices.last
|
20
24
|
end
|
21
|
-
result <<
|
25
|
+
result << codepoints[last_index..-1]
|
22
26
|
|
23
27
|
result.flatten.join
|
24
28
|
end
|
@@ -35,8 +39,8 @@ module Twitter
|
|
35
39
|
|
36
40
|
def rewrite_usernames_or_lists(text)
|
37
41
|
entities = Extractor.extract_mentions_or_lists_with_indices(text)
|
38
|
-
rewrite_entities(text, entities) do |entity,
|
39
|
-
at =
|
42
|
+
rewrite_entities(text, entities) do |entity, codepoints|
|
43
|
+
at = codepoints[entity[:indices].first]
|
40
44
|
list_slug = entity[:list_slug]
|
41
45
|
list_slug = nil if list_slug.empty?
|
42
46
|
yield(at, entity[:screen_name], list_slug)
|
@@ -46,8 +50,8 @@ module Twitter
|
|
46
50
|
|
47
51
|
def rewrite_hashtags(text)
|
48
52
|
entities = Extractor.extract_hashtags_with_indices(text)
|
49
|
-
rewrite_entities(text, entities) do |entity,
|
50
|
-
hash =
|
53
|
+
rewrite_entities(text, entities) do |entity, codepoints|
|
54
|
+
hash = codepoints[entity[:indices].first]
|
51
55
|
yield(hash, entity[:hashtag])
|
52
56
|
end
|
53
57
|
end
|
@@ -55,7 +59,7 @@ module Twitter
|
|
55
59
|
|
56
60
|
def rewrite_urls(text)
|
57
61
|
entities = Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false)
|
58
|
-
rewrite_entities(text, entities) do |entity,
|
62
|
+
rewrite_entities(text, entities) do |entity, codepoints|
|
59
63
|
yield(entity[:url])
|
60
64
|
end
|
61
65
|
end
|
data/lib/twitter-text/unicode.rb
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
1
5
|
require 'unf'
|
2
6
|
|
3
7
|
module Twitter
|
@@ -34,8 +38,7 @@ module Twitter
|
|
34
38
|
options = DEFAULT_TCO_URL_LENGTHS.merge(options)
|
35
39
|
config = options[:config] || Twitter::TwitterText::Configuration.default_configuration
|
36
40
|
normalized_text = text.to_nfc
|
37
|
-
|
38
|
-
unless (normalized_text_length > 0)
|
41
|
+
unless (normalized_text.length > 0)
|
39
42
|
ParseResults.empty()
|
40
43
|
end
|
41
44
|
|
@@ -46,6 +49,7 @@ module Twitter
|
|
46
49
|
ranges = config.ranges
|
47
50
|
|
48
51
|
url_entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(normalized_text)
|
52
|
+
emoji_entities = config.emoji_parsing_enabled ? Twitter::TwitterText::Extractor.extract_emoji_with_indices(normalized_text) : []
|
49
53
|
|
50
54
|
has_invalid_chars = false
|
51
55
|
weighted_count = 0
|
@@ -53,24 +57,42 @@ module Twitter
|
|
53
57
|
display_offset = 0
|
54
58
|
valid_offset = 0
|
55
59
|
|
56
|
-
while offset <
|
60
|
+
while offset < normalized_text.codepoint_length
|
57
61
|
# Reset the default char weight each pass through the loop
|
58
62
|
char_weight = config.default_weight
|
63
|
+
entity_length = 0
|
64
|
+
|
59
65
|
url_entities.each do |url_entity|
|
60
66
|
if url_entity[:indices].first == offset
|
61
|
-
|
67
|
+
entity_length = url_entity[:indices].last - url_entity[:indices].first
|
62
68
|
weighted_count += transformed_url_length
|
63
|
-
offset +=
|
64
|
-
display_offset +=
|
69
|
+
offset += entity_length
|
70
|
+
display_offset += entity_length
|
71
|
+
if weighted_count <= scaled_max_weighted_tweet_length
|
72
|
+
valid_offset += entity_length
|
73
|
+
end
|
74
|
+
# Finding a match breaks the loop
|
75
|
+
break
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
emoji_entities.each do |emoji_entity|
|
80
|
+
if emoji_entity[:indices].first == offset
|
81
|
+
entity_length = emoji_entity[:indices].last - emoji_entity[:indices].first
|
82
|
+
weighted_count += char_weight # the default weight
|
83
|
+
offset += entity_length
|
84
|
+
display_offset += entity_length
|
65
85
|
if weighted_count <= scaled_max_weighted_tweet_length
|
66
|
-
valid_offset +=
|
86
|
+
valid_offset += entity_length
|
67
87
|
end
|
68
|
-
# Finding a match breaks the loop
|
88
|
+
# Finding a match breaks the loop
|
69
89
|
break
|
70
90
|
end
|
71
91
|
end
|
72
92
|
|
73
|
-
if
|
93
|
+
next if entity_length > 0
|
94
|
+
|
95
|
+
if offset < normalized_text.codepoint_length
|
74
96
|
code_point = normalized_text[offset]
|
75
97
|
|
76
98
|
ranges.each do |range|
|
@@ -82,17 +104,19 @@ module Twitter
|
|
82
104
|
|
83
105
|
weighted_count += char_weight
|
84
106
|
|
85
|
-
has_invalid_chars = contains_invalid?(
|
86
|
-
|
87
|
-
offset +=
|
88
|
-
display_offset +=
|
107
|
+
has_invalid_chars = contains_invalid?(code_point) unless has_invalid_chars
|
108
|
+
codepoint_length = code_point.codepoint_length
|
109
|
+
offset += codepoint_length
|
110
|
+
display_offset += codepoint_length
|
111
|
+
# index += codepoint_length
|
89
112
|
|
90
113
|
if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length)
|
91
|
-
valid_offset +=
|
114
|
+
valid_offset += codepoint_length
|
92
115
|
end
|
93
116
|
end
|
94
117
|
end
|
95
|
-
|
118
|
+
|
119
|
+
normalized_text_offset = text.codepoint_length - normalized_text.codepoint_length
|
96
120
|
scaled_weighted_length = weighted_count / scale
|
97
121
|
is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length)
|
98
122
|
permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length
|
data/spec/autolinking_spec.rb
CHANGED
data/spec/configuration_spec.rb
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
1
5
|
# encoding: utf-8
|
2
6
|
require File.dirname(__FILE__) + '/spec_helper'
|
3
7
|
|
@@ -18,6 +22,7 @@ describe Twitter::TwitterText::Configuration do
|
|
18
22
|
it "should define version constants" do
|
19
23
|
expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V1)).to be true
|
20
24
|
expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V2)).to be true
|
25
|
+
expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V3)).to be true
|
21
26
|
end
|
22
27
|
|
23
28
|
it "should define a default configuration" do
|
@@ -87,5 +92,45 @@ describe Twitter::TwitterText::Configuration do
|
|
87
92
|
expect(weighted_range.weight).to be_kind_of(Integer)
|
88
93
|
end
|
89
94
|
end
|
95
|
+
|
96
|
+
context "with v3 configuration" do
|
97
|
+
before do
|
98
|
+
@config = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V3)
|
99
|
+
end
|
100
|
+
|
101
|
+
it "should have a version" do
|
102
|
+
expect(@config.version).to eq(3)
|
103
|
+
end
|
104
|
+
|
105
|
+
it "should have a max_weighted_tweet_length" do
|
106
|
+
expect(@config.max_weighted_tweet_length).to eq(280)
|
107
|
+
end
|
108
|
+
|
109
|
+
it "should have a scale" do
|
110
|
+
expect(@config.scale).to eq(100)
|
111
|
+
end
|
112
|
+
|
113
|
+
it "should have a default_weight" do
|
114
|
+
expect(@config.default_weight).to eq(200)
|
115
|
+
end
|
116
|
+
|
117
|
+
it "should have a transformed_url_length" do
|
118
|
+
expect(@config.transformed_url_length).to eq(23)
|
119
|
+
end
|
120
|
+
|
121
|
+
it "should have a configured range" do
|
122
|
+
expect(@config.ranges).to be_kind_of(Array)
|
123
|
+
expect(@config.ranges.count).to be > 0
|
124
|
+
expect(@config.ranges[0]).to be_kind_of(Twitter::TwitterText::WeightedRange)
|
125
|
+
weighted_range = @config.ranges[0]
|
126
|
+
expect(weighted_range.start).to be_kind_of(Integer)
|
127
|
+
expect(weighted_range.end).to be_kind_of(Integer)
|
128
|
+
expect(weighted_range.weight).to be_kind_of(Integer)
|
129
|
+
end
|
130
|
+
|
131
|
+
it "should support discounting emoji" do
|
132
|
+
expect(@config.emoji_parsing_enabled).to be true
|
133
|
+
end
|
134
|
+
end
|
90
135
|
end
|
91
136
|
end
|
data/spec/extractor_spec.rb
CHANGED
data/spec/hithighlighter_spec.rb
CHANGED
data/spec/regex_spec.rb
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
1
5
|
# encoding: utf-8
|
2
6
|
require File.dirname(__FILE__) + '/spec_helper'
|
3
7
|
|
@@ -35,4 +39,38 @@ describe "Twitter::TwitterText::Regex regular expressions" do
|
|
35
39
|
end
|
36
40
|
|
37
41
|
end
|
42
|
+
|
43
|
+
describe "matching Unicode 10.0 emoji" do
|
44
|
+
it "should match new emoji" do
|
45
|
+
input = "Unicode 10.0; grinning face with one large and one small eye: 🤪; woman with headscarf: 🧕; (fitzpatrick) woman with headscarf + medium-dark skin tone: 🧕🏾; flag (England): 🏴"
|
46
|
+
expected = ["🤪", "🧕", "🧕🏾", "🏴"]
|
47
|
+
entities = Twitter::TwitterText::Extractor.extract_emoji_with_indices(input)
|
48
|
+
entities.each_with_index do |entity, i|
|
49
|
+
expect(entity[:emoji]).to be_kind_of(String)
|
50
|
+
expect(entity[:indices]).to be_kind_of(Array)
|
51
|
+
entity[:indices].each do |position|
|
52
|
+
expect(position).to be_kind_of(Integer)
|
53
|
+
end
|
54
|
+
expect(entity[:emoji]).to be == expected[i]
|
55
|
+
expect(Twitter::TwitterText::Extractor.is_valid_emoji(entity[:emoji])).to be true
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "matching Unicode 9.0 emoji" do
|
61
|
+
it "should match new emoji" do
|
62
|
+
input = "Unicode 9.0; face with cowboy hat: 🤠; woman dancing: 💃, woman dancing + medium-dark skin tone: 💃🏾"
|
63
|
+
expected = ["🤠", "💃", "💃🏾"]
|
64
|
+
entities = Twitter::TwitterText::Extractor.extract_emoji_with_indices(input)
|
65
|
+
entities.each_with_index do |entity, i|
|
66
|
+
expect(entity[:emoji]).to be_kind_of(String)
|
67
|
+
expect(entity[:indices]).to be_kind_of(Array)
|
68
|
+
entity[:indices].each do |position|
|
69
|
+
expect(position).to be_kind_of(Integer)
|
70
|
+
end
|
71
|
+
expect(entity[:emoji]).to be == expected[i]
|
72
|
+
expect(Twitter::TwitterText::Extractor.is_valid_emoji(entity[:emoji])).to be true
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
38
76
|
end
|
data/spec/rewriter_spec.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
1
5
|
$TESTING=true
|
2
6
|
|
3
7
|
# Ruby 1.8 encoding check
|
@@ -20,6 +24,15 @@ require File.expand_path('../test_urls', __FILE__)
|
|
20
24
|
|
21
25
|
RSpec.configure do |config|
|
22
26
|
config.include TestUrls
|
27
|
+
|
28
|
+
config.filter_run_excluding :ruby => lambda { |version|
|
29
|
+
case version.to_s
|
30
|
+
when /^> (.*)/
|
31
|
+
!(RUBY_VERSION.to_s > $1)
|
32
|
+
else
|
33
|
+
!(RUBY_VERSION.to_s =~ /^#{version.to_s}/)
|
34
|
+
end
|
35
|
+
}
|
23
36
|
end
|
24
37
|
|
25
38
|
RSpec::Matchers.define :match_autolink_expression do
|
data/spec/test_urls.rb
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
1
5
|
# encoding: utf-8
|
2
6
|
|
3
7
|
module TestUrls
|
@@ -53,11 +57,11 @@ module TestUrls
|
|
53
57
|
"http://no_underscores.com",
|
54
58
|
"http://test.c_o_m",
|
55
59
|
"http://test.c-o-m",
|
56
|
-
"http://twitt#{[0x202A].pack('U')}
|
57
|
-
"http://twitt#{[0x202B].pack('U')}
|
58
|
-
"http://twitt#{[0x202C].pack('U')}
|
59
|
-
"http://twitt#{[0x202D].pack('U')}
|
60
|
-
"http://twitt#{[0x202E].pack('U')}
|
60
|
+
"http://twitt#{[0x202A].pack('U')}.com",
|
61
|
+
"http://twitt#{[0x202B].pack('U')}.com",
|
62
|
+
"http://twitt#{[0x202C].pack('U')}.com",
|
63
|
+
"http://twitt#{[0x202D].pack('U')}.com",
|
64
|
+
"http://twitt#{[0x202E].pack('U')}.com",
|
61
65
|
"https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo"
|
62
66
|
] unless defined?(TestUrls::INVALID)
|
63
67
|
|
data/spec/twitter_text_spec.rb
CHANGED
data/spec/unicode_spec.rb
CHANGED
data/spec/validation_spec.rb
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
1
5
|
# encoding: utf-8
|
2
6
|
require File.dirname(__FILE__) + '/spec_helper'
|
3
7
|
|
@@ -16,9 +20,9 @@ describe Twitter::TwitterText::Validation do
|
|
16
20
|
expect(TestValidation.new.tweet_invalid?("Bom:#{Twitter::TwitterText::Unicode::UFFFF}")).to be == :invalid_characters
|
17
21
|
end
|
18
22
|
|
19
|
-
it "should
|
23
|
+
it "should allow direction change characters" do
|
20
24
|
[0x202A, 0x202B, 0x202C, 0x202D, 0x202E].map{|cp| [cp].pack('U') }.each do |char|
|
21
|
-
expect(TestValidation.new.tweet_invalid?("Invalid:#{char}")).to
|
25
|
+
expect(TestValidation.new.tweet_invalid?("Invalid:#{char}")).to be false
|
22
26
|
end
|
23
27
|
end
|
24
28
|
|
@@ -64,4 +68,20 @@ describe Twitter::TwitterText::Validation do
|
|
64
68
|
expect(results[:valid_range_end]).to eq(0)
|
65
69
|
end
|
66
70
|
end
|
71
|
+
|
72
|
+
context "when parsing tweet text" do
|
73
|
+
it "should properly parse ZWJ and ZWNJ when grapheme clusters are enabled", ruby: ">= 2.5.0" do
|
74
|
+
# Grapheme clustering of devenghali script differs based on platform implementation
|
75
|
+
text = "ZWJ: क्ष -> क्\u200Dष; ZWNJ: क्ष -> क्\u200Cष"
|
76
|
+
config = Twitter::TwitterText::Configuration::configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V3)
|
77
|
+
results = Twitter::TwitterText::Validation::parse_tweet(text, config: config)
|
78
|
+
expect(results[:weighted_length]).to eq(29)
|
79
|
+
expect(results[:permillage]).to eq(103)
|
80
|
+
expect(results[:valid]).to be true
|
81
|
+
expect(results[:display_range_start]).to eq(0)
|
82
|
+
expect(results[:display_range_end]).to eq(34)
|
83
|
+
expect(results[:valid_range_start]).to eq(0)
|
84
|
+
expect(results[:valid_range_end]).to eq(34)
|
85
|
+
end
|
86
|
+
end
|
67
87
|
end
|
data/test/conformance_test.rb
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
1
5
|
require 'multi_json'
|
2
6
|
require 'nokogiri'
|
3
7
|
require 'test/unit'
|
@@ -130,6 +134,11 @@ class ConformanceTest < Test::Unit::TestCase
|
|
130
134
|
assert_equal e, extract_urls_with_indices(text), description
|
131
135
|
end
|
132
136
|
|
137
|
+
def_conformance_test("extract.yml", :urls_with_directional_markers) do
|
138
|
+
e = expected.map{|elem| elem.inject({}){|h, (k,v)| h[k.to_sym] = v; h} }
|
139
|
+
assert_equal e, extract_urls_with_indices(text), description
|
140
|
+
end
|
141
|
+
|
133
142
|
def_conformance_test("extract.yml", :hashtags) do
|
134
143
|
assert_equal expected, extract_hashtags(text), description
|
135
144
|
end
|
@@ -215,11 +224,19 @@ class ConformanceTest < Test::Unit::TestCase
|
|
215
224
|
assert_equal expected, valid_hashtag?(text), description
|
216
225
|
end
|
217
226
|
|
218
|
-
def_conformance_test("validate.yml", :
|
219
|
-
|
227
|
+
def_conformance_test("validate.yml", :WeightedTweetsCounterTest) do
|
228
|
+
# Force v2 configuration, basic weighted code point support
|
229
|
+
config = Twitter::TwitterText::Configuration::configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V2)
|
230
|
+
assert_equal_parse_results expected, parse_tweet(text, config: config), description
|
220
231
|
end
|
221
232
|
|
222
|
-
def_conformance_test("validate.yml", :
|
233
|
+
def_conformance_test("validate.yml", :WeightedTweetsWithDiscountedEmojiCounterTest) do
|
234
|
+
# Force v3 configuration, which supports discounting grapheme clusters that are emoji
|
235
|
+
config = Twitter::TwitterText::Configuration::configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V3)
|
236
|
+
assert_equal_parse_results expected, parse_tweet(text, config: config), description
|
237
|
+
end
|
238
|
+
|
239
|
+
def_conformance_test("validate.yml", :UnicodeDirectionalMarkerCounterTest) do
|
223
240
|
assert_equal_parse_results expected, parse_tweet(text), description
|
224
241
|
end
|
225
242
|
end
|
data/twitter-text.gemspec
CHANGED
@@ -1,8 +1,12 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
1
5
|
# encoding: utf-8
|
2
6
|
|
3
7
|
Gem::Specification.new do |s|
|
4
8
|
s.name = "twitter-text"
|
5
|
-
s.version = "
|
9
|
+
s.version = "3.0.0"
|
6
10
|
s.authors = ["David LaMacchia", "Sudheer Guntupalli", "Kaushik Lakshmikanth", "Jose Antonio Marquez Russo", "Lee Adams",
|
7
11
|
"Yoshimasa Niwa"]
|
8
12
|
s.email = ["opensource@twitter.com"]
|
@@ -14,7 +18,6 @@ Gem::Specification.new do |s|
|
|
14
18
|
s.has_rdoc = true
|
15
19
|
s.summary = "Twitter text handling library"
|
16
20
|
|
17
|
-
s.add_development_dependency "pry"
|
18
21
|
s.add_development_dependency "test-unit"
|
19
22
|
s.add_development_dependency "multi_json", "~> 1.3"
|
20
23
|
s.add_development_dependency "nokogiri", "~> 1.8.0"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David LaMacchia
|
@@ -13,22 +13,8 @@ authors:
|
|
13
13
|
autorequire:
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
|
-
date:
|
16
|
+
date: 2018-10-14 00:00:00.000000000 Z
|
17
17
|
dependencies:
|
18
|
-
- !ruby/object:Gem::Dependency
|
19
|
-
name: pry
|
20
|
-
requirement: !ruby/object:Gem::Requirement
|
21
|
-
requirements:
|
22
|
-
- - ">="
|
23
|
-
- !ruby/object:Gem::Version
|
24
|
-
version: '0'
|
25
|
-
type: :development
|
26
|
-
prerelease: false
|
27
|
-
version_requirements: !ruby/object:Gem::Requirement
|
28
|
-
requirements:
|
29
|
-
- - ">="
|
30
|
-
- !ruby/object:Gem::Version
|
31
|
-
version: '0'
|
32
18
|
- !ruby/object:Gem::Dependency
|
33
19
|
name: test-unit
|
34
20
|
requirement: !ruby/object:Gem::Requirement
|
@@ -174,11 +160,13 @@ files:
|
|
174
160
|
- config/README.md
|
175
161
|
- config/v1.json
|
176
162
|
- config/v2.json
|
163
|
+
- config/v3.json
|
177
164
|
- lib/assets/tld_lib.yml
|
178
165
|
- lib/twitter-text.rb
|
179
166
|
- lib/twitter-text/autolink.rb
|
180
167
|
- lib/twitter-text/configuration.rb
|
181
168
|
- lib/twitter-text/deprecation.rb
|
169
|
+
- lib/twitter-text/emoji_regex.rb
|
182
170
|
- lib/twitter-text/extractor.rb
|
183
171
|
- lib/twitter-text/hash_helper.rb
|
184
172
|
- lib/twitter-text/hit_highlighter.rb
|
@@ -222,7 +210,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
222
210
|
version: '0'
|
223
211
|
requirements: []
|
224
212
|
rubyforge_project:
|
225
|
-
rubygems_version: 2.
|
213
|
+
rubygems_version: 2.5.1
|
226
214
|
signing_key:
|
227
215
|
specification_version: 4
|
228
216
|
summary: Twitter text handling library
|