twitter-text-simpleidn 3.0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gemtest +0 -0
- data/.gitignore +40 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +35 -0
- data/Gemfile +4 -0
- data/LICENSE +188 -0
- data/README.md +193 -0
- data/Rakefile +52 -0
- data/config/README.md +142 -0
- data/config/v1.json +8 -0
- data/config/v2.json +29 -0
- data/config/v3.json +30 -0
- data/lib/assets/tld_lib.yml +1571 -0
- data/lib/twitter-text.rb +29 -0
- data/lib/twitter-text/autolink.rb +453 -0
- data/lib/twitter-text/configuration.rb +68 -0
- data/lib/twitter-text/deprecation.rb +21 -0
- data/lib/twitter-text/emoji_regex.rb +27 -0
- data/lib/twitter-text/extractor.rb +388 -0
- data/lib/twitter-text/hash_helper.rb +27 -0
- data/lib/twitter-text/hit_highlighter.rb +92 -0
- data/lib/twitter-text/regex.rb +381 -0
- data/lib/twitter-text/rewriter.rb +69 -0
- data/lib/twitter-text/unicode.rb +31 -0
- data/lib/twitter-text/validation.rb +251 -0
- data/lib/twitter-text/weighted_range.rb +24 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/spec/autolinking_spec.rb +848 -0
- data/spec/configuration_spec.rb +136 -0
- data/spec/extractor_spec.rb +392 -0
- data/spec/hithighlighter_spec.rb +96 -0
- data/spec/regex_spec.rb +76 -0
- data/spec/rewriter_spec.rb +553 -0
- data/spec/spec_helper.rb +139 -0
- data/spec/test_urls.rb +90 -0
- data/spec/twitter_text_spec.rb +25 -0
- data/spec/unicode_spec.rb +35 -0
- data/spec/validation_spec.rb +87 -0
- data/test/conformance_test.rb +242 -0
- data/twitter-text.gemspec +35 -0
- metadata +229 -0
@@ -0,0 +1,68 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
# encoding: UTF-8
|
6
|
+
|
7
|
+
module Twitter
|
8
|
+
module TwitterText
|
9
|
+
class Configuration
|
10
|
+
require 'json'
|
11
|
+
|
12
|
+
PARSER_VERSION_CLASSIC = "v1"
|
13
|
+
PARSER_VERSION_WEIGHTED = "v2"
|
14
|
+
PARSER_VERSION_EMOJI_PARSING = "v3"
|
15
|
+
|
16
|
+
PARSER_VERSION_DEFAULT = PARSER_VERSION_WEIGHTED
|
17
|
+
|
18
|
+
class << self
|
19
|
+
attr_accessor :default_configuration
|
20
|
+
end
|
21
|
+
|
22
|
+
attr_reader :version, :max_weighted_tweet_length, :scale
|
23
|
+
attr_reader :default_weight, :transformed_url_length, :ranges
|
24
|
+
attr_reader :emoji_parsing_enabled
|
25
|
+
|
26
|
+
CONFIG_V1 = File.join(
|
27
|
+
File.expand_path('../../../config', __FILE__), # project root
|
28
|
+
"#{PARSER_VERSION_CLASSIC}.json"
|
29
|
+
)
|
30
|
+
|
31
|
+
CONFIG_V2 = File.join(
|
32
|
+
File.expand_path('../../../config', __FILE__), # project root
|
33
|
+
"#{PARSER_VERSION_WEIGHTED}.json"
|
34
|
+
)
|
35
|
+
|
36
|
+
CONFIG_V3 = File.join(
|
37
|
+
File.expand_path('../../../config', __FILE__), # project root
|
38
|
+
"#{PARSER_VERSION_EMOJI_PARSING}.json"
|
39
|
+
)
|
40
|
+
|
41
|
+
def self.parse_string(string, options = {})
|
42
|
+
JSON.parse(string, options.merge(symbolize_names: true))
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.parse_file(filename)
|
46
|
+
string = File.open(filename, 'rb') { |f| f.read }
|
47
|
+
parse_string(string)
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.configuration_from_file(filename)
|
51
|
+
config = parse_file(filename)
|
52
|
+
config ? self.new(config) : nil
|
53
|
+
end
|
54
|
+
|
55
|
+
def initialize(config = {})
|
56
|
+
@version = config[:version]
|
57
|
+
@max_weighted_tweet_length = config[:maxWeightedTweetLength]
|
58
|
+
@scale = config[:scale]
|
59
|
+
@default_weight = config[:defaultWeight]
|
60
|
+
@transformed_url_length = config[:transformedURLLength]
|
61
|
+
@emoji_parsing_enabled = config[:emojiParsingEnabled]
|
62
|
+
@ranges = config[:ranges].map { |range| Twitter::TwitterText::WeightedRange.new(range) } if config.key?(:ranges) && config[:ranges].is_a?(Array)
|
63
|
+
end
|
64
|
+
|
65
|
+
self.default_configuration = self.configuration_from_file(CONFIG_V3)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
module Twitter
|
6
|
+
module TwitterText
|
7
|
+
module Deprecation
|
8
|
+
def deprecate(method, new_method = nil)
|
9
|
+
deprecated_method = :"deprecated_#{method}"
|
10
|
+
message = "Deprecation: `#{method}` is deprecated."
|
11
|
+
message << " Please use `#{new_method}` instead." if new_method
|
12
|
+
|
13
|
+
alias_method(deprecated_method, method)
|
14
|
+
define_method method do |*args, &block|
|
15
|
+
warn message unless $TESTING
|
16
|
+
send(deprecated_method, *args, &block)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#
|
2
|
+
# emoji_regex.rb
|
3
|
+
#
|
4
|
+
# Copyright © 2018 Twitter. All rights reserved.
|
5
|
+
#
|
6
|
+
# DO NOT MODIFY THIS FILE -- it is generated for twitter-text automatically
|
7
|
+
|
8
|
+
# encoding: utf-8
|
9
|
+
|
10
|
+
module Twitter
|
11
|
+
module TwitterText
|
12
|
+
class Regex
|
13
|
+
class Emoji
|
14
|
+
REGEXEN = {} # :nodoc:
|
15
|
+
|
16
|
+
# This regex pattern matches a single emoji
|
17
|
+
REGEXEN[:valid_emoji] = %r{
|
18
|
+
[\u{01f468}\u{01f469}][\u{01f3fb}-\u{01f3ff}]?\u200d(?:\u2695\ufe0f|\u2696\ufe0f|\u2708\ufe0f|[\u{01f33e}\u{01f373}\u{01f393}\u{01f3a4}\u{01f3a8}\u{01f3eb}\u{01f3ed}\u{01f4bb}\u{01f4bc}\u{01f527}\u{01f52c}\u{01f680}\u{01f692}\u{01f9b0}-\u{01f9b3}])|[\u26f9\u{01f3cb}\u{01f3cc}\u{01f574}\u{01f575}](?:[\ufe0f\u{01f3fb}-\u{01f3ff}]\u200d[\u2640\u2642]\ufe0f)|[\u{01f3c3}\u{01f3c4}\u{01f3ca}\u{01f46e}\u{01f471}\u{01f473}\u{01f477}\u{01f481}\u{01f482}\u{01f486}\u{01f487}\u{01f645}-\u{01f647}\u{01f64b}\u{01f64d}\u{01f64e}\u{01f6a3}\u{01f6b4}-\u{01f6b6}\u{01f926}\u{01f935}\u{01f937}-\u{01f939}\u{01f93d}\u{01f93e}\u{01f9b8}\u{01f9b9}\u{01f9d6}-\u{01f9dd}][\u{01f3fb}-\u{01f3ff}]?\u200d[\u2640\u2642]\ufe0f|(?:\u{01f468}\u200d\u2764\ufe0f\u200d\u{01f48b}\u200d\u{01f468}|\u{01f469}\u200d\u2764\ufe0f\u200d\u{01f48b}\u200d[\u{01f468}\u{01f469}]|\u{01f468}\u200d\u{01f468}\u200d\u{01f466}\u200d\u{01f466}|\u{01f468}\u200d\u{01f468}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f468}\u200d\u{01f469}\u200d\u{01f466}\u200d\u{01f466}|\u{01f468}\u200d\u{01f469}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f469}\u200d\u{01f469}\u200d\u{01f466}\u200d\u{01f466}|\u{01f469}\u200d\u{01f469}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f468}\u200d\u2764\ufe0f\u200d\u{01f468}|\u{01f469}\u200d\u2764\ufe0f\u200d[\u{01f468}\u{01f469}]|\u{01f468}\u200d\u{01f466}\u200d\u{01f466}|\u{01f468}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f468}\u200d\u{01f468}\u200d[\u{01f466}\u{01f467}]|\u{01f468}\u200d\u{01f469}\u200d[\u{01f466}\u{01f467}]|\u{01f469}\u200d\u{01f466}\u200d\u{01f466}|\u{01f469}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f469}\u200d\u{01f469}\u200d[\u{01f466}\u{01f467}]|\u{01f3f3}\ufe0f\u200d\u{01f308}|\u{01f3f4}\u200d\u2620\ufe0f|\u{01f46f}\u200d\u2640\ufe0f|\u{01f46f}\u200d\u2642\ufe0f|\u{01f93c}\u200d\u2640\ufe0f|\u{01f93c}\u200d\u2642\ufe0f|\u{01f9de}\u200d\u2640\ufe0f|\u{01f9de}\u200d\u2642\ufe0f|\u{01f9df}\u200d\u2640\ufe0f|\u{01f9df}\u200d\u2642\ufe0f|\u{01f441}\u200d\u{01f5e8}|\u{01f468}\u200d[\u{01f466}\u{01f467}]|\u{01f469}\u200d[\u{01f466}\u{01f467}])|[#*0-9]\ufe0f?\u20e3|(?:[©®\u2122\u265f]\ufe0f)|[\u203c\u2049\u2139\u2194-\u2199\u21a9\u21aa\u231a\u231b\u2328\u23cf\u23ed-\u23ef\u23f1\u23f2\u23f8-\u23fa\u24c2\u25aa\u25ab\u25b6\u25c0\u25fb-\u25fe\u2600-\u2604\u260e\u2611\u2614\u2615\u2618\u2620\u2622\u2623\u2626\u262a\u262e\u262f\u2638-\u263a\u2640\u2642\u2648-\u2653\u2660\u2663\u2665\u2666\u2668\u267b\u267f\u2692-\u2697\u2699\u269b\u269c\u26a0\u26a1\u26aa\u26ab\u26b0\u26b1\u26bd\u26be\u26c4\u26c5\u26c8\u26cf\u26d1\u26d3\u26d4\u26e9\u26ea\u26f0-\u26f5\u26f8\u26fa\u26fd\u2702\u2708\u2709\u270f\u2712\u2714\u2716\u271d\u2721\u2733\u2734\u2744\u2747\u2757\u2763\u2764\u27a1\u2934\u2935\u2b05-\u2b07\u2b1b\u2b1c\u2b50\u2b55\u3030\u303d\u3297\u3299\u{01f004}\u{01f170}\u{01f171}\u{01f17e}\u{01f17f}\u{01f202}\u{01f21a}\u{01f22f}\u{01f237}\u{01f321}\u{01f324}-\u{01f32c}\u{01f336}\u{01f37d}\u{01f396}\u{01f397}\u{01f399}-\u{01f39b}\u{01f39e}\u{01f39f}\u{01f3cd}\u{01f3ce}\u{01f3d4}-\u{01f3df}\u{01f3f3}\u{01f3f5}\u{01f3f7}\u{01f43f}\u{01f441}\u{01f4fd}\u{01f549}\u{01f54a}\u{01f56f}\u{01f570}\u{01f573}\u{01f576}-\u{01f579}\u{01f587}\u{01f58a}-\u{01f58d}\u{01f5a5}\u{01f5a8}\u{01f5b1}\u{01f5b2}\u{01f5bc}\u{01f5c2}-\u{01f5c4}\u{01f5d1}-\u{01f5d3}\u{01f5dc}-\u{01f5de}\u{01f5e1}\u{01f5e3}\u{01f5e8}\u{01f5ef}\u{01f5f3}\u{01f5fa}\u{01f6cb}\u{01f6cd}-\u{01f6cf}\u{01f6e0}-\u{01f6e5}\u{01f6e9}\u{01f6f0}\u{01f6f3}](?:\ufe0f|(?!\ufe0e))|(?:[\u261d\u26f7\u26f9\u270c\u270d\u{01f3cb}\u{01f3cc}\u{01f574}\u{01f575}\u{01f590}](?:\ufe0f|(?!\ufe0e))|[\u270a\u270b\u{01f385}\u{01f3c2}-\u{01f3c4}\u{01f3c7}\u{01f3ca}\u{01f442}\u{01f443}\u{01f446}-\u{01f450}\u{01f466}-\u{01f469}\u{01f46e}\u{01f470}-\u{01f478}\u{01f47c}\u{01f481}-\u{01f483}\u{01f485}-\u{01f487}\u{01f4aa}\u{01f57a}\u{01f595}\u{01f596}\u{01f645}-\u{01f647}\u{01f64b}-\u{01f64f}\u{01f6a3}\u{01f6b4}-\u{01f6b6}\u{01f6c0}\u{01f6cc}\u{01f918}-\u{01f91c}\u{01f91e}\u{01f91f}\u{01f926}\u{01f930}-\u{01f939}\u{01f93d}\u{01f93e}\u{01f9b5}\u{01f9b6}\u{01f9b8}\u{01f9b9}\u{01f9d1}-\u{01f9dd}])[\u{01f3fb}-\u{01f3ff}]?|(?:\u{01f3f4}\u{0e0067}\u{0e0062}\u{0e0065}\u{0e006e}\u{0e0067}\u{0e007f}|\u{01f3f4}\u{0e0067}\u{0e0062}\u{0e0073}\u{0e0063}\u{0e0074}\u{0e007f}|\u{01f3f4}\u{0e0067}\u{0e0062}\u{0e0077}\u{0e006c}\u{0e0073}\u{0e007f}|\u{01f1e6}[\u{01f1e8}-\u{01f1ec}\u{01f1ee}\u{01f1f1}\u{01f1f2}\u{01f1f4}\u{01f1f6}-\u{01f1fa}\u{01f1fc}\u{01f1fd}\u{01f1ff}]|\u{01f1e7}[\u{01f1e6}\u{01f1e7}\u{01f1e9}-\u{01f1ef}\u{01f1f1}-\u{01f1f4}\u{01f1f6}-\u{01f1f9}\u{01f1fb}\u{01f1fc}\u{01f1fe}\u{01f1ff}]|\u{01f1e8}[\u{01f1e6}\u{01f1e8}\u{01f1e9}\u{01f1eb}-\u{01f1ee}\u{01f1f0}-\u{01f1f5}\u{01f1f7}\u{01f1fa}-\u{01f1ff}]|\u{01f1e9}[\u{01f1ea}\u{01f1ec}\u{01f1ef}\u{01f1f0}\u{01f1f2}\u{01f1f4}\u{01f1ff}]|\u{01f1ea}[\u{01f1e6}\u{01f1e8}\u{01f1ea}\u{01f1ec}\u{01f1ed}\u{01f1f7}-\u{01f1fa}]|\u{01f1eb}[\u{01f1ee}-\u{01f1f0}\u{01f1f2}\u{01f1f4}\u{01f1f7}]|\u{01f1ec}[\u{01f1e6}\u{01f1e7}\u{01f1e9}-\u{01f1ee}\u{01f1f1}-\u{01f1f3}\u{01f1f5}-\u{01f1fa}\u{01f1fc}\u{01f1fe}]|\u{01f1ed}[\u{01f1f0}\u{01f1f2}\u{01f1f3}\u{01f1f7}\u{01f1f9}\u{01f1fa}]|\u{01f1ee}[\u{01f1e8}-\u{01f1ea}\u{01f1f1}-\u{01f1f4}\u{01f1f6}-\u{01f1f9}]|\u{01f1ef}[\u{01f1ea}\u{01f1f2}\u{01f1f4}\u{01f1f5}]|\u{01f1f0}[\u{01f1ea}\u{01f1ec}-\u{01f1ee}\u{01f1f2}\u{01f1f3}\u{01f1f5}\u{01f1f7}\u{01f1fc}\u{01f1fe}\u{01f1ff}]|\u{01f1f1}[\u{01f1e6}-\u{01f1e8}\u{01f1ee}\u{01f1f0}\u{01f1f7}-\u{01f1fb}\u{01f1fe}]|\u{01f1f2}[\u{01f1e6}\u{01f1e8}-\u{01f1ed}\u{01f1f0}-\u{01f1ff}]|\u{01f1f3}[\u{01f1e6}\u{01f1e8}\u{01f1ea}-\u{01f1ec}\u{01f1ee}\u{01f1f1}\u{01f1f4}\u{01f1f5}\u{01f1f7}\u{01f1fa}\u{01f1ff}]|\u{01f1f4}\u{01f1f2}|\u{01f1f5}[\u{01f1e6}\u{01f1ea}-\u{01f1ed}\u{01f1f0}-\u{01f1f3}\u{01f1f7}-\u{01f1f9}\u{01f1fc}\u{01f1fe}]|\u{01f1f6}\u{01f1e6}|\u{01f1f7}[\u{01f1ea}\u{01f1f4}\u{01f1f8}\u{01f1fa}\u{01f1fc}]|\u{01f1f8}[\u{01f1e6}-\u{01f1ea}\u{01f1ec}-\u{01f1f4}\u{01f1f7}-\u{01f1f9}\u{01f1fb}\u{01f1fd}-\u{01f1ff}]|\u{01f1f9}[\u{01f1e6}\u{01f1e8}\u{01f1e9}\u{01f1eb}-\u{01f1ed}\u{01f1ef}-\u{01f1f4}\u{01f1f7}\u{01f1f9}\u{01f1fb}\u{01f1fc}\u{01f1ff}]|\u{01f1fa}[\u{01f1e6}\u{01f1ec}\u{01f1f2}\u{01f1f3}\u{01f1f8}\u{01f1fe}\u{01f1ff}]|\u{01f1fb}[\u{01f1e6}\u{01f1e8}\u{01f1ea}\u{01f1ec}\u{01f1ee}\u{01f1f3}\u{01f1fa}]|\u{01f1fc}[\u{01f1eb}\u{01f1f8}]|\u{01f1fd}\u{01f1f0}|\u{01f1fe}[\u{01f1ea}\u{01f1f9}]|\u{01f1ff}[\u{01f1e6}\u{01f1f2}\u{01f1fc}]|[\u23e9-\u23ec\u23f0\u23f3\u267e\u26ce\u2705\u2728\u274c\u274e\u2753-\u2755\u2795-\u2797\u27b0\u27bf\ue50a\u{01f0cf}\u{01f18e}\u{01f191}-\u{01f19a}\u{01f1e6}-\u{01f1ff}\u{01f201}\u{01f232}-\u{01f236}\u{01f238}-\u{01f23a}\u{01f250}\u{01f251}\u{01f300}-\u{01f320}\u{01f32d}-\u{01f335}\u{01f337}-\u{01f37c}\u{01f37e}-\u{01f384}\u{01f386}-\u{01f393}\u{01f3a0}-\u{01f3c1}\u{01f3c5}\u{01f3c6}\u{01f3c8}\u{01f3c9}\u{01f3cf}-\u{01f3d3}\u{01f3e0}-\u{01f3f0}\u{01f3f4}\u{01f3f8}-\u{01f43e}\u{01f440}\u{01f444}\u{01f445}\u{01f451}-\u{01f465}\u{01f46a}-\u{01f46d}\u{01f46f}\u{01f479}-\u{01f47b}\u{01f47d}-\u{01f480}\u{01f484}\u{01f488}-\u{01f4a9}\u{01f4ab}-\u{01f4fc}\u{01f4ff}-\u{01f53d}\u{01f54b}-\u{01f54e}\u{01f550}-\u{01f567}\u{01f5a4}\u{01f5fb}-\u{01f644}\u{01f648}-\u{01f64a}\u{01f680}-\u{01f6a2}\u{01f6a4}-\u{01f6b3}\u{01f6b7}-\u{01f6bf}\u{01f6c1}-\u{01f6c5}\u{01f6d0}-\u{01f6d2}\u{01f6eb}\u{01f6ec}\u{01f6f4}-\u{01f6f9}\u{01f910}-\u{01f917}\u{01f91d}\u{01f920}-\u{01f925}\u{01f927}-\u{01f92f}\u{01f93a}\u{01f93c}\u{01f940}-\u{01f945}\u{01f947}-\u{01f970}\u{01f973}-\u{01f976}\u{01f97a}\u{01f97c}-\u{01f9a2}\u{01f9b4}\u{01f9b7}\u{01f9c0}-\u{01f9c2}\u{01f9d0}\u{01f9de}-\u{01f9ff}])|\ufe0f
|
19
|
+
}iox
|
20
|
+
|
21
|
+
def self.[](key)
|
22
|
+
REGEXEN[key]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,388 @@
|
|
1
|
+
# Copyright 2018 Twitter, Inc.
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
3
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
4
|
+
|
5
|
+
# encoding: utf-8
|
6
|
+
require 'simpleidn'
|
7
|
+
|
8
|
+
class String
|
9
|
+
# Helper function to count the character length by first converting to an
|
10
|
+
# array. This is needed because with unicode strings, the return value
|
11
|
+
# of length may be incorrect
|
12
|
+
def codepoint_length
|
13
|
+
if respond_to? :codepoints
|
14
|
+
length
|
15
|
+
else
|
16
|
+
chars.kind_of?(Enumerable) ? chars.to_a.size : chars.size
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# Helper function to convert this string into an array of unicode code points.
|
21
|
+
def to_codepoint_a
|
22
|
+
@to_codepoint_a ||= if chars.kind_of?(Enumerable)
|
23
|
+
chars.to_a
|
24
|
+
else
|
25
|
+
codepoint_array = []
|
26
|
+
0.upto(codepoint_length - 1) { |i| codepoint_array << [chars.slice(i)].pack('U') }
|
27
|
+
codepoint_array
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# Helper functions to return code point offsets instead of byte offsets.
|
33
|
+
class MatchData
|
34
|
+
def char_begin(n)
|
35
|
+
if string.respond_to? :codepoints
|
36
|
+
self.begin(n)
|
37
|
+
else
|
38
|
+
string[0, self.begin(n)].codepoint_length
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def char_end(n)
|
43
|
+
if string.respond_to? :codepoints
|
44
|
+
self.end(n)
|
45
|
+
else
|
46
|
+
string[0, self.end(n)].codepoint_length
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
module Twitter
|
52
|
+
module TwitterText
|
53
|
+
# A module for including Tweet parsing in a class. This module provides function for the extraction and processing
|
54
|
+
# of usernames, lists, URLs and hashtags.
|
55
|
+
module Extractor extend self
|
56
|
+
|
57
|
+
# Maximum URL length as defined by Twitter's backend.
|
58
|
+
MAX_URL_LENGTH = 4096
|
59
|
+
|
60
|
+
# The maximum t.co path length that the Twitter backend supports.
|
61
|
+
MAX_TCO_SLUG_LENGTH = 40
|
62
|
+
|
63
|
+
URL_PROTOCOL_LENGTH = "https://".length
|
64
|
+
|
65
|
+
# Remove overlapping entities.
|
66
|
+
# This returns a new array with no overlapping entities.
|
67
|
+
def remove_overlapping_entities(entities)
|
68
|
+
# sort by start index
|
69
|
+
entities = entities.sort_by{|entity| entity[:indices].first}
|
70
|
+
|
71
|
+
# remove duplicates
|
72
|
+
prev = nil
|
73
|
+
entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
|
74
|
+
entities
|
75
|
+
end
|
76
|
+
|
77
|
+
# Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
|
78
|
+
# along with the indices for where the entity ocurred
|
79
|
+
# If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
|
80
|
+
# will be returned.
|
81
|
+
#
|
82
|
+
# If a block is given then it will be called for each entity.
|
83
|
+
def extract_entities_with_indices(text, options = {}, &block)
|
84
|
+
config = options[:config] || Twitter::TwitterText::Configuration.default_configuration
|
85
|
+
|
86
|
+
# extract all entities
|
87
|
+
entities = extract_urls_with_indices(text, options) +
|
88
|
+
extract_hashtags_with_indices(text, :check_url_overlap => false) +
|
89
|
+
extract_mentions_or_lists_with_indices(text) +
|
90
|
+
extract_cashtags_with_indices(text)
|
91
|
+
entities += extract_emoji_with_indices(text) if config.emoji_parsing_enabled
|
92
|
+
|
93
|
+
return [] if entities.empty?
|
94
|
+
|
95
|
+
entities = remove_overlapping_entities(entities)
|
96
|
+
|
97
|
+
entities.each(&block) if block_given?
|
98
|
+
entities
|
99
|
+
end
|
100
|
+
|
101
|
+
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
|
102
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
|
103
|
+
# will be returned.
|
104
|
+
#
|
105
|
+
# If a block is given then it will be called for each username.
|
106
|
+
def extract_mentioned_screen_names(text, &block) # :yields: username
|
107
|
+
screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
|
108
|
+
screen_names.each(&block) if block_given?
|
109
|
+
screen_names
|
110
|
+
end
|
111
|
+
|
112
|
+
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
|
113
|
+
# along with the indices for where the mention ocurred. If the
|
114
|
+
# <tt>text</tt> is nil or contains no username mentions, an empty array
|
115
|
+
# will be returned.
|
116
|
+
#
|
117
|
+
# If a block is given, then it will be called with each username, the start
|
118
|
+
# index, and the end index in the <tt>text</tt>.
|
119
|
+
def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
|
120
|
+
return [] unless text
|
121
|
+
|
122
|
+
possible_screen_names = []
|
123
|
+
extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
|
124
|
+
next unless list_slug.empty?
|
125
|
+
possible_screen_names << {
|
126
|
+
:screen_name => screen_name,
|
127
|
+
:indices => [start_position, end_position]
|
128
|
+
}
|
129
|
+
end
|
130
|
+
|
131
|
+
if block_given?
|
132
|
+
possible_screen_names.each do |mention|
|
133
|
+
yield mention[:screen_name], mention[:indices].first, mention[:indices].last
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
possible_screen_names
|
138
|
+
end
|
139
|
+
|
140
|
+
# Extracts a list of all usernames or lists mentioned in the Tweet <tt>text</tt>
|
141
|
+
# along with the indices for where the mention ocurred. If the
|
142
|
+
# <tt>text</tt> is nil or contains no username or list mentions, an empty array
|
143
|
+
# will be returned.
|
144
|
+
#
|
145
|
+
# If a block is given, then it will be called with each username, list slug, the start
|
146
|
+
# index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
|
147
|
+
# if this is a username mention.
|
148
|
+
def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
|
149
|
+
return [] unless text =~ /[@@]/
|
150
|
+
|
151
|
+
possible_entries = []
|
152
|
+
text.to_s.scan(Twitter::TwitterText::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
|
153
|
+
match_data = $~
|
154
|
+
after = $'
|
155
|
+
unless after =~ Twitter::TwitterText::Regex[:end_mention_match]
|
156
|
+
start_position = match_data.char_begin(3) - 1
|
157
|
+
end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
|
158
|
+
possible_entries << {
|
159
|
+
:screen_name => screen_name,
|
160
|
+
:list_slug => list_slug || "",
|
161
|
+
:indices => [start_position, end_position]
|
162
|
+
}
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
if block_given?
|
167
|
+
possible_entries.each do |mention|
|
168
|
+
yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
possible_entries
|
173
|
+
end
|
174
|
+
|
175
|
+
# Extracts the username username replied to in the Tweet <tt>text</tt>. If the
|
176
|
+
# <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
|
177
|
+
#
|
178
|
+
# If a block is given then it will be called with the username replied to (if any)
|
179
|
+
def extract_reply_screen_name(text) # :yields: username
|
180
|
+
return nil unless text
|
181
|
+
|
182
|
+
possible_screen_name = text.match(Twitter::TwitterText::Regex[:valid_reply])
|
183
|
+
return unless possible_screen_name.respond_to?(:captures)
|
184
|
+
return if $' =~ Twitter::TwitterText::Regex[:end_mention_match]
|
185
|
+
screen_name = possible_screen_name.captures.first
|
186
|
+
yield screen_name if block_given?
|
187
|
+
screen_name
|
188
|
+
end
|
189
|
+
|
190
|
+
# Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
|
191
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
|
192
|
+
# will be returned.
|
193
|
+
#
|
194
|
+
# If a block is given then it will be called for each URL.
|
195
|
+
def extract_urls(text, &block) # :yields: url
|
196
|
+
urls = extract_urls_with_indices(text).map{|u| u[:url]}
|
197
|
+
urls.each(&block) if block_given?
|
198
|
+
urls
|
199
|
+
end
|
200
|
+
|
201
|
+
# Extracts a list of all URLs included in the Tweet <tt>text</tt> along
|
202
|
+
# with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
|
203
|
+
# URLs an empty array will be returned.
|
204
|
+
#
|
205
|
+
# If a block is given then it will be called for each URL.
|
206
|
+
def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
|
207
|
+
return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
|
208
|
+
urls = []
|
209
|
+
|
210
|
+
text.to_s.scan(Twitter::TwitterText::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
|
211
|
+
valid_url_match_data = $~
|
212
|
+
|
213
|
+
start_position = valid_url_match_data.char_begin(3)
|
214
|
+
end_position = valid_url_match_data.char_end(3)
|
215
|
+
|
216
|
+
# If protocol is missing and domain contains non-ASCII characters,
|
217
|
+
# extract ASCII-only domains.
|
218
|
+
if !protocol
|
219
|
+
next if !options[:extract_url_without_protocol] || before =~ Twitter::TwitterText::Regex[:invalid_url_without_protocol_preceding_chars]
|
220
|
+
last_url = nil
|
221
|
+
domain.scan(Twitter::TwitterText::Regex[:valid_ascii_domain]) do |ascii_domain|
|
222
|
+
next unless is_valid_domain(url.length, ascii_domain, protocol)
|
223
|
+
last_url = {
|
224
|
+
:url => ascii_domain,
|
225
|
+
:indices => [start_position + $~.char_begin(0),
|
226
|
+
start_position + $~.char_end(0)]
|
227
|
+
}
|
228
|
+
urls << last_url
|
229
|
+
end
|
230
|
+
|
231
|
+
# no ASCII-only domain found. Skip the entire URL
|
232
|
+
next unless last_url
|
233
|
+
|
234
|
+
# last_url only contains domain. Need to add path and query if they exist.
|
235
|
+
if path
|
236
|
+
# last_url was not added. Add it to urls here.
|
237
|
+
last_url[:url] = url.sub(domain, last_url[:url])
|
238
|
+
last_url[:indices][1] = end_position
|
239
|
+
end
|
240
|
+
else
|
241
|
+
# In the case of t.co URLs, don't allow additional path characters
|
242
|
+
if url =~ Twitter::TwitterText::Regex[:valid_tco_url]
|
243
|
+
next if $1 && $1.length > MAX_TCO_SLUG_LENGTH
|
244
|
+
url = $&
|
245
|
+
end_position = start_position + url.codepoint_length
|
246
|
+
end
|
247
|
+
|
248
|
+
next unless is_valid_domain(url.length, domain, protocol)
|
249
|
+
|
250
|
+
urls << {
|
251
|
+
:url => url,
|
252
|
+
:indices => [start_position, end_position]
|
253
|
+
}
|
254
|
+
end
|
255
|
+
end
|
256
|
+
urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
|
257
|
+
urls
|
258
|
+
end
|
259
|
+
|
260
|
+
# Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
|
261
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
|
262
|
+
# will be returned. The array returned will not include the leading <tt>#</tt>
|
263
|
+
# character.
|
264
|
+
#
|
265
|
+
# If a block is given then it will be called for each hashtag.
|
266
|
+
def extract_hashtags(text, &block) # :yields: hashtag_text
|
267
|
+
hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
|
268
|
+
hashtags.each(&block) if block_given?
|
269
|
+
hashtags
|
270
|
+
end
|
271
|
+
|
272
|
+
# Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
|
273
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
|
274
|
+
# will be returned. The array returned will not include the leading <tt>#</tt>
|
275
|
+
# character.
|
276
|
+
#
|
277
|
+
# If a block is given then it will be called for each hashtag.
|
278
|
+
def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
|
279
|
+
return [] unless text =~ /[##]/
|
280
|
+
|
281
|
+
tags = []
|
282
|
+
text.scan(Twitter::TwitterText::Regex[:valid_hashtag]) do |before, hash, hash_text|
|
283
|
+
match_data = $~
|
284
|
+
start_position = match_data.char_begin(2)
|
285
|
+
end_position = match_data.char_end(3)
|
286
|
+
after = $'
|
287
|
+
unless after =~ Twitter::TwitterText::Regex[:end_hashtag_match]
|
288
|
+
tags << {
|
289
|
+
:hashtag => hash_text,
|
290
|
+
:indices => [start_position, end_position]
|
291
|
+
}
|
292
|
+
end
|
293
|
+
end
|
294
|
+
|
295
|
+
if options[:check_url_overlap]
|
296
|
+
# extract URLs
|
297
|
+
urls = extract_urls_with_indices(text)
|
298
|
+
unless urls.empty?
|
299
|
+
tags.concat(urls)
|
300
|
+
# remove duplicates
|
301
|
+
tags = remove_overlapping_entities(tags)
|
302
|
+
# remove URL entities
|
303
|
+
tags.reject!{|entity| !entity[:hashtag] }
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
|
308
|
+
tags
|
309
|
+
end
|
310
|
+
|
311
|
+
# Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
|
312
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
|
313
|
+
# will be returned. The array returned will not include the leading <tt>$</tt>
|
314
|
+
# character.
|
315
|
+
#
|
316
|
+
# If a block is given then it will be called for each cashtag.
|
317
|
+
def extract_cashtags(text, &block) # :yields: cashtag_text
|
318
|
+
cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
|
319
|
+
cashtags.each(&block) if block_given?
|
320
|
+
cashtags
|
321
|
+
end
|
322
|
+
|
323
|
+
# Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
|
324
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
|
325
|
+
# will be returned. The array returned will not include the leading <tt>$</tt>
|
326
|
+
# character.
|
327
|
+
#
|
328
|
+
# If a block is given then it will be called for each cashtag.
|
329
|
+
def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
|
330
|
+
return [] unless text =~ /\$/
|
331
|
+
|
332
|
+
tags = []
|
333
|
+
text.scan(Twitter::TwitterText::Regex[:valid_cashtag]) do |before, dollar, cash_text|
|
334
|
+
match_data = $~
|
335
|
+
start_position = match_data.char_begin(2)
|
336
|
+
end_position = match_data.char_end(3)
|
337
|
+
tags << {
|
338
|
+
:cashtag => cash_text,
|
339
|
+
:indices => [start_position, end_position]
|
340
|
+
}
|
341
|
+
end
|
342
|
+
|
343
|
+
tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
|
344
|
+
tags
|
345
|
+
end
|
346
|
+
|
347
|
+
def extract_emoji_with_indices(text) # :yields: emoji, start, end
|
348
|
+
emoji = []
|
349
|
+
text.scan(Twitter::TwitterText::Regex[:valid_emoji]) do |emoji_text|
|
350
|
+
match_data = $~
|
351
|
+
start_position = match_data.char_begin(0)
|
352
|
+
end_position = match_data.char_end(0)
|
353
|
+
emoji << {
|
354
|
+
:emoji => emoji_text,
|
355
|
+
:indices => [start_position, end_position]
|
356
|
+
}
|
357
|
+
end
|
358
|
+
emoji
|
359
|
+
end
|
360
|
+
|
361
|
+
def is_valid_emoji(text)
|
362
|
+
begin
|
363
|
+
raise ArgumentError.new("invalid empty emoji") unless text
|
364
|
+
entities = extract_emoji_with_indices(text)
|
365
|
+
entities.count == 1 && entities[0][:emoji] == text
|
366
|
+
rescue Exception
|
367
|
+
# On error don't consider this a valid domain.
|
368
|
+
return false
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
def is_valid_domain(url_length, domain, protocol)
|
373
|
+
begin
|
374
|
+
raise ArgumentError.new("invalid empty domain") unless domain
|
375
|
+
original_domain_length = domain.length
|
376
|
+
encoded_domain = SimpleIDN.to_ascii(domain)
|
377
|
+
updated_domain_length = encoded_domain.length
|
378
|
+
url_length += (updated_domain_length - original_domain_length) if (updated_domain_length > original_domain_length)
|
379
|
+
url_length += URL_PROTOCOL_LENGTH unless protocol
|
380
|
+
url_length <= MAX_URL_LENGTH
|
381
|
+
rescue Exception
|
382
|
+
# On error don't consider this a valid domain.
|
383
|
+
return false
|
384
|
+
end
|
385
|
+
end
|
386
|
+
end
|
387
|
+
end
|
388
|
+
end
|