twitter-text-simpleidn 3.0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,68 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ # encoding: UTF-8
6
+
7
+ module Twitter
8
+ module TwitterText
9
+ class Configuration
10
+ require 'json'
11
+
12
+ PARSER_VERSION_CLASSIC = "v1"
13
+ PARSER_VERSION_WEIGHTED = "v2"
14
+ PARSER_VERSION_EMOJI_PARSING = "v3"
15
+
16
+ PARSER_VERSION_DEFAULT = PARSER_VERSION_WEIGHTED
17
+
18
+ class << self
19
+ attr_accessor :default_configuration
20
+ end
21
+
22
+ attr_reader :version, :max_weighted_tweet_length, :scale
23
+ attr_reader :default_weight, :transformed_url_length, :ranges
24
+ attr_reader :emoji_parsing_enabled
25
+
26
+ CONFIG_V1 = File.join(
27
+ File.expand_path('../../../config', __FILE__), # project root
28
+ "#{PARSER_VERSION_CLASSIC}.json"
29
+ )
30
+
31
+ CONFIG_V2 = File.join(
32
+ File.expand_path('../../../config', __FILE__), # project root
33
+ "#{PARSER_VERSION_WEIGHTED}.json"
34
+ )
35
+
36
+ CONFIG_V3 = File.join(
37
+ File.expand_path('../../../config', __FILE__), # project root
38
+ "#{PARSER_VERSION_EMOJI_PARSING}.json"
39
+ )
40
+
41
+ def self.parse_string(string, options = {})
42
+ JSON.parse(string, options.merge(symbolize_names: true))
43
+ end
44
+
45
+ def self.parse_file(filename)
46
+ string = File.open(filename, 'rb') { |f| f.read }
47
+ parse_string(string)
48
+ end
49
+
50
+ def self.configuration_from_file(filename)
51
+ config = parse_file(filename)
52
+ config ? self.new(config) : nil
53
+ end
54
+
55
+ def initialize(config = {})
56
+ @version = config[:version]
57
+ @max_weighted_tweet_length = config[:maxWeightedTweetLength]
58
+ @scale = config[:scale]
59
+ @default_weight = config[:defaultWeight]
60
+ @transformed_url_length = config[:transformedURLLength]
61
+ @emoji_parsing_enabled = config[:emojiParsingEnabled]
62
+ @ranges = config[:ranges].map { |range| Twitter::TwitterText::WeightedRange.new(range) } if config.key?(:ranges) && config[:ranges].is_a?(Array)
63
+ end
64
+
65
+ self.default_configuration = self.configuration_from_file(CONFIG_V3)
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,21 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ module Twitter
6
+ module TwitterText
7
+ module Deprecation
8
+ def deprecate(method, new_method = nil)
9
+ deprecated_method = :"deprecated_#{method}"
10
+ message = "Deprecation: `#{method}` is deprecated."
11
+ message << " Please use `#{new_method}` instead." if new_method
12
+
13
+ alias_method(deprecated_method, method)
14
+ define_method method do |*args, &block|
15
+ warn message unless $TESTING
16
+ send(deprecated_method, *args, &block)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,27 @@
1
+ #
2
+ # emoji_regex.rb
3
+ #
4
+ # Copyright © 2018 Twitter. All rights reserved.
5
+ #
6
+ # DO NOT MODIFY THIS FILE -- it is generated for twitter-text automatically
7
+
8
+ # encoding: utf-8
9
+
10
+ module Twitter
11
+ module TwitterText
12
+ class Regex
13
+ class Emoji
14
+ REGEXEN = {} # :nodoc:
15
+
16
+ # This regex pattern matches a single emoji
17
+ REGEXEN[:valid_emoji] = %r{
18
+ [\u{01f468}\u{01f469}][\u{01f3fb}-\u{01f3ff}]?\u200d(?:\u2695\ufe0f|\u2696\ufe0f|\u2708\ufe0f|[\u{01f33e}\u{01f373}\u{01f393}\u{01f3a4}\u{01f3a8}\u{01f3eb}\u{01f3ed}\u{01f4bb}\u{01f4bc}\u{01f527}\u{01f52c}\u{01f680}\u{01f692}\u{01f9b0}-\u{01f9b3}])|[\u26f9\u{01f3cb}\u{01f3cc}\u{01f574}\u{01f575}](?:[\ufe0f\u{01f3fb}-\u{01f3ff}]\u200d[\u2640\u2642]\ufe0f)|[\u{01f3c3}\u{01f3c4}\u{01f3ca}\u{01f46e}\u{01f471}\u{01f473}\u{01f477}\u{01f481}\u{01f482}\u{01f486}\u{01f487}\u{01f645}-\u{01f647}\u{01f64b}\u{01f64d}\u{01f64e}\u{01f6a3}\u{01f6b4}-\u{01f6b6}\u{01f926}\u{01f935}\u{01f937}-\u{01f939}\u{01f93d}\u{01f93e}\u{01f9b8}\u{01f9b9}\u{01f9d6}-\u{01f9dd}][\u{01f3fb}-\u{01f3ff}]?\u200d[\u2640\u2642]\ufe0f|(?:\u{01f468}\u200d\u2764\ufe0f\u200d\u{01f48b}\u200d\u{01f468}|\u{01f469}\u200d\u2764\ufe0f\u200d\u{01f48b}\u200d[\u{01f468}\u{01f469}]|\u{01f468}\u200d\u{01f468}\u200d\u{01f466}\u200d\u{01f466}|\u{01f468}\u200d\u{01f468}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f468}\u200d\u{01f469}\u200d\u{01f466}\u200d\u{01f466}|\u{01f468}\u200d\u{01f469}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f469}\u200d\u{01f469}\u200d\u{01f466}\u200d\u{01f466}|\u{01f469}\u200d\u{01f469}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f468}\u200d\u2764\ufe0f\u200d\u{01f468}|\u{01f469}\u200d\u2764\ufe0f\u200d[\u{01f468}\u{01f469}]|\u{01f468}\u200d\u{01f466}\u200d\u{01f466}|\u{01f468}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f468}\u200d\u{01f468}\u200d[\u{01f466}\u{01f467}]|\u{01f468}\u200d\u{01f469}\u200d[\u{01f466}\u{01f467}]|\u{01f469}\u200d\u{01f466}\u200d\u{01f466}|\u{01f469}\u200d\u{01f467}\u200d[\u{01f466}\u{01f467}]|\u{01f469}\u200d\u{01f469}\u200d[\u{01f466}\u{01f467}]|\u{01f3f3}\ufe0f\u200d\u{01f308}|\u{01f3f4}\u200d\u2620\ufe0f|\u{01f46f}\u200d\u2640\ufe0f|\u{01f46f}\u200d\u2642\ufe0f|\u{01f93c}\u200d\u2640\ufe0f|\u{01f93c}\u200d\u2642\ufe0f|\u{01f9de}\u200d\u2640\ufe0f|\u{01f9de}\u200d\u2642\ufe0f|\u{01f9df}\u200d\u2640\ufe0f|\u{01f9df}\u200d\u2642\ufe0f|\u{01f441}\u200d\u{01f5e8}|\u{01f468}\u200d[\u{01f466}\u{01f467}]|\u{01f469}\u200d[\u{01f466}\u{01f467}])|[#*0-9]\ufe0f?\u20e3|(?:[©®\u2122\u265f]\ufe0f)|[\u203c\u2049\u2139\u2194-\u2199\u21a9\u21aa\u231a\u231b\u2328\u23cf\u23ed-\u23ef\u23f1\u23f2\u23f8-\u23fa\u24c2\u25aa\u25ab\u25b6\u25c0\u25fb-\u25fe\u2600-\u2604\u260e\u2611\u2614\u2615\u2618\u2620\u2622\u2623\u2626\u262a\u262e\u262f\u2638-\u263a\u2640\u2642\u2648-\u2653\u2660\u2663\u2665\u2666\u2668\u267b\u267f\u2692-\u2697\u2699\u269b\u269c\u26a0\u26a1\u26aa\u26ab\u26b0\u26b1\u26bd\u26be\u26c4\u26c5\u26c8\u26cf\u26d1\u26d3\u26d4\u26e9\u26ea\u26f0-\u26f5\u26f8\u26fa\u26fd\u2702\u2708\u2709\u270f\u2712\u2714\u2716\u271d\u2721\u2733\u2734\u2744\u2747\u2757\u2763\u2764\u27a1\u2934\u2935\u2b05-\u2b07\u2b1b\u2b1c\u2b50\u2b55\u3030\u303d\u3297\u3299\u{01f004}\u{01f170}\u{01f171}\u{01f17e}\u{01f17f}\u{01f202}\u{01f21a}\u{01f22f}\u{01f237}\u{01f321}\u{01f324}-\u{01f32c}\u{01f336}\u{01f37d}\u{01f396}\u{01f397}\u{01f399}-\u{01f39b}\u{01f39e}\u{01f39f}\u{01f3cd}\u{01f3ce}\u{01f3d4}-\u{01f3df}\u{01f3f3}\u{01f3f5}\u{01f3f7}\u{01f43f}\u{01f441}\u{01f4fd}\u{01f549}\u{01f54a}\u{01f56f}\u{01f570}\u{01f573}\u{01f576}-\u{01f579}\u{01f587}\u{01f58a}-\u{01f58d}\u{01f5a5}\u{01f5a8}\u{01f5b1}\u{01f5b2}\u{01f5bc}\u{01f5c2}-\u{01f5c4}\u{01f5d1}-\u{01f5d3}\u{01f5dc}-\u{01f5de}\u{01f5e1}\u{01f5e3}\u{01f5e8}\u{01f5ef}\u{01f5f3}\u{01f5fa}\u{01f6cb}\u{01f6cd}-\u{01f6cf}\u{01f6e0}-\u{01f6e5}\u{01f6e9}\u{01f6f0}\u{01f6f3}](?:\ufe0f|(?!\ufe0e))|(?:[\u261d\u26f7\u26f9\u270c\u270d\u{01f3cb}\u{01f3cc}\u{01f574}\u{01f575}\u{01f590}](?:\ufe0f|(?!\ufe0e))|[\u270a\u270b\u{01f385}\u{01f3c2}-\u{01f3c4}\u{01f3c7}\u{01f3ca}\u{01f442}\u{01f443}\u{01f446}-\u{01f450}\u{01f466}-\u{01f469}\u{01f46e}\u{01f470}-\u{01f478}\u{01f47c}\u{01f481}-\u{01f483}\u{01f485}-\u{01f487}\u{01f4aa}\u{01f57a}\u{01f595}\u{01f596}\u{01f645}-\u{01f647}\u{01f64b}-\u{01f64f}\u{01f6a3}\u{01f6b4}-\u{01f6b6}\u{01f6c0}\u{01f6cc}\u{01f918}-\u{01f91c}\u{01f91e}\u{01f91f}\u{01f926}\u{01f930}-\u{01f939}\u{01f93d}\u{01f93e}\u{01f9b5}\u{01f9b6}\u{01f9b8}\u{01f9b9}\u{01f9d1}-\u{01f9dd}])[\u{01f3fb}-\u{01f3ff}]?|(?:\u{01f3f4}\u{0e0067}\u{0e0062}\u{0e0065}\u{0e006e}\u{0e0067}\u{0e007f}|\u{01f3f4}\u{0e0067}\u{0e0062}\u{0e0073}\u{0e0063}\u{0e0074}\u{0e007f}|\u{01f3f4}\u{0e0067}\u{0e0062}\u{0e0077}\u{0e006c}\u{0e0073}\u{0e007f}|\u{01f1e6}[\u{01f1e8}-\u{01f1ec}\u{01f1ee}\u{01f1f1}\u{01f1f2}\u{01f1f4}\u{01f1f6}-\u{01f1fa}\u{01f1fc}\u{01f1fd}\u{01f1ff}]|\u{01f1e7}[\u{01f1e6}\u{01f1e7}\u{01f1e9}-\u{01f1ef}\u{01f1f1}-\u{01f1f4}\u{01f1f6}-\u{01f1f9}\u{01f1fb}\u{01f1fc}\u{01f1fe}\u{01f1ff}]|\u{01f1e8}[\u{01f1e6}\u{01f1e8}\u{01f1e9}\u{01f1eb}-\u{01f1ee}\u{01f1f0}-\u{01f1f5}\u{01f1f7}\u{01f1fa}-\u{01f1ff}]|\u{01f1e9}[\u{01f1ea}\u{01f1ec}\u{01f1ef}\u{01f1f0}\u{01f1f2}\u{01f1f4}\u{01f1ff}]|\u{01f1ea}[\u{01f1e6}\u{01f1e8}\u{01f1ea}\u{01f1ec}\u{01f1ed}\u{01f1f7}-\u{01f1fa}]|\u{01f1eb}[\u{01f1ee}-\u{01f1f0}\u{01f1f2}\u{01f1f4}\u{01f1f7}]|\u{01f1ec}[\u{01f1e6}\u{01f1e7}\u{01f1e9}-\u{01f1ee}\u{01f1f1}-\u{01f1f3}\u{01f1f5}-\u{01f1fa}\u{01f1fc}\u{01f1fe}]|\u{01f1ed}[\u{01f1f0}\u{01f1f2}\u{01f1f3}\u{01f1f7}\u{01f1f9}\u{01f1fa}]|\u{01f1ee}[\u{01f1e8}-\u{01f1ea}\u{01f1f1}-\u{01f1f4}\u{01f1f6}-\u{01f1f9}]|\u{01f1ef}[\u{01f1ea}\u{01f1f2}\u{01f1f4}\u{01f1f5}]|\u{01f1f0}[\u{01f1ea}\u{01f1ec}-\u{01f1ee}\u{01f1f2}\u{01f1f3}\u{01f1f5}\u{01f1f7}\u{01f1fc}\u{01f1fe}\u{01f1ff}]|\u{01f1f1}[\u{01f1e6}-\u{01f1e8}\u{01f1ee}\u{01f1f0}\u{01f1f7}-\u{01f1fb}\u{01f1fe}]|\u{01f1f2}[\u{01f1e6}\u{01f1e8}-\u{01f1ed}\u{01f1f0}-\u{01f1ff}]|\u{01f1f3}[\u{01f1e6}\u{01f1e8}\u{01f1ea}-\u{01f1ec}\u{01f1ee}\u{01f1f1}\u{01f1f4}\u{01f1f5}\u{01f1f7}\u{01f1fa}\u{01f1ff}]|\u{01f1f4}\u{01f1f2}|\u{01f1f5}[\u{01f1e6}\u{01f1ea}-\u{01f1ed}\u{01f1f0}-\u{01f1f3}\u{01f1f7}-\u{01f1f9}\u{01f1fc}\u{01f1fe}]|\u{01f1f6}\u{01f1e6}|\u{01f1f7}[\u{01f1ea}\u{01f1f4}\u{01f1f8}\u{01f1fa}\u{01f1fc}]|\u{01f1f8}[\u{01f1e6}-\u{01f1ea}\u{01f1ec}-\u{01f1f4}\u{01f1f7}-\u{01f1f9}\u{01f1fb}\u{01f1fd}-\u{01f1ff}]|\u{01f1f9}[\u{01f1e6}\u{01f1e8}\u{01f1e9}\u{01f1eb}-\u{01f1ed}\u{01f1ef}-\u{01f1f4}\u{01f1f7}\u{01f1f9}\u{01f1fb}\u{01f1fc}\u{01f1ff}]|\u{01f1fa}[\u{01f1e6}\u{01f1ec}\u{01f1f2}\u{01f1f3}\u{01f1f8}\u{01f1fe}\u{01f1ff}]|\u{01f1fb}[\u{01f1e6}\u{01f1e8}\u{01f1ea}\u{01f1ec}\u{01f1ee}\u{01f1f3}\u{01f1fa}]|\u{01f1fc}[\u{01f1eb}\u{01f1f8}]|\u{01f1fd}\u{01f1f0}|\u{01f1fe}[\u{01f1ea}\u{01f1f9}]|\u{01f1ff}[\u{01f1e6}\u{01f1f2}\u{01f1fc}]|[\u23e9-\u23ec\u23f0\u23f3\u267e\u26ce\u2705\u2728\u274c\u274e\u2753-\u2755\u2795-\u2797\u27b0\u27bf\ue50a\u{01f0cf}\u{01f18e}\u{01f191}-\u{01f19a}\u{01f1e6}-\u{01f1ff}\u{01f201}\u{01f232}-\u{01f236}\u{01f238}-\u{01f23a}\u{01f250}\u{01f251}\u{01f300}-\u{01f320}\u{01f32d}-\u{01f335}\u{01f337}-\u{01f37c}\u{01f37e}-\u{01f384}\u{01f386}-\u{01f393}\u{01f3a0}-\u{01f3c1}\u{01f3c5}\u{01f3c6}\u{01f3c8}\u{01f3c9}\u{01f3cf}-\u{01f3d3}\u{01f3e0}-\u{01f3f0}\u{01f3f4}\u{01f3f8}-\u{01f43e}\u{01f440}\u{01f444}\u{01f445}\u{01f451}-\u{01f465}\u{01f46a}-\u{01f46d}\u{01f46f}\u{01f479}-\u{01f47b}\u{01f47d}-\u{01f480}\u{01f484}\u{01f488}-\u{01f4a9}\u{01f4ab}-\u{01f4fc}\u{01f4ff}-\u{01f53d}\u{01f54b}-\u{01f54e}\u{01f550}-\u{01f567}\u{01f5a4}\u{01f5fb}-\u{01f644}\u{01f648}-\u{01f64a}\u{01f680}-\u{01f6a2}\u{01f6a4}-\u{01f6b3}\u{01f6b7}-\u{01f6bf}\u{01f6c1}-\u{01f6c5}\u{01f6d0}-\u{01f6d2}\u{01f6eb}\u{01f6ec}\u{01f6f4}-\u{01f6f9}\u{01f910}-\u{01f917}\u{01f91d}\u{01f920}-\u{01f925}\u{01f927}-\u{01f92f}\u{01f93a}\u{01f93c}\u{01f940}-\u{01f945}\u{01f947}-\u{01f970}\u{01f973}-\u{01f976}\u{01f97a}\u{01f97c}-\u{01f9a2}\u{01f9b4}\u{01f9b7}\u{01f9c0}-\u{01f9c2}\u{01f9d0}\u{01f9de}-\u{01f9ff}])|\ufe0f
19
+ }iox
20
+
21
+ def self.[](key)
22
+ REGEXEN[key]
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,388 @@
1
+ # Copyright 2018 Twitter, Inc.
2
+ # Licensed under the Apache License, Version 2.0
3
+ # http://www.apache.org/licenses/LICENSE-2.0
4
+
5
+ # encoding: utf-8
6
+ require 'simpleidn'
7
+
8
+ class String
9
+ # Helper function to count the character length by first converting to an
10
+ # array. This is needed because with unicode strings, the return value
11
+ # of length may be incorrect
12
+ def codepoint_length
13
+ if respond_to? :codepoints
14
+ length
15
+ else
16
+ chars.kind_of?(Enumerable) ? chars.to_a.size : chars.size
17
+ end
18
+ end
19
+
20
+ # Helper function to convert this string into an array of unicode code points.
21
+ def to_codepoint_a
22
+ @to_codepoint_a ||= if chars.kind_of?(Enumerable)
23
+ chars.to_a
24
+ else
25
+ codepoint_array = []
26
+ 0.upto(codepoint_length - 1) { |i| codepoint_array << [chars.slice(i)].pack('U') }
27
+ codepoint_array
28
+ end
29
+ end
30
+ end
31
+
32
+ # Helper functions to return code point offsets instead of byte offsets.
33
+ class MatchData
34
+ def char_begin(n)
35
+ if string.respond_to? :codepoints
36
+ self.begin(n)
37
+ else
38
+ string[0, self.begin(n)].codepoint_length
39
+ end
40
+ end
41
+
42
+ def char_end(n)
43
+ if string.respond_to? :codepoints
44
+ self.end(n)
45
+ else
46
+ string[0, self.end(n)].codepoint_length
47
+ end
48
+ end
49
+ end
50
+
51
+ module Twitter
52
+ module TwitterText
53
+ # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
54
+ # of usernames, lists, URLs and hashtags.
55
+ module Extractor extend self
56
+
57
+ # Maximum URL length as defined by Twitter's backend.
58
+ MAX_URL_LENGTH = 4096
59
+
60
+ # The maximum t.co path length that the Twitter backend supports.
61
+ MAX_TCO_SLUG_LENGTH = 40
62
+
63
+ URL_PROTOCOL_LENGTH = "https://".length
64
+
65
+ # Remove overlapping entities.
66
+ # This returns a new array with no overlapping entities.
67
+ def remove_overlapping_entities(entities)
68
+ # sort by start index
69
+ entities = entities.sort_by{|entity| entity[:indices].first}
70
+
71
+ # remove duplicates
72
+ prev = nil
73
+ entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
74
+ entities
75
+ end
76
+
77
+ # Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
78
+ # along with the indices for where the entity ocurred
79
+ # If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
80
+ # will be returned.
81
+ #
82
+ # If a block is given then it will be called for each entity.
83
+ def extract_entities_with_indices(text, options = {}, &block)
84
+ config = options[:config] || Twitter::TwitterText::Configuration.default_configuration
85
+
86
+ # extract all entities
87
+ entities = extract_urls_with_indices(text, options) +
88
+ extract_hashtags_with_indices(text, :check_url_overlap => false) +
89
+ extract_mentions_or_lists_with_indices(text) +
90
+ extract_cashtags_with_indices(text)
91
+ entities += extract_emoji_with_indices(text) if config.emoji_parsing_enabled
92
+
93
+ return [] if entities.empty?
94
+
95
+ entities = remove_overlapping_entities(entities)
96
+
97
+ entities.each(&block) if block_given?
98
+ entities
99
+ end
100
+
101
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
102
+ # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
103
+ # will be returned.
104
+ #
105
+ # If a block is given then it will be called for each username.
106
+ def extract_mentioned_screen_names(text, &block) # :yields: username
107
+ screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
108
+ screen_names.each(&block) if block_given?
109
+ screen_names
110
+ end
111
+
112
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
113
+ # along with the indices for where the mention ocurred. If the
114
+ # <tt>text</tt> is nil or contains no username mentions, an empty array
115
+ # will be returned.
116
+ #
117
+ # If a block is given, then it will be called with each username, the start
118
+ # index, and the end index in the <tt>text</tt>.
119
+ def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
120
+ return [] unless text
121
+
122
+ possible_screen_names = []
123
+ extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
124
+ next unless list_slug.empty?
125
+ possible_screen_names << {
126
+ :screen_name => screen_name,
127
+ :indices => [start_position, end_position]
128
+ }
129
+ end
130
+
131
+ if block_given?
132
+ possible_screen_names.each do |mention|
133
+ yield mention[:screen_name], mention[:indices].first, mention[:indices].last
134
+ end
135
+ end
136
+
137
+ possible_screen_names
138
+ end
139
+
140
+ # Extracts a list of all usernames or lists mentioned in the Tweet <tt>text</tt>
141
+ # along with the indices for where the mention ocurred. If the
142
+ # <tt>text</tt> is nil or contains no username or list mentions, an empty array
143
+ # will be returned.
144
+ #
145
+ # If a block is given, then it will be called with each username, list slug, the start
146
+ # index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
147
+ # if this is a username mention.
148
+ def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
149
+ return [] unless text =~ /[@@]/
150
+
151
+ possible_entries = []
152
+ text.to_s.scan(Twitter::TwitterText::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
153
+ match_data = $~
154
+ after = $'
155
+ unless after =~ Twitter::TwitterText::Regex[:end_mention_match]
156
+ start_position = match_data.char_begin(3) - 1
157
+ end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
158
+ possible_entries << {
159
+ :screen_name => screen_name,
160
+ :list_slug => list_slug || "",
161
+ :indices => [start_position, end_position]
162
+ }
163
+ end
164
+ end
165
+
166
+ if block_given?
167
+ possible_entries.each do |mention|
168
+ yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
169
+ end
170
+ end
171
+
172
+ possible_entries
173
+ end
174
+
175
+ # Extracts the username username replied to in the Tweet <tt>text</tt>. If the
176
+ # <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
177
+ #
178
+ # If a block is given then it will be called with the username replied to (if any)
179
+ def extract_reply_screen_name(text) # :yields: username
180
+ return nil unless text
181
+
182
+ possible_screen_name = text.match(Twitter::TwitterText::Regex[:valid_reply])
183
+ return unless possible_screen_name.respond_to?(:captures)
184
+ return if $' =~ Twitter::TwitterText::Regex[:end_mention_match]
185
+ screen_name = possible_screen_name.captures.first
186
+ yield screen_name if block_given?
187
+ screen_name
188
+ end
189
+
190
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
191
+ # <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
192
+ # will be returned.
193
+ #
194
+ # If a block is given then it will be called for each URL.
195
+ def extract_urls(text, &block) # :yields: url
196
+ urls = extract_urls_with_indices(text).map{|u| u[:url]}
197
+ urls.each(&block) if block_given?
198
+ urls
199
+ end
200
+
201
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
202
+ # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
203
+ # URLs an empty array will be returned.
204
+ #
205
+ # If a block is given then it will be called for each URL.
206
+ def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
207
+ return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
208
+ urls = []
209
+
210
+ text.to_s.scan(Twitter::TwitterText::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
211
+ valid_url_match_data = $~
212
+
213
+ start_position = valid_url_match_data.char_begin(3)
214
+ end_position = valid_url_match_data.char_end(3)
215
+
216
+ # If protocol is missing and domain contains non-ASCII characters,
217
+ # extract ASCII-only domains.
218
+ if !protocol
219
+ next if !options[:extract_url_without_protocol] || before =~ Twitter::TwitterText::Regex[:invalid_url_without_protocol_preceding_chars]
220
+ last_url = nil
221
+ domain.scan(Twitter::TwitterText::Regex[:valid_ascii_domain]) do |ascii_domain|
222
+ next unless is_valid_domain(url.length, ascii_domain, protocol)
223
+ last_url = {
224
+ :url => ascii_domain,
225
+ :indices => [start_position + $~.char_begin(0),
226
+ start_position + $~.char_end(0)]
227
+ }
228
+ urls << last_url
229
+ end
230
+
231
+ # no ASCII-only domain found. Skip the entire URL
232
+ next unless last_url
233
+
234
+ # last_url only contains domain. Need to add path and query if they exist.
235
+ if path
236
+ # last_url was not added. Add it to urls here.
237
+ last_url[:url] = url.sub(domain, last_url[:url])
238
+ last_url[:indices][1] = end_position
239
+ end
240
+ else
241
+ # In the case of t.co URLs, don't allow additional path characters
242
+ if url =~ Twitter::TwitterText::Regex[:valid_tco_url]
243
+ next if $1 && $1.length > MAX_TCO_SLUG_LENGTH
244
+ url = $&
245
+ end_position = start_position + url.codepoint_length
246
+ end
247
+
248
+ next unless is_valid_domain(url.length, domain, protocol)
249
+
250
+ urls << {
251
+ :url => url,
252
+ :indices => [start_position, end_position]
253
+ }
254
+ end
255
+ end
256
+ urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
257
+ urls
258
+ end
259
+
260
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
261
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
262
+ # will be returned. The array returned will not include the leading <tt>#</tt>
263
+ # character.
264
+ #
265
+ # If a block is given then it will be called for each hashtag.
266
+ def extract_hashtags(text, &block) # :yields: hashtag_text
267
+ hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
268
+ hashtags.each(&block) if block_given?
269
+ hashtags
270
+ end
271
+
272
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
273
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
274
+ # will be returned. The array returned will not include the leading <tt>#</tt>
275
+ # character.
276
+ #
277
+ # If a block is given then it will be called for each hashtag.
278
+ def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
279
+ return [] unless text =~ /[##]/
280
+
281
+ tags = []
282
+ text.scan(Twitter::TwitterText::Regex[:valid_hashtag]) do |before, hash, hash_text|
283
+ match_data = $~
284
+ start_position = match_data.char_begin(2)
285
+ end_position = match_data.char_end(3)
286
+ after = $'
287
+ unless after =~ Twitter::TwitterText::Regex[:end_hashtag_match]
288
+ tags << {
289
+ :hashtag => hash_text,
290
+ :indices => [start_position, end_position]
291
+ }
292
+ end
293
+ end
294
+
295
+ if options[:check_url_overlap]
296
+ # extract URLs
297
+ urls = extract_urls_with_indices(text)
298
+ unless urls.empty?
299
+ tags.concat(urls)
300
+ # remove duplicates
301
+ tags = remove_overlapping_entities(tags)
302
+ # remove URL entities
303
+ tags.reject!{|entity| !entity[:hashtag] }
304
+ end
305
+ end
306
+
307
+ tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
308
+ tags
309
+ end
310
+
311
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
312
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
313
+ # will be returned. The array returned will not include the leading <tt>$</tt>
314
+ # character.
315
+ #
316
+ # If a block is given then it will be called for each cashtag.
317
+ def extract_cashtags(text, &block) # :yields: cashtag_text
318
+ cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
319
+ cashtags.each(&block) if block_given?
320
+ cashtags
321
+ end
322
+
323
+ # Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
324
+ # <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
325
+ # will be returned. The array returned will not include the leading <tt>$</tt>
326
+ # character.
327
+ #
328
+ # If a block is given then it will be called for each cashtag.
329
+ def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
330
+ return [] unless text =~ /\$/
331
+
332
+ tags = []
333
+ text.scan(Twitter::TwitterText::Regex[:valid_cashtag]) do |before, dollar, cash_text|
334
+ match_data = $~
335
+ start_position = match_data.char_begin(2)
336
+ end_position = match_data.char_end(3)
337
+ tags << {
338
+ :cashtag => cash_text,
339
+ :indices => [start_position, end_position]
340
+ }
341
+ end
342
+
343
+ tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
344
+ tags
345
+ end
346
+
347
+ def extract_emoji_with_indices(text) # :yields: emoji, start, end
348
+ emoji = []
349
+ text.scan(Twitter::TwitterText::Regex[:valid_emoji]) do |emoji_text|
350
+ match_data = $~
351
+ start_position = match_data.char_begin(0)
352
+ end_position = match_data.char_end(0)
353
+ emoji << {
354
+ :emoji => emoji_text,
355
+ :indices => [start_position, end_position]
356
+ }
357
+ end
358
+ emoji
359
+ end
360
+
361
+ def is_valid_emoji(text)
362
+ begin
363
+ raise ArgumentError.new("invalid empty emoji") unless text
364
+ entities = extract_emoji_with_indices(text)
365
+ entities.count == 1 && entities[0][:emoji] == text
366
+ rescue Exception
367
+ # On error don't consider this a valid domain.
368
+ return false
369
+ end
370
+ end
371
+
372
+ def is_valid_domain(url_length, domain, protocol)
373
+ begin
374
+ raise ArgumentError.new("invalid empty domain") unless domain
375
+ original_domain_length = domain.length
376
+ encoded_domain = SimpleIDN.to_ascii(domain)
377
+ updated_domain_length = encoded_domain.length
378
+ url_length += (updated_domain_length - original_domain_length) if (updated_domain_length > original_domain_length)
379
+ url_length += URL_PROTOCOL_LENGTH unless protocol
380
+ url_length <= MAX_URL_LENGTH
381
+ rescue Exception
382
+ # On error don't consider this a valid domain.
383
+ return false
384
+ end
385
+ end
386
+ end
387
+ end
388
+ end