vader_sentiment_ruby 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +51 -0
- data/lib/vader_sentiment_ruby.rb +20 -0
- data/lib/vader_sentiment_ruby/checker.rb +13 -0
- data/lib/vader_sentiment_ruby/checker/but_word_negation_checker.rb +34 -0
- data/lib/vader_sentiment_ruby/checker/least_word_negation_checker.rb +38 -0
- data/lib/vader_sentiment_ruby/checker/negation_checker.rb +114 -0
- data/lib/vader_sentiment_ruby/checker/no_word_checker.rb +49 -0
- data/lib/vader_sentiment_ruby/checker/previous_words_influence_checker.rb +55 -0
- data/lib/vader_sentiment_ruby/checker/sentiment_laden_idioms_checker.rb +30 -0
- data/lib/vader_sentiment_ruby/checker/special_idioms_checker.rb +107 -0
- data/lib/vader_sentiment_ruby/constants.rb +135 -0
- data/lib/vader_sentiment_ruby/data/emoji_utf8_lexicon.txt +3570 -0
- data/lib/vader_sentiment_ruby/data/vader_lexicon.txt +7518 -0
- data/lib/vader_sentiment_ruby/emojis_describer.rb +39 -0
- data/lib/vader_sentiment_ruby/emojis_dictionary_creator.rb +21 -0
- data/lib/vader_sentiment_ruby/lexicon_dictionary_creator.rb +21 -0
- data/lib/vader_sentiment_ruby/punctuation_emphasis_amplifier.rb +36 -0
- data/lib/vader_sentiment_ruby/sentiment_intensity_analyzer.rb +105 -0
- data/lib/vader_sentiment_ruby/sentiment_properties_identifier.rb +48 -0
- data/lib/vader_sentiment_ruby/sentiment_scores_sifter.rb +27 -0
- data/lib/vader_sentiment_ruby/valence_score_calculator.rb +82 -0
- data/lib/vader_sentiment_ruby/version.rb +5 -0
- data/lib/vader_sentiment_ruby/word_helper.rb +93 -0
- metadata +156 -0
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module VaderSentimentRuby
|
4
|
+
# Replaces emoji char with its description
|
5
|
+
class EmojiDescriber
|
6
|
+
def initialize(text, emojis)
|
7
|
+
@text = text
|
8
|
+
@emojis = emojis
|
9
|
+
@text_no_emoji = ''
|
10
|
+
@prev_space = true
|
11
|
+
end
|
12
|
+
|
13
|
+
def call
|
14
|
+
@text.split('').each do |chr|
|
15
|
+
if @emojis.keys.include?(chr)
|
16
|
+
handle_emoji_presence(chr)
|
17
|
+
else
|
18
|
+
handle_emoji_absence(chr)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
@text_no_emoji
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def handle_emoji_presence(emoji)
|
28
|
+
description = @emojis[emoji]
|
29
|
+
@text_no_emoji += ' ' unless @prev_space
|
30
|
+
@text_no_emoji += description
|
31
|
+
@prev_space = false
|
32
|
+
end
|
33
|
+
|
34
|
+
def handle_emoji_absence(character)
|
35
|
+
@text_no_emoji += character
|
36
|
+
@prev_space = character == ' '
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module VaderSentimentRuby
|
4
|
+
# Converts emoji lexicon file to a dictionary
|
5
|
+
class EmojisDictionaryCreator
|
6
|
+
# @return [Hash]
|
7
|
+
def call
|
8
|
+
emoji_file = File.open("#{__dir__}/data/emoji_utf8_lexicon.txt").read
|
9
|
+
emoji_dict = {}
|
10
|
+
lines = emoji_file.strip.split("\n")
|
11
|
+
lines.each do |line|
|
12
|
+
next unless line
|
13
|
+
|
14
|
+
emoji, description = line.strip.split("\t")[0..1]
|
15
|
+
emoji_dict[emoji] = description
|
16
|
+
end
|
17
|
+
|
18
|
+
emoji_dict
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module VaderSentimentRuby
|
4
|
+
# Converts lexicon file to a dictionary
|
5
|
+
class LexiconDictionaryCreator
|
6
|
+
# @return [Hash]
|
7
|
+
def call
|
8
|
+
lexicon_file = File.open("#{__dir__}/data/vader_lexicon.txt").read
|
9
|
+
lex_dict = {}
|
10
|
+
lines = lexicon_file.strip.split("\n")
|
11
|
+
lines.each do |line|
|
12
|
+
next unless line
|
13
|
+
|
14
|
+
word, measure = line.strip.split("\t")[0..1]
|
15
|
+
lex_dict[word] = measure.to_f
|
16
|
+
end
|
17
|
+
|
18
|
+
lex_dict
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module VaderSentimentRuby
|
4
|
+
# Adds emphasis from exclamation points and question marks
|
5
|
+
class PunctuationEmphasisAmplifier
|
6
|
+
# @param [String] text
|
7
|
+
def initialize(text)
|
8
|
+
@text = text
|
9
|
+
end
|
10
|
+
|
11
|
+
# @return [Float]
|
12
|
+
def call
|
13
|
+
amplify_exclamation_points + amplify_question_marks
|
14
|
+
end
|
15
|
+
|
16
|
+
def amplify_exclamation_points
|
17
|
+
# check for added emphasis resulting from exclamation points (up to 4 of them)
|
18
|
+
ep_count = @text.split('').count('!')
|
19
|
+
ep_count = 4.0 if ep_count > 4
|
20
|
+
|
21
|
+
# empirically derived mean sentiment intensity rating increase for exclamation points
|
22
|
+
ep_count * 0.292
|
23
|
+
end
|
24
|
+
|
25
|
+
def amplify_question_marks
|
26
|
+
# check for added emphasis resulting from question marks (2 or 3+)
|
27
|
+
qm_count = @text.split('').count('?')
|
28
|
+
|
29
|
+
return 0.0 unless qm_count > 1
|
30
|
+
# empirically derived mean sentiment intensity rating increase for question marks
|
31
|
+
return qm_count * 0.18 if qm_count <= 3
|
32
|
+
|
33
|
+
0.96
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module VaderSentimentRuby
|
4
|
+
# Returns a sentiment intensity score for sentences.
|
5
|
+
class SentimentIntensityAnalyzer
|
6
|
+
def initialize
|
7
|
+
@lexicon = LexiconDictionaryCreator.new.call
|
8
|
+
@emojis = EmojisDictionaryCreator.new.call
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns a float for sentiment strength based on the input text.
|
12
|
+
# Positive values are positive valence, negative value are negative valence.
|
13
|
+
# @param [String] text Text to analyze
|
14
|
+
# @return [Hash] Hash of sentiments for analyzed text
|
15
|
+
def polarity_scores(text)
|
16
|
+
text = EmojiDescriber.new(text, @emojis).call
|
17
|
+
senti_text = SentimentPropertiesIdentifier.new(text)
|
18
|
+
|
19
|
+
sentiments = []
|
20
|
+
words_and_emoticons = senti_text.words_and_emoticons
|
21
|
+
words_and_emoticons.each_with_index do |item, index|
|
22
|
+
sentiments << prepare_valence(item, index, words_and_emoticons, senti_text)
|
23
|
+
end
|
24
|
+
|
25
|
+
sentiments = Checker::ButWordNegationChecker.new(words_and_emoticons, sentiments).call
|
26
|
+
|
27
|
+
ValenceScoreCalculator.new(sentiments, text).call
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def prepare_valence(item, index, words_and_emoticons, senti_text)
|
33
|
+
valence = 0
|
34
|
+
|
35
|
+
# Check for vader_lexicon words that may be used as modifiers or negations
|
36
|
+
return valence if Constants::BOOSTER_DICT.keys.include?(item.downcase)
|
37
|
+
|
38
|
+
if index < words_and_emoticons.size - 1 &&
|
39
|
+
item.downcase == 'kind' &&
|
40
|
+
(words_and_emoticons[index + 1]).downcase == 'of'
|
41
|
+
return valence
|
42
|
+
end
|
43
|
+
|
44
|
+
sentiment_valence(valence, senti_text, item, index)
|
45
|
+
end
|
46
|
+
|
47
|
+
def sentiment_valence(valence, senti_text, item, index)
|
48
|
+
item_lowercase = item.downcase
|
49
|
+
|
50
|
+
if @lexicon.keys.include?(item_lowercase)
|
51
|
+
valence = calculate_valence_for_word_in_lexicon(item, item_lowercase, index, senti_text)
|
52
|
+
end
|
53
|
+
|
54
|
+
valence
|
55
|
+
end
|
56
|
+
|
57
|
+
def calculate_valence_for_word_in_lexicon(item, item_lowercase, index, senti_text)
|
58
|
+
is_cap_diff = senti_text.is_cap_diff
|
59
|
+
words_and_emoticons = senti_text.words_and_emoticons
|
60
|
+
|
61
|
+
valence = @lexicon[item_lowercase] # get the sentiment valence
|
62
|
+
valence = Checker::NoWordChecker.new(valence, item_lowercase, index, words_and_emoticons, @lexicon).call
|
63
|
+
# Check if sentiment laden word is in ALL CAPS (while others aren't)
|
64
|
+
valence = apply_intensity_rating(valence) if WordHelper.word_upcase?(item) && is_cap_diff
|
65
|
+
valence = modify_valence_by_scalar(valence, index, words_and_emoticons, is_cap_diff)
|
66
|
+
Checker::LeastWordNegationChecker.new(valence, words_and_emoticons, index, @lexicon).call
|
67
|
+
end
|
68
|
+
|
69
|
+
def apply_intensity_rating(valence)
|
70
|
+
return valence + Constants::C_INCR if valence.positive?
|
71
|
+
|
72
|
+
valence - Constants::C_INCR
|
73
|
+
end
|
74
|
+
|
75
|
+
# Dampen the scalar modifier of preceding words and emoticons
|
76
|
+
# (excluding the ones that immediately precede the item) based
|
77
|
+
# on their distance from the current item.
|
78
|
+
def modify_valence_by_scalar(valence, index, words_and_emoticons, is_cap_diff)
|
79
|
+
(0..2).each do |start_index|
|
80
|
+
next unless index > start_index
|
81
|
+
next if @lexicon.keys.include?((words_and_emoticons[index - (start_index + 1)]).downcase)
|
82
|
+
|
83
|
+
valence = apply_scalar(valence, words_and_emoticons, index, start_index, is_cap_diff)
|
84
|
+
valence = Checker::NegationChecker.new(valence, words_and_emoticons, start_index, index).call
|
85
|
+
valence = Checker::SpecialIdiomsChecker.new(valence, words_and_emoticons, index).call if start_index == 2
|
86
|
+
end
|
87
|
+
|
88
|
+
valence
|
89
|
+
end
|
90
|
+
|
91
|
+
def apply_scalar(valence, words_and_emoticons, index, start_index, is_cap_diff)
|
92
|
+
previous_word = words_and_emoticons[index - (start_index + 1)]
|
93
|
+
scalar = Checker::PreviousWordsInfluenceChecker.new(previous_word, valence, is_cap_diff).call
|
94
|
+
valence + adjust_scalar(scalar, start_index)
|
95
|
+
end
|
96
|
+
|
97
|
+
def adjust_scalar(scalar, start_index)
|
98
|
+
return scalar if scalar.zero?
|
99
|
+
|
100
|
+
scalar *= 0.95 if start_index == 1
|
101
|
+
scalar *= 0.9 if start_index == 2
|
102
|
+
scalar
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module VaderSentimentRuby
|
4
|
+
# Identify sentiment-relevant string-level properties of input text.
|
5
|
+
class SentimentPropertiesIdentifier
|
6
|
+
attr_reader :is_cap_diff, :words_and_emoticons
|
7
|
+
|
8
|
+
# @param [String] text
|
9
|
+
def initialize(text)
|
10
|
+
text = text.to_s.encode('utf-8') unless text.is_a? String
|
11
|
+
@text = text
|
12
|
+
@words_and_emoticons = prepare_words_and_emoticons
|
13
|
+
# Doesn't separate words from adjacent punctuation (keeps emoticons & contractions)
|
14
|
+
@is_cap_diff = all_cap_differential?(@words_and_emoticons)
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
# Removes leading and trailing punctuation
|
20
|
+
# Leaves contractions and most emoticons
|
21
|
+
# Does not preserve punc-plus-letter emoticons (e.g. :D)
|
22
|
+
# @return [Array]
|
23
|
+
def prepare_words_and_emoticons
|
24
|
+
@text
|
25
|
+
.split
|
26
|
+
.map { |word| WordHelper.strip_punctuation(word) }
|
27
|
+
end
|
28
|
+
|
29
|
+
# Check whether just some words in the input are ALL CAPS.
|
30
|
+
# Returns `True` if some but not all items in `words` are ALL CAPS
|
31
|
+
# @param [Array] words
|
32
|
+
# @return [Boolean]
|
33
|
+
def all_cap_differential?(words)
|
34
|
+
all_cap_words = 0
|
35
|
+
|
36
|
+
words.each do |word|
|
37
|
+
all_cap_words += 1 if WordHelper.word_upcase?(word)
|
38
|
+
end
|
39
|
+
|
40
|
+
words_size = words.size
|
41
|
+
cap_differential = words_size - all_cap_words
|
42
|
+
|
43
|
+
return true if cap_differential.positive? && cap_differential < words_size
|
44
|
+
|
45
|
+
false
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module VaderSentimentRuby
|
4
|
+
# Separates positive versus negative sentiment scores
|
5
|
+
class SentimentScoresSifter
|
6
|
+
def initialize(sentiments)
|
7
|
+
@sentiments = sentiments
|
8
|
+
@pos_sum = 0.0
|
9
|
+
@neg_sum = 0.0
|
10
|
+
@neu_count = 0
|
11
|
+
end
|
12
|
+
|
13
|
+
def call
|
14
|
+
@sentiments.each do |sentiment_score|
|
15
|
+
# compensates for neutral words that are counted as 1
|
16
|
+
@pos_sum += sentiment_score.to_f + 1 if sentiment_score.positive?
|
17
|
+
|
18
|
+
# when used with .abs, compensates for neutrals
|
19
|
+
@neg_sum += sentiment_score.to_f - 1 if sentiment_score.negative?
|
20
|
+
|
21
|
+
@neu_count += 1 if sentiment_score.zero?
|
22
|
+
end
|
23
|
+
|
24
|
+
[@pos_sum, @neg_sum, @neu_count]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module VaderSentimentRuby
|
4
|
+
# Prepares response with semantic score
|
5
|
+
class ValenceScoreCalculator
|
6
|
+
DEFAULT_RESPONSE = {
|
7
|
+
negative: 0.0,
|
8
|
+
neutral: 0.0,
|
9
|
+
positive: 0.0,
|
10
|
+
compound: 0.0
|
11
|
+
}.freeze
|
12
|
+
|
13
|
+
def initialize(sentiments, text)
|
14
|
+
@sentiments = sentiments
|
15
|
+
@text = text
|
16
|
+
end
|
17
|
+
|
18
|
+
def call
|
19
|
+
return DEFAULT_RESPONSE unless @sentiments
|
20
|
+
|
21
|
+
sum_s = @sentiments.map(&:to_f).sum
|
22
|
+
# compute and add emphasis from punctuation in text
|
23
|
+
punct_emph_amplifier = PunctuationEmphasisAmplifier.new(@text).call
|
24
|
+
compound = normalize(sum_s, punct_emph_amplifier)
|
25
|
+
|
26
|
+
prepare_response(compound, punct_emph_amplifier)
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
# Normalize the score to be between -1 and 1 using an alpha that
|
32
|
+
# approximates the max expected value
|
33
|
+
# Move to Sentiment analyzer
|
34
|
+
def normalize(score, punct_emph_amplifier, alpha = 15)
|
35
|
+
score = add_punctuation_emphasis(score, punct_emph_amplifier)
|
36
|
+
norm_score = score / Math.sqrt((score * score) + alpha).to_f
|
37
|
+
|
38
|
+
return -1.0 if norm_score < -1.0
|
39
|
+
return 1.0 if norm_score > 1.0
|
40
|
+
|
41
|
+
norm_score
|
42
|
+
end
|
43
|
+
|
44
|
+
def add_punctuation_emphasis(sum_s, punct_emph_amplifier)
|
45
|
+
if sum_s.positive?
|
46
|
+
sum_s += punct_emph_amplifier
|
47
|
+
elsif sum_s.negative?
|
48
|
+
sum_s -= punct_emph_amplifier
|
49
|
+
end
|
50
|
+
|
51
|
+
sum_s
|
52
|
+
end
|
53
|
+
|
54
|
+
# rubocop:disable Metrics/AbcSize
|
55
|
+
def prepare_response(compound, punct_emph_amplifier)
|
56
|
+
pos_sum, neg_sum, neu_count = scores(punct_emph_amplifier)
|
57
|
+
total = (pos_sum + neg_sum.to_f.abs) + neu_count
|
58
|
+
|
59
|
+
{
|
60
|
+
negative: (neg_sum / total.to_f).abs.round(3),
|
61
|
+
neutral: (neu_count / total.to_f).abs.round(3),
|
62
|
+
positive: (pos_sum / total.to_f).abs.round(3),
|
63
|
+
compound: compound.round(4)
|
64
|
+
}
|
65
|
+
end
|
66
|
+
# rubocop:enable Metrics/AbcSize
|
67
|
+
|
68
|
+
# Prepare scores sum for result calculation
|
69
|
+
def scores(punct_emph_amplifier)
|
70
|
+
# discriminate between positive, negative and neutral sentiment scores
|
71
|
+
pos_sum, neg_sum, neu_count = SentimentScoresSifter.new(@sentiments).call
|
72
|
+
|
73
|
+
if pos_sum > neg_sum.to_f.abs
|
74
|
+
pos_sum += punct_emph_amplifier
|
75
|
+
elsif pos_sum < neg_sum.to_f.abs
|
76
|
+
neg_sum -= punct_emph_amplifier
|
77
|
+
end
|
78
|
+
|
79
|
+
[pos_sum, neg_sum, neu_count]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module VaderSentimentRuby
|
4
|
+
# Helper module for word manipulations to simulate pythons methods behavior
|
5
|
+
# word_upcase?(word) is similar to Python's word.isupper()
|
6
|
+
# strip_punctuation(word) is similar to Python's word.strip(string.punctuation)
|
7
|
+
module WordHelper
|
8
|
+
PUNCTUATIONS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
|
9
|
+
|
10
|
+
class << self
|
11
|
+
# Checks that string contains at least one letter and all letters are in upcase
|
12
|
+
# @param [String] word
|
13
|
+
# @return [Boolean]
|
14
|
+
#
|
15
|
+
# Example
|
16
|
+
# word_upcase?(':D') # => true
|
17
|
+
# word_upcase?(':)') # => false
|
18
|
+
def word_upcase?(word)
|
19
|
+
word == word.upcase && word.count('A-Za-z').positive?
|
20
|
+
end
|
21
|
+
|
22
|
+
# Removes all trailing and leading punctuation
|
23
|
+
# If the resulting string has two or fewer characters,
|
24
|
+
# then it was likely an emoticon, so return original string
|
25
|
+
# (ie ':)' stripped would be '', so just return ':)'
|
26
|
+
# @param [String] token
|
27
|
+
# @return [String]
|
28
|
+
#
|
29
|
+
# Example
|
30
|
+
# strip_punctuation("'test'") # => "test"
|
31
|
+
# strip_punctuation("'don't'") # => "don't"
|
32
|
+
# strip_punctuation(":)") # => ":)"
|
33
|
+
def strip_punctuation(token)
|
34
|
+
token_without_punctuation = replace_punctuations(token)
|
35
|
+
|
36
|
+
original_set = token.split('')
|
37
|
+
updated_set = token_without_punctuation.split('')
|
38
|
+
|
39
|
+
pair_array = prepare_match_array(original_set, updated_set)
|
40
|
+
pair_array = clean_leading_punctuations(pair_array)
|
41
|
+
pair_array = clean_trailing_punctuations(pair_array)
|
42
|
+
|
43
|
+
stripped = pair_array.map { |item| item[:old_ch] }.join
|
44
|
+
|
45
|
+
return token if stripped.size <= 2
|
46
|
+
|
47
|
+
stripped
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def replace_punctuations(token)
|
53
|
+
punctuation_array = PUNCTUATIONS.split('')
|
54
|
+
|
55
|
+
punctuation_array.each do |punctuation|
|
56
|
+
token = token.gsub(punctuation, ' ')
|
57
|
+
end
|
58
|
+
|
59
|
+
token
|
60
|
+
end
|
61
|
+
|
62
|
+
def prepare_match_array(original_set, updated_set)
|
63
|
+
pair_array = []
|
64
|
+
original_set.each_with_index do |item, index|
|
65
|
+
pair_array << { index: index, old_ch: item, new_ch: updated_set[index] }
|
66
|
+
end
|
67
|
+
|
68
|
+
pair_array
|
69
|
+
end
|
70
|
+
|
71
|
+
def clean_leading_punctuations(pair_array)
|
72
|
+
pair_array.map do |pair|
|
73
|
+
break if pair[:new_ch] != ' '
|
74
|
+
|
75
|
+
pair_array.delete_at(pair[:index])
|
76
|
+
end
|
77
|
+
|
78
|
+
pair_array
|
79
|
+
end
|
80
|
+
|
81
|
+
def clean_trailing_punctuations(pair_array)
|
82
|
+
reversed_array = pair_array.reverse
|
83
|
+
reversed_array.map do |pair|
|
84
|
+
break if pair[:new_ch] != ' '
|
85
|
+
|
86
|
+
pair_array.delete_at(pair[:index])
|
87
|
+
end
|
88
|
+
|
89
|
+
pair_array
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|