textmood 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +66 -45
- data/bin/textmood +3 -3
- data/lib/textmood.rb +5 -3
- data/test/test.rb +4 -4
- metadata +1 -1
data/README.md
CHANGED
@@ -1,14 +1,19 @@
|
|
1
1
|
## TextMood - Simple sentiment analyzer
|
2
|
-
*TextMood* is a simple sentiment analyzer, provided as a Ruby gem with
|
3
|
-
tool for simple interoperability with other processes. It takes text
|
4
|
-
returns a sentiment score.
|
5
|
-
considered negative.
|
2
|
+
*TextMood* is a simple but powerful sentiment analyzer, provided as a Ruby gem with
|
3
|
+
a command-line tool for simple interoperability with other processes. It takes text
|
4
|
+
as input and returns a sentiment score.
|
6
5
|
|
7
|
-
The
|
8
|
-
for
|
6
|
+
The sentiment analysis is relatively simple, and works by splitting the text into
|
7
|
+
tokens and comparing each token to a pre-selected sentiment score for that token.
|
8
|
+
The combined score for all tokens is then returned.
|
9
|
+
|
10
|
+
However, TextMood also supports doing multiple passes over the text, splitting
|
11
|
+
it into tokens of N words (N-grams) for each pass. By adding multi-word tokens to
|
12
|
+
the sentiment file and using this feature, you can achieve much greater accuracy
|
13
|
+
than with just single-word analysis.
|
9
14
|
|
10
15
|
### Installation
|
11
|
-
The easiest way to get the latest stable version is to
|
16
|
+
The easiest way to get the latest stable version is to install the gem:
|
12
17
|
|
13
18
|
gem install textmood
|
14
19
|
|
@@ -17,27 +22,50 @@ If you’d like to get the bleeding-edge version:
|
|
17
22
|
git clone https://github.com/stiang/textmood
|
18
23
|
|
19
24
|
### Usage
|
20
|
-
TextMood can be used as a
|
25
|
+
TextMood can be used as a Ruby library or as a standalone CLI tool.
|
21
26
|
|
22
27
|
#### Ruby library
|
23
|
-
You can use
|
28
|
+
You can use it in a Ruby program like this:
|
24
29
|
```ruby
|
25
30
|
require "textmood"
|
26
31
|
|
27
32
|
# The :lang parameter makes TextMood use one of the bundled language sentiment files
|
28
|
-
|
29
|
-
score =
|
33
|
+
tm = TextMood.new(lang: "en_US")
|
34
|
+
score = tm.analyze("some text")
|
30
35
|
#=> '1.121'
|
31
36
|
|
32
37
|
# The :files parameter makes TextMood ignore the bundled sentiment files and use the
|
33
38
|
# specified files instead. You can specify as many files as you want.
|
34
|
-
|
39
|
+
tm = TextMood.new(files: ["en_US-mod1.txt", "emoticons.txt"])
|
40
|
+
|
41
|
+
# Using :normalize_output, you can make TextMood return a normalized value:
|
42
|
+
# 1 for positive, 0 for neutral and -1 for negative
|
43
|
+
tm = TextMood.new(lang: "en_US", normalize_output: true)
|
44
|
+
score = tm.analyze("some text")
|
45
|
+
#=> '1'
|
46
|
+
|
47
|
+
# :normalize_score will try to normalize the score to an integer between +/- 100,
|
48
|
+
# based on how many tokens were scored, which can be useful when trying to compare
|
49
|
+
# scores for texts of different length
|
50
|
+
tm = TextMood.new(lang: "en_US", normalize_score: true)
|
51
|
+
score = tm.analyze("some text")
|
52
|
+
#=> '14'
|
53
|
+
|
54
|
+
# :min_threshold and :max_threshold lets you customize the way :normalize_output
|
55
|
+
# treats different values. The options below will make all scores below 1 negative,
|
56
|
+
# 1-2 will be neutral, and above 2 will be positive.
|
57
|
+
tm = TextMood.new(lang: "en_US",
|
58
|
+
normalize_output: true,
|
59
|
+
min_threshold: 1,
|
60
|
+
max_threshold: 2)
|
61
|
+
score = tm.analyze("some text")
|
62
|
+
#=> '0'
|
35
63
|
|
36
64
|
# TextMood will by default make one pass over the text, checking every word, but it
|
37
65
|
# supports doing several passes for any range of word N-grams. Both the start and end
|
38
66
|
# N-gram can be specified using the :start_ngram and :end_ngram options
|
39
|
-
|
40
|
-
score =
|
67
|
+
tm = TextMood.new(lang: "en_US", debug: true, start_ngram: 2, end_ngram: 3)
|
68
|
+
score = tm.analyze("some long text with many words")
|
41
69
|
#(stdout): some long: 0.1
|
42
70
|
#(stdout): long text: 0.1
|
43
71
|
#(stdout): text with: -0.1
|
@@ -49,23 +77,10 @@ score = scorer.score_text("some long text with many words")
|
|
49
77
|
#(stdout): with many words: 0.1
|
50
78
|
#=> '0.1'
|
51
79
|
|
52
|
-
# Using :normalize, you can make TextMood return a normalized value: 1 for positive,
|
53
|
-
# 0 for neutral and -1 for negative
|
54
|
-
scorer = TextMood.new(lang: "en_US", normalize: true)
|
55
|
-
score = scorer.score_text("some text")
|
56
|
-
#=> '1'
|
57
|
-
|
58
|
-
# :min_threshold and :max_threshold lets you customize the way :normalize treats
|
59
|
-
# different values. The options below will make all scores below 1 negative,
|
60
|
-
# 1-2 will be neutral, and above 2 will be positive.
|
61
|
-
scorer = TextMood.new(lang: "en_US", normalize: true, min_threshold: 1, max_threshold: 2)
|
62
|
-
score = scorer.score_text("some text")
|
63
|
-
#=> '0'
|
64
|
-
|
65
80
|
# :debug prints out all tokens to stdout, alongs with their values (or 'nil' when the
|
66
81
|
# token was not found)
|
67
|
-
|
68
|
-
score =
|
82
|
+
tm = TextMood.new(lang: "en_US", debug: true)
|
83
|
+
score = tm.analyze("some text")
|
69
84
|
#(stdout): some: 0.1
|
70
85
|
#(stdout): text: 0.1
|
71
86
|
#(stdout): some text: -0.1
|
@@ -89,6 +104,8 @@ The cli tool has many useful options, mostly mirroring those of the library. Her
|
|
89
104
|
output from `textmood -h`:
|
90
105
|
```
|
91
106
|
Usage: textmood [options] "<text>"
|
107
|
+
OR
|
108
|
+
echo "<text>" | textmood [options]
|
92
109
|
|
93
110
|
Returns a floating-point sentiment score of the provided text.
|
94
111
|
Above 0 is considered positive, below is considered negative.
|
@@ -104,28 +121,32 @@ MANDATORY options:
|
|
104
121
|
files will be loaded if this option is used.
|
105
122
|
|
106
123
|
OPTIONAL options:
|
107
|
-
|
108
|
-
|
109
|
-
sentiment file has tokens of similar N-gram length
|
124
|
+
-o, --normalize-output Return 1 (positive), -1 (negative) or 0 (neutral)
|
125
|
+
instead of the actual score. See also --min and --max.
|
110
126
|
|
111
|
-
|
112
|
-
|
113
|
-
|
127
|
+
-s, --normalize-score Tries to normalize the score to an integer between +/- 100
|
128
|
+
according to the number of tokens that were scored, making
|
129
|
+
it more feasible to compare scores for texts of different
|
130
|
+
length
|
114
131
|
|
115
|
-
-
|
116
|
-
|
132
|
+
-i, --min-threshold FLOAT Scores lower than this are considered negative when
|
133
|
+
using --normalize-output (default 0.5). Note that the
|
134
|
+
threshold is compared to the normalized score, if applicable
|
117
135
|
|
118
|
-
|
119
|
-
|
136
|
+
-x, --max-threshold FLOAT Scores higher than this are considered positive when
|
137
|
+
using --normalize-output (default 0.5). Note that the
|
138
|
+
threshold is compared to the normalized score, if applicable
|
120
139
|
|
121
|
-
|
122
|
-
|
140
|
+
-b, --start-ngram INTEGER The lowest word N-gram number to split the text into
|
141
|
+
(default 1). Note that this only makes sense if the
|
142
|
+
sentiment file has tokens of similar N-gram length
|
123
143
|
|
124
|
-
|
125
|
-
|
144
|
+
-e, --end-ngram INTEGER The highest word N-gram number to to split the text into
|
145
|
+
(default 1). Note that this only makes sense if the
|
146
|
+
sentiment file has tokens of similar N-gram length
|
126
147
|
|
127
|
-
-
|
128
|
-
|
148
|
+
-k, --skip-symbols Do not include symbols file (emoticons etc.). Only applies
|
149
|
+
when using -l/--language.
|
129
150
|
|
130
151
|
-d, --debug Prints out the score for each token in the provided text
|
131
152
|
or 'nil' if the token was not found in the sentiment file
|
data/bin/textmood
CHANGED
@@ -63,7 +63,7 @@ opts_parser = OptionParser.new do |opts|
|
|
63
63
|
opts.separator ""
|
64
64
|
opts.on("-s", "--normalize-score", "Tries to normalize the score to an integer between +/- 100",
|
65
65
|
"according to the number of tokens that were scored, making",
|
66
|
-
"it more feasible to compare scores
|
66
|
+
"it more feasible to compare scores for texts of different",
|
67
67
|
"length") do |ns|
|
68
68
|
options[:normalize_score] = true
|
69
69
|
end
|
@@ -111,8 +111,8 @@ end
|
|
111
111
|
opts_parser.parse!
|
112
112
|
|
113
113
|
def do_main(text, options)
|
114
|
-
|
115
|
-
puts
|
114
|
+
tm = TextMood.new(options)
|
115
|
+
puts tm.analyze(text)
|
116
116
|
end
|
117
117
|
|
118
118
|
if ARGV[0]
|
data/lib/textmood.rb
CHANGED
@@ -37,7 +37,7 @@ class TextMood
|
|
37
37
|
end
|
38
38
|
|
39
39
|
# analyzes the sentiment of the provided text.
|
40
|
-
def
|
40
|
+
def analyze(text)
|
41
41
|
sentiment_total = 0.0
|
42
42
|
|
43
43
|
scores_added = 0
|
@@ -67,6 +67,8 @@ class TextMood
|
|
67
67
|
end
|
68
68
|
end
|
69
69
|
|
70
|
+
alias_method :analyse, :analyze
|
71
|
+
|
70
72
|
private
|
71
73
|
|
72
74
|
def score_token(token)
|
@@ -115,8 +117,8 @@ class TextMood
|
|
115
117
|
end
|
116
118
|
|
117
119
|
def normalize_score(score, count)
|
118
|
-
factor = NORMALIZE_TO / count
|
119
|
-
(score * factor).
|
120
|
+
factor = NORMALIZE_TO.to_f / count.to_f
|
121
|
+
(score * factor).round
|
120
122
|
end
|
121
123
|
|
122
124
|
end
|
data/test/test.rb
CHANGED
@@ -17,14 +17,14 @@ include Test::Unit::Assertions
|
|
17
17
|
class TestScorer < Test::Unit::TestCase
|
18
18
|
|
19
19
|
def setup
|
20
|
-
@
|
20
|
+
@tm = TextMood.new({:lang => "en_US"})
|
21
21
|
end
|
22
22
|
|
23
23
|
def test_negative
|
24
24
|
max = -0.01
|
25
25
|
texts = ["This is just terrible"]
|
26
26
|
texts.each do |text|
|
27
|
-
actual_score = @
|
27
|
+
actual_score = @tm.analyze(text)
|
28
28
|
assert((actual_score < max), "actual: #{actual_score} >= max: #{max} for '#{text}'")
|
29
29
|
end
|
30
30
|
end
|
@@ -34,7 +34,7 @@ class TestScorer < Test::Unit::TestCase
|
|
34
34
|
max = 0.5
|
35
35
|
texts = ["This is neutral"]
|
36
36
|
texts.each do |text, test_score|
|
37
|
-
actual_score = @
|
37
|
+
actual_score = @tm.analyze(text)
|
38
38
|
assert((actual_score < max and actual_score > min), "min: #{min} <= actual: #{actual_score} >= max: #{max} for '#{text}'")
|
39
39
|
end
|
40
40
|
end
|
@@ -43,7 +43,7 @@ class TestScorer < Test::Unit::TestCase
|
|
43
43
|
min = 0.01
|
44
44
|
texts = ["This is amazing!"]
|
45
45
|
texts.each do |text, test_score|
|
46
|
-
actual_score = @
|
46
|
+
actual_score = @tm.analyze(text)
|
47
47
|
assert((actual_score >= min), "actual: #{actual_score} <= max: #{min} for '#{text}'")
|
48
48
|
end
|
49
49
|
end
|