textmood 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +50 -9
- data/bin/textmood +40 -7
- data/lib/textmood.rb +3 -3
- data/test/test.rb +1 -1
- metadata +1 -1
data/README.md
CHANGED
@@ -52,7 +52,7 @@ You can use it in a Ruby program like this:
|
|
52
52
|
require "textmood"
|
53
53
|
|
54
54
|
# The :lang parameter makes TextMood use one of the bundled language sentiment files
|
55
|
-
tm = TextMood.new(
|
55
|
+
tm = TextMood.new(language: "en")
|
56
56
|
score = tm.analyze("some text")
|
57
57
|
#=> '1.121'
|
58
58
|
|
@@ -62,18 +62,18 @@ tm = TextMood.new(files: ["en_US-mod1.txt", "emoticons.txt"])
|
|
62
62
|
|
63
63
|
# Use :alias_file to make TextMood look up the file to use for the given language tag
|
64
64
|
# in a JSON file containing a hash with {"language_tag": "path_to_file"} mappings
|
65
|
-
tm = TextMood.new(
|
65
|
+
tm = TextMood.new(language: "zw", alias_file: "my-custom-languages.json")
|
66
66
|
|
67
67
|
# :normalize_score will try to normalize the score to an integer between +/- 100,
|
68
68
|
# based on how many tokens were scored, which can be useful when trying to compare
|
69
69
|
# scores for texts of different length
|
70
|
-
tm = TextMood.new(
|
70
|
+
tm = TextMood.new(language: "en", normalize_score: true)
|
71
71
|
score = tm.analyze("some text")
|
72
72
|
#=> '14'
|
73
73
|
|
74
74
|
# :ternary_output will make TextMood return one of three fixed values:
|
75
75
|
# 1 for positive, 0 for neutral and -1 for negative
|
76
|
-
tm = TextMood.new(
|
76
|
+
tm = TextMood.new(language: "en", ternary_output: true)
|
77
77
|
score = tm.analyze("some text")
|
78
78
|
#=> '1'
|
79
79
|
|
@@ -81,7 +81,7 @@ score = tm.analyze("some text")
|
|
81
81
|
# treats different values. The options below will make all scores below 10 negative,
|
82
82
|
# 10-20 will be neutral, and above 20 will be positive. Note that these thresholds
|
83
83
|
# are compared to the normalized score, if applicable.
|
84
|
-
tm = TextMood.new(
|
84
|
+
tm = TextMood.new(language: "en",
|
85
85
|
ternary_output: true,
|
86
86
|
normalize_score: true,
|
87
87
|
min_threshold: 10,
|
@@ -92,7 +92,7 @@ score = tm.analyze("some text")
|
|
92
92
|
# TextMood will by default make one pass over the text, checking every word, but it
|
93
93
|
# supports doing several passes for any range of word N-grams. Both the start and end
|
94
94
|
# N-gram can be specified using the :start_ngram and :end_ngram options
|
95
|
-
tm = TextMood.new(
|
95
|
+
tm = TextMood.new(language: "en", debug: true, start_ngram: 2, end_ngram: 3)
|
96
96
|
score = tm.analyze("some long text with many words")
|
97
97
|
#(stdout): some long: 0.1
|
98
98
|
#(stdout): long text: 0.1
|
@@ -107,7 +107,7 @@ score = tm.analyze("some long text with many words")
|
|
107
107
|
|
108
108
|
# :debug prints out all tokens to stdout, alongs with their values (or 'nil' when the
|
109
109
|
# token was not found)
|
110
|
-
tm = TextMood.new(
|
110
|
+
tm = TextMood.new(language: "en", debug: true)
|
111
111
|
score = tm.analyze("some text")
|
112
112
|
#(stdout): some: 0.1
|
113
113
|
#(stdout): text: 0.1
|
@@ -140,7 +140,7 @@ Above 0 is considered positive, below is considered negative.
|
|
140
140
|
|
141
141
|
MANDATORY options:
|
142
142
|
-l, --language LANGUAGE The IETF language tag for the provided text.
|
143
|
-
Examples:
|
143
|
+
Examples: en_US, no_NB
|
144
144
|
|
145
145
|
OR
|
146
146
|
|
@@ -149,6 +149,10 @@ MANDATORY options:
|
|
149
149
|
files will be loaded if this option is used.
|
150
150
|
|
151
151
|
OPTIONAL options:
|
152
|
+
-a, --alias-file PATH TO FILE JSON file containing a hash that maps language codes to
|
153
|
+
sentiment score files. This lets you use the convenience of
|
154
|
+
language codes with custom sentiment score files.
|
155
|
+
|
152
156
|
-n, --normalize-score Tries to normalize the score to an integer between +/- 100
|
153
157
|
according to the number of tokens that were scored, making
|
154
158
|
it more feasible to compare scores for texts of different
|
@@ -159,7 +163,7 @@ OPTIONAL options:
|
|
159
163
|
and --max-threshold.
|
160
164
|
|
161
165
|
-i, --min-threshold FLOAT Scores lower than this are considered negative when
|
162
|
-
using --ternary-output (default
|
166
|
+
using --ternary-output (default 0.5). Note that the
|
163
167
|
threshold is compared to the normalized score, if applicable
|
164
168
|
|
165
169
|
-x, --max-threshold FLOAT Scores higher than this are considered positive when
|
@@ -183,6 +187,43 @@ OPTIONAL options:
|
|
183
187
|
-h, --help Show this message
|
184
188
|
```
|
185
189
|
|
190
|
+
### Configuration files for the CLI tool
|
191
|
+
The CLI tool will look for /etc/textmood and ~/.textmood unless the -c/--config option
|
192
|
+
is used, in which case only that file is used. The configuration files are basic, flat
|
193
|
+
YAML files that use the same keys as the library understands:
|
194
|
+
```yaml
|
195
|
+
# Assume that text is in this language, unless overridden on the command line.
|
196
|
+
# Do not use this in conjunction with the files setting.
|
197
|
+
language: en
|
198
|
+
|
199
|
+
# Load these sentiment score files instead of using any of the bundled ones
|
200
|
+
# Do not use this in conjunction with the language setting
|
201
|
+
files: [/path/to/file1, /path/to/file2]
|
202
|
+
|
203
|
+
# Use a global alias file to resolve language codes
|
204
|
+
alias_file: /home/john/textmood/aliases.json
|
205
|
+
|
206
|
+
# Always normalize the score
|
207
|
+
normalize_score: true
|
208
|
+
|
209
|
+
# Use ternary output
|
210
|
+
ternary_output: true
|
211
|
+
|
212
|
+
# Use these thresholds when using ternary output
|
213
|
+
max_threshold: 10
|
214
|
+
min_threshold: 5
|
215
|
+
|
216
|
+
# Do three passes, scoring unigrams, bigrams and trigrams
|
217
|
+
start_ngram: 1
|
218
|
+
end_ngram: 3
|
219
|
+
|
220
|
+
# Do not load the symbols file when using a bundled language
|
221
|
+
skip_symbols: true
|
222
|
+
|
223
|
+
# Always print debug info
|
224
|
+
debug: true
|
225
|
+
```
|
226
|
+
|
186
227
|
## Sentiment files
|
187
228
|
The included sentiment files reside in the *lang* directory. I hope to add many
|
188
229
|
more baseline sentiment files in the future.
|
data/bin/textmood
CHANGED
@@ -12,8 +12,13 @@ $:.unshift File.join(File.dirname(__FILE__), *%w{ .. lib })
|
|
12
12
|
|
13
13
|
require "optparse"
|
14
14
|
require "textmood"
|
15
|
+
require "yaml"
|
15
16
|
|
16
|
-
usage =
|
17
|
+
usage = <<-eos
|
18
|
+
Usage: #{File.basename($0)} [options] "<text>"
|
19
|
+
OR
|
20
|
+
echo "<text>" | #{File.basename($0)} [options]"
|
21
|
+
eos
|
17
22
|
|
18
23
|
def mini_usage(usage, notext = false)
|
19
24
|
puts usage
|
@@ -22,7 +27,8 @@ def mini_usage(usage, notext = false)
|
|
22
27
|
puts "ERROR: Quoted text must be provided after the last option."
|
23
28
|
else
|
24
29
|
puts "ERROR: An IETF language tag must be provided using the -l/--language option,"
|
25
|
-
puts " or sentiment files must be provided with the -f/--file option."
|
30
|
+
puts " or sentiment files must be provided with the -f/--file option. These"
|
31
|
+
puts " values can also be set in /etc/textmood.cfg or ~/.textmood."
|
26
32
|
end
|
27
33
|
puts ""
|
28
34
|
puts "Use \"#{File.basename($0)} -h\" for full usage info."
|
@@ -30,21 +36,27 @@ def mini_usage(usage, notext = false)
|
|
30
36
|
exit 20
|
31
37
|
end
|
32
38
|
|
33
|
-
|
34
|
-
|
39
|
+
def parse_config_file(file, debug = false)
|
40
|
+
if File.file?(file)
|
41
|
+
puts "Using config: #{file}" if debug
|
42
|
+
YAML.load(File.read(file))
|
43
|
+
else
|
44
|
+
{}
|
45
|
+
end
|
35
46
|
end
|
36
47
|
|
37
48
|
options = {:files => []}
|
49
|
+
|
38
50
|
opts_parser = OptionParser.new do |opts|
|
39
51
|
opts.banner = usage
|
40
52
|
opts.separator ""
|
41
|
-
opts.separator "Returns a
|
42
|
-
opts.separator "
|
53
|
+
opts.separator "Returns a sentiment score of the provided text. Above 0 is usually"
|
54
|
+
opts.separator "considered positive, below is considered negative."
|
43
55
|
opts.separator ""
|
44
56
|
opts.separator "MANDATORY options:"
|
45
57
|
opts.on("-l", "--language LANGUAGE", "The IETF language tag for the provided text.",
|
46
58
|
"Examples: en_US, no_NB") do |l|
|
47
|
-
options[:
|
59
|
+
options[:language] = l
|
48
60
|
end
|
49
61
|
opts.separator ""
|
50
62
|
opts.separator " OR "
|
@@ -104,6 +116,12 @@ opts_parser = OptionParser.new do |opts|
|
|
104
116
|
options[:include_symbols] = false
|
105
117
|
end
|
106
118
|
opts.separator ""
|
119
|
+
opts.on("-c", "--config PATH TO FILE", "Use the specified config file. If not specified, textmood will look",
|
120
|
+
"for /etc/textmood.cfg and ~/.textmood. Settings in the user config",
|
121
|
+
"will override settings from the global file.") do |c|
|
122
|
+
options[:config] = c.to_s
|
123
|
+
end
|
124
|
+
opts.separator ""
|
107
125
|
opts.on("-d", "--debug", "Prints out the score for each token in the provided text",
|
108
126
|
"or 'nil' if the token was not found in the sentiment file") do |d|
|
109
127
|
options[:debug] = true
|
@@ -117,6 +135,21 @@ opts_parser = OptionParser.new do |opts|
|
|
117
135
|
end
|
118
136
|
opts_parser.parse!
|
119
137
|
|
138
|
+
if options[:config]
|
139
|
+
options.merge!(parse_config_file(options[:config], options[:debug]))
|
140
|
+
else
|
141
|
+
["/etc/textmood.cfg", File.expand_path("~/.textmood")].each do |file|
|
142
|
+
options.merge!(parse_config_file(file, options[:debug]))
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
options = Hash[options.map{ |k, v| [k.to_sym, v] }]
|
147
|
+
|
148
|
+
unless ((options[:language] or not options[:files].empty?) or (ARGV[0] and ARGV[1]))
|
149
|
+
mini_usage(usage)
|
150
|
+
exit 2
|
151
|
+
end
|
152
|
+
|
120
153
|
def do_main(text, options)
|
121
154
|
tm = TextMood.new(options)
|
122
155
|
puts tm.analyze(text)
|
data/lib/textmood.rb
CHANGED
@@ -19,11 +19,11 @@ class TextMood
|
|
19
19
|
options[:start_ngram] ||= 1
|
20
20
|
options[:end_ngram] ||= 1
|
21
21
|
@options = options
|
22
|
-
if options[:
|
22
|
+
if options[:language]
|
23
23
|
if options[:alias_file]
|
24
24
|
aliases = load_alias_file(options[:alias_file])
|
25
25
|
if aliases
|
26
|
-
file = aliases[options[:
|
26
|
+
file = aliases[options[:language]]
|
27
27
|
unless file
|
28
28
|
raise ArgumentError, "Language tag not found in alias file"
|
29
29
|
end
|
@@ -31,7 +31,7 @@ class TextMood
|
|
31
31
|
raise ArgumentError, "Alias file not found"
|
32
32
|
end
|
33
33
|
else
|
34
|
-
file = File.dirname(__FILE__) + "/../lang/#{options[:
|
34
|
+
file = File.dirname(__FILE__) + "/../lang/#{options[:language]}.txt"
|
35
35
|
end
|
36
36
|
@sentiment_values = load_sentiment_file(file)
|
37
37
|
unless options[:include_symbols] == false
|
data/test/test.rb
CHANGED