textmood 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +50 -9
- data/bin/textmood +40 -7
- data/lib/textmood.rb +3 -3
- data/test/test.rb +1 -1
- metadata +1 -1
data/README.md
CHANGED
@@ -52,7 +52,7 @@ You can use it in a Ruby program like this:
|
|
52
52
|
require "textmood"
|
53
53
|
|
54
54
|
# The :lang parameter makes TextMood use one of the bundled language sentiment files
|
55
|
-
tm = TextMood.new(
|
55
|
+
tm = TextMood.new(language: "en")
|
56
56
|
score = tm.analyze("some text")
|
57
57
|
#=> '1.121'
|
58
58
|
|
@@ -62,18 +62,18 @@ tm = TextMood.new(files: ["en_US-mod1.txt", "emoticons.txt"])
|
|
62
62
|
|
63
63
|
# Use :alias_file to make TextMood look up the file to use for the given language tag
|
64
64
|
# in a JSON file containing a hash with {"language_tag": "path_to_file"} mappings
|
65
|
-
tm = TextMood.new(
|
65
|
+
tm = TextMood.new(language: "zw", alias_file: "my-custom-languages.json")
|
66
66
|
|
67
67
|
# :normalize_score will try to normalize the score to an integer between +/- 100,
|
68
68
|
# based on how many tokens were scored, which can be useful when trying to compare
|
69
69
|
# scores for texts of different length
|
70
|
-
tm = TextMood.new(
|
70
|
+
tm = TextMood.new(language: "en", normalize_score: true)
|
71
71
|
score = tm.analyze("some text")
|
72
72
|
#=> '14'
|
73
73
|
|
74
74
|
# :ternary_output will make TextMood return one of three fixed values:
|
75
75
|
# 1 for positive, 0 for neutral and -1 for negative
|
76
|
-
tm = TextMood.new(
|
76
|
+
tm = TextMood.new(language: "en", ternary_output: true)
|
77
77
|
score = tm.analyze("some text")
|
78
78
|
#=> '1'
|
79
79
|
|
@@ -81,7 +81,7 @@ score = tm.analyze("some text")
|
|
81
81
|
# treats different values. The options below will make all scores below 10 negative,
|
82
82
|
# 10-20 will be neutral, and above 20 will be positive. Note that these thresholds
|
83
83
|
# are compared to the normalized score, if applicable.
|
84
|
-
tm = TextMood.new(
|
84
|
+
tm = TextMood.new(language: "en",
|
85
85
|
ternary_output: true,
|
86
86
|
normalize_score: true,
|
87
87
|
min_threshold: 10,
|
@@ -92,7 +92,7 @@ score = tm.analyze("some text")
|
|
92
92
|
# TextMood will by default make one pass over the text, checking every word, but it
|
93
93
|
# supports doing several passes for any range of word N-grams. Both the start and end
|
94
94
|
# N-gram can be specified using the :start_ngram and :end_ngram options
|
95
|
-
tm = TextMood.new(
|
95
|
+
tm = TextMood.new(language: "en", debug: true, start_ngram: 2, end_ngram: 3)
|
96
96
|
score = tm.analyze("some long text with many words")
|
97
97
|
#(stdout): some long: 0.1
|
98
98
|
#(stdout): long text: 0.1
|
@@ -107,7 +107,7 @@ score = tm.analyze("some long text with many words")
|
|
107
107
|
|
108
108
|
# :debug prints out all tokens to stdout, alongs with their values (or 'nil' when the
|
109
109
|
# token was not found)
|
110
|
-
tm = TextMood.new(
|
110
|
+
tm = TextMood.new(language: "en", debug: true)
|
111
111
|
score = tm.analyze("some text")
|
112
112
|
#(stdout): some: 0.1
|
113
113
|
#(stdout): text: 0.1
|
@@ -140,7 +140,7 @@ Above 0 is considered positive, below is considered negative.
|
|
140
140
|
|
141
141
|
MANDATORY options:
|
142
142
|
-l, --language LANGUAGE The IETF language tag for the provided text.
|
143
|
-
Examples:
|
143
|
+
Examples: en_US, no_NB
|
144
144
|
|
145
145
|
OR
|
146
146
|
|
@@ -149,6 +149,10 @@ MANDATORY options:
|
|
149
149
|
files will be loaded if this option is used.
|
150
150
|
|
151
151
|
OPTIONAL options:
|
152
|
+
-a, --alias-file PATH TO FILE JSON file containing a hash that maps language codes to
|
153
|
+
sentiment score files. This lets you use the convenience of
|
154
|
+
language codes with custom sentiment score files.
|
155
|
+
|
152
156
|
-n, --normalize-score Tries to normalize the score to an integer between +/- 100
|
153
157
|
according to the number of tokens that were scored, making
|
154
158
|
it more feasible to compare scores for texts of different
|
@@ -159,7 +163,7 @@ OPTIONAL options:
|
|
159
163
|
and --max-threshold.
|
160
164
|
|
161
165
|
-i, --min-threshold FLOAT Scores lower than this are considered negative when
|
162
|
-
using --ternary-output (default
|
166
|
+
using --ternary-output (default 0.5). Note that the
|
163
167
|
threshold is compared to the normalized score, if applicable
|
164
168
|
|
165
169
|
-x, --max-threshold FLOAT Scores higher than this are considered positive when
|
@@ -183,6 +187,43 @@ OPTIONAL options:
|
|
183
187
|
-h, --help Show this message
|
184
188
|
```
|
185
189
|
|
190
|
+
### Configuration files for the CLI tool
|
191
|
+
The CLI tool will look for /etc/textmood and ~/.textmood unless the -c/--config option
|
192
|
+
is used, in which case only that file is used. The configuration files are basic, flat
|
193
|
+
YAML files that use the same keys as the library understands:
|
194
|
+
```yaml
|
195
|
+
# Assume that text is in this language, unless overridden on the command line.
|
196
|
+
# Do not use this in conjunction with the files setting.
|
197
|
+
language: en
|
198
|
+
|
199
|
+
# Load these sentiment score files instead of using any of the bundled ones
|
200
|
+
# Do not use this in conjunction with the language setting
|
201
|
+
files: [/path/to/file1, /path/to/file2]
|
202
|
+
|
203
|
+
# Use a global alias file to resolve language codes
|
204
|
+
alias_file: /home/john/textmood/aliases.json
|
205
|
+
|
206
|
+
# Always normalize the score
|
207
|
+
normalize_score: true
|
208
|
+
|
209
|
+
# Use ternary output
|
210
|
+
ternary_output: true
|
211
|
+
|
212
|
+
# Use these thresholds when using ternary output
|
213
|
+
max_threshold: 10
|
214
|
+
min_threshold: 5
|
215
|
+
|
216
|
+
# Do three passes, scoring unigrams, bigrams and trigrams
|
217
|
+
start_ngram: 1
|
218
|
+
end_ngram: 3
|
219
|
+
|
220
|
+
# Do not load the symbols file when using a bundled language
|
221
|
+
skip_symbols: true
|
222
|
+
|
223
|
+
# Always print debug info
|
224
|
+
debug: true
|
225
|
+
```
|
226
|
+
|
186
227
|
## Sentiment files
|
187
228
|
The included sentiment files reside in the *lang* directory. I hope to add many
|
188
229
|
more baseline sentiment files in the future.
|
data/bin/textmood
CHANGED
@@ -12,8 +12,13 @@ $:.unshift File.join(File.dirname(__FILE__), *%w{ .. lib })
|
|
12
12
|
|
13
13
|
require "optparse"
|
14
14
|
require "textmood"
|
15
|
+
require "yaml"
|
15
16
|
|
16
|
-
usage =
|
17
|
+
usage = <<-eos
|
18
|
+
Usage: #{File.basename($0)} [options] "<text>"
|
19
|
+
OR
|
20
|
+
echo "<text>" | #{File.basename($0)} [options]"
|
21
|
+
eos
|
17
22
|
|
18
23
|
def mini_usage(usage, notext = false)
|
19
24
|
puts usage
|
@@ -22,7 +27,8 @@ def mini_usage(usage, notext = false)
|
|
22
27
|
puts "ERROR: Quoted text must be provided after the last option."
|
23
28
|
else
|
24
29
|
puts "ERROR: An IETF language tag must be provided using the -l/--language option,"
|
25
|
-
puts " or sentiment files must be provided with the -f/--file option."
|
30
|
+
puts " or sentiment files must be provided with the -f/--file option. These"
|
31
|
+
puts " values can also be set in /etc/textmood.cfg or ~/.textmood."
|
26
32
|
end
|
27
33
|
puts ""
|
28
34
|
puts "Use \"#{File.basename($0)} -h\" for full usage info."
|
@@ -30,21 +36,27 @@ def mini_usage(usage, notext = false)
|
|
30
36
|
exit 20
|
31
37
|
end
|
32
38
|
|
33
|
-
|
34
|
-
|
39
|
+
def parse_config_file(file, debug = false)
|
40
|
+
if File.file?(file)
|
41
|
+
puts "Using config: #{file}" if debug
|
42
|
+
YAML.load(File.read(file))
|
43
|
+
else
|
44
|
+
{}
|
45
|
+
end
|
35
46
|
end
|
36
47
|
|
37
48
|
options = {:files => []}
|
49
|
+
|
38
50
|
opts_parser = OptionParser.new do |opts|
|
39
51
|
opts.banner = usage
|
40
52
|
opts.separator ""
|
41
|
-
opts.separator "Returns a
|
42
|
-
opts.separator "
|
53
|
+
opts.separator "Returns a sentiment score of the provided text. Above 0 is usually"
|
54
|
+
opts.separator "considered positive, below is considered negative."
|
43
55
|
opts.separator ""
|
44
56
|
opts.separator "MANDATORY options:"
|
45
57
|
opts.on("-l", "--language LANGUAGE", "The IETF language tag for the provided text.",
|
46
58
|
"Examples: en_US, no_NB") do |l|
|
47
|
-
options[:
|
59
|
+
options[:language] = l
|
48
60
|
end
|
49
61
|
opts.separator ""
|
50
62
|
opts.separator " OR "
|
@@ -104,6 +116,12 @@ opts_parser = OptionParser.new do |opts|
|
|
104
116
|
options[:include_symbols] = false
|
105
117
|
end
|
106
118
|
opts.separator ""
|
119
|
+
opts.on("-c", "--config PATH TO FILE", "Use the specified config file. If not specified, textmood will look",
|
120
|
+
"for /etc/textmood.cfg and ~/.textmood. Settings in the user config",
|
121
|
+
"will override settings from the global file.") do |c|
|
122
|
+
options[:config] = c.to_s
|
123
|
+
end
|
124
|
+
opts.separator ""
|
107
125
|
opts.on("-d", "--debug", "Prints out the score for each token in the provided text",
|
108
126
|
"or 'nil' if the token was not found in the sentiment file") do |d|
|
109
127
|
options[:debug] = true
|
@@ -117,6 +135,21 @@ opts_parser = OptionParser.new do |opts|
|
|
117
135
|
end
|
118
136
|
opts_parser.parse!
|
119
137
|
|
138
|
+
if options[:config]
|
139
|
+
options.merge!(parse_config_file(options[:config], options[:debug]))
|
140
|
+
else
|
141
|
+
["/etc/textmood.cfg", File.expand_path("~/.textmood")].each do |file|
|
142
|
+
options.merge!(parse_config_file(file, options[:debug]))
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
options = Hash[options.map{ |k, v| [k.to_sym, v] }]
|
147
|
+
|
148
|
+
unless ((options[:language] or not options[:files].empty?) or (ARGV[0] and ARGV[1]))
|
149
|
+
mini_usage(usage)
|
150
|
+
exit 2
|
151
|
+
end
|
152
|
+
|
120
153
|
def do_main(text, options)
|
121
154
|
tm = TextMood.new(options)
|
122
155
|
puts tm.analyze(text)
|
data/lib/textmood.rb
CHANGED
@@ -19,11 +19,11 @@ class TextMood
|
|
19
19
|
options[:start_ngram] ||= 1
|
20
20
|
options[:end_ngram] ||= 1
|
21
21
|
@options = options
|
22
|
-
if options[:
|
22
|
+
if options[:language]
|
23
23
|
if options[:alias_file]
|
24
24
|
aliases = load_alias_file(options[:alias_file])
|
25
25
|
if aliases
|
26
|
-
file = aliases[options[:
|
26
|
+
file = aliases[options[:language]]
|
27
27
|
unless file
|
28
28
|
raise ArgumentError, "Language tag not found in alias file"
|
29
29
|
end
|
@@ -31,7 +31,7 @@ class TextMood
|
|
31
31
|
raise ArgumentError, "Alias file not found"
|
32
32
|
end
|
33
33
|
else
|
34
|
-
file = File.dirname(__FILE__) + "/../lang/#{options[:
|
34
|
+
file = File.dirname(__FILE__) + "/../lang/#{options[:language]}.txt"
|
35
35
|
end
|
36
36
|
@sentiment_values = load_sentiment_file(file)
|
37
37
|
unless options[:include_symbols] == false
|
data/test/test.rb
CHANGED