textmood 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +6 -3
- data/bin/textmood +28 -19
- data/lang/no_NB.txt +5 -29
- data/lib/textmood.rb +18 -3
- metadata +1 -1
data/README.md
CHANGED
@@ -112,7 +112,10 @@ OPTIONAL options:
|
|
112
112
|
(default 1). Note that this only makes sense if the
|
113
113
|
sentiment file has tokens of similar N-gram length
|
114
114
|
|
115
|
-
-n, --normalize
|
115
|
+
-n, --normalize-output Return 1 (positive), -1 (negative) or 0 (neutral)
|
116
|
+
instead of the actual score. See also --min and --max.
|
117
|
+
|
118
|
+
--normalize-score Return 1 (positive), -1 (negative) or 0 (neutral)
|
116
119
|
instead of the actual score. See also --min and --max.
|
117
120
|
|
118
121
|
--min-threshold FLOAT Scores lower than this are considered negative when
|
@@ -149,8 +152,8 @@ and contain one colon-separated line per token, like so:
|
|
149
152
|
0.875: well-situated
|
150
153
|
0.6: well suited
|
151
154
|
```
|
152
|
-
The score is to the left of the first ':',
|
153
|
-
(potentially multi-word) token.
|
155
|
+
The score, which must be between -1.0 and 1.0, is to the left of the first ':',
|
156
|
+
and everything to the right is the (potentially multi-word) token.
|
154
157
|
|
155
158
|
## Contribute
|
156
159
|
Including baseline word/N-gram scores for many different languages is one
|
data/bin/textmood
CHANGED
@@ -56,35 +56,44 @@ opts_parser = OptionParser.new do |opts|
|
|
56
56
|
end
|
57
57
|
opts.separator ""
|
58
58
|
opts.separator "OPTIONAL options:"
|
59
|
-
opts.on("
|
60
|
-
|
61
|
-
|
62
|
-
options[:start_ngram] = start_ngram.to_i
|
59
|
+
opts.on("-o", "--normalize-output", "Return 1 (positive), -1 (negative) or 0 (neutral)",
|
60
|
+
"instead of the actual score. See also --min and --max.") do |n|
|
61
|
+
options[:normalize_output] = true
|
63
62
|
end
|
64
63
|
opts.separator ""
|
65
|
-
opts.on("
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
opts.separator ""
|
71
|
-
opts.on("-n", "--normalize", "Return 1 (positive), -1 (negative) or 0 (neutral)",
|
72
|
-
"instead of the actual score. See also --min and --max.") do |n|
|
73
|
-
options[:normalize] = true
|
64
|
+
opts.on("-s", "--normalize-score", "Tries to normalize the score to an integer between +/- 100",
|
65
|
+
"according to the number of tokens that were scored, making",
|
66
|
+
"it more feasible to compare scores between texts of different",
|
67
|
+
"length") do |ns|
|
68
|
+
options[:normalize_score] = true
|
74
69
|
end
|
75
70
|
opts.separator ""
|
76
|
-
opts.on("--min-threshold FLOAT", "Scores lower than this are considered negative when",
|
77
|
-
"using --normalize (default
|
71
|
+
opts.on("-i", "--min-threshold FLOAT", "Scores lower than this are considered negative when",
|
72
|
+
"using --normalize-output (default 0.5). Note that the",
|
73
|
+
"threshold is compared to the normalized score, if applicable") do |min|
|
78
74
|
options[:min_threshold] = min.to_f
|
79
75
|
end
|
80
76
|
opts.separator ""
|
81
|
-
opts.on("--max-threshold FLOAT", "Scores higher than this are considered positive when",
|
82
|
-
"using --normalize (default 0.5)
|
77
|
+
opts.on("-x", "--max-threshold FLOAT", "Scores higher than this are considered positive when",
|
78
|
+
"using --normalize-output (default 0.5). Note that the",
|
79
|
+
"threshold is compared to the normalized score, if applicable") do |max|
|
83
80
|
options[:max_threshold] = max.to_f
|
84
81
|
end
|
85
82
|
opts.separator ""
|
86
|
-
opts.on("-
|
87
|
-
|
83
|
+
opts.on("-b", "--start-ngram INTEGER", "The lowest word N-gram number to split the text into",
|
84
|
+
"(default 1). Note that this only makes sense if the",
|
85
|
+
"sentiment file has tokens of similar N-gram length") do |start_ngram|
|
86
|
+
options[:start_ngram] = start_ngram.to_i
|
87
|
+
end
|
88
|
+
opts.separator ""
|
89
|
+
opts.on("-e", "--end-ngram INTEGER", "The highest word N-gram number to to split the text into",
|
90
|
+
"(default 1). Note that this only makes sense if the",
|
91
|
+
"sentiment file has tokens of similar N-gram length") do |end_ngram|
|
92
|
+
options[:end_ngram] = end_ngram.to_i
|
93
|
+
end
|
94
|
+
opts.separator ""
|
95
|
+
opts.on("-k", "--skip-symbols", "Do not include symbols file (emoticons etc.). Only applies",
|
96
|
+
"when using -l/--language.") do |s|
|
88
97
|
options[:include_symbols] = false
|
89
98
|
end
|
90
99
|
opts.separator ""
|
data/lang/no_NB.txt
CHANGED
@@ -8,21 +8,14 @@
|
|
8
8
|
0.87: godt fast
|
9
9
|
0.87: upretensiøs
|
10
10
|
0.87: undervurdert
|
11
|
-
0.87: top-hulls
|
12
11
|
0.87: terapeutisk
|
13
|
-
0.87: stirrer
|
14
|
-
0.87: utlagte masser
|
15
12
|
0.87: selvrespekt
|
16
|
-
0.87: self-respektfull
|
17
|
-
0.87: selvrespekt
|
18
|
-
0.87: self-hensyn
|
19
13
|
0.87: helsebringende
|
20
14
|
0.87: rosenrød i kinnene
|
21
15
|
0.87: utstråle
|
22
16
|
0.87: utstråling
|
23
17
|
0.87: velstand
|
24
18
|
0.87: prinsipiell
|
25
|
-
0.87: ut-og-ytre
|
26
19
|
0.87: moraliserende
|
27
20
|
0.87: mesterstykke
|
28
21
|
0.87: mesterlig
|
@@ -64,7 +57,6 @@
|
|
64
57
|
0.75: fagmessig
|
65
58
|
0.75: snedig
|
66
59
|
0.75: helhet
|
67
|
-
0.75: godt gjennomtenkt av
|
68
60
|
0.75: velproporsjonert
|
69
61
|
0.75: godt bevart
|
70
62
|
0.75: godt favoriserte
|
@@ -78,8 +70,6 @@
|
|
78
70
|
0.75: infisert
|
79
71
|
0.75: urettferdig
|
80
72
|
0.75: kle
|
81
|
-
0.75: revet ut av villfarelse
|
82
|
-
0.75: riv ut av villfarelse
|
83
73
|
0.75: pålitelighet
|
84
74
|
0.75: øverste nivå
|
85
75
|
0.75: brødrister
|
@@ -171,7 +161,6 @@
|
|
171
161
|
0.75: halvhjertede
|
172
162
|
0.75: skyldfri
|
173
163
|
0.75: pen
|
174
|
-
0.75: get-at-stand
|
175
164
|
0.75: famlende
|
176
165
|
0.75: influensa
|
177
166
|
0.75: finere
|
@@ -252,7 +241,6 @@
|
|
252
241
|
0.68: hedret
|
253
242
|
0.68: treffende
|
254
243
|
0.68: konstruktiv
|
255
|
-
0.68: kommer-at-stand
|
256
244
|
0.68: munter
|
257
245
|
0.66: produktiv
|
258
246
|
0.66: gledelig
|
@@ -262,11 +250,8 @@
|
|
262
250
|
0.64: verdig
|
263
251
|
0.63: god
|
264
252
|
0.62: ettergivende
|
265
|
-
0.62: lengtet-for
|
266
253
|
0.62: verdig
|
267
|
-
0.62: ord-splitting
|
268
254
|
0.62: lurer
|
269
|
-
0.62: ønsket-for
|
270
255
|
0.62: klokt
|
271
256
|
0.62: velvære
|
272
257
|
0.62: velprøvd
|
@@ -292,19 +277,16 @@
|
|
292
277
|
0.62: ubetenkelig
|
293
278
|
0.62: forståelig
|
294
279
|
0.62: kraftens
|
295
|
-
0.62: Ullr
|
296
280
|
0.62: sannferdig
|
297
281
|
0.62: tillitsfullt
|
298
282
|
0.62: klarert
|
299
283
|
0.62: overskride
|
300
284
|
0.62: ro
|
301
|
-
0.62: ro
|
302
285
|
0.62: trening
|
303
286
|
0.62: sporbar
|
304
287
|
0.62: sporbar
|
305
288
|
0.62: totaliteten
|
306
289
|
0.62: topper
|
307
|
-
0.62: fløtekaramell-nosed
|
308
290
|
0.62: forsagt
|
309
291
|
0.62: tre kvarter
|
310
292
|
0.62: vitnesbyrd
|
@@ -317,22 +299,18 @@
|
|
317
299
|
0.62: vellykket
|
318
300
|
0.62: stilistisk
|
319
301
|
0.62: iherdig
|
320
|
-
0.62: Stoppable
|
321
302
|
0.62: vekst
|
322
303
|
0.62: angitt
|
323
304
|
0.62: sfærisk
|
324
305
|
0.62: tale-endowed
|
325
|
-
0.62:
|
306
|
+
0.62: sparer
|
326
307
|
0.62: ønsket
|
327
308
|
0.62: soignée
|
328
|
-
0.62: SOIGNE
|
329
309
|
0.62: røykfritt
|
330
310
|
0.62: knusende
|
331
311
|
0.62: treghet
|
332
|
-
0.62: klapse-up
|
333
312
|
0.62: synde
|
334
313
|
0.62: forenklede
|
335
|
-
0.62: SID
|
336
314
|
0.62: sjokksikker
|
337
315
|
0.62: Shivaree
|
338
316
|
0.62: velskapt
|
@@ -373,7 +351,6 @@
|
|
373
351
|
0.62: oppladbart
|
374
352
|
0.62: betryggende
|
375
353
|
0.62: rimelig
|
376
|
-
0.62: fornyet stadfestelse
|
377
354
|
0.62: hev
|
378
355
|
0.62: kvantifiserbare
|
379
356
|
0.62: målrettet
|
@@ -411,7 +388,6 @@
|
|
411
388
|
0.62: overtrekk
|
412
389
|
0.62: overkompensere
|
413
390
|
0.62: rangeres foran
|
414
|
-
0.62: ut-og-ut
|
415
391
|
0.62: ortodoks
|
416
392
|
0.62: rikest
|
417
393
|
0.62: overdådig
|
@@ -430,7 +406,6 @@
|
|
430
406
|
0.62: danser
|
431
407
|
0.62: naboskap
|
432
408
|
0.62: naturalisering
|
433
|
-
0.62: naturalisering
|
434
409
|
0.62: narsissisme
|
435
410
|
0.62: naivt
|
436
411
|
0.62: foranderlig
|
@@ -674,6 +649,8 @@
|
|
674
649
|
0.62: ablativ
|
675
650
|
0.62: underdanig
|
676
651
|
0.62: abbed
|
652
|
+
0.62: lykke
|
653
|
+
0.62: lykkelig
|
677
654
|
0.58: håndgripelig
|
678
655
|
0.58: barmhjertig
|
679
656
|
0.58: verdighet
|
@@ -1046,7 +1023,7 @@
|
|
1046
1023
|
0.50: edel-mindedness
|
1047
1024
|
0.50: knuslete
|
1048
1025
|
0.50: pent
|
1049
|
-
0.50:
|
1026
|
+
0.50: nylaget
|
1050
1027
|
0.50: nøytralisert
|
1051
1028
|
0.50: nøytralisert
|
1052
1029
|
0.50: nevrobiologiske
|
@@ -4250,6 +4227,7 @@
|
|
4250
4227
|
0.20: ladet
|
4251
4228
|
0.20: godartet
|
4252
4229
|
0.20: våken
|
4230
|
+
0.10: lykkes
|
4253
4231
|
-0.20: stamme
|
4254
4232
|
-0.20: variasjon
|
4255
4233
|
-0.20: varians
|
@@ -4515,7 +4493,6 @@
|
|
4515
4493
|
-0.25: sukker-belagt
|
4516
4494
|
-0.25: kvelende
|
4517
4495
|
-0.25: kveles
|
4518
|
-
-0.25: lykkes
|
4519
4496
|
-0.25: underjordiske
|
4520
4497
|
-0.25: underjordiske
|
4521
4498
|
-0.25: innsynkning
|
@@ -8993,7 +8970,6 @@
|
|
8993
8970
|
-0.75: hode
|
8994
8971
|
-0.75: Hayseed
|
8995
8972
|
-0.75: Haredi
|
8996
|
-
-0.75: lykke
|
8997
8973
|
-0.75: Hamming
|
8998
8974
|
-0.75: skinke-handed
|
8999
8975
|
-0.75: skinke-fisted
|
data/lib/textmood.rb
CHANGED
@@ -7,6 +7,8 @@ else
|
|
7
7
|
Encoding.default_internal = Encoding::UTF_8
|
8
8
|
end
|
9
9
|
|
10
|
+
NORMALIZE_TO = 100
|
11
|
+
|
10
12
|
class TextMood
|
11
13
|
|
12
14
|
def initialize(options = {})
|
@@ -38,13 +40,21 @@ class TextMood
|
|
38
40
|
def score_text(text)
|
39
41
|
sentiment_total = 0.0
|
40
42
|
|
43
|
+
scores_added = 0
|
41
44
|
(@options[:start_ngram]..@options[:end_ngram]).each do |i|
|
42
45
|
ngrams(i, text.to_s).each do |token|
|
43
|
-
|
46
|
+
score = score_token(token)
|
47
|
+
unless score.nil?
|
48
|
+
sentiment_total += score
|
49
|
+
scores_added += 1
|
50
|
+
end
|
44
51
|
end
|
45
52
|
end
|
46
53
|
|
47
|
-
if @options[:
|
54
|
+
if @options[:normalize_score]
|
55
|
+
sentiment_total = normalize_score(sentiment_total, scores_added)
|
56
|
+
end
|
57
|
+
if @options[:normalize_output]
|
48
58
|
if sentiment_total > @options[:max_threshold]
|
49
59
|
1
|
50
60
|
elsif sentiment_total < @options[:min_threshold]
|
@@ -76,7 +86,7 @@ class TextMood
|
|
76
86
|
sentiment_value
|
77
87
|
else
|
78
88
|
puts "#{used_token}: nil" if @options[:debug]
|
79
|
-
|
89
|
+
nil
|
80
90
|
end
|
81
91
|
end
|
82
92
|
|
@@ -104,4 +114,9 @@ class TextMood
|
|
104
114
|
sentiment_values
|
105
115
|
end
|
106
116
|
|
117
|
+
def normalize_score(score, count)
|
118
|
+
factor = NORMALIZE_TO / count
|
119
|
+
(score * factor).to_i
|
120
|
+
end
|
121
|
+
|
107
122
|
end
|