textmood 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +6 -3
- data/bin/textmood +28 -19
- data/lang/no_NB.txt +5 -29
- data/lib/textmood.rb +18 -3
- metadata +1 -1
data/README.md
CHANGED
@@ -112,7 +112,10 @@ OPTIONAL options:
|
|
112
112
|
(default 1). Note that this only makes sense if the
|
113
113
|
sentiment file has tokens of similar N-gram length
|
114
114
|
|
115
|
-
-n, --normalize
|
115
|
+
-n, --normalize-output Return 1 (positive), -1 (negative) or 0 (neutral)
|
116
|
+
instead of the actual score. See also --min and --max.
|
117
|
+
|
118
|
+
--normalize-score Return 1 (positive), -1 (negative) or 0 (neutral)
|
116
119
|
instead of the actual score. See also --min and --max.
|
117
120
|
|
118
121
|
--min-threshold FLOAT Scores lower than this are considered negative when
|
@@ -149,8 +152,8 @@ and contain one colon-separated line per token, like so:
|
|
149
152
|
0.875: well-situated
|
150
153
|
0.6: well suited
|
151
154
|
```
|
152
|
-
The score is to the left of the first ':',
|
153
|
-
(potentially multi-word) token.
|
155
|
+
The score, which must be between -1.0 and 1.0, is to the left of the first ':',
|
156
|
+
and everything to the right is the (potentially multi-word) token.
|
154
157
|
|
155
158
|
## Contribute
|
156
159
|
Including baseline word/N-gram scores for many different languages is one
|
data/bin/textmood
CHANGED
@@ -56,35 +56,44 @@ opts_parser = OptionParser.new do |opts|
|
|
56
56
|
end
|
57
57
|
opts.separator ""
|
58
58
|
opts.separator "OPTIONAL options:"
|
59
|
-
opts.on("
|
60
|
-
|
61
|
-
|
62
|
-
options[:start_ngram] = start_ngram.to_i
|
59
|
+
opts.on("-o", "--normalize-output", "Return 1 (positive), -1 (negative) or 0 (neutral)",
|
60
|
+
"instead of the actual score. See also --min and --max.") do |n|
|
61
|
+
options[:normalize_output] = true
|
63
62
|
end
|
64
63
|
opts.separator ""
|
65
|
-
opts.on("
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
opts.separator ""
|
71
|
-
opts.on("-n", "--normalize", "Return 1 (positive), -1 (negative) or 0 (neutral)",
|
72
|
-
"instead of the actual score. See also --min and --max.") do |n|
|
73
|
-
options[:normalize] = true
|
64
|
+
opts.on("-s", "--normalize-score", "Tries to normalize the score to an integer between +/- 100",
|
65
|
+
"according to the number of tokens that were scored, making",
|
66
|
+
"it more feasible to compare scores between texts of different",
|
67
|
+
"length") do |ns|
|
68
|
+
options[:normalize_score] = true
|
74
69
|
end
|
75
70
|
opts.separator ""
|
76
|
-
opts.on("--min-threshold FLOAT", "Scores lower than this are considered negative when",
|
77
|
-
"using --normalize (default
|
71
|
+
opts.on("-i", "--min-threshold FLOAT", "Scores lower than this are considered negative when",
|
72
|
+
"using --normalize-output (default 0.5). Note that the",
|
73
|
+
"threshold is compared to the normalized score, if applicable") do |min|
|
78
74
|
options[:min_threshold] = min.to_f
|
79
75
|
end
|
80
76
|
opts.separator ""
|
81
|
-
opts.on("--max-threshold FLOAT", "Scores higher than this are considered positive when",
|
82
|
-
"using --normalize (default 0.5)
|
77
|
+
opts.on("-x", "--max-threshold FLOAT", "Scores higher than this are considered positive when",
|
78
|
+
"using --normalize-output (default 0.5). Note that the",
|
79
|
+
"threshold is compared to the normalized score, if applicable") do |max|
|
83
80
|
options[:max_threshold] = max.to_f
|
84
81
|
end
|
85
82
|
opts.separator ""
|
86
|
-
opts.on("-
|
87
|
-
|
83
|
+
opts.on("-b", "--start-ngram INTEGER", "The lowest word N-gram number to split the text into",
|
84
|
+
"(default 1). Note that this only makes sense if the",
|
85
|
+
"sentiment file has tokens of similar N-gram length") do |start_ngram|
|
86
|
+
options[:start_ngram] = start_ngram.to_i
|
87
|
+
end
|
88
|
+
opts.separator ""
|
89
|
+
opts.on("-e", "--end-ngram INTEGER", "The highest word N-gram number to to split the text into",
|
90
|
+
"(default 1). Note that this only makes sense if the",
|
91
|
+
"sentiment file has tokens of similar N-gram length") do |end_ngram|
|
92
|
+
options[:end_ngram] = end_ngram.to_i
|
93
|
+
end
|
94
|
+
opts.separator ""
|
95
|
+
opts.on("-k", "--skip-symbols", "Do not include symbols file (emoticons etc.). Only applies",
|
96
|
+
"when using -l/--language.") do |s|
|
88
97
|
options[:include_symbols] = false
|
89
98
|
end
|
90
99
|
opts.separator ""
|
data/lang/no_NB.txt
CHANGED
@@ -8,21 +8,14 @@
|
|
8
8
|
0.87: godt fast
|
9
9
|
0.87: upretensiøs
|
10
10
|
0.87: undervurdert
|
11
|
-
0.87: top-hulls
|
12
11
|
0.87: terapeutisk
|
13
|
-
0.87: stirrer
|
14
|
-
0.87: utlagte masser
|
15
12
|
0.87: selvrespekt
|
16
|
-
0.87: self-respektfull
|
17
|
-
0.87: selvrespekt
|
18
|
-
0.87: self-hensyn
|
19
13
|
0.87: helsebringende
|
20
14
|
0.87: rosenrød i kinnene
|
21
15
|
0.87: utstråle
|
22
16
|
0.87: utstråling
|
23
17
|
0.87: velstand
|
24
18
|
0.87: prinsipiell
|
25
|
-
0.87: ut-og-ytre
|
26
19
|
0.87: moraliserende
|
27
20
|
0.87: mesterstykke
|
28
21
|
0.87: mesterlig
|
@@ -64,7 +57,6 @@
|
|
64
57
|
0.75: fagmessig
|
65
58
|
0.75: snedig
|
66
59
|
0.75: helhet
|
67
|
-
0.75: godt gjennomtenkt av
|
68
60
|
0.75: velproporsjonert
|
69
61
|
0.75: godt bevart
|
70
62
|
0.75: godt favoriserte
|
@@ -78,8 +70,6 @@
|
|
78
70
|
0.75: infisert
|
79
71
|
0.75: urettferdig
|
80
72
|
0.75: kle
|
81
|
-
0.75: revet ut av villfarelse
|
82
|
-
0.75: riv ut av villfarelse
|
83
73
|
0.75: pålitelighet
|
84
74
|
0.75: øverste nivå
|
85
75
|
0.75: brødrister
|
@@ -171,7 +161,6 @@
|
|
171
161
|
0.75: halvhjertede
|
172
162
|
0.75: skyldfri
|
173
163
|
0.75: pen
|
174
|
-
0.75: get-at-stand
|
175
164
|
0.75: famlende
|
176
165
|
0.75: influensa
|
177
166
|
0.75: finere
|
@@ -252,7 +241,6 @@
|
|
252
241
|
0.68: hedret
|
253
242
|
0.68: treffende
|
254
243
|
0.68: konstruktiv
|
255
|
-
0.68: kommer-at-stand
|
256
244
|
0.68: munter
|
257
245
|
0.66: produktiv
|
258
246
|
0.66: gledelig
|
@@ -262,11 +250,8 @@
|
|
262
250
|
0.64: verdig
|
263
251
|
0.63: god
|
264
252
|
0.62: ettergivende
|
265
|
-
0.62: lengtet-for
|
266
253
|
0.62: verdig
|
267
|
-
0.62: ord-splitting
|
268
254
|
0.62: lurer
|
269
|
-
0.62: ønsket-for
|
270
255
|
0.62: klokt
|
271
256
|
0.62: velvære
|
272
257
|
0.62: velprøvd
|
@@ -292,19 +277,16 @@
|
|
292
277
|
0.62: ubetenkelig
|
293
278
|
0.62: forståelig
|
294
279
|
0.62: kraftens
|
295
|
-
0.62: Ullr
|
296
280
|
0.62: sannferdig
|
297
281
|
0.62: tillitsfullt
|
298
282
|
0.62: klarert
|
299
283
|
0.62: overskride
|
300
284
|
0.62: ro
|
301
|
-
0.62: ro
|
302
285
|
0.62: trening
|
303
286
|
0.62: sporbar
|
304
287
|
0.62: sporbar
|
305
288
|
0.62: totaliteten
|
306
289
|
0.62: topper
|
307
|
-
0.62: fløtekaramell-nosed
|
308
290
|
0.62: forsagt
|
309
291
|
0.62: tre kvarter
|
310
292
|
0.62: vitnesbyrd
|
@@ -317,22 +299,18 @@
|
|
317
299
|
0.62: vellykket
|
318
300
|
0.62: stilistisk
|
319
301
|
0.62: iherdig
|
320
|
-
0.62: Stoppable
|
321
302
|
0.62: vekst
|
322
303
|
0.62: angitt
|
323
304
|
0.62: sfærisk
|
324
305
|
0.62: tale-endowed
|
325
|
-
0.62:
|
306
|
+
0.62: sparer
|
326
307
|
0.62: ønsket
|
327
308
|
0.62: soignée
|
328
|
-
0.62: SOIGNE
|
329
309
|
0.62: røykfritt
|
330
310
|
0.62: knusende
|
331
311
|
0.62: treghet
|
332
|
-
0.62: klapse-up
|
333
312
|
0.62: synde
|
334
313
|
0.62: forenklede
|
335
|
-
0.62: SID
|
336
314
|
0.62: sjokksikker
|
337
315
|
0.62: Shivaree
|
338
316
|
0.62: velskapt
|
@@ -373,7 +351,6 @@
|
|
373
351
|
0.62: oppladbart
|
374
352
|
0.62: betryggende
|
375
353
|
0.62: rimelig
|
376
|
-
0.62: fornyet stadfestelse
|
377
354
|
0.62: hev
|
378
355
|
0.62: kvantifiserbare
|
379
356
|
0.62: målrettet
|
@@ -411,7 +388,6 @@
|
|
411
388
|
0.62: overtrekk
|
412
389
|
0.62: overkompensere
|
413
390
|
0.62: rangeres foran
|
414
|
-
0.62: ut-og-ut
|
415
391
|
0.62: ortodoks
|
416
392
|
0.62: rikest
|
417
393
|
0.62: overdådig
|
@@ -430,7 +406,6 @@
|
|
430
406
|
0.62: danser
|
431
407
|
0.62: naboskap
|
432
408
|
0.62: naturalisering
|
433
|
-
0.62: naturalisering
|
434
409
|
0.62: narsissisme
|
435
410
|
0.62: naivt
|
436
411
|
0.62: foranderlig
|
@@ -674,6 +649,8 @@
|
|
674
649
|
0.62: ablativ
|
675
650
|
0.62: underdanig
|
676
651
|
0.62: abbed
|
652
|
+
0.62: lykke
|
653
|
+
0.62: lykkelig
|
677
654
|
0.58: håndgripelig
|
678
655
|
0.58: barmhjertig
|
679
656
|
0.58: verdighet
|
@@ -1046,7 +1023,7 @@
|
|
1046
1023
|
0.50: edel-mindedness
|
1047
1024
|
0.50: knuslete
|
1048
1025
|
0.50: pent
|
1049
|
-
0.50:
|
1026
|
+
0.50: nylaget
|
1050
1027
|
0.50: nøytralisert
|
1051
1028
|
0.50: nøytralisert
|
1052
1029
|
0.50: nevrobiologiske
|
@@ -4250,6 +4227,7 @@
|
|
4250
4227
|
0.20: ladet
|
4251
4228
|
0.20: godartet
|
4252
4229
|
0.20: våken
|
4230
|
+
0.10: lykkes
|
4253
4231
|
-0.20: stamme
|
4254
4232
|
-0.20: variasjon
|
4255
4233
|
-0.20: varians
|
@@ -4515,7 +4493,6 @@
|
|
4515
4493
|
-0.25: sukker-belagt
|
4516
4494
|
-0.25: kvelende
|
4517
4495
|
-0.25: kveles
|
4518
|
-
-0.25: lykkes
|
4519
4496
|
-0.25: underjordiske
|
4520
4497
|
-0.25: underjordiske
|
4521
4498
|
-0.25: innsynkning
|
@@ -8993,7 +8970,6 @@
|
|
8993
8970
|
-0.75: hode
|
8994
8971
|
-0.75: Hayseed
|
8995
8972
|
-0.75: Haredi
|
8996
|
-
-0.75: lykke
|
8997
8973
|
-0.75: Hamming
|
8998
8974
|
-0.75: skinke-handed
|
8999
8975
|
-0.75: skinke-fisted
|
data/lib/textmood.rb
CHANGED
@@ -7,6 +7,8 @@ else
|
|
7
7
|
Encoding.default_internal = Encoding::UTF_8
|
8
8
|
end
|
9
9
|
|
10
|
+
NORMALIZE_TO = 100
|
11
|
+
|
10
12
|
class TextMood
|
11
13
|
|
12
14
|
def initialize(options = {})
|
@@ -38,13 +40,21 @@ class TextMood
|
|
38
40
|
def score_text(text)
|
39
41
|
sentiment_total = 0.0
|
40
42
|
|
43
|
+
scores_added = 0
|
41
44
|
(@options[:start_ngram]..@options[:end_ngram]).each do |i|
|
42
45
|
ngrams(i, text.to_s).each do |token|
|
43
|
-
|
46
|
+
score = score_token(token)
|
47
|
+
unless score.nil?
|
48
|
+
sentiment_total += score
|
49
|
+
scores_added += 1
|
50
|
+
end
|
44
51
|
end
|
45
52
|
end
|
46
53
|
|
47
|
-
if @options[:
|
54
|
+
if @options[:normalize_score]
|
55
|
+
sentiment_total = normalize_score(sentiment_total, scores_added)
|
56
|
+
end
|
57
|
+
if @options[:normalize_output]
|
48
58
|
if sentiment_total > @options[:max_threshold]
|
49
59
|
1
|
50
60
|
elsif sentiment_total < @options[:min_threshold]
|
@@ -76,7 +86,7 @@ class TextMood
|
|
76
86
|
sentiment_value
|
77
87
|
else
|
78
88
|
puts "#{used_token}: nil" if @options[:debug]
|
79
|
-
|
89
|
+
nil
|
80
90
|
end
|
81
91
|
end
|
82
92
|
|
@@ -104,4 +114,9 @@ class TextMood
|
|
104
114
|
sentiment_values
|
105
115
|
end
|
106
116
|
|
117
|
+
def normalize_score(score, count)
|
118
|
+
factor = NORMALIZE_TO / count
|
119
|
+
(score * factor).to_i
|
120
|
+
end
|
121
|
+
|
107
122
|
end
|