textmood 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (5) hide show
  1. data/README.md +6 -3
  2. data/bin/textmood +28 -19
  3. data/lang/no_NB.txt +5 -29
  4. data/lib/textmood.rb +18 -3
  5. metadata +1 -1
data/README.md CHANGED
@@ -112,7 +112,10 @@ OPTIONAL options:
112
112
  (default 1). Note that this only makes sense if the
113
113
  sentiment file has tokens of similar N-gram length
114
114
 
115
- -n, --normalize Return 1 (positive), -1 (negative) or 0 (neutral)
115
+ -n, --normalize-output Return 1 (positive), -1 (negative) or 0 (neutral)
116
+ instead of the actual score. See also --min and --max.
117
+
118
+ --normalize-score Return 1 (positive), -1 (negative) or 0 (neutral)
116
119
  instead of the actual score. See also --min and --max.
117
120
 
118
121
  --min-threshold FLOAT Scores lower than this are considered negative when
@@ -149,8 +152,8 @@ and contain one colon-separated line per token, like so:
149
152
  0.875: well-situated
150
153
  0.6: well suited
151
154
  ```
152
- The score is to the left of the first ':', and everything to the right is the
153
- (potentially multi-word) token.
155
+ The score, which must be between -1.0 and 1.0, is to the left of the first ':',
156
+ and everything to the right is the (potentially multi-word) token.
154
157
 
155
158
  ## Contribute
156
159
  Including baseline word/N-gram scores for many different languages is one
data/bin/textmood CHANGED
@@ -56,35 +56,44 @@ opts_parser = OptionParser.new do |opts|
56
56
  end
57
57
  opts.separator ""
58
58
  opts.separator "OPTIONAL options:"
59
- opts.on("--start-ngram INTEGER", "The lowest word N-gram number to split the text into",
60
- "(default 1). Note that this only makes sense if the",
61
- "sentiment file has tokens of similar N-gram length") do |start_ngram|
62
- options[:start_ngram] = start_ngram.to_i
59
+ opts.on("-o", "--normalize-output", "Return 1 (positive), -1 (negative) or 0 (neutral)",
60
+ "instead of the actual score. See also --min and --max.") do |n|
61
+ options[:normalize_output] = true
63
62
  end
64
63
  opts.separator ""
65
- opts.on("--end-ngram INTEGER", "The highest word N-gram number to to split the text into",
66
- "(default 1). Note that this only makes sense if the",
67
- "sentiment file has tokens of similar N-gram length") do |end_ngram|
68
- options[:end_ngram] = end_ngram.to_i
69
- end
70
- opts.separator ""
71
- opts.on("-n", "--normalize", "Return 1 (positive), -1 (negative) or 0 (neutral)",
72
- "instead of the actual score. See also --min and --max.") do |n|
73
- options[:normalize] = true
64
+ opts.on("-s", "--normalize-score", "Tries to normalize the score to an integer between +/- 100",
65
+ "according to the number of tokens that were scored, making",
66
+ "it more feasible to compare scores between texts of different",
67
+ "length") do |ns|
68
+ options[:normalize_score] = true
74
69
  end
75
70
  opts.separator ""
76
- opts.on("--min-threshold FLOAT", "Scores lower than this are considered negative when",
77
- "using --normalize (default -0.5)") do |min|
71
+ opts.on("-i", "--min-threshold FLOAT", "Scores lower than this are considered negative when",
72
+ "using --normalize-output (default 0.5). Note that the",
73
+ "threshold is compared to the normalized score, if applicable") do |min|
78
74
  options[:min_threshold] = min.to_f
79
75
  end
80
76
  opts.separator ""
81
- opts.on("--max-threshold FLOAT", "Scores higher than this are considered positive when",
82
- "using --normalize (default 0.5)") do |max|
77
+ opts.on("-x", "--max-threshold FLOAT", "Scores higher than this are considered positive when",
78
+ "using --normalize-output (default 0.5). Note that the",
79
+ "threshold is compared to the normalized score, if applicable") do |max|
83
80
  options[:max_threshold] = max.to_f
84
81
  end
85
82
  opts.separator ""
86
- opts.on("-s", "--skip-symbols", "Do not include symbols file (emoticons etc.).",
87
- "Only applies when using -l/--language.") do |s|
83
+ opts.on("-b", "--start-ngram INTEGER", "The lowest word N-gram number to split the text into",
84
+ "(default 1). Note that this only makes sense if the",
85
+ "sentiment file has tokens of similar N-gram length") do |start_ngram|
86
+ options[:start_ngram] = start_ngram.to_i
87
+ end
88
+ opts.separator ""
89
+ opts.on("-e", "--end-ngram INTEGER", "The highest word N-gram number to to split the text into",
90
+ "(default 1). Note that this only makes sense if the",
91
+ "sentiment file has tokens of similar N-gram length") do |end_ngram|
92
+ options[:end_ngram] = end_ngram.to_i
93
+ end
94
+ opts.separator ""
95
+ opts.on("-k", "--skip-symbols", "Do not include symbols file (emoticons etc.). Only applies",
96
+ "when using -l/--language.") do |s|
88
97
  options[:include_symbols] = false
89
98
  end
90
99
  opts.separator ""
data/lang/no_NB.txt CHANGED
@@ -8,21 +8,14 @@
8
8
  0.87: godt fast
9
9
  0.87: upretensiøs
10
10
  0.87: undervurdert
11
- 0.87: top-hulls
12
11
  0.87: terapeutisk
13
- 0.87: stirrer
14
- 0.87: utlagte masser
15
12
  0.87: selvrespekt
16
- 0.87: self-respektfull
17
- 0.87: selvrespekt
18
- 0.87: self-hensyn
19
13
  0.87: helsebringende
20
14
  0.87: rosenrød i kinnene
21
15
  0.87: utstråle
22
16
  0.87: utstråling
23
17
  0.87: velstand
24
18
  0.87: prinsipiell
25
- 0.87: ut-og-ytre
26
19
  0.87: moraliserende
27
20
  0.87: mesterstykke
28
21
  0.87: mesterlig
@@ -64,7 +57,6 @@
64
57
  0.75: fagmessig
65
58
  0.75: snedig
66
59
  0.75: helhet
67
- 0.75: godt gjennomtenkt av
68
60
  0.75: velproporsjonert
69
61
  0.75: godt bevart
70
62
  0.75: godt favoriserte
@@ -78,8 +70,6 @@
78
70
  0.75: infisert
79
71
  0.75: urettferdig
80
72
  0.75: kle
81
- 0.75: revet ut av villfarelse
82
- 0.75: riv ut av villfarelse
83
73
  0.75: pålitelighet
84
74
  0.75: øverste nivå
85
75
  0.75: brødrister
@@ -171,7 +161,6 @@
171
161
  0.75: halvhjertede
172
162
  0.75: skyldfri
173
163
  0.75: pen
174
- 0.75: get-at-stand
175
164
  0.75: famlende
176
165
  0.75: influensa
177
166
  0.75: finere
@@ -252,7 +241,6 @@
252
241
  0.68: hedret
253
242
  0.68: treffende
254
243
  0.68: konstruktiv
255
- 0.68: kommer-at-stand
256
244
  0.68: munter
257
245
  0.66: produktiv
258
246
  0.66: gledelig
@@ -262,11 +250,8 @@
262
250
  0.64: verdig
263
251
  0.63: god
264
252
  0.62: ettergivende
265
- 0.62: lengtet-for
266
253
  0.62: verdig
267
- 0.62: ord-splitting
268
254
  0.62: lurer
269
- 0.62: ønsket-for
270
255
  0.62: klokt
271
256
  0.62: velvære
272
257
  0.62: velprøvd
@@ -292,19 +277,16 @@
292
277
  0.62: ubetenkelig
293
278
  0.62: forståelig
294
279
  0.62: kraftens
295
- 0.62: Ullr
296
280
  0.62: sannferdig
297
281
  0.62: tillitsfullt
298
282
  0.62: klarert
299
283
  0.62: overskride
300
284
  0.62: ro
301
- 0.62: ro
302
285
  0.62: trening
303
286
  0.62: sporbar
304
287
  0.62: sporbar
305
288
  0.62: totaliteten
306
289
  0.62: topper
307
- 0.62: fløtekaramell-nosed
308
290
  0.62: forsagt
309
291
  0.62: tre kvarter
310
292
  0.62: vitnesbyrd
@@ -317,22 +299,18 @@
317
299
  0.62: vellykket
318
300
  0.62: stilistisk
319
301
  0.62: iherdig
320
- 0.62: Stoppable
321
302
  0.62: vekst
322
303
  0.62: angitt
323
304
  0.62: sfærisk
324
305
  0.62: tale-endowed
325
- 0.62: Sparer
306
+ 0.62: sparer
326
307
  0.62: ønsket
327
308
  0.62: soignée
328
- 0.62: SOIGNE
329
309
  0.62: røykfritt
330
310
  0.62: knusende
331
311
  0.62: treghet
332
- 0.62: klapse-up
333
312
  0.62: synde
334
313
  0.62: forenklede
335
- 0.62: SID
336
314
  0.62: sjokksikker
337
315
  0.62: Shivaree
338
316
  0.62: velskapt
@@ -373,7 +351,6 @@
373
351
  0.62: oppladbart
374
352
  0.62: betryggende
375
353
  0.62: rimelig
376
- 0.62: fornyet stadfestelse
377
354
  0.62: hev
378
355
  0.62: kvantifiserbare
379
356
  0.62: målrettet
@@ -411,7 +388,6 @@
411
388
  0.62: overtrekk
412
389
  0.62: overkompensere
413
390
  0.62: rangeres foran
414
- 0.62: ut-og-ut
415
391
  0.62: ortodoks
416
392
  0.62: rikest
417
393
  0.62: overdådig
@@ -430,7 +406,6 @@
430
406
  0.62: danser
431
407
  0.62: naboskap
432
408
  0.62: naturalisering
433
- 0.62: naturalisering
434
409
  0.62: narsissisme
435
410
  0.62: naivt
436
411
  0.62: foranderlig
@@ -674,6 +649,8 @@
674
649
  0.62: ablativ
675
650
  0.62: underdanig
676
651
  0.62: abbed
652
+ 0.62: lykke
653
+ 0.62: lykkelig
677
654
  0.58: håndgripelig
678
655
  0.58: barmhjertig
679
656
  0.58: verdighet
@@ -1046,7 +1023,7 @@
1046
1023
  0.50: edel-mindedness
1047
1024
  0.50: knuslete
1048
1025
  0.50: pent
1049
- 0.50: ny-laget
1026
+ 0.50: nylaget
1050
1027
  0.50: nøytralisert
1051
1028
  0.50: nøytralisert
1052
1029
  0.50: nevrobiologiske
@@ -4250,6 +4227,7 @@
4250
4227
  0.20: ladet
4251
4228
  0.20: godartet
4252
4229
  0.20: våken
4230
+ 0.10: lykkes
4253
4231
  -0.20: stamme
4254
4232
  -0.20: variasjon
4255
4233
  -0.20: varians
@@ -4515,7 +4493,6 @@
4515
4493
  -0.25: sukker-belagt
4516
4494
  -0.25: kvelende
4517
4495
  -0.25: kveles
4518
- -0.25: lykkes
4519
4496
  -0.25: underjordiske
4520
4497
  -0.25: underjordiske
4521
4498
  -0.25: innsynkning
@@ -8993,7 +8970,6 @@
8993
8970
  -0.75: hode
8994
8971
  -0.75: Hayseed
8995
8972
  -0.75: Haredi
8996
- -0.75: lykke
8997
8973
  -0.75: Hamming
8998
8974
  -0.75: skinke-handed
8999
8975
  -0.75: skinke-fisted
data/lib/textmood.rb CHANGED
@@ -7,6 +7,8 @@ else
7
7
  Encoding.default_internal = Encoding::UTF_8
8
8
  end
9
9
 
10
+ NORMALIZE_TO = 100
11
+
10
12
  class TextMood
11
13
 
12
14
  def initialize(options = {})
@@ -38,13 +40,21 @@ class TextMood
38
40
  def score_text(text)
39
41
  sentiment_total = 0.0
40
42
 
43
+ scores_added = 0
41
44
  (@options[:start_ngram]..@options[:end_ngram]).each do |i|
42
45
  ngrams(i, text.to_s).each do |token|
43
- sentiment_total += score_token(token)
46
+ score = score_token(token)
47
+ unless score.nil?
48
+ sentiment_total += score
49
+ scores_added += 1
50
+ end
44
51
  end
45
52
  end
46
53
 
47
- if @options[:normalize]
54
+ if @options[:normalize_score]
55
+ sentiment_total = normalize_score(sentiment_total, scores_added)
56
+ end
57
+ if @options[:normalize_output]
48
58
  if sentiment_total > @options[:max_threshold]
49
59
  1
50
60
  elsif sentiment_total < @options[:min_threshold]
@@ -76,7 +86,7 @@ class TextMood
76
86
  sentiment_value
77
87
  else
78
88
  puts "#{used_token}: nil" if @options[:debug]
79
- 0.0
89
+ nil
80
90
  end
81
91
  end
82
92
 
@@ -104,4 +114,9 @@ class TextMood
104
114
  sentiment_values
105
115
  end
106
116
 
117
+ def normalize_score(score, count)
118
+ factor = NORMALIZE_TO / count
119
+ (score * factor).to_i
120
+ end
121
+
107
122
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textmood
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: