textmood 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. data/README.md +6 -3
  2. data/bin/textmood +28 -19
  3. data/lang/no_NB.txt +5 -29
  4. data/lib/textmood.rb +18 -3
  5. metadata +1 -1
data/README.md CHANGED
@@ -112,7 +112,10 @@ OPTIONAL options:
112
112
  (default 1). Note that this only makes sense if the
113
113
  sentiment file has tokens of similar N-gram length
114
114
 
115
- -n, --normalize Return 1 (positive), -1 (negative) or 0 (neutral)
115
+ -n, --normalize-output Return 1 (positive), -1 (negative) or 0 (neutral)
116
+ instead of the actual score. See also --min and --max.
117
+
118
+ --normalize-score Return 1 (positive), -1 (negative) or 0 (neutral)
116
119
  instead of the actual score. See also --min and --max.
117
120
 
118
121
  --min-threshold FLOAT Scores lower than this are considered negative when
@@ -149,8 +152,8 @@ and contain one colon-separated line per token, like so:
149
152
  0.875: well-situated
150
153
  0.6: well suited
151
154
  ```
152
- The score is to the left of the first ':', and everything to the right is the
153
- (potentially multi-word) token.
155
+ The score, which must be between -1.0 and 1.0, is to the left of the first ':',
156
+ and everything to the right is the (potentially multi-word) token.
154
157
 
155
158
  ## Contribute
156
159
  Including baseline word/N-gram scores for many different languages is one
data/bin/textmood CHANGED
@@ -56,35 +56,44 @@ opts_parser = OptionParser.new do |opts|
56
56
  end
57
57
  opts.separator ""
58
58
  opts.separator "OPTIONAL options:"
59
- opts.on("--start-ngram INTEGER", "The lowest word N-gram number to split the text into",
60
- "(default 1). Note that this only makes sense if the",
61
- "sentiment file has tokens of similar N-gram length") do |start_ngram|
62
- options[:start_ngram] = start_ngram.to_i
59
+ opts.on("-o", "--normalize-output", "Return 1 (positive), -1 (negative) or 0 (neutral)",
60
+ "instead of the actual score. See also --min and --max.") do |n|
61
+ options[:normalize_output] = true
63
62
  end
64
63
  opts.separator ""
65
- opts.on("--end-ngram INTEGER", "The highest word N-gram number to to split the text into",
66
- "(default 1). Note that this only makes sense if the",
67
- "sentiment file has tokens of similar N-gram length") do |end_ngram|
68
- options[:end_ngram] = end_ngram.to_i
69
- end
70
- opts.separator ""
71
- opts.on("-n", "--normalize", "Return 1 (positive), -1 (negative) or 0 (neutral)",
72
- "instead of the actual score. See also --min and --max.") do |n|
73
- options[:normalize] = true
64
+ opts.on("-s", "--normalize-score", "Tries to normalize the score to an integer between +/- 100",
65
+ "according to the number of tokens that were scored, making",
66
+ "it more feasible to compare scores between texts of different",
67
+ "length") do |ns|
68
+ options[:normalize_score] = true
74
69
  end
75
70
  opts.separator ""
76
- opts.on("--min-threshold FLOAT", "Scores lower than this are considered negative when",
77
- "using --normalize (default -0.5)") do |min|
71
+ opts.on("-i", "--min-threshold FLOAT", "Scores lower than this are considered negative when",
72
+ "using --normalize-output (default 0.5). Note that the",
73
+ "threshold is compared to the normalized score, if applicable") do |min|
78
74
  options[:min_threshold] = min.to_f
79
75
  end
80
76
  opts.separator ""
81
- opts.on("--max-threshold FLOAT", "Scores higher than this are considered positive when",
82
- "using --normalize (default 0.5)") do |max|
77
+ opts.on("-x", "--max-threshold FLOAT", "Scores higher than this are considered positive when",
78
+ "using --normalize-output (default 0.5). Note that the",
79
+ "threshold is compared to the normalized score, if applicable") do |max|
83
80
  options[:max_threshold] = max.to_f
84
81
  end
85
82
  opts.separator ""
86
- opts.on("-s", "--skip-symbols", "Do not include symbols file (emoticons etc.).",
87
- "Only applies when using -l/--language.") do |s|
83
+ opts.on("-b", "--start-ngram INTEGER", "The lowest word N-gram number to split the text into",
84
+ "(default 1). Note that this only makes sense if the",
85
+ "sentiment file has tokens of similar N-gram length") do |start_ngram|
86
+ options[:start_ngram] = start_ngram.to_i
87
+ end
88
+ opts.separator ""
89
+ opts.on("-e", "--end-ngram INTEGER", "The highest word N-gram number to to split the text into",
90
+ "(default 1). Note that this only makes sense if the",
91
+ "sentiment file has tokens of similar N-gram length") do |end_ngram|
92
+ options[:end_ngram] = end_ngram.to_i
93
+ end
94
+ opts.separator ""
95
+ opts.on("-k", "--skip-symbols", "Do not include symbols file (emoticons etc.). Only applies",
96
+ "when using -l/--language.") do |s|
88
97
  options[:include_symbols] = false
89
98
  end
90
99
  opts.separator ""
data/lang/no_NB.txt CHANGED
@@ -8,21 +8,14 @@
8
8
  0.87: godt fast
9
9
  0.87: upretensiøs
10
10
  0.87: undervurdert
11
- 0.87: top-hulls
12
11
  0.87: terapeutisk
13
- 0.87: stirrer
14
- 0.87: utlagte masser
15
12
  0.87: selvrespekt
16
- 0.87: self-respektfull
17
- 0.87: selvrespekt
18
- 0.87: self-hensyn
19
13
  0.87: helsebringende
20
14
  0.87: rosenrød i kinnene
21
15
  0.87: utstråle
22
16
  0.87: utstråling
23
17
  0.87: velstand
24
18
  0.87: prinsipiell
25
- 0.87: ut-og-ytre
26
19
  0.87: moraliserende
27
20
  0.87: mesterstykke
28
21
  0.87: mesterlig
@@ -64,7 +57,6 @@
64
57
  0.75: fagmessig
65
58
  0.75: snedig
66
59
  0.75: helhet
67
- 0.75: godt gjennomtenkt av
68
60
  0.75: velproporsjonert
69
61
  0.75: godt bevart
70
62
  0.75: godt favoriserte
@@ -78,8 +70,6 @@
78
70
  0.75: infisert
79
71
  0.75: urettferdig
80
72
  0.75: kle
81
- 0.75: revet ut av villfarelse
82
- 0.75: riv ut av villfarelse
83
73
  0.75: pålitelighet
84
74
  0.75: øverste nivå
85
75
  0.75: brødrister
@@ -171,7 +161,6 @@
171
161
  0.75: halvhjertede
172
162
  0.75: skyldfri
173
163
  0.75: pen
174
- 0.75: get-at-stand
175
164
  0.75: famlende
176
165
  0.75: influensa
177
166
  0.75: finere
@@ -252,7 +241,6 @@
252
241
  0.68: hedret
253
242
  0.68: treffende
254
243
  0.68: konstruktiv
255
- 0.68: kommer-at-stand
256
244
  0.68: munter
257
245
  0.66: produktiv
258
246
  0.66: gledelig
@@ -262,11 +250,8 @@
262
250
  0.64: verdig
263
251
  0.63: god
264
252
  0.62: ettergivende
265
- 0.62: lengtet-for
266
253
  0.62: verdig
267
- 0.62: ord-splitting
268
254
  0.62: lurer
269
- 0.62: ønsket-for
270
255
  0.62: klokt
271
256
  0.62: velvære
272
257
  0.62: velprøvd
@@ -292,19 +277,16 @@
292
277
  0.62: ubetenkelig
293
278
  0.62: forståelig
294
279
  0.62: kraftens
295
- 0.62: Ullr
296
280
  0.62: sannferdig
297
281
  0.62: tillitsfullt
298
282
  0.62: klarert
299
283
  0.62: overskride
300
284
  0.62: ro
301
- 0.62: ro
302
285
  0.62: trening
303
286
  0.62: sporbar
304
287
  0.62: sporbar
305
288
  0.62: totaliteten
306
289
  0.62: topper
307
- 0.62: fløtekaramell-nosed
308
290
  0.62: forsagt
309
291
  0.62: tre kvarter
310
292
  0.62: vitnesbyrd
@@ -317,22 +299,18 @@
317
299
  0.62: vellykket
318
300
  0.62: stilistisk
319
301
  0.62: iherdig
320
- 0.62: Stoppable
321
302
  0.62: vekst
322
303
  0.62: angitt
323
304
  0.62: sfærisk
324
305
  0.62: tale-endowed
325
- 0.62: Sparer
306
+ 0.62: sparer
326
307
  0.62: ønsket
327
308
  0.62: soignée
328
- 0.62: SOIGNE
329
309
  0.62: røykfritt
330
310
  0.62: knusende
331
311
  0.62: treghet
332
- 0.62: klapse-up
333
312
  0.62: synde
334
313
  0.62: forenklede
335
- 0.62: SID
336
314
  0.62: sjokksikker
337
315
  0.62: Shivaree
338
316
  0.62: velskapt
@@ -373,7 +351,6 @@
373
351
  0.62: oppladbart
374
352
  0.62: betryggende
375
353
  0.62: rimelig
376
- 0.62: fornyet stadfestelse
377
354
  0.62: hev
378
355
  0.62: kvantifiserbare
379
356
  0.62: målrettet
@@ -411,7 +388,6 @@
411
388
  0.62: overtrekk
412
389
  0.62: overkompensere
413
390
  0.62: rangeres foran
414
- 0.62: ut-og-ut
415
391
  0.62: ortodoks
416
392
  0.62: rikest
417
393
  0.62: overdådig
@@ -430,7 +406,6 @@
430
406
  0.62: danser
431
407
  0.62: naboskap
432
408
  0.62: naturalisering
433
- 0.62: naturalisering
434
409
  0.62: narsissisme
435
410
  0.62: naivt
436
411
  0.62: foranderlig
@@ -674,6 +649,8 @@
674
649
  0.62: ablativ
675
650
  0.62: underdanig
676
651
  0.62: abbed
652
+ 0.62: lykke
653
+ 0.62: lykkelig
677
654
  0.58: håndgripelig
678
655
  0.58: barmhjertig
679
656
  0.58: verdighet
@@ -1046,7 +1023,7 @@
1046
1023
  0.50: edel-mindedness
1047
1024
  0.50: knuslete
1048
1025
  0.50: pent
1049
- 0.50: ny-laget
1026
+ 0.50: nylaget
1050
1027
  0.50: nøytralisert
1051
1028
  0.50: nøytralisert
1052
1029
  0.50: nevrobiologiske
@@ -4250,6 +4227,7 @@
4250
4227
  0.20: ladet
4251
4228
  0.20: godartet
4252
4229
  0.20: våken
4230
+ 0.10: lykkes
4253
4231
  -0.20: stamme
4254
4232
  -0.20: variasjon
4255
4233
  -0.20: varians
@@ -4515,7 +4493,6 @@
4515
4493
  -0.25: sukker-belagt
4516
4494
  -0.25: kvelende
4517
4495
  -0.25: kveles
4518
- -0.25: lykkes
4519
4496
  -0.25: underjordiske
4520
4497
  -0.25: underjordiske
4521
4498
  -0.25: innsynkning
@@ -8993,7 +8970,6 @@
8993
8970
  -0.75: hode
8994
8971
  -0.75: Hayseed
8995
8972
  -0.75: Haredi
8996
- -0.75: lykke
8997
8973
  -0.75: Hamming
8998
8974
  -0.75: skinke-handed
8999
8975
  -0.75: skinke-fisted
data/lib/textmood.rb CHANGED
@@ -7,6 +7,8 @@ else
7
7
  Encoding.default_internal = Encoding::UTF_8
8
8
  end
9
9
 
10
+ NORMALIZE_TO = 100
11
+
10
12
  class TextMood
11
13
 
12
14
  def initialize(options = {})
@@ -38,13 +40,21 @@ class TextMood
38
40
  def score_text(text)
39
41
  sentiment_total = 0.0
40
42
 
43
+ scores_added = 0
41
44
  (@options[:start_ngram]..@options[:end_ngram]).each do |i|
42
45
  ngrams(i, text.to_s).each do |token|
43
- sentiment_total += score_token(token)
46
+ score = score_token(token)
47
+ unless score.nil?
48
+ sentiment_total += score
49
+ scores_added += 1
50
+ end
44
51
  end
45
52
  end
46
53
 
47
- if @options[:normalize]
54
+ if @options[:normalize_score]
55
+ sentiment_total = normalize_score(sentiment_total, scores_added)
56
+ end
57
+ if @options[:normalize_output]
48
58
  if sentiment_total > @options[:max_threshold]
49
59
  1
50
60
  elsif sentiment_total < @options[:min_threshold]
@@ -76,7 +86,7 @@ class TextMood
76
86
  sentiment_value
77
87
  else
78
88
  puts "#{used_token}: nil" if @options[:debug]
79
- 0.0
89
+ nil
80
90
  end
81
91
  end
82
92
 
@@ -104,4 +114,9 @@ class TextMood
104
114
  sentiment_values
105
115
  end
106
116
 
117
+ def normalize_score(score, count)
118
+ factor = NORMALIZE_TO / count
119
+ (score * factor).to_i
120
+ end
121
+
107
122
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textmood
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: