RubyGems - textmood - Versions diffs - 0.0.4 → 0.0.5 - Mend

textmood 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/README.md CHANGED Viewed

@@ -112,7 +112,10 @@ OPTIONAL options:
                                      (default 1). Note that this only makes sense if the
                                      sentiment file has tokens of similar N-gram length
-    -n, --normalize                  Return 1 (positive), -1 (negative) or 0 (neutral)
+    -n, --normalize-output           Return 1 (positive), -1 (negative) or 0 (neutral)
+                                     instead of the actual score. See also --min and --max.
+        --normalize-score            Return 1 (positive), -1 (negative) or 0 (neutral)
                                      instead of the actual score. See also --min and --max.
         --min-threshold FLOAT        Scores lower than this are considered negative when
@@ -149,8 +152,8 @@ and contain one colon-separated line per token, like so:
 0.875: well-situated
 0.6: well suited
 ```
-The score is to the left of the first ':', and everything to the right is the
-(potentially multi-word) token.
+The score, which must be between -1.0 and 1.0, is to the left of the first ':',
+and everything to the right is the (potentially multi-word) token.
 ## Contribute
 Including baseline word/N-gram scores for many different languages is one

data/bin/textmood CHANGED Viewed

@@ -56,35 +56,44 @@ opts_parser = OptionParser.new do |opts|
   end
   opts.separator ""
   opts.separator "OPTIONAL options:"
-  opts.on("--start-ngram INTEGER", "The lowest word N-gram number to split the text into",
-                                   "(default 1). Note that this only makes sense if the",
-                                   "sentiment file has tokens of similar N-gram length") do |start_ngram|
-    options[:start_ngram] = start_ngram.to_i
+  opts.on("-o", "--normalize-output", "Return 1 (positive), -1 (negative) or 0 (neutral)",
+                                      "instead of the actual score. See also --min and --max.") do |n|
+    options[:normalize_output] = true
   end
   opts.separator ""
-  opts.on("--end-ngram INTEGER", "The highest word N-gram number to to split the text into",
-                                 "(default 1). Note that this only makes sense if the",
-                                 "sentiment file has tokens of similar N-gram length") do |end_ngram|
-    options[:end_ngram] = end_ngram.to_i
-  end
-  opts.separator ""
-  opts.on("-n", "--normalize", "Return 1 (positive), -1 (negative) or 0 (neutral)",
-                               "instead of the actual score. See also --min and --max.") do |n|
-    options[:normalize] = true
+  opts.on("-s", "--normalize-score", "Tries to normalize the score to an integer between +/- 100",
+                                     "according to the number of tokens that were scored, making",
+                                     "it more feasible to compare scores between texts of different",
+                                     "length") do |ns|
+    options[:normalize_score] = true
   end
   opts.separator ""
-  opts.on("--min-threshold FLOAT", "Scores lower than this are considered negative when",
-                                   "using --normalize (default -0.5)") do |min|
+  opts.on("-i", "--min-threshold FLOAT", "Scores lower than this are considered negative when",
+                                   "using --normalize-output (default 0.5). Note that the",
+                                   "threshold is compared to the normalized score, if applicable") do |min|
     options[:min_threshold] = min.to_f
   end
   opts.separator ""
-  opts.on("--max-threshold FLOAT", "Scores higher than this are considered positive when",
-                                   "using --normalize (default 0.5)") do |max|
+  opts.on("-x", "--max-threshold FLOAT", "Scores higher than this are considered positive when",
+                                   "using --normalize-output (default 0.5). Note that the",
+                                   "threshold is compared to the normalized score, if applicable") do |max|
     options[:max_threshold] = max.to_f
   end
   opts.separator ""
-  opts.on("-s", "--skip-symbols", "Do not include symbols file (emoticons etc.).",
-                                  "Only applies when using -l/--language.") do |s|
+  opts.on("-b", "--start-ngram INTEGER", "The lowest word N-gram number to split the text into",
+                                   "(default 1). Note that this only makes sense if the",
+                                   "sentiment file has tokens of similar N-gram length") do |start_ngram|
+    options[:start_ngram] = start_ngram.to_i
+  end
+  opts.separator ""
+  opts.on("-e", "--end-ngram INTEGER", "The highest word N-gram number to to split the text into",
+                                 "(default 1). Note that this only makes sense if the",
+                                 "sentiment file has tokens of similar N-gram length") do |end_ngram|
+    options[:end_ngram] = end_ngram.to_i
+  end
+  opts.separator ""
+  opts.on("-k", "--skip-symbols", "Do not include symbols file (emoticons etc.). Only applies",
+                            "when using -l/--language.") do |s|
     options[:include_symbols] = false
   end
   opts.separator ""

data/lang/no_NB.txt CHANGED Viewed

@@ -8,21 +8,14 @@
 0.87: godt fast
 0.87: upretensiøs
 0.87: undervurdert
-0.87: top-hulls
 0.87: terapeutisk
-0.87: stirrer
-0.87: utlagte masser
 0.87: selvrespekt
-0.87: self-respektfull
-0.87: selvrespekt
-0.87: self-hensyn
 0.87: helsebringende
 0.87: rosenrød i kinnene
 0.87: utstråle
 0.87: utstråling
 0.87: velstand
 0.87: prinsipiell
-0.87: ut-og-ytre
 0.87: moraliserende
 0.87: mesterstykke
 0.87: mesterlig
@@ -64,7 +57,6 @@
 0.75: fagmessig
 0.75: snedig
 0.75: helhet
-0.75: godt gjennomtenkt av
 0.75: velproporsjonert
 0.75: godt bevart
 0.75: godt favoriserte
@@ -78,8 +70,6 @@
 0.75: infisert
 0.75: urettferdig
 0.75: kle
-0.75: revet ut av villfarelse
-0.75: riv ut av villfarelse
 0.75: pålitelighet
 0.75: øverste nivå
 0.75: brødrister
@@ -171,7 +161,6 @@
 0.75: halvhjertede
 0.75: skyldfri
 0.75: pen
-0.75: get-at-stand
 0.75: famlende
 0.75: influensa
 0.75: finere
@@ -252,7 +241,6 @@
 0.68: hedret
 0.68: treffende
 0.68: konstruktiv
-0.68: kommer-at-stand
 0.68: munter
 0.66: produktiv
 0.66: gledelig
@@ -262,11 +250,8 @@
 0.64: verdig
 0.63: god
 0.62: ettergivende
-0.62: lengtet-for
 0.62: verdig
-0.62: ord-splitting
 0.62: lurer
-0.62: ønsket-for
 0.62: klokt
 0.62: velvære
 0.62: velprøvd
@@ -292,19 +277,16 @@
 0.62: ubetenkelig
 0.62: forståelig
 0.62: kraftens
-0.62: Ullr
 0.62: sannferdig
 0.62: tillitsfullt
 0.62: klarert
 0.62: overskride
 0.62: ro
-0.62: ro
 0.62: trening
 0.62: sporbar
 0.62: sporbar
 0.62: totaliteten
 0.62: topper
-0.62: fløtekaramell-nosed
 0.62: forsagt
 0.62: tre kvarter
 0.62: vitnesbyrd
@@ -317,22 +299,18 @@
 0.62: vellykket
 0.62: stilistisk
 0.62: iherdig
-0.62: Stoppable
 0.62: vekst
 0.62: angitt
 0.62: sfærisk
 0.62: tale-endowed
-0.62: Sparer
+0.62: sparer
 0.62: ønsket
 0.62: soignée
-0.62: SOIGNE
 0.62: røykfritt
 0.62: knusende
 0.62: treghet
-0.62: klapse-up
 0.62: synde
 0.62: forenklede
-0.62: SID
 0.62: sjokksikker
 0.62: Shivaree
 0.62: velskapt
@@ -373,7 +351,6 @@
 0.62: oppladbart
 0.62: betryggende
 0.62: rimelig
-0.62: fornyet stadfestelse
 0.62: hev
 0.62: kvantifiserbare
 0.62: målrettet
@@ -411,7 +388,6 @@
 0.62: overtrekk
 0.62: overkompensere
 0.62: rangeres foran
-0.62: ut-og-ut
 0.62: ortodoks
 0.62: rikest
 0.62: overdådig
@@ -430,7 +406,6 @@
 0.62: danser
 0.62: naboskap
 0.62: naturalisering
-0.62: naturalisering
 0.62: narsissisme
 0.62: naivt
 0.62: foranderlig
@@ -674,6 +649,8 @@
 0.62: ablativ
 0.62: underdanig
 0.62: abbed
+0.62: lykke
+0.62: lykkelig
 0.58: håndgripelig
 0.58: barmhjertig
 0.58: verdighet
@@ -1046,7 +1023,7 @@
 0.50: edel-mindedness
 0.50: knuslete
 0.50: pent
-0.50: ny-laget
+0.50: nylaget
 0.50: nøytralisert
 0.50: nøytralisert
 0.50: nevrobiologiske
@@ -4250,6 +4227,7 @@
 0.20: ladet
 0.20: godartet
 0.20: våken
+0.10: lykkes
 -0.20: stamme
 -0.20: variasjon
 -0.20: varians
@@ -4515,7 +4493,6 @@
 -0.25: sukker-belagt
 -0.25: kvelende
 -0.25: kveles
--0.25: lykkes
 -0.25: underjordiske
 -0.25: underjordiske
 -0.25: innsynkning
@@ -8993,7 +8970,6 @@
 -0.75: hode
 -0.75: Hayseed
 -0.75: Haredi
--0.75: lykke
 -0.75: Hamming
 -0.75: skinke-handed
 -0.75: skinke-fisted

data/lib/textmood.rb CHANGED Viewed

@@ -7,6 +7,8 @@ else
   Encoding.default_internal = Encoding::UTF_8
 end
+NORMALIZE_TO = 100
 class TextMood
   def initialize(options = {})
@@ -38,13 +40,21 @@ class TextMood
   def score_text(text)
     sentiment_total = 0.0
+    scores_added = 0
     (@options[:start_ngram]..@options[:end_ngram]).each do |i|
       ngrams(i, text.to_s).each do |token|
-        sentiment_total += score_token(token)
+        score = score_token(token)
+        unless score.nil?
+          sentiment_total += score
+          scores_added += 1
+        end
       end
     end
-    if @options[:normalize]
+    if @options[:normalize_score]
+      sentiment_total = normalize_score(sentiment_total, scores_added)
+    end
+    if @options[:normalize_output]
       if sentiment_total > @options[:max_threshold]
         1
       elsif sentiment_total < @options[:min_threshold]
@@ -76,7 +86,7 @@ class TextMood
       sentiment_value
     else
       puts "#{used_token}: nil" if @options[:debug]
-      0.0
+      nil
     end
   end
@@ -104,4 +114,9 @@ class TextMood
     sentiment_values
   end
+  def normalize_score(score, count)
+    factor = NORMALIZE_TO / count
+    (score * factor).to_i
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: textmood
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.5
   prerelease:
 platform: ruby
 authors: