RubyGems - omnicat - Versions diffs - 0.1.1 → 0.1.2 - Mend

omnicat 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +7 -0
data/CHANGELOG.txt +3 -0
data/README.md +6 -3
data/lib/omnicat/classifiers/bayes.rb +27 -14
data/lib/omnicat/version.rb +1 -1
metadata +9 -15

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 5d04e29b0e2e16e019592553041b62eb9b759d3d
+  data.tar.gz: 1a4614e75aef06179e7c9589a73bf9c0b3552d20
+SHA512:
+  metadata.gz: d30317305905b877570cc2626665b74b56d3d4278422398ed05bb836d2316b2cb3fe4faee1cb0e26e1068121151fb0ba49b1eac61b79541fd2e44c23c6d19c03
+  data.tar.gz: bc998d7a815212af3881fb81bfa18bda74d1f68df1a273f9c17fdbf76340579de26953a384eb30edbdfac51409ddb7279e6195bac1e86d51f6e11bde48e5029e

data/CHANGELOG.txt CHANGED Viewed

@@ -1,3 +1,6 @@
+0.1.2
+# fix the bayes algorithm (so important changes!)
 0.1.1
 # fix Regexp error for ruby version < 2.0.0

data/README.md CHANGED Viewed

@@ -57,9 +57,9 @@ Train category with multiple documents.
 Classify a document.
     result = bayes.classify('I feel so good and happy')
-    => #<OmniCat::Result:0x007fe59b97b548 @category={:name=>"negative", :percentage=>99}, @scores={"positive"=>1.749909854122994e-07, "negative"=>0.014084507042253521}, @total_score=0.014084682033238934>
+    => #<OmniCat::Result:0x007fd20296aad8 @category={:name=>"positive", :percentage=>73}, @scores={"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, @total_score=7.385426829679431e-09>
     result.to_hash
-    => {:category=>{:name=>"negative", :percentage=>99}, :scores=>{"positive"=>1.749909854122994e-07, "negative"=>0.014084507042253521}, :total_score=>0.014084682033238934}
+    => {:category=>{:name=>"positive", :percentage=>73}, :scores=>{"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, :total_score=>7.385426829679431e-09}
 ### Classify batch
 Classify multiple documents at a time.
@@ -70,19 +70,22 @@ Classify multiple documents at a time.
         'a good piece of work'
       ]
     )
-    => [#<OmniCat::Result:0x007fe59b949d90 @category={:name=>"negative", :percentage=>75}, @scores={"positive"=>7.962089836259623e-06, "negative"=>2.5145916163515512e-05}, @total_score=3.3108005999775135e-05>, #<OmniCat::Result:0x007fe59c9d7d10 @category={:name=>"positive", :percentage=>100}, @scores={"positive"=>0.0005434126313247192, "negative"=>0}, @total_score=0.0005434126313247192>]
+    => [#<OmniCat::Result:0x007fd2029341b8 @category={:name=>"negative", :percentage=>78}, @scores={"positive"=>2.5521869888765736e-14, "negative"=>9.074442627116706e-14}, @total_score=1.162662961599328e-13>, #<OmniCat::Result:0x007fd20292e7e0 @category={:name=>"positive", :percentage=>80}, @scores={"positive"=>2.411265432098765e-07, "negative"=>5.880238822371627e-08}, @total_score=2.999289314335928e-07>]
 ### Convert to hash
 Convert full Bayes object to hash.
     # For storing, restoring modal data
     bayes_hash = bayes.to_hash
+    => {:categories=>{"positive"=>{:doc_count=>4, :tokens=>{"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, :token_count=>37}, "negative"=>{:doc_count=>4, :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, :token_count=>17}}, :category_count=>2, :doc_count=>8, :k_value=>1.0, :token_count=>54, :uniq_token_count=>43}
 ### Load from hash
 Load full Bayes object from hash.
     another_bayes_obj = OmniCat::Classifiers::Bayes.new(bayes_hash)
+    => #<OmniCat::Classifiers::Bayes:0x007fd20308cff0 @categories={"positive"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf78 @doc_count=4, @tokens={"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, @token_count=37>, "negative"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf00 @doc_count=4, @tokens={"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, @token_count=17>}, @category_count=2, @doc_count=8, @k_value=1.0, @token_count=54, @uniq_token_count=43>
     another_bayes_obj.classify('best senses')
+    => #<OmniCat::Result:0x007fd203075008 @category={:name=>"positive", :percentage=>57}, @scores={"positive"=>0.0002314814814814815, "negative"=>0.00017146776406035664}, @total_score=0.00040294924554183816>
 ## Todo
 * Add more text classification modules such as Support Vector Machine (SVM).

data/lib/omnicat/classifiers/bayes.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module OmniCat
   module Classifiers
     class Bayes < ::OmniCat::Classifiers::Base
-      attr_accessor :categories, :category_count, :doc_count, :token_count
+      attr_accessor :categories, :category_count, :doc_count, :token_count, :uniq_token_count
       attr_accessor :k_value # helper val for skipping some Bayes theorem errors
       def initialize(bayes_hash = {})
@@ -16,6 +16,7 @@ module OmniCat
         self.doc_count = bayes_hash[:doc_count].to_i
         self.k_value = bayes_hash[:k_value] || 1.0
         self.token_count = bayes_hash[:token_count].to_i
+        self.uniq_token_count = bayes_hash[:uniq_token_count].to_i
       end
       # Allows adding new classification category
@@ -53,18 +54,26 @@ module OmniCat
       #   bayes.train("positive", "good, very well")
       #   bayes.train("negative", "bad dog")
       #   bayes.train("neutral", "how is the management gui")
-      def train(category, doc)
-        if category_exists?(category)
+      def train(category_name, doc)
+        if category_exists?(category_name)
           self.doc_count += 1
-          categories[category].doc_count += 1
+          categories[category_name].doc_count += 1
           doc.tokenize_with_counts.each do |token, count|
+            uniq_token_addition = 0
+            categories.each do |name, category|
+               if category.tokens.has_key?(token)
+                 uniq_token_addition = 1
+                 break
+               end
+            end
+            self.uniq_token_count += 1 if uniq_token_addition == 0
             self.token_count += count
-            self.categories[category].tokens[token] = self.categories[category].tokens[token].to_i + count
-            self.categories[category].token_count += count
+            self.categories[category_name].tokens[token] = self.categories[category_name].tokens[token].to_i + count
+            self.categories[category_name].token_count += count
           end
         else
           raise StandardError,
-                "Category with name '#{category}' does not exist!"
+                "Category with name '#{category_name}' does not exist!"
         end
       end
@@ -94,14 +103,18 @@ module OmniCat
           prior = category.doc_count / doc_count.to_f
           result.scores[name] = k_value
           doc.tokenize_with_counts.each do |token, count|
-            result.scores[name] *= (
-              (category.tokens[token].to_i + k_value) /
-              (category.token_count + token_count)
-            ) if category.tokens.has_key?(token)
+            if category.tokens[token].to_i == 0
+              result.scores[name] *= k_value / token_count
+            else
+              result.scores[name] *= (
+                count * (
+                  (category.tokens[token].to_i + k_value) /
+                  (category.token_count + uniq_token_count)
+                )
+              )
+            end
           end
-          result.scores[name] = (
-            result.scores[name].to_f == 1.0 ? 0 : (prior * result.scores[name])
-          )
+          result.scores[name] = prior * result.scores[name]
           if result.scores[name] > score
             result.category[:name] = name;
             score = result.scores[name];

data/lib/omnicat/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module OmniCat
-  VERSION = "0.1.1"
+  VERSION = "0.1.2"
 end

metadata CHANGED Viewed

@@ -1,20 +1,18 @@
 --- !ruby/object:Gem::Specification
 name: omnicat
 version: !ruby/object:Gem::Version
-  version: 0.1.1
-  prerelease:
+  version: 0.1.2
 platform: ruby
 authors:
 - Mustafa Turan
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-06-16 00:00:00.000000000 Z
+date: 2013-06-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
@@ -30,17 +27,15 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 description: A generalized framework for text classifications.
@@ -77,26 +72,25 @@ files:
 homepage: https://github.com/mustafaturan/omnicat
 licenses:
 - MIT
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.23
+rubygems_version: 2.0.3
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: A generalized framework for text classifications.
 test_files: []