RubyGems - clusterer - Versions diffs - 0.1.0 → 0.1.9 - Mend

clusterer 0.1.0 → 0.1.9

Files changed (33) hide show

data/README +29 -7
data/examples/google_search_cluster.rb +13 -7
data/examples/yahoo_search_cluster.rb +18 -31
data/lib/clusterer.rb +36 -95
data/lib/clusterer/algorithms.rb +95 -0
data/lib/clusterer/bayes.rb +255 -0
data/lib/clusterer/cluster.rb +56 -0
data/lib/clusterer/clustering.rb +35 -0
data/lib/clusterer/document.rb +71 -0
data/lib/clusterer/document_array.rb +79 -0
data/lib/clusterer/document_base.rb +32 -0
data/lib/clusterer/documents_centroid.rb +44 -0
data/lib/clusterer/inverse_document_frequency.rb +83 -0
data/lib/clusterer/lsi/dmatrix.rb +132 -0
data/lib/clusterer/lsi/document_vector.rb +54 -0
data/lib/clusterer/lsi/documents_centroid_vector.rb +51 -0
data/lib/clusterer/lsi/lsi.rb +95 -0
data/lib/clusterer/similarity.rb +34 -0
data/lib/{word_hash.rb → clusterer/stop_words.rb} +21 -23
data/lib/clusterer/tokenizer.rb +70 -0
data/tests/algorithms_test.rb +48 -0
data/tests/bayes_test.rb +68 -0
data/tests/cluster_test.rb +54 -0
data/tests/document_array_test.rb +64 -0
data/tests/document_centroid_test.rb +64 -0
data/tests/document_test.rb +71 -0
data/tests/inverse_document_frequency_test.rb +76 -0
data/tests/lsi_test.rb +77 -0
data/tests/similarity_test.rb +62 -0
data/tests/tokenizer_test.rb +72 -0
metadata +35 -9
data/lib/similarity.rb +0 -27
data/tests/clusterer_test.rb +0 -20

data/README CHANGED

@@ -1,8 +1,15 @@
-A ruby library which implements  clustering algorithms for text
-mining.
+A ruby library which implements clustering and classification
+algorithms for text mining.
-Currently implemented algorithms are K-Means, and Hierarchical
-clustering.
+Clustering algorithms currently implemented are - K-Means, and
+Hierarchical clustering, LSI. Many variations of these algorithms are
+also available, where you can change the similarity matrix, use
+refined version, i.e., hierarchical/bisecting clustering followed by
+Kmeans.
+LSI can be also used for clustering. In this first SVD transformation
+is done, and then the documents in the new space are clustered (any of
+KMeans, hierarchical/bisecting clustering can be used).
 Hierarchical gives better results, but complexity roughly O(n*n)
@@ -13,9 +20,24 @@ ysearch-rb from
 http://developer.yahoo.com/download/download.html
+The multinomial, complement and weightnormalized complement Bayes
+algorithm also implementd.
+Lsi can also be used for classification.
-Hybrid clustering algorithms + more similarity metrics + semi-supervised
-clustering... coming soon ... (or submit pacthes/show keen interest, if
-you want faster results)
+Thanks a lot to several people who researched and worked on the several ideas.
 Happy hacking......
+ToDo:
+Add more documentation, and explain the API.
+Add more examples.
+Incorporate the C version of Gorrell and Simon Funk's GHA and SVD algorithm, also
+write a Ruby version.
+Incorporate my own C version of Kernel SVD, and GHA.
+Explore ways to improve and make a better tokenizer, and introduce other NLP techniques.
+Add more classification algos: Decision Trees, and various extensions of it.
+Ruby and C version of SVM and NN.
+Feature Selection.

data/examples/google_search_cluster.rb CHANGED

@@ -2,6 +2,8 @@
 #Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
 require 'soap/wsdlDriver'
 require 'clusterer'
@@ -9,7 +11,7 @@ require 'clusterer'
 WSDL_URL = "http://api.google.com/GoogleSearch.wsdl"
 driver = SOAP::WSDLDriverFactory.new(WSDL_URL).create_rpc_driver
-query = 'kreeti'
+query = 'kolkata'
 key = ""
 results = driver.doGoogleSearch(key, query, 0, 10, true, "", 1, "lang_en", "", "")
@@ -17,28 +19,32 @@ count= results.resultElements.size
 max_count = results.estimatedTotalResultsCount.to_i
 results = results.resultElements
-while (count < 100 && count <= max_count)
+while (count < 20 && count <= max_count)
   more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
   results.concat(more_results.resultElements)
   count += more_results.resultElements.size
 end
-clusters = Clusterer::Clustering.kmeans_clustering(results.collect {|r| r.title.to_s.gsub(/<\/?[^>]*>/, "") +
-                                                     " " + r.snippet.to_s.gsub(/<\/?[^>]*>/, "")})
+clusters = Clusterer::Clustering.cluster(:kmeans, results, :no_stem => true, :tokenizer => :simple_ngram_tokenizer) {|r|
+  r.title.to_s.gsub(/<\/?[^>]*>/, "") + " " + r.snippet.to_s.gsub(/<\/?[^>]*>/, "")}
 #writing the output
 File.open("temp.html","w") do |f|
   f.write("<ul>")
   clusters.each do |clus|
     f.write("<li>")
+    f.write("<h4>")
+    clus.centroid.to_a.sort{|a,b| b[1] <=> a[1]}.slice(0,5).each {|w| f.write("#{w[0]} - #{format '%.2f',w[1]}, ")}
+    f.write("</h4>")
     f.write("<ul>")
-    clus.each do |d|
+    clus.documents.each do |doc|
+      result = doc.object
       f.write("<li>")
       f.write("<span class='title'>")
-      f.write(results[d].title)
+      f.write(result.title)
       f.write("</span>")
       f.write("<span class='snippet'>")
-      f.write(results[d].snippet)
+      f.write(result.snippet)
       f.write("</span>")
       f.write("</li>")
     end

data/examples/yahoo_search_cluster.rb CHANGED

@@ -2,53 +2,40 @@
 #Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
-require 'clusterer'
- require 'ysearch-rb/lib/ysearch'
-## try using HTML stripping to get better results
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
-# get the query parameter
-query = "kreeti"
-##
-# create a web search object:
-# Arguments:
-# 1. App ID (You can get one at http://developer.yahoo.net)
-# 2. The query
-# 3. type can be one of: 'all', 'any' or 'phrase'
-# 4. The no. of results
-##
-obj = WebSearch.new('YahooDemo', query, 'all', 100)
+require 'clusterer'
+require 'rubygems'
+require 'yahoo/web_search'
-results = obj.parse_results
-# count= results.resultElements.size
-# max_count = results.estimatedTotalResultsCount.to_i
-# results = results.resultElements
+ys = Yahoo::WebSearch.new "mUZGF4TV34F2H2aNPZat57sIgR7P2aKBwQm4aEq3TxOg1lYrlhRdMbbmdhmSwyYoRA4bOw--"
+query = "kolkata"
+results, = ys.search query, 10
-# while (count < 100 && count <= max_count)
-#   more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
-#   results.concat(more_results.resultElements)
-#   count += more_results.resultElements.size
-# end
+## try using HTML stripping to get better results
-#kmeans_clustering
-clusters = Clusterer::Clustering.hierarchical_clustering(results.collect {|r| r['Title'].to_s.gsub(/<\/?[^>]*>/, "") +
-                                                     " " + r['Summary'].to_s.gsub(/<\/?[^>]*>/, "")})
+#kmeans
+clusters = Clusterer::Clustering.cluster(:hierarchical, results, :no_stem => true, :tokenizer => :simple_ngram_tokenizer){|r|
+  r.title.to_s.gsub(/<\/?[^>]*>/, "") + " " + r.summary.to_s.gsub(/<\/?[^>]*>/, "")}
 #writing the output
 File.open("temp.html","w") do |f|
   f.write("<ul>")
   clusters.each do |clus|
     f.write("<li>")
+    f.write("<h4>")
+    clus.centroid.to_a.sort{|a,b| b[1] <=> a[1]}.slice(0,5).each {|w| f.write("#{w[0]} - #{format '%.2f',w[1]}, ")}
+    f.write("</h4>")
     f.write("<ul>")
-    clus.each do |d|
+    clus.documents.each do |doc|
+      result = doc.object
       f.write("<li>")
       f.write("<span class='title'>")
-      f.write(results[d]['Title'])
+      f.write(result.title)
       f.write("</span>")
       f.write("<span class='snippet'>")
-      f.write(results[d]['Summary'])
+      f.write(result.summary)
       f.write("</span>")
       f.write("</li>")
     end

data/lib/clusterer.rb CHANGED

@@ -1,100 +1,41 @@
-#The MIT License
+#--
 ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-require 'word_hash'
-require 'similarity'
 module Clusterer
-  class Clustering
-    class << self
-      #returns clusters containing index of the elements in doc
-      def kmeans_clustering (docs, k = nil, max_iter = 10, &similarity_function)
-        similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
-        k = Math.sqrt(docs.size) unless k
-        docs_hash = Array.new(docs.size)
-        clusters = Array.new(k)
-        cluster_centers = Array.new(k)
-        old_cluster_centers = Array.new(k)
-        docs.each_with_index {|d,i| docs_hash[i] = d.clean_word_hash}
-        0.upto(k - 1) {|i| x = rand(docs.size); clusters[i], cluster_centers[i] = [x],docs_hash[x].clone }
-        iter = 0
-        while (!max_iter || iter < max_iter) && !convergence(cluster_centers,old_cluster_centers)
-          puts "Iteration ....#{iter}...#{clusters.inspect}"
-          0.upto(k - 1) {|i| clusters[i] = []; old_cluster_centers[i] = cluster_centers[i]}
-          docs_hash.each_with_index do |doc, i|
-            max_value, max_index = 0, 0
-            cluster_centers.each_with_index do |cen, j|
-              sim = similarity_function.call(doc,cen)
-              max_value, max_index = sim,j if sim >= max_value
-            end
-            clusters[max_index] << i
-          end
-          recalculate_centers(cluster_centers,clusters,docs_hash)
-          iter += 1
-        end
-        clusters
-      end
-      def hierarchical_clustering (docs, k = nil, &similarity_function)
-        similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
-        k = Math.sqrt(docs.size) unless k
-        docs_hash = Array.new(docs.size)
-        clusters = Array.new(docs.size)
-        cluster_centers = Array.new(docs.size)
-        docs.each_with_index do |d,i|
-          cluster_centers[i] = d.clean_word_hash
-          clusters[i] = [i]
-        end
-        iter = 0
-        while (clusters.size > k)
-          puts "Iteration ....#{iter}...#{clusters.inspect}"
-          min_value, min_index = clusters.size[0], 0
-          clusters.each_with_index {|a, i| (min_value, min_index = a.size, i) if a.size <= min_value}
-          p = cluster_centers.delete_at(min_index)
-          c = clusters.delete_at(min_index)
-          max_value, max_index = 0, 0
-          cluster_centers.each_with_index do |cen, j|
-            sim = similarity_function.call(p,cen)
-            max_value, max_index = sim,j if sim >= max_value
-          end
-          merge_clusters(clusters[max_index],cluster_centers[max_index],c,p)
-          iter += 1
-        end
-        clusters
-      end
-      private
-      #merge cluster 2 into cluster 1
-      def merge_clusters(cluster1, cluster_center1, cluster2, cluster_center2)
-        cluster_center1.each_key {|k| cluster_center1[k] *= cluster1.size}
-        cluster_center2.each_key {|k| cluster_center2[k] *= cluster2.size}
-        cluster_center1.merge!(cluster_center2) {|k,o,n| cluster_center1[k] = o + n}
-        cluster1.concat(cluster2)
-        cluster_center1.each_key {|k| cluster_center1[k] /= cluster1.size.to_f}
-        cluster_center1.delete(:total)
-      end
-      def recalculate_centers(cluster_centers,clusters,docs_hash)
-        clusters.each_with_index do |cluster,i|
-          center = { }
-          cluster.each {|d| docs_hash[d].each {|w,v| center[w] = (center[w] || 0) + v} }
-          total = 0
-          count = cluster.size
-          count = 1 if count ==0
-          center.each_key {|w| next unless w.class == String;
-            center[w] /=count.to_f; total += center[w]**2}
-          total = 1 if total == 0
-          center[:total] = total
-          cluster_centers[i] = center
-        end
-      end
-      def convergence(new_centers,old_centers)
-        new_centers.each_with_index {|c,i|     return false unless c == old_centers[i]}
-        return true
-      end
-    end
-  end
+  VERSION = '0.2.0'
 end
+require 'mathn'
+require 'clusterer/stop_words'
+require 'clusterer/similarity'
+require 'clusterer/tokenizer'
+require 'clusterer/document_base'
+require 'clusterer/inverse_document_frequency'
+require 'clusterer/document'
+require 'clusterer/documents_centroid'
+require 'clusterer/document_array'
+require 'clusterer/cluster'
+require 'clusterer/algorithms'
+require 'clusterer/clustering'
+require 'clusterer/lsi/lsi'
+require 'clusterer/bayes'

data/lib/clusterer/algorithms.rb ADDED

@@ -0,0 +1,95 @@
+#--
+###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+module Clusterer
+  class Algorithms
+    class << self
+private
+      def random_cluster_seeds(documents,k)
+        temp = []
+        (1..k).collect do
+          t= nil
+          while(!t || temp.include?(t))
+            t= Cluster.new([documents[rand(documents.size)]]);
+          end
+          temp << t
+          t
+        end
+      end
+public
+      def kmeans(documents, k, options = { })
+        old_clusters = Array.new(k)
+        max_iter = options[:maximum_iterations] || 10
+        clusters = options[:seeds] || random_cluster_seeds(documents, k)
+        sim_fun = options[:similarity_function] || :cosine_similarity
+        iter = 0
+        while (!max_iter || iter < max_iter) && clusters != old_clusters
+          puts "Iteration ....#{iter}"
+          k.times {|i| old_clusters[i] = clusters[i]; clusters[i] = []}
+          documents.each do |document|
+            max_index = (0..k-1).max do |i,j|
+              document.send(sim_fun, old_clusters[i].centroid) <=> document.send(sim_fun, old_clusters[j].centroid)
+            end
+            clusters[max_index] << document
+          end
+          k.times {|i| clusters[i] = Cluster.new(clusters[i])}
+          iter += 1
+        end
+        return clusters
+      end
+      def bisecting_kmeans(documents, k, options = { })
+        clusters = [Cluster.new(documents)]
+        while  clusters.size < k
+          lg_clus = clusters.max {|a, b| a.documents.size <=> b.documents.size} #largest cluster
+          clusters.delete(lg_clus)
+          clusters.concat(kmeans(lg_clus.documents,2))
+        end
+        options[:refined] ? clusters = kmeans(documents, k, options.merge(:seeds => clusters)) : clusters
+      end
+      def hierarchical(documents, k, options = { })
+        clusters = documents.collect {|d| Cluster.new([d])}
+        iter = 0
+        sim_fun = options[:similarity_function] || :upgma
+        options[:similarity_function] = nil
+        while clusters.size > k
+          puts "Iteration ....#{iter}"
+          pairs = []
+          clusters.each_with_index {|c,i| pairs.concat(clusters.slice(i+1,clusters.size).collect{|f| [c,f] })}
+          pair = pairs.max {|a,b| a[0].send(sim_fun, a[1]) <=> b[0].send(sim_fun, b[1]) }
+          clusters.delete(pair[1])
+          pair[0].merge!(pair[1])
+          iter += 1
+        end
+        options[:refined] ? clusters = kmeans(documents, k, options.merge(:seeds => clusters)) : clusters
+      end
+    end
+  end
+end

data/lib/clusterer/bayes.rb ADDED

@@ -0,0 +1,255 @@
+#--
+###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+module Clusterer
+  #The class Bayes is the base class for implementing different types of Naive
+  #Bayes classifier. The initialize method of this class is protected, so objects
+  #of this class cannot be instantiated.
+  #The Bayesian Formula is P(y|x) = P(x/y) * P(y) / P(x)
+  # posterior = likelhood * prior / evidence
+  #Given the evidence, we have to predict the posterior. The different Bayesian variants
+  #given below calculate likelihood using different methods.
+  #While calculating the posterior since the evidence value is same for all the categories, this
+  #values is not calculated. Also, posterior distribution over all possible categories sum upto 1.
+  class Bayes
+    #an attribute for storing the different types of classes or categories
+    attr_accessor :categories
+    protected
+    #The first argument is an Array of categories. Currently no options are supported.
+    def initialize(categories, options = { })
+      @prior_count = Hash.new       #stores the number of document of diffrent classes/categories.
+      @categories = categories.collect {|c| c.to_sym }
+      @likelihood_numer = Hash.new    #hash of hash for storing the numerator in the likelihood value for each class
+      @likelihood_denom = Hash.new    #hash of hash for storing the denominator in the likelihood value for each class
+      @documents_count = 0  #total number of documents
+      @categories.each {|cl| @likelihood_numer[cl] = Hash.new; @likelihood_denom[cl] = 0.0; @prior_count[cl] = 0}
+    end
+    #The first argument is the document, which will be used for training, and the
+    #second is the category.
+    def train(document, category)
+      check_class(category)
+      @prior_count[category] +=1
+      @documents_count += 1
+    end
+    #The first argument is the document, which should be removed, and the
+    #second is the category to which it belonged.
+    def untrain(document, category)
+      check_class(category)
+      raise StandardError, "There are no documents for this class.",caller if @prior_count[category] <= 0
+      @prior_count[category] -= 1
+      @documents_count -= 1
+    end
+    #For an input document returns the probability distribution of the different
+    #categories in the same order as the order in categories array.
+    def distribution
+      posterior = Array.new(@categories.size,0.0)
+      @categories.each_with_index do |cl,ind|
+        posterior[ind] = yield(cl,ind) + Math.log((@prior_count[cl] + 1)/(@documents_count + 1).to_f)
+      end
+      sum = 0
+      posterior.each_with_index {|v,i| posterior[i] = Math.exp(v); sum += posterior[i]}
+      posterior.each_with_index {|v,i| posterior[i] /= sum}
+      posterior
+    end
+public
+    #For an input document returns the prediction in favor of class with the
+    #highest probability.
+    def classify(document, weight = nil)
+      posterior = distribution(document)
+      @categories[(0..(@categories.size - 1)).max {|i,j| posterior[i] <=> posterior[j]}]
+    end
+    #This method missing helps in having training and untraining method which have the
+    #category appended to their front. For example:
+    #
+    #    train_good document
+    #
+    def method_missing (name, *args)
+      if name.to_s =~ /^(un)?train_/
+        category = name.to_s.gsub(/(un)?train_/, '').to_sym
+        send("#{$1}train",args[0],category)
+       else
+        super
+      end
+    end
+    private
+    def check_class(category)
+      raise ArgumentError,"Unknown class. It should be one of the following #{categories}.",caller unless categories.include?(category)
+    end
+  end
+  #Based on the description given in "Tackling the Poor Assumptions of Naive Bayes Text Classifiers"
+  #by Jason D. M. Rennie, Lawrence Shih, Jaime Teevan and David R. Karger, ICML - 2003
+  #
+  #The basic idea is that likelihood of a document for certain category is directly proportional to
+  #the number of other documents containing the same terms appearing while training for the
+  #particular class.
+  class MultinomialBayes < Bayes
+    def train(document, category)
+      category = category.to_sym
+      super
+      numer, sum = @likelihood_numer[category], 0.0
+      document.each do |term,freq|
+        numer[term] = (numer[term] || 0) + freq
+        sum += freq
+      end
+      @likelihood_denom[category] += sum
+    end
+    def untrain(document, category)
+      category = category.to_sym
+      super
+      numer, sum = @likelihood_numer[category], 0.0
+      document.each do |term,freq|
+        if numer[term]
+          numer[term] = [numer[term] - freq, 0].max
+          sum += freq
+        end
+      end
+      @likelihood_denom[category] = [@likelihood_denom[category] - sum, 0.0].max
+    end
+    def distribution(document)
+      super() do |cl,ind|
+        numer, denom, sum = @likelihood_numer[cl], (1 + @likelihood_denom[cl]), 0.0
+        document.each {|term,freq| sum += freq * Math.log((1 + (numer[term] || 0))/denom)}
+        sum
+      end
+    end
+  end
+  #Based on the description given in "Tackling the Poor Assumptions of Naive Bayes Text Classifiers"
+  #by Jason D. M. Rennie, Lawrence Shih, Jaime Teevan and David R. Karger, ICML - 2003
+  #
+  #The idea is that likelihood of a document for certain category is inversely proportional to
+  #the number of other documents containing the same terms appearing in other classes. Notice, the
+  #difference with MultiNomialBayes, and hence it is called complement.
+  #Though the authors claim that this performs better than MultiNomialBayes, but take the results
+  #with a pinch of salt, the performance of MultiNomial may be better with balanced datasets.
+  #If the dataset is skewed with the minority class being important, use ComplementBayes.
+  class ComplementBayes < Bayes
+    def train(document, category)
+      category = category.to_sym
+      super
+      (@categories - [category]).each_with_index do |cl,ind|
+        numer, sum = @likelihood_numer[cl], 0.0
+        document.each do |term,freq|
+          numer[term] = (numer[term] || 0) + freq
+          sum += freq
+        end
+        @likelihood_denom[cl] += sum
+      end
+    end
+    def untrain(document, category)
+      category = category.to_sym
+      super
+      (@categories - [category]).each_with_index do |cl,ind|
+        numer, sum = @likelihood_numer[category], 0.0
+        document.each do |term,freq|
+          if numer[term]
+            numer[term] = [numer[term] - freq, 0].max
+            sum += freq
+          end
+        end
+        @likelihood_denom[category] = [@likelihood_denom[category] - sum, 0.0].max
+      end
+    end
+    def distribution(document)
+      super() do |cl,ind|
+        numer, denom, sum = @likelihood_numer[cl], (1 + @likelihood_denom[cl]), 0.0
+        document.each {|term, freq| sum += freq * Math.log((1 + (numer[term] || 0))/denom)}
+        -sum
+      end
+    end
+  end
+  ##Module to help in implementing weighted form of MultimonialBayes and ComplementBayes.
+  ##For performance reasons the normalized classifier weights are cached. These weights
+  #are calculated only when the classifier is first used for training or prediction.
+  #Training or Untraining an instance clears the cached normalized weights.
+  module WeightNormalized
+    def initialize(categories, options = { })
+      super
+      @weighted_likelihood = Hash.new
+    end
+    def train(document, category)
+      super
+      @weighted_likelihood.clear
+    end
+    def untrain(document, category)
+      super
+      @weighted_likelihood.clear
+    end
+    private
+    def weighted_likelihood(category)
+      @weighted_likelihood[category] ||= begin
+                                           sum, le, denom = 0.0, Hash.new, (1 + @likelihood_denom[category])
+                                           numer =
+                                           @likelihood_numer[category].each do |term,freq|
+                                             le[term] = Math.log((1 + freq)/denom)
+                                             sum += le[term]
+                                           end
+                                           le.each {|term, weight| le[term] = weight/sum }
+                                         end
+    end
+  end
+  #Based on the description given in "Tackling the Poor Assumptions of Naive Bayes Text Classifiers"
+  #by Jason D. M. Rennie, Lawrence Shih, Jaime Teevan and David R. Karger, ICML - 2003
+  #
+  #An improved complement bayes, the authors claim that this algorithm performs better, then the
+  #ComplementBayes. The weights are normalized, before using this algorithm.
+  class WeightNormalizedComplementBayes < ComplementBayes
+    include WeightNormalized
+    def distribution(document)
+      self.class.superclass.superclass.instance_method(:distribution).bind(self).call do |cl,ind|
+        we, sum = weighted_likelihood(cl), 0.0
+        document.each {|term,freq| sum += freq * (we[term] || 0)}
+        -sum
+      end
+    end
+  end
+  #Hopefully an improved MultinomialBayes, based on the same ideas as the WeightNormalizedComplementBayes
+  #only using MultinomialBayes as the base. The weights are normalized, before using this algorithm.
+  class WeightNormalizedMultinomialBayes < MultinomialBayes
+    include WeightNormalized
+    def distribution(document)
+      self.class.superclass.superclass.instance_method(:distribution).bind(self).call do |cl,ind|
+        we, sum = weighted_likelihood(cl), 0.0
+        document.each {|term,freq| sum += freq * (we[term] || 0)}
+        sum
+      end
+    end
+  end
+end