RubyGems - classifier - Versions diffs - 1.3.4 → 1.4.0 - Mend

classifier 1.3.4 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +5 -5
data/LICENSE +2 -2
data/lib/classifier/bayes.rb +132 -124
data/lib/classifier/extensions/string.rb +1 -1
data/lib/classifier/extensions/vector.rb +72 -78
data/lib/classifier/extensions/vector_serialize.rb +8 -10
data/lib/classifier/extensions/word_hash.rb +114 -120
data/lib/classifier/lsi/content_node.rb +39 -37
data/lib/classifier/lsi/summary.rb +24 -24
data/lib/classifier/lsi/word_list.rb +7 -8
data/lib/classifier/lsi.rb +174 -151
data/lib/classifier.rb +2 -1
data/test/test_helper.rb +3 -2
metadata +60 -27
data/Gemfile +0 -5
data/Gemfile.lock +0 -26
data/README.markdown +0 -97
data/Rakefile +0 -84
data/test/bayes/bayesian_test.rb +0 -33
data/test/extensions/word_hash_test.rb +0 -35
data/test/lsi/lsi_test.rb +0 -123

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: 3c53668ddd328fb78862c67723b185df9c2aa717
-  data.tar.gz: 3655405d082fdd8a01e4ca893a70360ca9f62322
+SHA256:
+  metadata.gz: c20227e9f55f35fe93f2cbb6e5fc132127159973da74d7af68d1680a25021e73
+  data.tar.gz: ab8146db131a32b455b4e9b47dc3db7964e1e1a3bf9c69e953fdc6e6305eab89
 SHA512:
-  metadata.gz: 40b7395e2f04f56bdbabb49a4d0013dba36e9c1325ae66e5bff92451059c5b559677aaea30e50f8f2fbbae58e50bf0f084925ef38e0e3d3fb729e37e357469d4
-  data.tar.gz: 150f8f387706d870a37e86b0418c5e68ad386b82518294bdf21585ab3509fd98515648bc5e06dfb78b97f1e544099fe1da5ddcd69413826e0ccc39780d457940
+  metadata.gz: 2d024e1d46b3529f7e328110e8b59161e0f6fa0d738b3de0d973073974ad69fd7d3bfb529d5a1b827a444266c14f5dff37630af2e522522d07d74f18324064eb
+  data.tar.gz: 12fa59e6fc0f3e5ffb4fe1dc838b00ca529e630875729c78a3a0847bfdb6367856799c0f95831ecd75dbb36d0f4fad7fea2a6082083a965a5529d8bbf75a1542

data/LICENSE CHANGED Viewed

@@ -146,7 +146,7 @@ such a program is covered only if its contents constitute a work based
 on the Library (independent of the use of the Library in a tool for
 writing it).  Whether that is true depends on what the Library does
 and what the program that uses the Library does.
   1. You may copy and distribute verbatim copies of the Library's
 complete source code as you receive it, in any medium, provided that
 you conspicuously and appropriately publish on each copy an
@@ -426,4 +426,4 @@ the Free Software Foundation.
   14. If you wish to incorporate parts of the Library into other free
 programs whose distribution conditions are incompatible with these,
 write to the author to ask for permission.  For software which is
-copyrighted by
+copyrighted by

data/lib/classifier/bayes.rb CHANGED Viewed

@@ -3,133 +3,141 @@
 # License::   LGPL
 module Classifier
+  class Bayes
+    # The class can be created with one or more categories, each of which will be
+    # initialized and given a training method. E.g.,
+    #      b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
+    def initialize(*categories)
+      @categories = {}
+      categories.each { |category| @categories[category.prepare_category_name] = {} }
+      @total_words = 0
+      @category_counts = Hash.new(0)
+      @category_word_count = Hash.new(0)
+    end
-class Bayes
-  # The class can be created with one or more categories, each of which will be
-  # initialized and given a training method. E.g.,
-  #      b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
-	def initialize(*categories)
-		@categories = Hash.new
-		categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
-		@total_words = 0
-                @category_counts = Hash.new(0)
-	end
+    #
+    # Provides a general training method for all categories specified in Bayes#new
+    # For example:
+    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+    #     b.train :this, "This text"
+    #     b.train "that", "That text"
+    #     b.train "The other", "The other text"
+    def train(category, text)
+      category = category.prepare_category_name
+      @category_counts[category] += 1
+      text.word_hash.each do |word, count|
+        @categories[category][word] ||= 0
+        @categories[category][word] += count
+        @total_words += count
+        @category_word_count[category] += count
+      end
+    end
-	#
-	# Provides a general training method for all categories specified in Bayes#new
-	# For example:
-	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
-	#     b.train :this, "This text"
-	#     b.train "that", "That text"
-	#     b.train "The other", "The other text"
-	def train(category, text)
-		category = category.prepare_category_name
-                @category_counts[category] += 1
-		text.word_hash.each do |word, count|
-			@categories[category][word]     ||=     0
-			@categories[category][word]      +=     count
-			@total_words += count
-		end
-	end
+    #
+    # Provides a untraining method for all categories specified in Bayes#new
+    # Be very careful with this method.
+    #
+    # For example:
+    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+    #     b.train :this, "This text"
+    #     b.untrain :this, "This text"
+    def untrain(category, text)
+      category = category.prepare_category_name
+      @category_counts[category] -= 1
+      text.word_hash.each do |word, count|
+        next unless @total_words >= 0
-	#
-	# Provides a untraining method for all categories specified in Bayes#new
-	# Be very careful with this method.
-	#
-	# For example:
-	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
-	#     b.train :this, "This text"
-	#     b.untrain :this, "This text"
-	def untrain(category, text)
-		category = category.prepare_category_name
-                @category_counts[category] -= 1
-		text.word_hash.each do |word, count|
-			if @total_words >= 0
-				orig = @categories[category][word]
-				@categories[category][word]     ||=     0
-				@categories[category][word]      -=     count
-				if @categories[category][word] <= 0
-					@categories[category].delete(word)
-					count = orig
-				end
-				@total_words -= count
-			end
-		end
-	end
-	#
-	# Returns the scores in each category the provided +text+. E.g.,
-	#    b.classifications "I hate bad words and you"
-	#    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
-	# The largest of these scores (the one closest to 0) is the one picked out by #classify
-	def classifications(text)
-		score = Hash.new
-                training_count = @category_counts.values.inject { |x,y| x+y }.to_f
-		@categories.each do |category, category_words|
-			score[category.to_s] = 0
-			total = category_words.values.inject(0) {|sum, element| sum+element}
-			text.word_hash.each do |word, count|
-				s = category_words.has_key?(word) ? category_words[word] : 0.1
-				score[category.to_s] += Math.log(s/total.to_f)
-			end
-                        # now add prior probability for the category
-                        s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
-                        score[category.to_s] += Math.log(s / training_count)
-		end
-		return score
-	end
+        orig = @categories[category][word] || 0
+        @categories[category][word] ||= 0
+        @categories[category][word] -= count
+        if @categories[category][word] <= 0
+          @categories[category].delete(word)
+          count = orig
+        end
+        @category_word_count[category] -= count if @category_word_count[category] >= count
+        @total_words -= count
+      end
+    end
-  #
-  # Returns the classification of the provided +text+, which is one of the
-  # categories given in the initializer. E.g.,
-  #    b.classify "I hate bad words and you"
-  #    =>  'Uninteresting'
-	def classify(text)
-		(classifications(text).sort_by { |a| -a[1] })[0][0]
-	end
-	#
-	# Provides training and untraining methods for the categories specified in Bayes#new
-	# For example:
-	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
-	#     b.train_this "This text"
-	#     b.train_that "That text"
-	#     b.untrain_that "That text"
-	#     b.train_the_other "The other text"
-	def method_missing(name, *args)
-		category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
-		if @categories.has_key? category
-			args.each { |text| eval("#{$1}train(category, text)") }
-		elsif name.to_s =~ /(un)?train_([\w]+)/
-			raise StandardError, "No such category: #{category}"
-		else
-	    super  #raise StandardError, "No such method: #{name}"
-		end
-	end
-	#
-	# Provides a list of category names
-	# For example:
-	#     b.categories
-	#     =>   ['This', 'That', 'the_other']
-	def categories # :nodoc:
-		@categories.keys.collect {|c| c.to_s}
-	end
-	#
-	# Allows you to add categories to the classifier.
-	# For example:
-	#     b.add_category "Not spam"
-	#
-	# WARNING: Adding categories to a trained classifier will
-	# result in an undertrained category that will tend to match
-	# more criteria than the trained selective categories. In short,
-	# try to initialize your categories at initialization.
-	def add_category(category)
-		@categories[category.prepare_category_name] = Hash.new
-	end
-	alias append_category add_category
-end
+    #
+    # Returns the scores in each category the provided +text+. E.g.,
+    #    b.classifications "I hate bad words and you"
+    #    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
+    # The largest of these scores (the one closest to 0) is the one picked out by #classify
+    def classifications(text)
+      score = {}
+      word_hash = text.word_hash
+      training_count = @category_counts.values.inject { |x, y| x + y }.to_f
+      @categories.each do |category, category_words|
+        score[category.to_s] = 0
+        total = (@category_word_count[category] || 1).to_f
+        word_hash.each_key do |word|
+          s = category_words.key?(word) ? category_words[word] : 0.1
+          score[category.to_s] += Math.log(s / total)
+        end
+        # now add prior probability for the category
+        s = @category_counts.key?(category) ? @category_counts[category] : 0.1
+        score[category.to_s] += Math.log(s / training_count)
+      end
+      score
+    end
+    #
+    # Returns the classification of the provided +text+, which is one of the
+    # categories given in the initializer. E.g.,
+    #    b.classify "I hate bad words and you"
+    #    =>  'Uninteresting'
+    def classify(text)
+      (classifications(text).sort_by { |a| -a[1] })[0][0]
+    end
+    #
+    # Provides training and untraining methods for the categories specified in Bayes#new
+    # For example:
+    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+    #     b.train_this "This text"
+    #     b.train_that "That text"
+    #     b.untrain_that "That text"
+    #     b.train_the_other "The other text"
+    def method_missing(name, *args)
+      category = name.to_s.gsub(/(un)?train_(\w+)/, '\2').prepare_category_name
+      if @categories.key?(category)
+        args.each do |text|
+          if name.to_s.start_with?('untrain_')
+            untrain(category, text)
+          else
+            train(category, text)
+          end
+        end
+      elsif name.to_s =~ /(un)?train_(\w+)/
+        raise StandardError, "No such category: #{category}"
+      else
+        super
+      end
+    end
+    #
+    # Provides a list of category names
+    # For example:
+    #     b.categories
+    #     =>   ['This', 'That', 'the_other']
+    def categories # :nodoc:
+      @categories.keys.collect(&:to_s)
+    end
+    #
+    # Allows you to add categories to the classifier.
+    # For example:
+    #     b.add_category "Not spam"
+    #
+    # WARNING: Adding categories to a trained classifier will
+    # result in an undertrained category that will tend to match
+    # more criteria than the trained selective categories. In short,
+    # try to initialize your categories at initialization.
+    def add_category(category)
+      @categories[category.prepare_category_name] = {}
+    end
+    alias append_category add_category
+  end
 end

data/lib/classifier/extensions/string.rb CHANGED Viewed

@@ -6,5 +6,5 @@ require 'fast_stemmer'
 require 'classifier/extensions/word_hash'
 class Object
-	def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
+  def prepare_category_name = to_s.gsub('_', ' ').capitalize.intern
 end

data/lib/classifier/extensions/vector.rb CHANGED Viewed

@@ -1,112 +1,106 @@
 # Author::    Ernest Ellingson
-# Copyright:: Copyright (c) 2005
+# Copyright:: Copyright (c) 2005
 # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
 require 'matrix'
-require 'mathn'
 class Array
-  def sum(identity = 0, &block)
-    return identity unless size > 0
+  def sum_with_identity(identity = 0.0, &block)
+    return identity unless size.to_i.positive?
     if block_given?
-      map(&block).sum
+      map(&block).sum_with_identity(identity)
     else
-      reduce(:+)
+      compact.reduce(:+).to_f || identity.to_f
     end
   end
 end
-class Vector
+module VectorExtensions
   def magnitude
-    sumsqs = 0.0
-    self.size.times do |i|
-      sumsqs += self[i] ** 2.0
+    sum_of_squares = 0.to_r
+    size.times do |i|
+      sum_of_squares += self[i]**2.to_r
     end
-    Math.sqrt(sumsqs)
+    Math.sqrt(sum_of_squares.to_f)
   end
-  def normalize
-    nv = []
-    mag = self.magnitude
-    self.size.times do |i|
-      nv << (self[i] / mag)
+  def normalize
+    normalized_values = []
+    magnitude_value = magnitude.to_r
+    size.times do |i|
+      normalized_values << (self[i] / magnitude_value)
     end
-    Vector[*nv]
+    Vector[*normalized_values]
   end
 end
+class Vector
+  include VectorExtensions
+end
 class Matrix
-  def Matrix.diag(s)
-     Matrix.diagonal(*s)
+  def self.diag(diagonal_elements)
+    Matrix.diagonal(*diagonal_elements)
   end
-  alias :trans :transpose
-  def SV_decomp(maxSweeps = 20)
-    if self.row_size >= self.column_size
-      q = self.trans * self
-    else
-      q = self * self.trans
-    end
-    qrot    = q.dup
-    v       = Matrix.identity(q.row_size)
-    azrot   = nil
-    mzrot   = nil
-    cnt     = 0
-    s_old   = nil
-    mu      = nil
+  alias trans transpose
+  def SV_decomp(max_sweeps = 20)
+    q_matrix = if row_size >= column_size
+                 trans * self
+               else
+                 self * trans
+               end
-    while true do
-      cnt += 1
-      for row in (0...qrot.row_size-1) do
-        for col in (1..qrot.row_size-1) do
+    q_rotation_matrix = q_matrix.dup
+    v_matrix = Matrix.identity(q_matrix.row_size)
+    iteration_count = 0
+    previous_s_matrix = nil
+    loop do
+      iteration_count += 1
+      (0...q_rotation_matrix.row_size - 1).each do |row|
+        (1..q_rotation_matrix.row_size - 1).each do |col|
           next if row == col
-          h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
-          hcos = Math.cos(h)
-          hsin = Math.sin(h)
-          mzrot = Matrix.identity(qrot.row_size)
-          mzrot[row,row] = hcos
-          mzrot[row,col] = -hsin
-          mzrot[col,row] = hsin
-          mzrot[col,col] = hcos
-          qrot = mzrot.trans * qrot * mzrot
-          v = v * mzrot
-        end
+          angle = Math.atan((2.to_r * q_rotation_matrix[row,
+                                                        col]) / (q_rotation_matrix[row,
+                                                                                   row] - q_rotation_matrix[col,
+                                                                                                            col])) / 2.0
+          cosine = Math.cos(angle)
+          sine = Math.sin(angle)
+          rotation_matrix = Matrix.identity(q_rotation_matrix.row_size)
+          rotation_matrix[row, row] = cosine
+          rotation_matrix[row, col] = -sine
+          rotation_matrix[col, row] = sine
+          rotation_matrix[col, col] = cosine
+          q_rotation_matrix = rotation_matrix.trans * q_rotation_matrix * rotation_matrix
+          v_matrix *= rotation_matrix
+        end
       end
-      s_old = qrot.dup if cnt == 1
-      sum_qrot = 0.0
-      if cnt > 1
-        qrot.row_size.times do |r|
-          sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
+      previous_s_matrix = q_rotation_matrix.dup if iteration_count == 1
+      sum_of_differences = 0.to_r
+      if iteration_count > 1
+        q_rotation_matrix.row_size.times do |r|
+          difference = (q_rotation_matrix[r, r] - previous_s_matrix[r, r]).abs
+          sum_of_differences += difference.to_r if difference > 0.001
         end
-        s_old = qrot.dup
-      end
-      break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
-    end # of do while true
-    s = []
-    qrot.row_size.times do |r|
-      s << Math.sqrt(qrot[r,r])
+        previous_s_matrix = q_rotation_matrix.dup
+      end
+      break if (sum_of_differences <= 0.001 && iteration_count > 1) || iteration_count >= max_sweeps
     end
-    #puts "cnt = #{cnt}"
-    if self.row_size >= self.column_size
-      mu = self *  v * Matrix.diagonal(*s).inverse
-      return [mu, v, s]
-    else
-      puts v.row_size
-      puts v.column_size
-      puts self.row_size
-      puts self.column_size
-      puts s.size
-      mu = (self.trans * v *  Matrix.diagonal(*s).inverse)
-      return [mu, v, s]
+    singular_values = []
+    q_rotation_matrix.row_size.times do |r|
+      singular_values << Math.sqrt(q_rotation_matrix[r, r].to_f)
     end
+    u_matrix = (row_size >= column_size ? self : trans) * v_matrix * Matrix.diagonal(*singular_values).inverse
+    [u_matrix, v_matrix, singular_values]
   end
-  def []=(i,j,val)
-    @rows[i][j] = val
+  def []=(row_index, col_index, value)
+    @rows[row_index][col_index] = value
   end
 end

data/lib/classifier/extensions/vector_serialize.rb CHANGED Viewed

@@ -1,20 +1,18 @@
 module GSL
   class Vector
-    def _dump(v)
-      Marshal.dump( self.to_a )
+    def _dump(_v)
+      Marshal.dump(to_a)
     end
     def self._load(arr)
       arry = Marshal.load(arr)
-      return GSL::Vector.alloc(arry)
+      GSL::Vector.alloc(arry)
     end
   end
   class Matrix
-     class <<self
-        alias :diag :diagonal
-     end
+    class << self
+      alias diag diagonal
+    end
   end
 end