RubyGems - classifier - Versions diffs - 1.3.5 → 1.4.0 - Mend

classifier 1.3.5 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/lib/classifier/bayes.rb +128 -120
data/lib/classifier/extensions/string.rb +1 -1
data/lib/classifier/extensions/vector.rb +66 -72
data/lib/classifier/extensions/vector_serialize.rb +6 -8
data/lib/classifier/extensions/word_hash.rb +108 -114
data/lib/classifier/lsi/content_node.rb +25 -23
data/lib/classifier/lsi/summary.rb +20 -20
data/lib/classifier/lsi/word_list.rb +1 -2
data/lib/classifier/lsi.rb +112 -89
data/lib/classifier.rb +1 -0
data/test/test_helper.rb +5 -0
metadata +7 -21

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1d453b4ca9e0a0c44a2b8d3c9ef55db5b55a17efe5ff0cfbcab93f24965cd536
-  data.tar.gz: 26f5d9595ddd35c8d7c239f946afa8251a4b68f24f5b8bc41e30be13ded60547
+  metadata.gz: c20227e9f55f35fe93f2cbb6e5fc132127159973da74d7af68d1680a25021e73
+  data.tar.gz: ab8146db131a32b455b4e9b47dc3db7964e1e1a3bf9c69e953fdc6e6305eab89
 SHA512:
-  metadata.gz: c13c1666c7d2fe92d47ab7ced0a885fec7b13719aea25f283a013d5015744d6aea7d473706af54042972ba687b214c2a3a619f58d75560d90e57c3569d38f957
-  data.tar.gz: 23261ba6708307ecf6faac636d8572c50720fbf9a2e6db6ee736a07ce3de445daa431afb3bbf2c49dd4d2bd699327ca1c419029b4972236c52a9a5c1f00ab5a2
+  metadata.gz: 2d024e1d46b3529f7e328110e8b59161e0f6fa0d738b3de0d973073974ad69fd7d3bfb529d5a1b827a444266c14f5dff37630af2e522522d07d74f18324064eb
+  data.tar.gz: 12fa59e6fc0f3e5ffb4fe1dc838b00ca529e630875729c78a3a0847bfdb6367856799c0f95831ecd75dbb36d0f4fad7fea2a6082083a965a5529d8bbf75a1542

data/lib/classifier/bayes.rb CHANGED Viewed

@@ -3,133 +3,141 @@
 # License::   LGPL
 module Classifier
+  class Bayes
+    # The class can be created with one or more categories, each of which will be
+    # initialized and given a training method. E.g.,
+    #      b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
+    def initialize(*categories)
+      @categories = {}
+      categories.each { |category| @categories[category.prepare_category_name] = {} }
+      @total_words = 0
+      @category_counts = Hash.new(0)
+      @category_word_count = Hash.new(0)
+    end
-class Bayes
-  # The class can be created with one or more categories, each of which will be
-  # initialized and given a training method. E.g.,
-  #      b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
-	def initialize(*categories)
-		@categories = Hash.new
-		categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
-		@total_words = 0
-                @category_counts = Hash.new(0)
-	end
+    #
+    # Provides a general training method for all categories specified in Bayes#new
+    # For example:
+    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+    #     b.train :this, "This text"
+    #     b.train "that", "That text"
+    #     b.train "The other", "The other text"
+    def train(category, text)
+      category = category.prepare_category_name
+      @category_counts[category] += 1
+      text.word_hash.each do |word, count|
+        @categories[category][word] ||= 0
+        @categories[category][word] += count
+        @total_words += count
+        @category_word_count[category] += count
+      end
+    end
-	#
-	# Provides a general training method for all categories specified in Bayes#new
-	# For example:
-	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
-	#     b.train :this, "This text"
-	#     b.train "that", "That text"
-	#     b.train "The other", "The other text"
-	def train(category, text)
-		category = category.prepare_category_name
-                @category_counts[category] += 1
-		text.word_hash.each do |word, count|
-			@categories[category][word]     ||=     0
-			@categories[category][word]      +=     count
-			@total_words += count
-		end
-	end
+    #
+    # Provides a untraining method for all categories specified in Bayes#new
+    # Be very careful with this method.
+    #
+    # For example:
+    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+    #     b.train :this, "This text"
+    #     b.untrain :this, "This text"
+    def untrain(category, text)
+      category = category.prepare_category_name
+      @category_counts[category] -= 1
+      text.word_hash.each do |word, count|
+        next unless @total_words >= 0
-	#
-	# Provides a untraining method for all categories specified in Bayes#new
-	# Be very careful with this method.
-	#
-	# For example:
-	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
-	#     b.train :this, "This text"
-	#     b.untrain :this, "This text"
-	def untrain(category, text)
-		category = category.prepare_category_name
-                @category_counts[category] -= 1
-		text.word_hash.each do |word, count|
-			if @total_words >= 0
-				orig = @categories[category][word]
-				@categories[category][word]     ||=     0
-				@categories[category][word]      -=     count
-				if @categories[category][word] <= 0
-					@categories[category].delete(word)
-					count = orig
-				end
-				@total_words -= count
-			end
-		end
-	end
+        orig = @categories[category][word] || 0
+        @categories[category][word] ||= 0
+        @categories[category][word] -= count
+        if @categories[category][word] <= 0
+          @categories[category].delete(word)
+          count = orig
+        end
+        @category_word_count[category] -= count if @category_word_count[category] >= count
+        @total_words -= count
+      end
+    end
-	#
-	# Returns the scores in each category the provided +text+. E.g.,
-	#    b.classifications "I hate bad words and you"
-	#    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
-	# The largest of these scores (the one closest to 0) is the one picked out by #classify
-	def classifications(text)
-		score = Hash.new
-                training_count = @category_counts.values.inject { |x,y| x+y }.to_f
-		@categories.each do |category, category_words|
-			score[category.to_s] = 0
-			total = category_words.values.inject(0) {|sum, element| sum+element}
-			text.word_hash.each do |word, count|
-				s = category_words.has_key?(word) ? category_words[word] : 0.1
-				score[category.to_s] += Math.log(s/total.to_f)
-			end
-                        # now add prior probability for the category
-                        s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
-                        score[category.to_s] += Math.log(s / training_count)
-		end
-		return score
-	end
+    #
+    # Returns the scores in each category the provided +text+. E.g.,
+    #    b.classifications "I hate bad words and you"
+    #    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
+    # The largest of these scores (the one closest to 0) is the one picked out by #classify
+    def classifications(text)
+      score = {}
+      word_hash = text.word_hash
+      training_count = @category_counts.values.inject { |x, y| x + y }.to_f
+      @categories.each do |category, category_words|
+        score[category.to_s] = 0
+        total = (@category_word_count[category] || 1).to_f
+        word_hash.each_key do |word|
+          s = category_words.key?(word) ? category_words[word] : 0.1
+          score[category.to_s] += Math.log(s / total)
+        end
+        # now add prior probability for the category
+        s = @category_counts.key?(category) ? @category_counts[category] : 0.1
+        score[category.to_s] += Math.log(s / training_count)
+      end
+      score
+    end
-  #
-  # Returns the classification of the provided +text+, which is one of the
-  # categories given in the initializer. E.g.,
-  #    b.classify "I hate bad words and you"
-  #    =>  'Uninteresting'
-	def classify(text)
-		(classifications(text).sort_by { |a| -a[1] })[0][0]
-	end
+    #
+    # Returns the classification of the provided +text+, which is one of the
+    # categories given in the initializer. E.g.,
+    #    b.classify "I hate bad words and you"
+    #    =>  'Uninteresting'
+    def classify(text)
+      (classifications(text).sort_by { |a| -a[1] })[0][0]
+    end
-	#
-	# Provides training and untraining methods for the categories specified in Bayes#new
-	# For example:
-	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
-	#     b.train_this "This text"
-	#     b.train_that "That text"
-	#     b.untrain_that "That text"
-	#     b.train_the_other "The other text"
-	def method_missing(name, *args)
-		category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
-		if @categories.has_key? category
-			args.each { |text| eval("#{$1}train(category, text)") }
-		elsif name.to_s =~ /(un)?train_([\w]+)/
-			raise StandardError, "No such category: #{category}"
-		else
-	    super  #raise StandardError, "No such method: #{name}"
-		end
-	end
+    #
+    # Provides training and untraining methods for the categories specified in Bayes#new
+    # For example:
+    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+    #     b.train_this "This text"
+    #     b.train_that "That text"
+    #     b.untrain_that "That text"
+    #     b.train_the_other "The other text"
+    def method_missing(name, *args)
+      category = name.to_s.gsub(/(un)?train_(\w+)/, '\2').prepare_category_name
+      if @categories.key?(category)
+        args.each do |text|
+          if name.to_s.start_with?('untrain_')
+            untrain(category, text)
+          else
+            train(category, text)
+          end
+        end
+      elsif name.to_s =~ /(un)?train_(\w+)/
+        raise StandardError, "No such category: #{category}"
+      else
+        super
+      end
+    end
-	#
-	# Provides a list of category names
-	# For example:
-	#     b.categories
-	#     =>   ['This', 'That', 'the_other']
-	def categories # :nodoc:
-		@categories.keys.collect {|c| c.to_s}
-	end
+    #
+    # Provides a list of category names
+    # For example:
+    #     b.categories
+    #     =>   ['This', 'That', 'the_other']
+    def categories # :nodoc:
+      @categories.keys.collect(&:to_s)
+    end
-	#
-	# Allows you to add categories to the classifier.
-	# For example:
-	#     b.add_category "Not spam"
-	#
-	# WARNING: Adding categories to a trained classifier will
-	# result in an undertrained category that will tend to match
-	# more criteria than the trained selective categories. In short,
-	# try to initialize your categories at initialization.
-	def add_category(category)
-		@categories[category.prepare_category_name] = Hash.new
-	end
-	alias append_category add_category
-end
+    #
+    # Allows you to add categories to the classifier.
+    # For example:
+    #     b.add_category "Not spam"
+    #
+    # WARNING: Adding categories to a trained classifier will
+    # result in an undertrained category that will tend to match
+    # more criteria than the trained selective categories. In short,
+    # try to initialize your categories at initialization.
+    def add_category(category)
+      @categories[category.prepare_category_name] = {}
+    end
+    alias append_category add_category
+  end
 end

data/lib/classifier/extensions/string.rb CHANGED Viewed

@@ -6,5 +6,5 @@ require 'fast_stemmer'
 require 'classifier/extensions/word_hash'
 class Object
-	def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
+  def prepare_category_name = to_s.gsub('_', ' ').capitalize.intern
 end

data/lib/classifier/extensions/vector.rb CHANGED Viewed

@@ -4,109 +4,103 @@
 # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
 require 'matrix'
-require 'mathn'
 class Array
-  def sum(identity = 0, &block)
-    return identity unless size > 0
+  def sum_with_identity(identity = 0.0, &block)
+    return identity unless size.to_i.positive?
     if block_given?
-      map(&block).sum
+      map(&block).sum_with_identity(identity)
     else
-      reduce(:+)
+      compact.reduce(:+).to_f || identity.to_f
     end
   end
 end
-class Vector
+module VectorExtensions
   def magnitude
-    sumsqs = 0.0
-    self.size.times do |i|
-      sumsqs += self[i] ** 2.0
+    sum_of_squares = 0.to_r
+    size.times do |i|
+      sum_of_squares += self[i]**2.to_r
     end
-    Math.sqrt(sumsqs)
+    Math.sqrt(sum_of_squares.to_f)
   end
-  def normalize
-    nv = []
-    mag = self.magnitude
-    self.size.times do |i|
-      nv << (self[i] / mag)
+  def normalize
+    normalized_values = []
+    magnitude_value = magnitude.to_r
+    size.times do |i|
+      normalized_values << (self[i] / magnitude_value)
     end
-    Vector[*nv]
+    Vector[*normalized_values]
   end
 end
+class Vector
+  include VectorExtensions
+end
 class Matrix
-  def Matrix.diag(s)
-     Matrix.diagonal(*s)
+  def self.diag(diagonal_elements)
+    Matrix.diagonal(*diagonal_elements)
   end
-  alias :trans :transpose
+  alias trans transpose
-  def SV_decomp(maxSweeps = 20)
-    if self.row_size >= self.column_size
-      q = self.trans * self
-    else
-      q = self * self.trans
-    end
+  def SV_decomp(max_sweeps = 20)
+    q_matrix = if row_size >= column_size
+                 trans * self
+               else
+                 self * trans
+               end
-    qrot    = q.dup
-    v       = Matrix.identity(q.row_size)
-    azrot   = nil
-    mzrot   = nil
-    cnt     = 0
-    s_old   = nil
-    mu      = nil
+    q_rotation_matrix = q_matrix.dup
+    v_matrix = Matrix.identity(q_matrix.row_size)
+    iteration_count = 0
+    previous_s_matrix = nil
-    while true do
-      cnt += 1
-      for row in (0...qrot.row_size-1) do
-        for col in (1..qrot.row_size-1) do
+    loop do
+      iteration_count += 1
+      (0...q_rotation_matrix.row_size - 1).each do |row|
+        (1..q_rotation_matrix.row_size - 1).each do |col|
           next if row == col
-          h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
-          hcos = Math.cos(h)
-          hsin = Math.sin(h)
-          mzrot = Matrix.identity(qrot.row_size)
-          mzrot[row,row] = hcos
-          mzrot[row,col] = -hsin
-          mzrot[col,row] = hsin
-          mzrot[col,col] = hcos
-          qrot = mzrot.trans * qrot * mzrot
-          v = v * mzrot
+          angle = Math.atan((2.to_r * q_rotation_matrix[row,
+                                                        col]) / (q_rotation_matrix[row,
+                                                                                   row] - q_rotation_matrix[col,
+                                                                                                            col])) / 2.0
+          cosine = Math.cos(angle)
+          sine = Math.sin(angle)
+          rotation_matrix = Matrix.identity(q_rotation_matrix.row_size)
+          rotation_matrix[row, row] = cosine
+          rotation_matrix[row, col] = -sine
+          rotation_matrix[col, row] = sine
+          rotation_matrix[col, col] = cosine
+          q_rotation_matrix = rotation_matrix.trans * q_rotation_matrix * rotation_matrix
+          v_matrix *= rotation_matrix
         end
       end
-      s_old = qrot.dup if cnt == 1
-      sum_qrot = 0.0
-      if cnt > 1
-        qrot.row_size.times do |r|
-          sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
+      previous_s_matrix = q_rotation_matrix.dup if iteration_count == 1
+      sum_of_differences = 0.to_r
+      if iteration_count > 1
+        q_rotation_matrix.row_size.times do |r|
+          difference = (q_rotation_matrix[r, r] - previous_s_matrix[r, r]).abs
+          sum_of_differences += difference.to_r if difference > 0.001
         end
-        s_old = qrot.dup
+        previous_s_matrix = q_rotation_matrix.dup
       end
-      break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
-    end # of do while true
-    s = []
-    qrot.row_size.times do |r|
-      s << Math.sqrt(qrot[r,r])
+      break if (sum_of_differences <= 0.001 && iteration_count > 1) || iteration_count >= max_sweeps
     end
-    #puts "cnt = #{cnt}"
-    if self.row_size >= self.column_size
-      mu = self *  v * Matrix.diagonal(*s).inverse
-      return [mu, v, s]
-    else
-      puts v.row_size
-      puts v.column_size
-      puts self.row_size
-      puts self.column_size
-      puts s.size
-      mu = (self.trans * v *  Matrix.diagonal(*s).inverse)
-      return [mu, v, s]
+    singular_values = []
+    q_rotation_matrix.row_size.times do |r|
+      singular_values << Math.sqrt(q_rotation_matrix[r, r].to_f)
     end
+    u_matrix = (row_size >= column_size ? self : trans) * v_matrix * Matrix.diagonal(*singular_values).inverse
+    [u_matrix, v_matrix, singular_values]
   end
-  def []=(i,j,val)
-    @rows[i][j] = val
+  def []=(row_index, col_index, value)
+    @rows[row_index][col_index] = value
   end
 end

data/lib/classifier/extensions/vector_serialize.rb CHANGED Viewed

@@ -1,20 +1,18 @@
 module GSL
   class Vector
-    def _dump(v)
-      Marshal.dump( self.to_a )
+    def _dump(_v)
+      Marshal.dump(to_a)
     end
     def self._load(arr)
       arry = Marshal.load(arr)
-      return GSL::Vector.alloc(arry)
+      GSL::Vector.alloc(arry)
     end
   end
   class Matrix
-     class <<self
-        alias :diag :diagonal
-     end
+    class << self
+      alias diag diagonal
+    end
   end
 end