RubyGems - ai4r - Versions diffs - 1.8 → 1.9 - Mend

ai4r 1.8 → 1.9

Files changed (12) hide show

data/examples/{decision_trees/data_set.csv → classifiers/id3_data.csv} +0 -0
data/examples/{decision_trees → classifiers}/id3_example.rb +1 -1
data/examples/classifiers/naive_bayes_data.csv +11 -0
data/examples/classifiers/naive_bayes_example.rb +16 -0
data/examples/{decision_trees → classifiers}/results.txt +0 -0
data/examples/genetic_algorithm/genetic_algorithm_example.rb +1 -1
data/lib/ai4r.rb +1 -0
data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
data/lib/ai4r/data/data_set.rb +63 -47
data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +3 -3
data/test/classifiers/naive_bayes_test.rb +43 -0
metadata +10 -6

data/examples/{decision_trees/data_set.csv → classifiers/id3_data.csv} RENAMED

File without changes

data/examples/{decision_trees → classifiers}/id3_example.rb RENAMED

@@ -10,7 +10,7 @@
 require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/id3'
 # Load data from data_set.csv
-data_filename = "#{File.dirname(__FILE__)}/data_set.csv"
+data_filename = "#{File.dirname(__FILE__)}/id3_data.csv"
 data_set = Ai4r::Data::DataSet.new.load_csv_with_labels data_filename
 # Build ID3 tree

data/examples/classifiers/naive_bayes_data.csv ADDED

@@ -0,0 +1,11 @@
+"Color","Type","Origin","Stolen?"
+"Red","Sports","Domestic","Yes"
+"Red","Sports","Domestic","No"
+"Red","Sports","Domestic","Yes"
+"Yellow","Sports","Domestic","No"
+"Yellow","Sports","Imported","Yes"
+"Yellow","SUV","Imported","No"
+"Yellow","SUV","Imported","Yes"
+"Yellow","Sports","Domestic","No"
+"Red","SUV","Imported","No"
+"Red","Sports","Imported","Yes"

data/examples/classifiers/naive_bayes_example.rb ADDED

@@ -0,0 +1,16 @@
+require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/naive_bayes'
+require File.dirname(__FILE__) + '/../../lib/ai4r/data/data_set'
+require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/id3'
+require 'benchmark'
+include Ai4r::Classifiers
+include Ai4r::Data
+data_set = DataSet.new
+data_set.load_csv_with_labels File.dirname(__FILE__) + "/naive_bayes_data.csv"
+b = NaiveBayes.new.
+      set_parameters({:m=>3}).
+      build data_set
+p b.eval(["Red", "SUV", "Domestic"])
+p b.get_probability_map(["Red", "SUV", "Domestic"])

data/examples/{decision_trees → classifiers}/results.txt RENAMED

File without changes

data/examples/genetic_algorithm/genetic_algorithm_example.rb CHANGED

@@ -16,7 +16,7 @@ data_filename = "#{File.dirname(__FILE__)}/travel_cost.csv"
 data_set = Ai4r::Data::DataSet.new.load_csv_with_labels data_filename
 data_set.data_items.collect! {|column| column.collect {|element| element.to_f}}
-Ai4r::GeneticAlgorithm::Chromosome.set_cost_matrix(data_set)
+Ai4r::GeneticAlgorithm::Chromosome.set_cost_matrix(data_set.data_items)
 puts "Some random selected tours costs: "
 3.times do

data/lib/ai4r.rb CHANGED

@@ -22,6 +22,7 @@ require File.dirname(__FILE__) +  "/ai4r/classifiers/prism"
 require File.dirname(__FILE__) +  "/ai4r/classifiers/one_r"
 require File.dirname(__FILE__) +  "/ai4r/classifiers/zero_r"
 require File.dirname(__FILE__) +  "/ai4r/classifiers/hyperpipes"
+require File.dirname(__FILE__) +  "/ai4r/classifiers/naive_bayes"
 # Neural networks
 require File.dirname(__FILE__) +  "/ai4r/neural_network/backpropagation"
 require File.dirname(__FILE__) +  "/ai4r/neural_network/hopfield"

data/lib/ai4r/classifiers/naive_bayes.rb ADDED

@@ -0,0 +1,259 @@
+# Author::    Thomas Kern
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/classifier'
+module Ai4r
+  module Classifiers
+    # = Introduction
+    #
+    # This is an implementation of a Naive Bayesian Classifier without any
+    # specialisation (ie. for text classification)
+    # Probabilities P(a_i | v_j) are estimated using m-estimates, hence the
+    # m parameter as second parameter when isntantiating the class.
+    # The estimation looks like this:
+    #(n_c + mp) / (n + m)
+    #
+    # the variables are:
+    # n = the number of training examples for which v = v_j
+    # n_c = number of examples for which v = v_j and a = a_i
+    # p = a priori estimate for P(a_i | v_j)
+    # m = the equivalent sample size
+    #
+    # stores the conditional probabilities in an array named @pcp and in this form:
+    # @pcp[attributes][values][classes]
+    #
+    # This kind of estimator is useful when the training data set is relatively small.
+    # If the data set is big enough, set it to 0, which is also the default value
+    #
+    #
+    # For further details regarding Bayes and Naive Bayes Classifier have a look at those websites:
+    # http://en.wikipedia.org/wiki/Naive_Bayesian_classification
+    # http://en.wikipedia.org/wiki/Bayes%27_theorem
+    #
+    #
+    # = Parameters
+    #
+    # * :m => Optional. Default value is set to 0. It may be set to a value greater than 0 when
+    # the size of the dataset is relatively small
+    #
+    # = How to use it
+    #
+    #   data = DataSet.new.load_csv_with_labels "bayes_data.csv"
+    #   b = NaiveBayes.new.
+    #     set_parameters({:m=>3}).
+    #     build data
+    #   b.eval(["Red", "SUV", "Domestic"])
+    #
+    class NaiveBayes < Classifier
+      parameters_info :m => "Default value is set to 0. It may be set to a value greater than " +
+        "0 when the size of the dataset is relatively small"
+      def initialize
+        @m = 0
+        @class_counts = []
+        @class_prob = [] # stores the probability of the classes
+        @pcc = [] # stores the number of instances divided into attribute/value/class
+        @pcp = [] # stores the conditional probabilities of the values of an attribute
+        @klass_index = {} # hashmap for quick lookup of all the used klasses and their indice
+        @values = {} # hashmap for quick lookup of all the values
+      end
+      # You can evaluate new data, predicting its category.
+      # e.g.
+      #   b.eval(["Red", "SUV", "Domestic"])
+      #     => 'No'
+      def eval(data)
+        prob = @class_prob.map {|cp| cp}
+        prob = calculate_class_probabilities_for_entry(data, prob)
+        index_to_klass(prob.index(prob.max))
+      end
+      # Calculates the probabilities for the data entry Data.
+      # data has to be an array of the same dimension as the training data minus the
+      # class column.
+      # Returns a map containint all classes as keys:
+      # {Class_1 => probability, Class_2 => probability2 ... }
+      # Probability is <= 1 and of type Float.
+      # e.g.
+      #   b.get_probability_map(["Red", "SUV", "Domestic"])
+      #     => {"Yes"=>0.4166666666666667, "No"=>0.5833333333333334}
+      def get_probability_map(data)
+        prob = @class_prob.map {|cp| cp}
+        prob = calculate_class_probabilities_for_entry(data, prob)
+        prob = normalize_class_probability prob
+        probability_map = {}
+        prob.each_with_index { |p, i| probability_map[index_to_klass(i)] = p }
+        return probability_map
+      end
+      # counts values of the attribute instances and calculates the probability of the classes
+      # and the conditional probabilities
+      # Parameter data has to be an instance of CsvDataSet
+      def build(data)
+        raise "Error instance must be passed" unless data.is_a?(DataSet)
+        raise "Data should not be empty" if data.data_items.length == 0
+        initialize_domain_data(data)
+        initialize_klass_index
+        initialize_pc
+        calculate_probabilities
+        return self
+      end
+      private
+      def initialize_domain_data(data)
+        @domains = data.build_domains
+        @data_items = data.data_items.map { |item| DataEntry.new(item[0...-1], item.last) }
+        @data_labels = data.data_labels[0...-1]
+        @klasses = @domains.last.to_a
+      end
+      # calculates the klass probability of a data entry
+      # as usual, the probability of the value is multiplied with every conditional
+      # probability of every attribute in condition to a specific class
+      # this is repeated for every class
+      def calculate_class_probabilities_for_entry(data, prob)
+        prob.each_with_index do |prob_entry, prob_index|
+          data.each_with_index do |att, index|
+            next if value_index(att, index).nil?
+            prob[prob_index] *= @pcp[index][value_index(att, index)][prob_index]
+          end
+        end
+      end
+      # normalises the array of probabilities so the sum of the array equals 1
+      def normalize_class_probability(prob)
+        prob_sum = sum(prob)
+        prob_sum > 0 ?
+          prob.map {|prob_entry| prob_entry / prob_sum } :
+          prob
+      end
+      # sums an array up; returns a number of type Float
+      def sum(array)
+        array.inject(0.0){|b, i| b+i}
+      end
+      # returns the name of the class when the index is found
+      def index_to_klass(index)
+        @klass_index.has_value?(index) ? @klass_index.index(index) : nil
+      end
+      # initializes @values and @klass_index; maps a certain value to a uniq index
+      def initialize_klass_index
+        @klasses.each_with_index do |dl, index|
+          @klass_index[dl] = index
+        end
+        @data_labels.each_with_index do |dl, index|
+          @values[index] = {}
+          @domains[index].each_with_index do |d, d_index|
+            @values[index][d] = d_index
+          end
+        end
+      end
+      # returns the index of a class
+      def klass_index(klass)
+        @klass_index[klass]
+      end
+      # returns the index of a value, depending on the attribute index
+      def value_index(value, dl_index)
+        @values[dl_index][value]
+      end
+      # builds an array of the form:
+      # array[attributes][values][classes]
+      def build_array(dl, index)
+        domains = Array.new(@domains[index].length)
+        domains.map do |p1|
+          pl = Array.new @klasses.length, 0
+        end
+      end
+      # initializes the two array for storing the count and conditional probabilities of
+      # the attributes
+      def initialize_pc
+        @data_labels.each_with_index do |dl, index|
+          @pcc << build_array(dl, index)
+          @pcp << build_array(dl, index)
+        end
+      end
+      # calculates the occurrences of a class and the instances of a certain value of a
+      # certain attribute and the assigned class.
+      # In addition to that, it also calculates the conditional probabilities and values
+      def calculate_probabilities
+        @klasses.each {|dl| @class_counts[klass_index(dl)] = 0}
+        calculate_class_probabilities
+        count_instances
+        calculate_conditional_probabilities
+      end
+      def calculate_class_probabilities
+        @data_items.each do |entry|
+          @class_counts[klass_index(entry.klass)] += 1
+        end
+        @class_counts.each_with_index do |k, index|
+          @class_prob[index] = k.to_f / @data_items.length
+        end
+      end
+      # counts the instances of a certain value of a certain attribute and the assigned class
+      def count_instances
+        @data_items.each do |item|
+          @data_labels.each_with_index do |dl, dl_index|
+            @pcc[dl_index][value_index(item[dl_index], dl_index)][klass_index(item.klass)] += 1
+          end
+        end
+      end
+      # calculates the conditional probability and stores it in the @pcp-array
+      def calculate_conditional_probabilities
+        @pcc.each_with_index do |attributes, a_index|
+          attributes.each_with_index do |values, v_index|
+            values.each_with_index do |klass, k_index|
+              @pcp[a_index][v_index][k_index] = (klass.to_f + @m * @class_prob[k_index]) / (@class_counts[k_index] + @m).to_f
+            end
+          end
+        end
+      end
+      #DataEntry stores the instance of the data entry
+      #the data is accessible via entries
+      #stores the class-column in the attribute klass and
+      #removes the column for the class-entry
+      class DataEntry
+        attr_accessor :klass, :entries
+        def initialize(attributes, klass)
+          @klass = klass
+          @entries = attributes
+        end
+        # wrapper method for the access to @entries
+        def [](index)
+          @entries[index]
+        end
+      end
+    end
+  end
+end

data/lib/ai4r/data/data_set.rb CHANGED

@@ -13,17 +13,17 @@ require File.dirname(__FILE__) + '/statistics'
 module Ai4r
   module Data
     # A data set is a collection of N data items. Each data item is
     # described by a set of attributes, represented as an array.
     # Optionally, you can assign a label to the attributes, using
     # the data_labels property.
     class DataSet
       @@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
-      attr_reader :data_labels, :data_items
+      attr_reader :data_labels, :data_items
       # Create a new DataSet. By default, empty.
       # Optionaly, you can provide the initial data items and data labels.
       #
@@ -41,37 +41,52 @@ module Ai4r
       # Retrieve a new DataSet, with the item(s) selected by the provided
       # index. You can specify an index range, too.
       def [](index)
-        selected_items = (index.is_a?(Fixnum)) ?
-          [@data_items[index]] : @data_items[index]
-        return DataSet.new(:data_items => selected_items,
-          :data_labels =>@data_labels)
+        selected_items = (index.is_a?(Fixnum)) ?
+                [@data_items[index]] : @data_items[index]
+        return DataSet.new(:data_items => selected_items,
+                           :data_labels =>@data_labels)
       end
       # Load data items from csv file
       def load_csv(filepath)
         items = []
-        CSV::Reader.parse(File.open(filepath, 'r')) do |row|
-          items << row
+        open_csv_file(filepath) do |entry|
+          items << entry
         end
         set_data_items(items)
       end
+      # opens a csv-file and reads it line by line
+      # for each line, a block is called and the row is passed to the block
+      # ruby1.8 and 1.9 safe
+      def open_csv_file(filepath, &block)
+        if CSV.const_defined? :Reader
+          CSV::Reader.parse(File.open(filepath, 'r')) do |row|
+            block.call row
+          end
+        else
+          CSV.parse(File.open(filepath, 'r')) do |row|
+            block.call row
+          end
+        end
+      end
       # Load data items from csv file. The first row is used as data labels.
       def load_csv_with_labels(filepath)
         load_csv(filepath)
         @data_labels = @data_items.shift
         return self
       end
       # Same as load_csv, but it will try to convert cell contents as numbers.
       def parse_csv(filepath)
         items = []
-        CSV::Reader.parse(File.open(filepath, 'r')) do |row|
+        open_csv_file(filepath) do |row|
           items << row.collect{|x| (x.match(@@number_regex)) ? x.to_f : x.data }
         end
         set_data_items(items)
       end
       # Set data labels.
       # Data labels must have the following format:
       #     [ 'city', 'age_range', 'gender', 'marketing_target'  ]
@@ -134,7 +149,7 @@ module Ai4r
       def build_domains
         @data_labels.collect {|attr_label| build_domain(attr_label) }
       end
       # Returns a Set instance containing all possible values for an attribute
       # The parameter can be an attribute label or index (0 based).
       # * Set instance containing all possible values for nominal attributes
@@ -156,12 +171,12 @@ module Ai4r
           return @data_items.inject(Set.new){|domain, x| domain << x[index]}
         end
       end
       # Returns attributes number, including class attribute
       def num_attributes
         return (@data_items.empty?) ? 0 : @data_items.first.size
       end
       # Returns the index of a given attribute (0-based).
       # For example, if "gender" is the third attribute, then:
       #   get_index("gender")
@@ -169,82 +184,83 @@ module Ai4r
       def get_index(attr)
         return (attr.is_a?(Fixnum) || attr.is_a?(Range)) ? attr : @data_labels.index(attr)
       end
       # Raise an exception if there is no data item.
       def check_not_empty
         if @data_items.empty?
-          raise ArgumentError,"Examples data set must not be empty."
+          raise ArgumentError, "Examples data set must not be empty."
         end
       end
       # Add a data item to the data set
       def << data_item
         if data_item.nil? || !data_item.is_a?(Enumerable) || data_item.empty?
-          raise ArgumentError,"Data must not be an non empty array."
+          raise ArgumentError, "Data must not be an non empty array."
         elsif @data_items.empty?
           set_data_items([data_item])
         elsif data_item.length != num_attributes
-          raise ArgumentError,"Number of attributes do not match. " +
-            "#{data_item.length} attributes provided, " +
-            "#{num_attributes} attributes expected."
-        else
+          raise ArgumentError, "Number of attributes do not match. " +
+                  "#{data_item.length} attributes provided, " +
+                  "#{num_attributes} attributes expected."
+        else
           @data_items << data_item
         end
       end
       # Returns an array with the mean value of numeric attributes, and
       # the most frequent value of non numeric attributes
       def get_mean_or_mode
         mean = []
-        num_attributes.times do |i|
-          mean[i] =
-            if @data_items.first[i].is_a?(Numeric)
-              Statistics.mean(self, i)
-            else
-              Statistics.mode(self, i)
-            end
+        num_attributes.times do |i|
+          mean[i] =
+                  if @data_items.first[i].is_a?(Numeric)
+                    Statistics.mean(self, i)
+                  else
+                    Statistics.mode(self, i)
+                  end
         end
         return mean
       end
       protected
       def check_data_items(data_items)
         if !data_items || data_items.empty?
-          raise ArgumentError,"Examples data set must not be empty."
+          raise ArgumentError, "Examples data set must not be empty."
         elsif !data_items.first.is_a?(Enumerable)
-          raise ArgumentError,"Unkown format for example data."
+          raise ArgumentError, "Unkown format for example data."
         end
         attributes_num = data_items.first.length
         data_items.each_index do |index|
           if data_items[index].length != attributes_num
             raise ArgumentError,
-                "Quantity of attributes is inconsistent. " +
-                "The first item has #{attributes_num} attributes "+
-                "and row #{index} has #{data_items[index].length} attributes"
+                  "Quantity of attributes is inconsistent. " +
+                          "The first item has #{attributes_num} attributes "+
+                          "and row #{index} has #{data_items[index].length} attributes"
           end
         end
       end
       def check_data_labels(labels)
         if !@data_items.empty?
           if labels.length != @data_items.first.length
             raise ArgumentError,
-              "Number of labels and attributes do not match. " +
-              "#{labels.length} labels and " +
-              "#{@data_items.first.length} attributes found."
+                  "Number of labels and attributes do not match. " +
+                          "#{labels.length} labels and " +
+                          "#{@data_items.first.length} attributes found."
           end
         end
       end
       def default_data_labels(data_items)
         data_labels = []
         data_items[0][0..-2].each_index do |i|
-          data_labels[i] = "attribute_#{i+1}"
+          data_labels[i] = "attribute_#{i+1}"
         end
         data_labels[data_labels.length]="class_value"
         return data_labels
       end
     end
   end
 end

data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb CHANGED

@@ -178,7 +178,7 @@ module Ai4r
         last_token = @data[0]
         cost = 0
         @data[1..-1].each do |token|
-          cost += @@costs.data_items[last_token][token]
+          cost += @@costs[last_token][token]
           last_token = token
         end
         @fitness = -1 * cost
@@ -220,7 +220,7 @@ module Ai4r
       # In this case, we have implemented edge recombination, wich is the
       # most used reproduction algorithm for the Travelling salesman problem.
       def self.reproduce(a, b)
-        data_size = @@costs.data_items[0].length
+        data_size = @@costs[0].length
         available = []
         0.upto(data_size-1) { |n| available << n }
         token = a.data[0]
@@ -249,7 +249,7 @@ module Ai4r
       # use some problem domain knowledge, to generate a
       # (probably) better initial solution.
       def self.seed
-        data_size = @@costs.data_items[0].length
+        data_size = @@costs[0].length
         available = []
         0.upto(data_size-1) { |n| available << n }
         seed = []

data/test/classifiers/naive_bayes_test.rb ADDED

@@ -0,0 +1,43 @@
+require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/naive_bayes'
+require File.dirname(__FILE__) + '/../../lib/ai4r/data/data_set'
+require 'test/unit'
+include Ai4r::Classifiers
+include Ai4r::Data
+class NaiveBayesTest < Test::Unit::TestCase
+  @@data_labels = [ "Color","Type","Origin","Stolen?" ]
+  @@data_items = [
+              ["Red",   "Sports", "Domestic", "Yes"],
+              ["Red",   "Sports", "Domestic", "No"],
+              ["Red",   "Sports", "Domestic", "Yes"],
+              ["Yellow","Sports", "Domestic", "No"],
+              ["Yellow","Sports", "Imported", "Yes"],
+              ["Yellow","SUV",    "Imported", "No"],
+              ["Yellow","SUV",    "Imported", "Yes"],
+              ["Yellow","Sports", "Domestic", "No"],
+              ["Red",   "SUV",    "Imported", "No"],
+              ["Red",   "Sports", "Imported", "Yes"]
+            ]
+  def setup
+    @data_set = DataSet.new
+    @data_set = DataSet.new(:data_items => @@data_items, :data_labels => @@data_labels)
+    @b = NaiveBayes.new.set_parameters({:m=>3}).build @data_set
+  end
+  def test_eval
+    result = @b.eval(["Red", "SUV", "Domestic"])
+    assert_equal "No", result
+  end
+  def test_get_probability_map
+    map = @b.get_probability_map(["Red", "SUV", "Domestic"])
+    assert_equal 2, map.keys.length
+    assert_in_delta 0.42, map["Yes"], 0.1
+    assert_in_delta 0.58, map["No"], 0.1
+  end
+end

metadata CHANGED

@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
 specification_version: 1
 name: ai4r
 version: !ruby/object:Gem::Version
-  version: "1.8"
-date: 2009-06-15 00:00:00 +01:00
+  version: "1.9"
+date: 2009-07-01 00:00:00 +01:00
 summary: Ruby implementations of algorithms covering several Artificial intelligence fields, including Genetic algorithms, Neural Networks, machine learning, and clustering.
 require_paths:
 - lib
@@ -29,11 +29,13 @@ post_install_message:
 authors:
 - Sergio Fierens
 files:
+- examples/classifiers
+- examples/classifiers/id3_data.csv
+- examples/classifiers/id3_example.rb
+- examples/classifiers/naive_bayes_data.csv
+- examples/classifiers/naive_bayes_example.rb
+- examples/classifiers/results.txt
 - examples/clusterers
-- examples/decision_trees
-- examples/decision_trees/data_set.csv
-- examples/decision_trees/id3_example.rb
-- examples/decision_trees/results.txt
 - examples/genetic_algorithm
 - examples/genetic_algorithm/genetic_algorithm_example.rb
 - examples/genetic_algorithm/travel_cost.csv
@@ -53,6 +55,7 @@ files:
 - lib/ai4r/classifiers/hyperpipes.rb
 - lib/ai4r/classifiers/id3.rb
 - lib/ai4r/classifiers/multilayer_perceptron.rb
+- lib/ai4r/classifiers/naive_bayes.rb
 - lib/ai4r/classifiers/one_r.rb
 - lib/ai4r/classifiers/prism.rb
 - lib/ai4r/classifiers/zero_r.rb
@@ -91,6 +94,7 @@ test_files:
 - test/classifiers/hyperpipes_test.rb
 - test/classifiers/id3_test.rb
 - test/classifiers/multilayer_perceptron_test.rb
+- test/classifiers/naive_bayes_test.rb
 - test/classifiers/one_r_test.rb
 - test/classifiers/prism_test.rb
 - test/classifiers/zero_r_test.rb