ai4r 1.8 → 1.9
Sign up to get free protection for your applications and to get access to all the features.
- data/examples/{decision_trees/data_set.csv → classifiers/id3_data.csv} +0 -0
- data/examples/{decision_trees → classifiers}/id3_example.rb +1 -1
- data/examples/classifiers/naive_bayes_data.csv +11 -0
- data/examples/classifiers/naive_bayes_example.rb +16 -0
- data/examples/{decision_trees → classifiers}/results.txt +0 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +1 -1
- data/lib/ai4r.rb +1 -0
- data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
- data/lib/ai4r/data/data_set.rb +63 -47
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +3 -3
- data/test/classifiers/naive_bayes_test.rb +43 -0
- metadata +10 -6
File without changes
|
@@ -10,7 +10,7 @@
|
|
10
10
|
require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/id3'
|
11
11
|
|
12
12
|
# Load data from data_set.csv
|
13
|
-
data_filename = "#{File.dirname(__FILE__)}/
|
13
|
+
data_filename = "#{File.dirname(__FILE__)}/id3_data.csv"
|
14
14
|
data_set = Ai4r::Data::DataSet.new.load_csv_with_labels data_filename
|
15
15
|
|
16
16
|
# Build ID3 tree
|
@@ -0,0 +1,11 @@
|
|
1
|
+
"Color","Type","Origin","Stolen?"
|
2
|
+
"Red","Sports","Domestic","Yes"
|
3
|
+
"Red","Sports","Domestic","No"
|
4
|
+
"Red","Sports","Domestic","Yes"
|
5
|
+
"Yellow","Sports","Domestic","No"
|
6
|
+
"Yellow","Sports","Imported","Yes"
|
7
|
+
"Yellow","SUV","Imported","No"
|
8
|
+
"Yellow","SUV","Imported","Yes"
|
9
|
+
"Yellow","Sports","Domestic","No"
|
10
|
+
"Red","SUV","Imported","No"
|
11
|
+
"Red","Sports","Imported","Yes"
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/naive_bayes'
|
2
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/data/data_set'
|
3
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/id3'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
include Ai4r::Classifiers
|
7
|
+
include Ai4r::Data
|
8
|
+
|
9
|
+
data_set = DataSet.new
|
10
|
+
data_set.load_csv_with_labels File.dirname(__FILE__) + "/naive_bayes_data.csv"
|
11
|
+
|
12
|
+
b = NaiveBayes.new.
|
13
|
+
set_parameters({:m=>3}).
|
14
|
+
build data_set
|
15
|
+
p b.eval(["Red", "SUV", "Domestic"])
|
16
|
+
p b.get_probability_map(["Red", "SUV", "Domestic"])
|
File without changes
|
@@ -16,7 +16,7 @@ data_filename = "#{File.dirname(__FILE__)}/travel_cost.csv"
|
|
16
16
|
data_set = Ai4r::Data::DataSet.new.load_csv_with_labels data_filename
|
17
17
|
data_set.data_items.collect! {|column| column.collect {|element| element.to_f}}
|
18
18
|
|
19
|
-
Ai4r::GeneticAlgorithm::Chromosome.set_cost_matrix(data_set)
|
19
|
+
Ai4r::GeneticAlgorithm::Chromosome.set_cost_matrix(data_set.data_items)
|
20
20
|
|
21
21
|
puts "Some random selected tours costs: "
|
22
22
|
3.times do
|
data/lib/ai4r.rb
CHANGED
@@ -22,6 +22,7 @@ require File.dirname(__FILE__) + "/ai4r/classifiers/prism"
|
|
22
22
|
require File.dirname(__FILE__) + "/ai4r/classifiers/one_r"
|
23
23
|
require File.dirname(__FILE__) + "/ai4r/classifiers/zero_r"
|
24
24
|
require File.dirname(__FILE__) + "/ai4r/classifiers/hyperpipes"
|
25
|
+
require File.dirname(__FILE__) + "/ai4r/classifiers/naive_bayes"
|
25
26
|
# Neural networks
|
26
27
|
require File.dirname(__FILE__) + "/ai4r/neural_network/backpropagation"
|
27
28
|
require File.dirname(__FILE__) + "/ai4r/neural_network/hopfield"
|
@@ -0,0 +1,259 @@
|
|
1
|
+
# Author:: Thomas Kern
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/classifier'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Classifiers
|
15
|
+
|
16
|
+
|
17
|
+
# = Introduction
|
18
|
+
#
|
19
|
+
# This is an implementation of a Naive Bayesian Classifier without any
|
20
|
+
# specialisation (ie. for text classification)
|
21
|
+
# Probabilities P(a_i | v_j) are estimated using m-estimates, hence the
|
22
|
+
# m parameter as second parameter when isntantiating the class.
|
23
|
+
# The estimation looks like this:
|
24
|
+
#(n_c + mp) / (n + m)
|
25
|
+
#
|
26
|
+
# the variables are:
|
27
|
+
# n = the number of training examples for which v = v_j
|
28
|
+
# n_c = number of examples for which v = v_j and a = a_i
|
29
|
+
# p = a priori estimate for P(a_i | v_j)
|
30
|
+
# m = the equivalent sample size
|
31
|
+
#
|
32
|
+
# stores the conditional probabilities in an array named @pcp and in this form:
|
33
|
+
# @pcp[attributes][values][classes]
|
34
|
+
#
|
35
|
+
# This kind of estimator is useful when the training data set is relatively small.
|
36
|
+
# If the data set is big enough, set it to 0, which is also the default value
|
37
|
+
#
|
38
|
+
#
|
39
|
+
# For further details regarding Bayes and Naive Bayes Classifier have a look at those websites:
|
40
|
+
# http://en.wikipedia.org/wiki/Naive_Bayesian_classification
|
41
|
+
# http://en.wikipedia.org/wiki/Bayes%27_theorem
|
42
|
+
#
|
43
|
+
#
|
44
|
+
# = Parameters
|
45
|
+
#
|
46
|
+
# * :m => Optional. Default value is set to 0. It may be set to a value greater than 0 when
|
47
|
+
# the size of the dataset is relatively small
|
48
|
+
#
|
49
|
+
# = How to use it
|
50
|
+
#
|
51
|
+
# data = DataSet.new.load_csv_with_labels "bayes_data.csv"
|
52
|
+
# b = NaiveBayes.new.
|
53
|
+
# set_parameters({:m=>3}).
|
54
|
+
# build data
|
55
|
+
# b.eval(["Red", "SUV", "Domestic"])
|
56
|
+
#
|
57
|
+
class NaiveBayes < Classifier
|
58
|
+
|
59
|
+
parameters_info :m => "Default value is set to 0. It may be set to a value greater than " +
|
60
|
+
"0 when the size of the dataset is relatively small"
|
61
|
+
|
62
|
+
def initialize
|
63
|
+
@m = 0
|
64
|
+
@class_counts = []
|
65
|
+
@class_prob = [] # stores the probability of the classes
|
66
|
+
@pcc = [] # stores the number of instances divided into attribute/value/class
|
67
|
+
@pcp = [] # stores the conditional probabilities of the values of an attribute
|
68
|
+
@klass_index = {} # hashmap for quick lookup of all the used klasses and their indice
|
69
|
+
@values = {} # hashmap for quick lookup of all the values
|
70
|
+
end
|
71
|
+
|
72
|
+
# You can evaluate new data, predicting its category.
|
73
|
+
# e.g.
|
74
|
+
# b.eval(["Red", "SUV", "Domestic"])
|
75
|
+
# => 'No'
|
76
|
+
def eval(data)
|
77
|
+
prob = @class_prob.map {|cp| cp}
|
78
|
+
prob = calculate_class_probabilities_for_entry(data, prob)
|
79
|
+
index_to_klass(prob.index(prob.max))
|
80
|
+
end
|
81
|
+
|
82
|
+
# Calculates the probabilities for the data entry Data.
|
83
|
+
# data has to be an array of the same dimension as the training data minus the
|
84
|
+
# class column.
|
85
|
+
# Returns a map containint all classes as keys:
|
86
|
+
# {Class_1 => probability, Class_2 => probability2 ... }
|
87
|
+
# Probability is <= 1 and of type Float.
|
88
|
+
# e.g.
|
89
|
+
# b.get_probability_map(["Red", "SUV", "Domestic"])
|
90
|
+
# => {"Yes"=>0.4166666666666667, "No"=>0.5833333333333334}
|
91
|
+
def get_probability_map(data)
|
92
|
+
prob = @class_prob.map {|cp| cp}
|
93
|
+
prob = calculate_class_probabilities_for_entry(data, prob)
|
94
|
+
prob = normalize_class_probability prob
|
95
|
+
probability_map = {}
|
96
|
+
prob.each_with_index { |p, i| probability_map[index_to_klass(i)] = p }
|
97
|
+
return probability_map
|
98
|
+
end
|
99
|
+
|
100
|
+
# counts values of the attribute instances and calculates the probability of the classes
|
101
|
+
# and the conditional probabilities
|
102
|
+
# Parameter data has to be an instance of CsvDataSet
|
103
|
+
def build(data)
|
104
|
+
raise "Error instance must be passed" unless data.is_a?(DataSet)
|
105
|
+
raise "Data should not be empty" if data.data_items.length == 0
|
106
|
+
|
107
|
+
initialize_domain_data(data)
|
108
|
+
initialize_klass_index
|
109
|
+
initialize_pc
|
110
|
+
calculate_probabilities
|
111
|
+
|
112
|
+
return self
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def initialize_domain_data(data)
|
118
|
+
@domains = data.build_domains
|
119
|
+
@data_items = data.data_items.map { |item| DataEntry.new(item[0...-1], item.last) }
|
120
|
+
@data_labels = data.data_labels[0...-1]
|
121
|
+
@klasses = @domains.last.to_a
|
122
|
+
end
|
123
|
+
|
124
|
+
|
125
|
+
# calculates the klass probability of a data entry
|
126
|
+
# as usual, the probability of the value is multiplied with every conditional
|
127
|
+
# probability of every attribute in condition to a specific class
|
128
|
+
# this is repeated for every class
|
129
|
+
def calculate_class_probabilities_for_entry(data, prob)
|
130
|
+
prob.each_with_index do |prob_entry, prob_index|
|
131
|
+
data.each_with_index do |att, index|
|
132
|
+
next if value_index(att, index).nil?
|
133
|
+
prob[prob_index] *= @pcp[index][value_index(att, index)][prob_index]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# normalises the array of probabilities so the sum of the array equals 1
|
139
|
+
def normalize_class_probability(prob)
|
140
|
+
prob_sum = sum(prob)
|
141
|
+
prob_sum > 0 ?
|
142
|
+
prob.map {|prob_entry| prob_entry / prob_sum } :
|
143
|
+
prob
|
144
|
+
end
|
145
|
+
|
146
|
+
# sums an array up; returns a number of type Float
|
147
|
+
def sum(array)
|
148
|
+
array.inject(0.0){|b, i| b+i}
|
149
|
+
end
|
150
|
+
|
151
|
+
# returns the name of the class when the index is found
|
152
|
+
def index_to_klass(index)
|
153
|
+
@klass_index.has_value?(index) ? @klass_index.index(index) : nil
|
154
|
+
end
|
155
|
+
|
156
|
+
# initializes @values and @klass_index; maps a certain value to a uniq index
|
157
|
+
def initialize_klass_index
|
158
|
+
@klasses.each_with_index do |dl, index|
|
159
|
+
@klass_index[dl] = index
|
160
|
+
end
|
161
|
+
|
162
|
+
@data_labels.each_with_index do |dl, index|
|
163
|
+
@values[index] = {}
|
164
|
+
@domains[index].each_with_index do |d, d_index|
|
165
|
+
@values[index][d] = d_index
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# returns the index of a class
|
171
|
+
def klass_index(klass)
|
172
|
+
@klass_index[klass]
|
173
|
+
end
|
174
|
+
|
175
|
+
# returns the index of a value, depending on the attribute index
|
176
|
+
def value_index(value, dl_index)
|
177
|
+
@values[dl_index][value]
|
178
|
+
end
|
179
|
+
|
180
|
+
# builds an array of the form:
|
181
|
+
# array[attributes][values][classes]
|
182
|
+
def build_array(dl, index)
|
183
|
+
domains = Array.new(@domains[index].length)
|
184
|
+
domains.map do |p1|
|
185
|
+
pl = Array.new @klasses.length, 0
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
# initializes the two array for storing the count and conditional probabilities of
|
190
|
+
# the attributes
|
191
|
+
def initialize_pc
|
192
|
+
@data_labels.each_with_index do |dl, index|
|
193
|
+
@pcc << build_array(dl, index)
|
194
|
+
@pcp << build_array(dl, index)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
# calculates the occurrences of a class and the instances of a certain value of a
|
199
|
+
# certain attribute and the assigned class.
|
200
|
+
# In addition to that, it also calculates the conditional probabilities and values
|
201
|
+
def calculate_probabilities
|
202
|
+
@klasses.each {|dl| @class_counts[klass_index(dl)] = 0}
|
203
|
+
|
204
|
+
calculate_class_probabilities
|
205
|
+
count_instances
|
206
|
+
calculate_conditional_probabilities
|
207
|
+
end
|
208
|
+
|
209
|
+
def calculate_class_probabilities
|
210
|
+
@data_items.each do |entry|
|
211
|
+
@class_counts[klass_index(entry.klass)] += 1
|
212
|
+
end
|
213
|
+
|
214
|
+
@class_counts.each_with_index do |k, index|
|
215
|
+
@class_prob[index] = k.to_f / @data_items.length
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
# counts the instances of a certain value of a certain attribute and the assigned class
|
220
|
+
def count_instances
|
221
|
+
@data_items.each do |item|
|
222
|
+
@data_labels.each_with_index do |dl, dl_index|
|
223
|
+
@pcc[dl_index][value_index(item[dl_index], dl_index)][klass_index(item.klass)] += 1
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
# calculates the conditional probability and stores it in the @pcp-array
|
229
|
+
def calculate_conditional_probabilities
|
230
|
+
@pcc.each_with_index do |attributes, a_index|
|
231
|
+
attributes.each_with_index do |values, v_index|
|
232
|
+
values.each_with_index do |klass, k_index|
|
233
|
+
@pcp[a_index][v_index][k_index] = (klass.to_f + @m * @class_prob[k_index]) / (@class_counts[k_index] + @m).to_f
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
#DataEntry stores the instance of the data entry
|
240
|
+
#the data is accessible via entries
|
241
|
+
#stores the class-column in the attribute klass and
|
242
|
+
#removes the column for the class-entry
|
243
|
+
class DataEntry
|
244
|
+
attr_accessor :klass, :entries
|
245
|
+
|
246
|
+
def initialize(attributes, klass)
|
247
|
+
@klass = klass
|
248
|
+
@entries = attributes
|
249
|
+
end
|
250
|
+
|
251
|
+
# wrapper method for the access to @entries
|
252
|
+
def [](index)
|
253
|
+
@entries[index]
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
data/lib/ai4r/data/data_set.rb
CHANGED
@@ -13,17 +13,17 @@ require File.dirname(__FILE__) + '/statistics'
|
|
13
13
|
|
14
14
|
module Ai4r
|
15
15
|
module Data
|
16
|
-
|
16
|
+
|
17
17
|
# A data set is a collection of N data items. Each data item is
|
18
18
|
# described by a set of attributes, represented as an array.
|
19
19
|
# Optionally, you can assign a label to the attributes, using
|
20
20
|
# the data_labels property.
|
21
21
|
class DataSet
|
22
|
-
|
22
|
+
|
23
23
|
@@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
|
24
|
-
|
25
|
-
attr_reader :data_labels, :data_items
|
26
|
-
|
24
|
+
|
25
|
+
attr_reader :data_labels, :data_items
|
26
|
+
|
27
27
|
# Create a new DataSet. By default, empty.
|
28
28
|
# Optionaly, you can provide the initial data items and data labels.
|
29
29
|
#
|
@@ -41,37 +41,52 @@ module Ai4r
|
|
41
41
|
# Retrieve a new DataSet, with the item(s) selected by the provided
|
42
42
|
# index. You can specify an index range, too.
|
43
43
|
def [](index)
|
44
|
-
selected_items = (index.is_a?(Fixnum)) ?
|
45
|
-
|
46
|
-
return DataSet.new(:data_items => selected_items,
|
47
|
-
|
44
|
+
selected_items = (index.is_a?(Fixnum)) ?
|
45
|
+
[@data_items[index]] : @data_items[index]
|
46
|
+
return DataSet.new(:data_items => selected_items,
|
47
|
+
:data_labels =>@data_labels)
|
48
48
|
end
|
49
|
-
|
49
|
+
|
50
50
|
# Load data items from csv file
|
51
51
|
def load_csv(filepath)
|
52
52
|
items = []
|
53
|
-
|
54
|
-
items <<
|
53
|
+
open_csv_file(filepath) do |entry|
|
54
|
+
items << entry
|
55
55
|
end
|
56
56
|
set_data_items(items)
|
57
57
|
end
|
58
|
-
|
58
|
+
|
59
|
+
# opens a csv-file and reads it line by line
|
60
|
+
# for each line, a block is called and the row is passed to the block
|
61
|
+
# ruby1.8 and 1.9 safe
|
62
|
+
def open_csv_file(filepath, &block)
|
63
|
+
if CSV.const_defined? :Reader
|
64
|
+
CSV::Reader.parse(File.open(filepath, 'r')) do |row|
|
65
|
+
block.call row
|
66
|
+
end
|
67
|
+
else
|
68
|
+
CSV.parse(File.open(filepath, 'r')) do |row|
|
69
|
+
block.call row
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
59
74
|
# Load data items from csv file. The first row is used as data labels.
|
60
75
|
def load_csv_with_labels(filepath)
|
61
76
|
load_csv(filepath)
|
62
77
|
@data_labels = @data_items.shift
|
63
78
|
return self
|
64
79
|
end
|
65
|
-
|
80
|
+
|
66
81
|
# Same as load_csv, but it will try to convert cell contents as numbers.
|
67
82
|
def parse_csv(filepath)
|
68
83
|
items = []
|
69
|
-
|
84
|
+
open_csv_file(filepath) do |row|
|
70
85
|
items << row.collect{|x| (x.match(@@number_regex)) ? x.to_f : x.data }
|
71
86
|
end
|
72
87
|
set_data_items(items)
|
73
88
|
end
|
74
|
-
|
89
|
+
|
75
90
|
# Set data labels.
|
76
91
|
# Data labels must have the following format:
|
77
92
|
# [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
@@ -134,7 +149,7 @@ module Ai4r
|
|
134
149
|
def build_domains
|
135
150
|
@data_labels.collect {|attr_label| build_domain(attr_label) }
|
136
151
|
end
|
137
|
-
|
152
|
+
|
138
153
|
# Returns a Set instance containing all possible values for an attribute
|
139
154
|
# The parameter can be an attribute label or index (0 based).
|
140
155
|
# * Set instance containing all possible values for nominal attributes
|
@@ -156,12 +171,12 @@ module Ai4r
|
|
156
171
|
return @data_items.inject(Set.new){|domain, x| domain << x[index]}
|
157
172
|
end
|
158
173
|
end
|
159
|
-
|
174
|
+
|
160
175
|
# Returns attributes number, including class attribute
|
161
176
|
def num_attributes
|
162
177
|
return (@data_items.empty?) ? 0 : @data_items.first.size
|
163
178
|
end
|
164
|
-
|
179
|
+
|
165
180
|
# Returns the index of a given attribute (0-based).
|
166
181
|
# For example, if "gender" is the third attribute, then:
|
167
182
|
# get_index("gender")
|
@@ -169,82 +184,83 @@ module Ai4r
|
|
169
184
|
def get_index(attr)
|
170
185
|
return (attr.is_a?(Fixnum) || attr.is_a?(Range)) ? attr : @data_labels.index(attr)
|
171
186
|
end
|
172
|
-
|
187
|
+
|
173
188
|
# Raise an exception if there is no data item.
|
174
189
|
def check_not_empty
|
175
190
|
if @data_items.empty?
|
176
|
-
raise ArgumentError,"Examples data set must not be empty."
|
191
|
+
raise ArgumentError, "Examples data set must not be empty."
|
177
192
|
end
|
178
193
|
end
|
179
194
|
|
180
195
|
# Add a data item to the data set
|
181
196
|
def << data_item
|
182
197
|
if data_item.nil? || !data_item.is_a?(Enumerable) || data_item.empty?
|
183
|
-
raise ArgumentError,"Data must not be an non empty array."
|
198
|
+
raise ArgumentError, "Data must not be an non empty array."
|
184
199
|
elsif @data_items.empty?
|
185
200
|
set_data_items([data_item])
|
186
201
|
elsif data_item.length != num_attributes
|
187
|
-
raise ArgumentError,"Number of attributes do not match. " +
|
188
|
-
|
189
|
-
|
190
|
-
else
|
202
|
+
raise ArgumentError, "Number of attributes do not match. " +
|
203
|
+
"#{data_item.length} attributes provided, " +
|
204
|
+
"#{num_attributes} attributes expected."
|
205
|
+
else
|
191
206
|
@data_items << data_item
|
192
207
|
end
|
193
208
|
end
|
194
|
-
|
209
|
+
|
195
210
|
# Returns an array with the mean value of numeric attributes, and
|
196
211
|
# the most frequent value of non numeric attributes
|
197
212
|
def get_mean_or_mode
|
198
213
|
mean = []
|
199
|
-
num_attributes.times do |i|
|
200
|
-
mean[i] =
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
214
|
+
num_attributes.times do |i|
|
215
|
+
mean[i] =
|
216
|
+
if @data_items.first[i].is_a?(Numeric)
|
217
|
+
Statistics.mean(self, i)
|
218
|
+
else
|
219
|
+
Statistics.mode(self, i)
|
220
|
+
end
|
206
221
|
end
|
207
222
|
return mean
|
208
223
|
end
|
209
|
-
|
224
|
+
|
210
225
|
protected
|
226
|
+
|
211
227
|
def check_data_items(data_items)
|
212
228
|
if !data_items || data_items.empty?
|
213
|
-
raise ArgumentError,"Examples data set must not be empty."
|
229
|
+
raise ArgumentError, "Examples data set must not be empty."
|
214
230
|
elsif !data_items.first.is_a?(Enumerable)
|
215
|
-
raise ArgumentError,"Unkown format for example data."
|
231
|
+
raise ArgumentError, "Unkown format for example data."
|
216
232
|
end
|
217
233
|
attributes_num = data_items.first.length
|
218
234
|
data_items.each_index do |index|
|
219
235
|
if data_items[index].length != attributes_num
|
220
236
|
raise ArgumentError,
|
221
|
-
|
222
|
-
|
223
|
-
|
237
|
+
"Quantity of attributes is inconsistent. " +
|
238
|
+
"The first item has #{attributes_num} attributes "+
|
239
|
+
"and row #{index} has #{data_items[index].length} attributes"
|
224
240
|
end
|
225
241
|
end
|
226
242
|
end
|
227
|
-
|
243
|
+
|
228
244
|
def check_data_labels(labels)
|
229
245
|
if !@data_items.empty?
|
230
246
|
if labels.length != @data_items.first.length
|
231
247
|
raise ArgumentError,
|
232
|
-
|
233
|
-
|
234
|
-
|
248
|
+
"Number of labels and attributes do not match. " +
|
249
|
+
"#{labels.length} labels and " +
|
250
|
+
"#{@data_items.first.length} attributes found."
|
235
251
|
end
|
236
252
|
end
|
237
253
|
end
|
238
|
-
|
254
|
+
|
239
255
|
def default_data_labels(data_items)
|
240
256
|
data_labels = []
|
241
257
|
data_items[0][0..-2].each_index do |i|
|
242
|
-
data_labels[i] = "attribute_#{i+1}"
|
258
|
+
data_labels[i] = "attribute_#{i+1}"
|
243
259
|
end
|
244
260
|
data_labels[data_labels.length]="class_value"
|
245
261
|
return data_labels
|
246
262
|
end
|
247
|
-
|
263
|
+
|
248
264
|
end
|
249
265
|
end
|
250
266
|
end
|
@@ -178,7 +178,7 @@ module Ai4r
|
|
178
178
|
last_token = @data[0]
|
179
179
|
cost = 0
|
180
180
|
@data[1..-1].each do |token|
|
181
|
-
cost += @@costs
|
181
|
+
cost += @@costs[last_token][token]
|
182
182
|
last_token = token
|
183
183
|
end
|
184
184
|
@fitness = -1 * cost
|
@@ -220,7 +220,7 @@ module Ai4r
|
|
220
220
|
# In this case, we have implemented edge recombination, wich is the
|
221
221
|
# most used reproduction algorithm for the Travelling salesman problem.
|
222
222
|
def self.reproduce(a, b)
|
223
|
-
data_size = @@costs
|
223
|
+
data_size = @@costs[0].length
|
224
224
|
available = []
|
225
225
|
0.upto(data_size-1) { |n| available << n }
|
226
226
|
token = a.data[0]
|
@@ -249,7 +249,7 @@ module Ai4r
|
|
249
249
|
# use some problem domain knowledge, to generate a
|
250
250
|
# (probably) better initial solution.
|
251
251
|
def self.seed
|
252
|
-
data_size = @@costs
|
252
|
+
data_size = @@costs[0].length
|
253
253
|
available = []
|
254
254
|
0.upto(data_size-1) { |n| available << n }
|
255
255
|
seed = []
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/naive_bayes'
|
2
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/data/data_set'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
include Ai4r::Classifiers
|
6
|
+
include Ai4r::Data
|
7
|
+
|
8
|
+
class NaiveBayesTest < Test::Unit::TestCase
|
9
|
+
|
10
|
+
@@data_labels = [ "Color","Type","Origin","Stolen?" ]
|
11
|
+
|
12
|
+
@@data_items = [
|
13
|
+
["Red", "Sports", "Domestic", "Yes"],
|
14
|
+
["Red", "Sports", "Domestic", "No"],
|
15
|
+
["Red", "Sports", "Domestic", "Yes"],
|
16
|
+
["Yellow","Sports", "Domestic", "No"],
|
17
|
+
["Yellow","Sports", "Imported", "Yes"],
|
18
|
+
["Yellow","SUV", "Imported", "No"],
|
19
|
+
["Yellow","SUV", "Imported", "Yes"],
|
20
|
+
["Yellow","Sports", "Domestic", "No"],
|
21
|
+
["Red", "SUV", "Imported", "No"],
|
22
|
+
["Red", "Sports", "Imported", "Yes"]
|
23
|
+
]
|
24
|
+
|
25
|
+
def setup
|
26
|
+
@data_set = DataSet.new
|
27
|
+
@data_set = DataSet.new(:data_items => @@data_items, :data_labels => @@data_labels)
|
28
|
+
@b = NaiveBayes.new.set_parameters({:m=>3}).build @data_set
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_eval
|
32
|
+
result = @b.eval(["Red", "SUV", "Domestic"])
|
33
|
+
assert_equal "No", result
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_get_probability_map
|
37
|
+
map = @b.get_probability_map(["Red", "SUV", "Domestic"])
|
38
|
+
assert_equal 2, map.keys.length
|
39
|
+
assert_in_delta 0.42, map["Yes"], 0.1
|
40
|
+
assert_in_delta 0.58, map["No"], 0.1
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: ai4r
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: "1.
|
7
|
-
date: 2009-
|
6
|
+
version: "1.9"
|
7
|
+
date: 2009-07-01 00:00:00 +01:00
|
8
8
|
summary: Ruby implementations of algorithms covering several Artificial intelligence fields, including Genetic algorithms, Neural Networks, machine learning, and clustering.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -29,11 +29,13 @@ post_install_message:
|
|
29
29
|
authors:
|
30
30
|
- Sergio Fierens
|
31
31
|
files:
|
32
|
+
- examples/classifiers
|
33
|
+
- examples/classifiers/id3_data.csv
|
34
|
+
- examples/classifiers/id3_example.rb
|
35
|
+
- examples/classifiers/naive_bayes_data.csv
|
36
|
+
- examples/classifiers/naive_bayes_example.rb
|
37
|
+
- examples/classifiers/results.txt
|
32
38
|
- examples/clusterers
|
33
|
-
- examples/decision_trees
|
34
|
-
- examples/decision_trees/data_set.csv
|
35
|
-
- examples/decision_trees/id3_example.rb
|
36
|
-
- examples/decision_trees/results.txt
|
37
39
|
- examples/genetic_algorithm
|
38
40
|
- examples/genetic_algorithm/genetic_algorithm_example.rb
|
39
41
|
- examples/genetic_algorithm/travel_cost.csv
|
@@ -53,6 +55,7 @@ files:
|
|
53
55
|
- lib/ai4r/classifiers/hyperpipes.rb
|
54
56
|
- lib/ai4r/classifiers/id3.rb
|
55
57
|
- lib/ai4r/classifiers/multilayer_perceptron.rb
|
58
|
+
- lib/ai4r/classifiers/naive_bayes.rb
|
56
59
|
- lib/ai4r/classifiers/one_r.rb
|
57
60
|
- lib/ai4r/classifiers/prism.rb
|
58
61
|
- lib/ai4r/classifiers/zero_r.rb
|
@@ -91,6 +94,7 @@ test_files:
|
|
91
94
|
- test/classifiers/hyperpipes_test.rb
|
92
95
|
- test/classifiers/id3_test.rb
|
93
96
|
- test/classifiers/multilayer_perceptron_test.rb
|
97
|
+
- test/classifiers/naive_bayes_test.rb
|
94
98
|
- test/classifiers/one_r_test.rb
|
95
99
|
- test/classifiers/prism_test.rb
|
96
100
|
- test/classifiers/zero_r_test.rb
|