ai4r 1.8 → 1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/examples/{decision_trees/data_set.csv → classifiers/id3_data.csv} +0 -0
- data/examples/{decision_trees → classifiers}/id3_example.rb +1 -1
- data/examples/classifiers/naive_bayes_data.csv +11 -0
- data/examples/classifiers/naive_bayes_example.rb +16 -0
- data/examples/{decision_trees → classifiers}/results.txt +0 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +1 -1
- data/lib/ai4r.rb +1 -0
- data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
- data/lib/ai4r/data/data_set.rb +63 -47
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +3 -3
- data/test/classifiers/naive_bayes_test.rb +43 -0
- metadata +10 -6
File without changes
|
@@ -10,7 +10,7 @@
|
|
10
10
|
require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/id3'
|
11
11
|
|
12
12
|
# Load data from data_set.csv
|
13
|
-
data_filename = "#{File.dirname(__FILE__)}/
|
13
|
+
data_filename = "#{File.dirname(__FILE__)}/id3_data.csv"
|
14
14
|
data_set = Ai4r::Data::DataSet.new.load_csv_with_labels data_filename
|
15
15
|
|
16
16
|
# Build ID3 tree
|
@@ -0,0 +1,11 @@
|
|
1
|
+
"Color","Type","Origin","Stolen?"
|
2
|
+
"Red","Sports","Domestic","Yes"
|
3
|
+
"Red","Sports","Domestic","No"
|
4
|
+
"Red","Sports","Domestic","Yes"
|
5
|
+
"Yellow","Sports","Domestic","No"
|
6
|
+
"Yellow","Sports","Imported","Yes"
|
7
|
+
"Yellow","SUV","Imported","No"
|
8
|
+
"Yellow","SUV","Imported","Yes"
|
9
|
+
"Yellow","Sports","Domestic","No"
|
10
|
+
"Red","SUV","Imported","No"
|
11
|
+
"Red","Sports","Imported","Yes"
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/naive_bayes'
|
2
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/data/data_set'
|
3
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/id3'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
include Ai4r::Classifiers
|
7
|
+
include Ai4r::Data
|
8
|
+
|
9
|
+
data_set = DataSet.new
|
10
|
+
data_set.load_csv_with_labels File.dirname(__FILE__) + "/naive_bayes_data.csv"
|
11
|
+
|
12
|
+
b = NaiveBayes.new.
|
13
|
+
set_parameters({:m=>3}).
|
14
|
+
build data_set
|
15
|
+
p b.eval(["Red", "SUV", "Domestic"])
|
16
|
+
p b.get_probability_map(["Red", "SUV", "Domestic"])
|
File without changes
|
@@ -16,7 +16,7 @@ data_filename = "#{File.dirname(__FILE__)}/travel_cost.csv"
|
|
16
16
|
data_set = Ai4r::Data::DataSet.new.load_csv_with_labels data_filename
|
17
17
|
data_set.data_items.collect! {|column| column.collect {|element| element.to_f}}
|
18
18
|
|
19
|
-
Ai4r::GeneticAlgorithm::Chromosome.set_cost_matrix(data_set)
|
19
|
+
Ai4r::GeneticAlgorithm::Chromosome.set_cost_matrix(data_set.data_items)
|
20
20
|
|
21
21
|
puts "Some random selected tours costs: "
|
22
22
|
3.times do
|
data/lib/ai4r.rb
CHANGED
@@ -22,6 +22,7 @@ require File.dirname(__FILE__) + "/ai4r/classifiers/prism"
|
|
22
22
|
require File.dirname(__FILE__) + "/ai4r/classifiers/one_r"
|
23
23
|
require File.dirname(__FILE__) + "/ai4r/classifiers/zero_r"
|
24
24
|
require File.dirname(__FILE__) + "/ai4r/classifiers/hyperpipes"
|
25
|
+
require File.dirname(__FILE__) + "/ai4r/classifiers/naive_bayes"
|
25
26
|
# Neural networks
|
26
27
|
require File.dirname(__FILE__) + "/ai4r/neural_network/backpropagation"
|
27
28
|
require File.dirname(__FILE__) + "/ai4r/neural_network/hopfield"
|
@@ -0,0 +1,259 @@
|
|
1
|
+
# Author:: Thomas Kern
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/classifier'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Classifiers
|
15
|
+
|
16
|
+
|
17
|
+
# = Introduction
|
18
|
+
#
|
19
|
+
# This is an implementation of a Naive Bayesian Classifier without any
|
20
|
+
# specialisation (ie. for text classification)
|
21
|
+
# Probabilities P(a_i | v_j) are estimated using m-estimates, hence the
|
22
|
+
# m parameter as second parameter when isntantiating the class.
|
23
|
+
# The estimation looks like this:
|
24
|
+
#(n_c + mp) / (n + m)
|
25
|
+
#
|
26
|
+
# the variables are:
|
27
|
+
# n = the number of training examples for which v = v_j
|
28
|
+
# n_c = number of examples for which v = v_j and a = a_i
|
29
|
+
# p = a priori estimate for P(a_i | v_j)
|
30
|
+
# m = the equivalent sample size
|
31
|
+
#
|
32
|
+
# stores the conditional probabilities in an array named @pcp and in this form:
|
33
|
+
# @pcp[attributes][values][classes]
|
34
|
+
#
|
35
|
+
# This kind of estimator is useful when the training data set is relatively small.
|
36
|
+
# If the data set is big enough, set it to 0, which is also the default value
|
37
|
+
#
|
38
|
+
#
|
39
|
+
# For further details regarding Bayes and Naive Bayes Classifier have a look at those websites:
|
40
|
+
# http://en.wikipedia.org/wiki/Naive_Bayesian_classification
|
41
|
+
# http://en.wikipedia.org/wiki/Bayes%27_theorem
|
42
|
+
#
|
43
|
+
#
|
44
|
+
# = Parameters
|
45
|
+
#
|
46
|
+
# * :m => Optional. Default value is set to 0. It may be set to a value greater than 0 when
|
47
|
+
# the size of the dataset is relatively small
|
48
|
+
#
|
49
|
+
# = How to use it
|
50
|
+
#
|
51
|
+
# data = DataSet.new.load_csv_with_labels "bayes_data.csv"
|
52
|
+
# b = NaiveBayes.new.
|
53
|
+
# set_parameters({:m=>3}).
|
54
|
+
# build data
|
55
|
+
# b.eval(["Red", "SUV", "Domestic"])
|
56
|
+
#
|
57
|
+
class NaiveBayes < Classifier
|
58
|
+
|
59
|
+
parameters_info :m => "Default value is set to 0. It may be set to a value greater than " +
|
60
|
+
"0 when the size of the dataset is relatively small"
|
61
|
+
|
62
|
+
def initialize
|
63
|
+
@m = 0
|
64
|
+
@class_counts = []
|
65
|
+
@class_prob = [] # stores the probability of the classes
|
66
|
+
@pcc = [] # stores the number of instances divided into attribute/value/class
|
67
|
+
@pcp = [] # stores the conditional probabilities of the values of an attribute
|
68
|
+
@klass_index = {} # hashmap for quick lookup of all the used klasses and their indice
|
69
|
+
@values = {} # hashmap for quick lookup of all the values
|
70
|
+
end
|
71
|
+
|
72
|
+
# You can evaluate new data, predicting its category.
|
73
|
+
# e.g.
|
74
|
+
# b.eval(["Red", "SUV", "Domestic"])
|
75
|
+
# => 'No'
|
76
|
+
def eval(data)
|
77
|
+
prob = @class_prob.map {|cp| cp}
|
78
|
+
prob = calculate_class_probabilities_for_entry(data, prob)
|
79
|
+
index_to_klass(prob.index(prob.max))
|
80
|
+
end
|
81
|
+
|
82
|
+
# Calculates the probabilities for the data entry Data.
|
83
|
+
# data has to be an array of the same dimension as the training data minus the
|
84
|
+
# class column.
|
85
|
+
# Returns a map containint all classes as keys:
|
86
|
+
# {Class_1 => probability, Class_2 => probability2 ... }
|
87
|
+
# Probability is <= 1 and of type Float.
|
88
|
+
# e.g.
|
89
|
+
# b.get_probability_map(["Red", "SUV", "Domestic"])
|
90
|
+
# => {"Yes"=>0.4166666666666667, "No"=>0.5833333333333334}
|
91
|
+
def get_probability_map(data)
|
92
|
+
prob = @class_prob.map {|cp| cp}
|
93
|
+
prob = calculate_class_probabilities_for_entry(data, prob)
|
94
|
+
prob = normalize_class_probability prob
|
95
|
+
probability_map = {}
|
96
|
+
prob.each_with_index { |p, i| probability_map[index_to_klass(i)] = p }
|
97
|
+
return probability_map
|
98
|
+
end
|
99
|
+
|
100
|
+
# counts values of the attribute instances and calculates the probability of the classes
|
101
|
+
# and the conditional probabilities
|
102
|
+
# Parameter data has to be an instance of CsvDataSet
|
103
|
+
def build(data)
|
104
|
+
raise "Error instance must be passed" unless data.is_a?(DataSet)
|
105
|
+
raise "Data should not be empty" if data.data_items.length == 0
|
106
|
+
|
107
|
+
initialize_domain_data(data)
|
108
|
+
initialize_klass_index
|
109
|
+
initialize_pc
|
110
|
+
calculate_probabilities
|
111
|
+
|
112
|
+
return self
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def initialize_domain_data(data)
|
118
|
+
@domains = data.build_domains
|
119
|
+
@data_items = data.data_items.map { |item| DataEntry.new(item[0...-1], item.last) }
|
120
|
+
@data_labels = data.data_labels[0...-1]
|
121
|
+
@klasses = @domains.last.to_a
|
122
|
+
end
|
123
|
+
|
124
|
+
|
125
|
+
# calculates the klass probability of a data entry
|
126
|
+
# as usual, the probability of the value is multiplied with every conditional
|
127
|
+
# probability of every attribute in condition to a specific class
|
128
|
+
# this is repeated for every class
|
129
|
+
def calculate_class_probabilities_for_entry(data, prob)
|
130
|
+
prob.each_with_index do |prob_entry, prob_index|
|
131
|
+
data.each_with_index do |att, index|
|
132
|
+
next if value_index(att, index).nil?
|
133
|
+
prob[prob_index] *= @pcp[index][value_index(att, index)][prob_index]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# normalises the array of probabilities so the sum of the array equals 1
|
139
|
+
def normalize_class_probability(prob)
|
140
|
+
prob_sum = sum(prob)
|
141
|
+
prob_sum > 0 ?
|
142
|
+
prob.map {|prob_entry| prob_entry / prob_sum } :
|
143
|
+
prob
|
144
|
+
end
|
145
|
+
|
146
|
+
# sums an array up; returns a number of type Float
|
147
|
+
def sum(array)
|
148
|
+
array.inject(0.0){|b, i| b+i}
|
149
|
+
end
|
150
|
+
|
151
|
+
# returns the name of the class when the index is found
|
152
|
+
def index_to_klass(index)
|
153
|
+
@klass_index.has_value?(index) ? @klass_index.index(index) : nil
|
154
|
+
end
|
155
|
+
|
156
|
+
# initializes @values and @klass_index; maps a certain value to a uniq index
|
157
|
+
def initialize_klass_index
|
158
|
+
@klasses.each_with_index do |dl, index|
|
159
|
+
@klass_index[dl] = index
|
160
|
+
end
|
161
|
+
|
162
|
+
@data_labels.each_with_index do |dl, index|
|
163
|
+
@values[index] = {}
|
164
|
+
@domains[index].each_with_index do |d, d_index|
|
165
|
+
@values[index][d] = d_index
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# returns the index of a class
|
171
|
+
def klass_index(klass)
|
172
|
+
@klass_index[klass]
|
173
|
+
end
|
174
|
+
|
175
|
+
# returns the index of a value, depending on the attribute index
|
176
|
+
def value_index(value, dl_index)
|
177
|
+
@values[dl_index][value]
|
178
|
+
end
|
179
|
+
|
180
|
+
# builds an array of the form:
|
181
|
+
# array[attributes][values][classes]
|
182
|
+
def build_array(dl, index)
|
183
|
+
domains = Array.new(@domains[index].length)
|
184
|
+
domains.map do |p1|
|
185
|
+
pl = Array.new @klasses.length, 0
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
# initializes the two array for storing the count and conditional probabilities of
|
190
|
+
# the attributes
|
191
|
+
def initialize_pc
|
192
|
+
@data_labels.each_with_index do |dl, index|
|
193
|
+
@pcc << build_array(dl, index)
|
194
|
+
@pcp << build_array(dl, index)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
# calculates the occurrences of a class and the instances of a certain value of a
|
199
|
+
# certain attribute and the assigned class.
|
200
|
+
# In addition to that, it also calculates the conditional probabilities and values
|
201
|
+
def calculate_probabilities
|
202
|
+
@klasses.each {|dl| @class_counts[klass_index(dl)] = 0}
|
203
|
+
|
204
|
+
calculate_class_probabilities
|
205
|
+
count_instances
|
206
|
+
calculate_conditional_probabilities
|
207
|
+
end
|
208
|
+
|
209
|
+
def calculate_class_probabilities
|
210
|
+
@data_items.each do |entry|
|
211
|
+
@class_counts[klass_index(entry.klass)] += 1
|
212
|
+
end
|
213
|
+
|
214
|
+
@class_counts.each_with_index do |k, index|
|
215
|
+
@class_prob[index] = k.to_f / @data_items.length
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
# counts the instances of a certain value of a certain attribute and the assigned class
|
220
|
+
def count_instances
|
221
|
+
@data_items.each do |item|
|
222
|
+
@data_labels.each_with_index do |dl, dl_index|
|
223
|
+
@pcc[dl_index][value_index(item[dl_index], dl_index)][klass_index(item.klass)] += 1
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
# calculates the conditional probability and stores it in the @pcp-array
|
229
|
+
def calculate_conditional_probabilities
|
230
|
+
@pcc.each_with_index do |attributes, a_index|
|
231
|
+
attributes.each_with_index do |values, v_index|
|
232
|
+
values.each_with_index do |klass, k_index|
|
233
|
+
@pcp[a_index][v_index][k_index] = (klass.to_f + @m * @class_prob[k_index]) / (@class_counts[k_index] + @m).to_f
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
#DataEntry stores the instance of the data entry
|
240
|
+
#the data is accessible via entries
|
241
|
+
#stores the class-column in the attribute klass and
|
242
|
+
#removes the column for the class-entry
|
243
|
+
class DataEntry
|
244
|
+
attr_accessor :klass, :entries
|
245
|
+
|
246
|
+
def initialize(attributes, klass)
|
247
|
+
@klass = klass
|
248
|
+
@entries = attributes
|
249
|
+
end
|
250
|
+
|
251
|
+
# wrapper method for the access to @entries
|
252
|
+
def [](index)
|
253
|
+
@entries[index]
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
data/lib/ai4r/data/data_set.rb
CHANGED
@@ -13,17 +13,17 @@ require File.dirname(__FILE__) + '/statistics'
|
|
13
13
|
|
14
14
|
module Ai4r
|
15
15
|
module Data
|
16
|
-
|
16
|
+
|
17
17
|
# A data set is a collection of N data items. Each data item is
|
18
18
|
# described by a set of attributes, represented as an array.
|
19
19
|
# Optionally, you can assign a label to the attributes, using
|
20
20
|
# the data_labels property.
|
21
21
|
class DataSet
|
22
|
-
|
22
|
+
|
23
23
|
@@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
|
24
|
-
|
25
|
-
attr_reader :data_labels, :data_items
|
26
|
-
|
24
|
+
|
25
|
+
attr_reader :data_labels, :data_items
|
26
|
+
|
27
27
|
# Create a new DataSet. By default, empty.
|
28
28
|
# Optionaly, you can provide the initial data items and data labels.
|
29
29
|
#
|
@@ -41,37 +41,52 @@ module Ai4r
|
|
41
41
|
# Retrieve a new DataSet, with the item(s) selected by the provided
|
42
42
|
# index. You can specify an index range, too.
|
43
43
|
def [](index)
|
44
|
-
selected_items = (index.is_a?(Fixnum)) ?
|
45
|
-
|
46
|
-
return DataSet.new(:data_items => selected_items,
|
47
|
-
|
44
|
+
selected_items = (index.is_a?(Fixnum)) ?
|
45
|
+
[@data_items[index]] : @data_items[index]
|
46
|
+
return DataSet.new(:data_items => selected_items,
|
47
|
+
:data_labels =>@data_labels)
|
48
48
|
end
|
49
|
-
|
49
|
+
|
50
50
|
# Load data items from csv file
|
51
51
|
def load_csv(filepath)
|
52
52
|
items = []
|
53
|
-
|
54
|
-
items <<
|
53
|
+
open_csv_file(filepath) do |entry|
|
54
|
+
items << entry
|
55
55
|
end
|
56
56
|
set_data_items(items)
|
57
57
|
end
|
58
|
-
|
58
|
+
|
59
|
+
# opens a csv-file and reads it line by line
|
60
|
+
# for each line, a block is called and the row is passed to the block
|
61
|
+
# ruby1.8 and 1.9 safe
|
62
|
+
def open_csv_file(filepath, &block)
|
63
|
+
if CSV.const_defined? :Reader
|
64
|
+
CSV::Reader.parse(File.open(filepath, 'r')) do |row|
|
65
|
+
block.call row
|
66
|
+
end
|
67
|
+
else
|
68
|
+
CSV.parse(File.open(filepath, 'r')) do |row|
|
69
|
+
block.call row
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
59
74
|
# Load data items from csv file. The first row is used as data labels.
|
60
75
|
def load_csv_with_labels(filepath)
|
61
76
|
load_csv(filepath)
|
62
77
|
@data_labels = @data_items.shift
|
63
78
|
return self
|
64
79
|
end
|
65
|
-
|
80
|
+
|
66
81
|
# Same as load_csv, but it will try to convert cell contents as numbers.
|
67
82
|
def parse_csv(filepath)
|
68
83
|
items = []
|
69
|
-
|
84
|
+
open_csv_file(filepath) do |row|
|
70
85
|
items << row.collect{|x| (x.match(@@number_regex)) ? x.to_f : x.data }
|
71
86
|
end
|
72
87
|
set_data_items(items)
|
73
88
|
end
|
74
|
-
|
89
|
+
|
75
90
|
# Set data labels.
|
76
91
|
# Data labels must have the following format:
|
77
92
|
# [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
@@ -134,7 +149,7 @@ module Ai4r
|
|
134
149
|
def build_domains
|
135
150
|
@data_labels.collect {|attr_label| build_domain(attr_label) }
|
136
151
|
end
|
137
|
-
|
152
|
+
|
138
153
|
# Returns a Set instance containing all possible values for an attribute
|
139
154
|
# The parameter can be an attribute label or index (0 based).
|
140
155
|
# * Set instance containing all possible values for nominal attributes
|
@@ -156,12 +171,12 @@ module Ai4r
|
|
156
171
|
return @data_items.inject(Set.new){|domain, x| domain << x[index]}
|
157
172
|
end
|
158
173
|
end
|
159
|
-
|
174
|
+
|
160
175
|
# Returns attributes number, including class attribute
|
161
176
|
def num_attributes
|
162
177
|
return (@data_items.empty?) ? 0 : @data_items.first.size
|
163
178
|
end
|
164
|
-
|
179
|
+
|
165
180
|
# Returns the index of a given attribute (0-based).
|
166
181
|
# For example, if "gender" is the third attribute, then:
|
167
182
|
# get_index("gender")
|
@@ -169,82 +184,83 @@ module Ai4r
|
|
169
184
|
def get_index(attr)
|
170
185
|
return (attr.is_a?(Fixnum) || attr.is_a?(Range)) ? attr : @data_labels.index(attr)
|
171
186
|
end
|
172
|
-
|
187
|
+
|
173
188
|
# Raise an exception if there is no data item.
|
174
189
|
def check_not_empty
|
175
190
|
if @data_items.empty?
|
176
|
-
raise ArgumentError,"Examples data set must not be empty."
|
191
|
+
raise ArgumentError, "Examples data set must not be empty."
|
177
192
|
end
|
178
193
|
end
|
179
194
|
|
180
195
|
# Add a data item to the data set
|
181
196
|
def << data_item
|
182
197
|
if data_item.nil? || !data_item.is_a?(Enumerable) || data_item.empty?
|
183
|
-
raise ArgumentError,"Data must not be an non empty array."
|
198
|
+
raise ArgumentError, "Data must not be an non empty array."
|
184
199
|
elsif @data_items.empty?
|
185
200
|
set_data_items([data_item])
|
186
201
|
elsif data_item.length != num_attributes
|
187
|
-
raise ArgumentError,"Number of attributes do not match. " +
|
188
|
-
|
189
|
-
|
190
|
-
else
|
202
|
+
raise ArgumentError, "Number of attributes do not match. " +
|
203
|
+
"#{data_item.length} attributes provided, " +
|
204
|
+
"#{num_attributes} attributes expected."
|
205
|
+
else
|
191
206
|
@data_items << data_item
|
192
207
|
end
|
193
208
|
end
|
194
|
-
|
209
|
+
|
195
210
|
# Returns an array with the mean value of numeric attributes, and
|
196
211
|
# the most frequent value of non numeric attributes
|
197
212
|
def get_mean_or_mode
|
198
213
|
mean = []
|
199
|
-
num_attributes.times do |i|
|
200
|
-
mean[i] =
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
214
|
+
num_attributes.times do |i|
|
215
|
+
mean[i] =
|
216
|
+
if @data_items.first[i].is_a?(Numeric)
|
217
|
+
Statistics.mean(self, i)
|
218
|
+
else
|
219
|
+
Statistics.mode(self, i)
|
220
|
+
end
|
206
221
|
end
|
207
222
|
return mean
|
208
223
|
end
|
209
|
-
|
224
|
+
|
210
225
|
protected
|
226
|
+
|
211
227
|
def check_data_items(data_items)
|
212
228
|
if !data_items || data_items.empty?
|
213
|
-
raise ArgumentError,"Examples data set must not be empty."
|
229
|
+
raise ArgumentError, "Examples data set must not be empty."
|
214
230
|
elsif !data_items.first.is_a?(Enumerable)
|
215
|
-
raise ArgumentError,"Unkown format for example data."
|
231
|
+
raise ArgumentError, "Unkown format for example data."
|
216
232
|
end
|
217
233
|
attributes_num = data_items.first.length
|
218
234
|
data_items.each_index do |index|
|
219
235
|
if data_items[index].length != attributes_num
|
220
236
|
raise ArgumentError,
|
221
|
-
|
222
|
-
|
223
|
-
|
237
|
+
"Quantity of attributes is inconsistent. " +
|
238
|
+
"The first item has #{attributes_num} attributes "+
|
239
|
+
"and row #{index} has #{data_items[index].length} attributes"
|
224
240
|
end
|
225
241
|
end
|
226
242
|
end
|
227
|
-
|
243
|
+
|
228
244
|
def check_data_labels(labels)
|
229
245
|
if !@data_items.empty?
|
230
246
|
if labels.length != @data_items.first.length
|
231
247
|
raise ArgumentError,
|
232
|
-
|
233
|
-
|
234
|
-
|
248
|
+
"Number of labels and attributes do not match. " +
|
249
|
+
"#{labels.length} labels and " +
|
250
|
+
"#{@data_items.first.length} attributes found."
|
235
251
|
end
|
236
252
|
end
|
237
253
|
end
|
238
|
-
|
254
|
+
|
239
255
|
def default_data_labels(data_items)
|
240
256
|
data_labels = []
|
241
257
|
data_items[0][0..-2].each_index do |i|
|
242
|
-
data_labels[i] = "attribute_#{i+1}"
|
258
|
+
data_labels[i] = "attribute_#{i+1}"
|
243
259
|
end
|
244
260
|
data_labels[data_labels.length]="class_value"
|
245
261
|
return data_labels
|
246
262
|
end
|
247
|
-
|
263
|
+
|
248
264
|
end
|
249
265
|
end
|
250
266
|
end
|
@@ -178,7 +178,7 @@ module Ai4r
|
|
178
178
|
last_token = @data[0]
|
179
179
|
cost = 0
|
180
180
|
@data[1..-1].each do |token|
|
181
|
-
cost += @@costs
|
181
|
+
cost += @@costs[last_token][token]
|
182
182
|
last_token = token
|
183
183
|
end
|
184
184
|
@fitness = -1 * cost
|
@@ -220,7 +220,7 @@ module Ai4r
|
|
220
220
|
# In this case, we have implemented edge recombination, wich is the
|
221
221
|
# most used reproduction algorithm for the Travelling salesman problem.
|
222
222
|
def self.reproduce(a, b)
|
223
|
-
data_size = @@costs
|
223
|
+
data_size = @@costs[0].length
|
224
224
|
available = []
|
225
225
|
0.upto(data_size-1) { |n| available << n }
|
226
226
|
token = a.data[0]
|
@@ -249,7 +249,7 @@ module Ai4r
|
|
249
249
|
# use some problem domain knowledge, to generate a
|
250
250
|
# (probably) better initial solution.
|
251
251
|
def self.seed
|
252
|
-
data_size = @@costs
|
252
|
+
data_size = @@costs[0].length
|
253
253
|
available = []
|
254
254
|
0.upto(data_size-1) { |n| available << n }
|
255
255
|
seed = []
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/naive_bayes'
|
2
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/data/data_set'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
include Ai4r::Classifiers
|
6
|
+
include Ai4r::Data
|
7
|
+
|
8
|
+
class NaiveBayesTest < Test::Unit::TestCase
|
9
|
+
|
10
|
+
@@data_labels = [ "Color","Type","Origin","Stolen?" ]
|
11
|
+
|
12
|
+
@@data_items = [
|
13
|
+
["Red", "Sports", "Domestic", "Yes"],
|
14
|
+
["Red", "Sports", "Domestic", "No"],
|
15
|
+
["Red", "Sports", "Domestic", "Yes"],
|
16
|
+
["Yellow","Sports", "Domestic", "No"],
|
17
|
+
["Yellow","Sports", "Imported", "Yes"],
|
18
|
+
["Yellow","SUV", "Imported", "No"],
|
19
|
+
["Yellow","SUV", "Imported", "Yes"],
|
20
|
+
["Yellow","Sports", "Domestic", "No"],
|
21
|
+
["Red", "SUV", "Imported", "No"],
|
22
|
+
["Red", "Sports", "Imported", "Yes"]
|
23
|
+
]
|
24
|
+
|
25
|
+
def setup
|
26
|
+
@data_set = DataSet.new
|
27
|
+
@data_set = DataSet.new(:data_items => @@data_items, :data_labels => @@data_labels)
|
28
|
+
@b = NaiveBayes.new.set_parameters({:m=>3}).build @data_set
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_eval
|
32
|
+
result = @b.eval(["Red", "SUV", "Domestic"])
|
33
|
+
assert_equal "No", result
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_get_probability_map
|
37
|
+
map = @b.get_probability_map(["Red", "SUV", "Domestic"])
|
38
|
+
assert_equal 2, map.keys.length
|
39
|
+
assert_in_delta 0.42, map["Yes"], 0.1
|
40
|
+
assert_in_delta 0.58, map["No"], 0.1
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: ai4r
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: "1.
|
7
|
-
date: 2009-
|
6
|
+
version: "1.9"
|
7
|
+
date: 2009-07-01 00:00:00 +01:00
|
8
8
|
summary: Ruby implementations of algorithms covering several Artificial intelligence fields, including Genetic algorithms, Neural Networks, machine learning, and clustering.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -29,11 +29,13 @@ post_install_message:
|
|
29
29
|
authors:
|
30
30
|
- Sergio Fierens
|
31
31
|
files:
|
32
|
+
- examples/classifiers
|
33
|
+
- examples/classifiers/id3_data.csv
|
34
|
+
- examples/classifiers/id3_example.rb
|
35
|
+
- examples/classifiers/naive_bayes_data.csv
|
36
|
+
- examples/classifiers/naive_bayes_example.rb
|
37
|
+
- examples/classifiers/results.txt
|
32
38
|
- examples/clusterers
|
33
|
-
- examples/decision_trees
|
34
|
-
- examples/decision_trees/data_set.csv
|
35
|
-
- examples/decision_trees/id3_example.rb
|
36
|
-
- examples/decision_trees/results.txt
|
37
39
|
- examples/genetic_algorithm
|
38
40
|
- examples/genetic_algorithm/genetic_algorithm_example.rb
|
39
41
|
- examples/genetic_algorithm/travel_cost.csv
|
@@ -53,6 +55,7 @@ files:
|
|
53
55
|
- lib/ai4r/classifiers/hyperpipes.rb
|
54
56
|
- lib/ai4r/classifiers/id3.rb
|
55
57
|
- lib/ai4r/classifiers/multilayer_perceptron.rb
|
58
|
+
- lib/ai4r/classifiers/naive_bayes.rb
|
56
59
|
- lib/ai4r/classifiers/one_r.rb
|
57
60
|
- lib/ai4r/classifiers/prism.rb
|
58
61
|
- lib/ai4r/classifiers/zero_r.rb
|
@@ -91,6 +94,7 @@ test_files:
|
|
91
94
|
- test/classifiers/hyperpipes_test.rb
|
92
95
|
- test/classifiers/id3_test.rb
|
93
96
|
- test/classifiers/multilayer_perceptron_test.rb
|
97
|
+
- test/classifiers/naive_bayes_test.rb
|
94
98
|
- test/classifiers/one_r_test.rb
|
95
99
|
- test/classifiers/prism_test.rb
|
96
100
|
- test/classifiers/zero_r_test.rb
|