ai4ruby 1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +47 -0
- data/examples/classifiers/id3_data.csv +121 -0
- data/examples/classifiers/id3_example.rb +29 -0
- data/examples/classifiers/naive_bayes_data.csv +11 -0
- data/examples/classifiers/naive_bayes_example.rb +16 -0
- data/examples/classifiers/results.txt +31 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
- data/examples/genetic_algorithm/travel_cost.csv +16 -0
- data/examples/neural_network/backpropagation_example.rb +67 -0
- data/examples/neural_network/patterns_with_base_noise.rb +68 -0
- data/examples/neural_network/patterns_with_noise.rb +66 -0
- data/examples/neural_network/training_patterns.rb +68 -0
- data/examples/neural_network/xor_example.rb +35 -0
- data/examples/som/som_data.rb +156 -0
- data/examples/som/som_multi_node_example.rb +22 -0
- data/examples/som/som_single_example.rb +24 -0
- data/lib/ai4r.rb +33 -0
- data/lib/ai4r/classifiers/classifier.rb +62 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
- data/lib/ai4r/classifiers/ib1.rb +121 -0
- data/lib/ai4r/classifiers/id3.rb +326 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
- data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
- data/lib/ai4r/classifiers/one_r.rb +110 -0
- data/lib/ai4r/classifiers/prism.rb +197 -0
- data/lib/ai4r/classifiers/zero_r.rb +73 -0
- data/lib/ai4r/clusterers/average_linkage.rb +59 -0
- data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
- data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
- data/lib/ai4r/clusterers/clusterer.rb +61 -0
- data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
- data/lib/ai4r/clusterers/diana.rb +139 -0
- data/lib/ai4r/clusterers/k_means.rb +126 -0
- data/lib/ai4r/clusterers/median_linkage.rb +61 -0
- data/lib/ai4r/clusterers/single_linkage.rb +194 -0
- data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +31 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
- data/lib/ai4r/data/data_set.rb +266 -0
- data/lib/ai4r/data/parameterizable.rb +64 -0
- data/lib/ai4r/data/proximity.rb +100 -0
- data/lib/ai4r/data/statistics.rb +77 -0
- data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
- data/lib/ai4r/neural_network/backpropagation.rb +326 -0
- data/lib/ai4r/neural_network/hopfield.rb +149 -0
- data/lib/ai4r/som/layer.rb +68 -0
- data/lib/ai4r/som/node.rb +96 -0
- data/lib/ai4r/som/som.rb +155 -0
- data/lib/ai4r/som/two_phase_layer.rb +90 -0
- data/test/classifiers/hyperpipes_test.rb +84 -0
- data/test/classifiers/ib1_test.rb +78 -0
- data/test/classifiers/id3_test.rb +208 -0
- data/test/classifiers/multilayer_perceptron_test.rb +79 -0
- data/test/classifiers/naive_bayes_test.rb +43 -0
- data/test/classifiers/one_r_test.rb +62 -0
- data/test/classifiers/prism_test.rb +85 -0
- data/test/classifiers/zero_r_test.rb +49 -0
- data/test/clusterers/average_linkage_test.rb +51 -0
- data/test/clusterers/bisecting_k_means_test.rb +66 -0
- data/test/clusterers/centroid_linkage_test.rb +53 -0
- data/test/clusterers/complete_linkage_test.rb +57 -0
- data/test/clusterers/diana_test.rb +69 -0
- data/test/clusterers/k_means_test.rb +100 -0
- data/test/clusterers/median_linkage_test.rb +53 -0
- data/test/clusterers/single_linkage_test.rb +122 -0
- data/test/clusterers/ward_linkage_hierarchical_test.rb +61 -0
- data/test/clusterers/ward_linkage_test.rb +53 -0
- data/test/clusterers/weighted_average_linkage_test.rb +53 -0
- data/test/data/data_set_test.rb +96 -0
- data/test/data/proximity_test.rb +81 -0
- data/test/data/statistics_test.rb +65 -0
- data/test/experiment/classifier_evaluator_test.rb +76 -0
- data/test/genetic_algorithm/chromosome_test.rb +58 -0
- data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
- data/test/neural_network/backpropagation_test.rb +82 -0
- data/test/neural_network/hopfield_test.rb +72 -0
- data/test/som/som_test.rb +97 -0
- metadata +168 -0
@@ -0,0 +1,135 @@
|
|
1
|
+
# Author:: Sergio Fierens (Implementation only)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set.rb'
|
11
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
12
|
+
require File.dirname(__FILE__) + '/../neural_network/backpropagation'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Classifiers
|
16
|
+
|
17
|
+
# = Introduction
|
18
|
+
#
|
19
|
+
# The idea behind the MultilayerPerceptron classifier is to
|
20
|
+
# train a Multilayer Perceptron neural network with the provided examples,
|
21
|
+
# and predict the class for new data items.
|
22
|
+
#
|
23
|
+
# = Parameters
|
24
|
+
#
|
25
|
+
# Use class method get_parameters_info to obtain details on the algorithm
|
26
|
+
# parameters. Use set_parameters to set values for this parameters.
|
27
|
+
# See Parameterizable module documentation.
|
28
|
+
#
|
29
|
+
# * :network_class => Neural network implementation class.
|
30
|
+
# By default: Ai4r::NeuralNetwork::Backpropagation.
|
31
|
+
# * :network_parameters => Parameters to be forwarded to the back end
|
32
|
+
# neural ntework.
|
33
|
+
# * :hidden_layers => Hidden layer structure. E.g. [8, 6] will generate
|
34
|
+
# 2 hidden layers with 8 and 6 neurons each. By default []
|
35
|
+
# * :training_iterations => How many times the training should be repeated.
|
36
|
+
# By default: 1000.
|
37
|
+
# :active_node_value => Default: 1
|
38
|
+
# :inactive_node_value => Default: 1
|
39
|
+
class MultilayerPerceptron < Classifier
|
40
|
+
|
41
|
+
attr_reader :data_set, :class_value, :network, :domains
|
42
|
+
|
43
|
+
parameters_info :network_class => "Neural network implementation class."+
|
44
|
+
"By default: Ai4r::NeuralNetwork::Backpropagation.",
|
45
|
+
:network_parameters => "parameters to be forwarded to the back end " +
|
46
|
+
"neural network.",
|
47
|
+
:hidden_layers => "Hidden layer structure. E.g. [8, 6] will generate " +
|
48
|
+
"2 hidden layers with 8 and 6 neurons each. By default []",
|
49
|
+
:training_iterations => "How many times the training should be " +
|
50
|
+
"repeated. By default: 1000",
|
51
|
+
:active_node_value => "Default: 1",
|
52
|
+
:inactive_node_value => "Default: 0"
|
53
|
+
|
54
|
+
def initialize
|
55
|
+
@network_class = Ai4r::NeuralNetwork::Backpropagation
|
56
|
+
@hidden_layers = []
|
57
|
+
@training_iterations = 500
|
58
|
+
@network_parameters = {}
|
59
|
+
@active_node_value = 1
|
60
|
+
@inactive_node_value = 0
|
61
|
+
end
|
62
|
+
|
63
|
+
# Build a new MultilayerPerceptron classifier. You must provide a DataSet
|
64
|
+
# instance as parameter. The last attribute of each item is considered as
|
65
|
+
# the item class.
|
66
|
+
def build(data_set)
|
67
|
+
data_set.check_not_empty
|
68
|
+
@data_set = data_set
|
69
|
+
@domains = @data_set.build_domains.collect {|domain| domain.to_a}
|
70
|
+
@outputs = @domains.last.length
|
71
|
+
@inputs = 0
|
72
|
+
@domains[0...-1].each {|domain| @inputs += domain.length}
|
73
|
+
@structure = [@inputs] + @hidden_layers + [@outputs]
|
74
|
+
@network = @network_class.new @structure
|
75
|
+
@training_iterations.times do
|
76
|
+
data_set.data_items.each do |data_item|
|
77
|
+
input_values = data_to_input(data_item[0...-1])
|
78
|
+
output_values = data_to_output(data_item.last)
|
79
|
+
@network.train(input_values, output_values)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
return self
|
83
|
+
end
|
84
|
+
|
85
|
+
# You can evaluate new data, predicting its class.
|
86
|
+
# e.g.
|
87
|
+
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
88
|
+
def eval(data)
|
89
|
+
input_values = data_to_input(data)
|
90
|
+
output_values = @network.eval(input_values)
|
91
|
+
return @domains.last[get_max_index(output_values)]
|
92
|
+
end
|
93
|
+
|
94
|
+
# Multilayer Perceptron Classifiers cannot generate
|
95
|
+
# human-readable rules.
|
96
|
+
def get_rules
|
97
|
+
return "raise 'Neural networks classifiers do not generate human-readable rules.'"
|
98
|
+
end
|
99
|
+
|
100
|
+
protected
|
101
|
+
|
102
|
+
def data_to_input(data_item)
|
103
|
+
input_values = Array.new(@inputs, @inactive_node_value)
|
104
|
+
accum_index = 0
|
105
|
+
data_item.each_index do |att_index|
|
106
|
+
att_value = data_item[att_index]
|
107
|
+
domain_index = @domains[att_index].index(att_value)
|
108
|
+
input_values[domain_index + accum_index] = @active_node_value
|
109
|
+
accum_index = @domains[att_index].length
|
110
|
+
end
|
111
|
+
return input_values
|
112
|
+
end
|
113
|
+
|
114
|
+
def data_to_output(data_item)
|
115
|
+
output_values = Array.new(@outputs, @inactive_node_value)
|
116
|
+
output_values[@domains.last.index(data_item)] = @active_node_value
|
117
|
+
return output_values
|
118
|
+
end
|
119
|
+
|
120
|
+
def get_max_index(output_values)
|
121
|
+
max_value = @inactive_node_value
|
122
|
+
max_index = 0
|
123
|
+
output_values.each_index do |output_index|
|
124
|
+
if max_value < output_values[output_index]
|
125
|
+
max_value = output_values[output_index]
|
126
|
+
max_index = output_index
|
127
|
+
end
|
128
|
+
end
|
129
|
+
return max_index
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,259 @@
|
|
1
|
+
# Author:: Thomas Kern
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/classifier'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Classifiers
|
15
|
+
|
16
|
+
|
17
|
+
# = Introduction
|
18
|
+
#
|
19
|
+
# This is an implementation of a Naive Bayesian Classifier without any
|
20
|
+
# specialisation (ie. for text classification)
|
21
|
+
# Probabilities P(a_i | v_j) are estimated using m-estimates, hence the
|
22
|
+
# m parameter as second parameter when isntantiating the class.
|
23
|
+
# The estimation looks like this:
|
24
|
+
#(n_c + mp) / (n + m)
|
25
|
+
#
|
26
|
+
# the variables are:
|
27
|
+
# n = the number of training examples for which v = v_j
|
28
|
+
# n_c = number of examples for which v = v_j and a = a_i
|
29
|
+
# p = a priori estimate for P(a_i | v_j)
|
30
|
+
# m = the equivalent sample size
|
31
|
+
#
|
32
|
+
# stores the conditional probabilities in an array named @pcp and in this form:
|
33
|
+
# @pcp[attributes][values][classes]
|
34
|
+
#
|
35
|
+
# This kind of estimator is useful when the training data set is relatively small.
|
36
|
+
# If the data set is big enough, set it to 0, which is also the default value
|
37
|
+
#
|
38
|
+
#
|
39
|
+
# For further details regarding Bayes and Naive Bayes Classifier have a look at those websites:
|
40
|
+
# http://en.wikipedia.org/wiki/Naive_Bayesian_classification
|
41
|
+
# http://en.wikipedia.org/wiki/Bayes%27_theorem
|
42
|
+
#
|
43
|
+
#
|
44
|
+
# = Parameters
|
45
|
+
#
|
46
|
+
# * :m => Optional. Default value is set to 0. It may be set to a value greater than 0 when
|
47
|
+
# the size of the dataset is relatively small
|
48
|
+
#
|
49
|
+
# = How to use it
|
50
|
+
#
|
51
|
+
# data = DataSet.new.load_csv_with_labels "bayes_data.csv"
|
52
|
+
# b = NaiveBayes.new.
|
53
|
+
# set_parameters({:m=>3}).
|
54
|
+
# build data
|
55
|
+
# b.eval(["Red", "SUV", "Domestic"])
|
56
|
+
#
|
57
|
+
class NaiveBayes < Classifier
|
58
|
+
|
59
|
+
parameters_info :m => "Default value is set to 0. It may be set to a value greater than " +
|
60
|
+
"0 when the size of the dataset is relatively small"
|
61
|
+
|
62
|
+
def initialize
|
63
|
+
@m = 0
|
64
|
+
@class_counts = []
|
65
|
+
@class_prob = [] # stores the probability of the classes
|
66
|
+
@pcc = [] # stores the number of instances divided into attribute/value/class
|
67
|
+
@pcp = [] # stores the conditional probabilities of the values of an attribute
|
68
|
+
@klass_index = {} # hashmap for quick lookup of all the used klasses and their indice
|
69
|
+
@values = {} # hashmap for quick lookup of all the values
|
70
|
+
end
|
71
|
+
|
72
|
+
# You can evaluate new data, predicting its category.
|
73
|
+
# e.g.
|
74
|
+
# b.eval(["Red", "SUV", "Domestic"])
|
75
|
+
# => 'No'
|
76
|
+
def eval(data)
|
77
|
+
prob = @class_prob.map {|cp| cp}
|
78
|
+
prob = calculate_class_probabilities_for_entry(data, prob)
|
79
|
+
index_to_klass(prob.index(prob.max))
|
80
|
+
end
|
81
|
+
|
82
|
+
# Calculates the probabilities for the data entry Data.
|
83
|
+
# data has to be an array of the same dimension as the training data minus the
|
84
|
+
# class column.
|
85
|
+
# Returns a map containint all classes as keys:
|
86
|
+
# {Class_1 => probability, Class_2 => probability2 ... }
|
87
|
+
# Probability is <= 1 and of type Float.
|
88
|
+
# e.g.
|
89
|
+
# b.get_probability_map(["Red", "SUV", "Domestic"])
|
90
|
+
# => {"Yes"=>0.4166666666666667, "No"=>0.5833333333333334}
|
91
|
+
def get_probability_map(data)
|
92
|
+
prob = @class_prob.map {|cp| cp}
|
93
|
+
prob = calculate_class_probabilities_for_entry(data, prob)
|
94
|
+
prob = normalize_class_probability prob
|
95
|
+
probability_map = {}
|
96
|
+
prob.each_with_index { |p, i| probability_map[index_to_klass(i)] = p }
|
97
|
+
return probability_map
|
98
|
+
end
|
99
|
+
|
100
|
+
# counts values of the attribute instances and calculates the probability of the classes
|
101
|
+
# and the conditional probabilities
|
102
|
+
# Parameter data has to be an instance of CsvDataSet
|
103
|
+
def build(data)
|
104
|
+
raise "Error instance must be passed" unless data.is_a?(DataSet)
|
105
|
+
raise "Data should not be empty" if data.data_items.length == 0
|
106
|
+
|
107
|
+
initialize_domain_data(data)
|
108
|
+
initialize_klass_index
|
109
|
+
initialize_pc
|
110
|
+
calculate_probabilities
|
111
|
+
|
112
|
+
return self
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def initialize_domain_data(data)
|
118
|
+
@domains = data.build_domains
|
119
|
+
@data_items = data.data_items.map { |item| DataEntry.new(item[0...-1], item.last) }
|
120
|
+
@data_labels = data.data_labels[0...-1]
|
121
|
+
@klasses = @domains.last.to_a
|
122
|
+
end
|
123
|
+
|
124
|
+
|
125
|
+
# calculates the klass probability of a data entry
|
126
|
+
# as usual, the probability of the value is multiplied with every conditional
|
127
|
+
# probability of every attribute in condition to a specific class
|
128
|
+
# this is repeated for every class
|
129
|
+
def calculate_class_probabilities_for_entry(data, prob)
|
130
|
+
prob.each_with_index do |prob_entry, prob_index|
|
131
|
+
data.each_with_index do |att, index|
|
132
|
+
next if value_index(att, index).nil?
|
133
|
+
prob[prob_index] *= @pcp[index][value_index(att, index)][prob_index]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# normalises the array of probabilities so the sum of the array equals 1
|
139
|
+
def normalize_class_probability(prob)
|
140
|
+
prob_sum = sum(prob)
|
141
|
+
prob_sum > 0 ?
|
142
|
+
prob.map {|prob_entry| prob_entry / prob_sum } :
|
143
|
+
prob
|
144
|
+
end
|
145
|
+
|
146
|
+
# sums an array up; returns a number of type Float
|
147
|
+
def sum(array)
|
148
|
+
array.inject(0.0){|b, i| b+i}
|
149
|
+
end
|
150
|
+
|
151
|
+
# returns the name of the class when the index is found
|
152
|
+
def index_to_klass(index)
|
153
|
+
@klass_index.has_value?(index) ? @klass_index.index(index) : nil
|
154
|
+
end
|
155
|
+
|
156
|
+
# initializes @values and @klass_index; maps a certain value to a uniq index
|
157
|
+
def initialize_klass_index
|
158
|
+
@klasses.each_with_index do |dl, index|
|
159
|
+
@klass_index[dl] = index
|
160
|
+
end
|
161
|
+
|
162
|
+
@data_labels.each_with_index do |dl, index|
|
163
|
+
@values[index] = {}
|
164
|
+
@domains[index].each_with_index do |d, d_index|
|
165
|
+
@values[index][d] = d_index
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# returns the index of a class
|
171
|
+
def klass_index(klass)
|
172
|
+
@klass_index[klass]
|
173
|
+
end
|
174
|
+
|
175
|
+
# returns the index of a value, depending on the attribute index
|
176
|
+
def value_index(value, dl_index)
|
177
|
+
@values[dl_index][value]
|
178
|
+
end
|
179
|
+
|
180
|
+
# builds an array of the form:
|
181
|
+
# array[attributes][values][classes]
|
182
|
+
def build_array(dl, index)
|
183
|
+
domains = Array.new(@domains[index].length)
|
184
|
+
domains.map do |p1|
|
185
|
+
pl = Array.new @klasses.length, 0
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
# initializes the two array for storing the count and conditional probabilities of
|
190
|
+
# the attributes
|
191
|
+
def initialize_pc
|
192
|
+
@data_labels.each_with_index do |dl, index|
|
193
|
+
@pcc << build_array(dl, index)
|
194
|
+
@pcp << build_array(dl, index)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
# calculates the occurrences of a class and the instances of a certain value of a
|
199
|
+
# certain attribute and the assigned class.
|
200
|
+
# In addition to that, it also calculates the conditional probabilities and values
|
201
|
+
def calculate_probabilities
|
202
|
+
@klasses.each {|dl| @class_counts[klass_index(dl)] = 0}
|
203
|
+
|
204
|
+
calculate_class_probabilities
|
205
|
+
count_instances
|
206
|
+
calculate_conditional_probabilities
|
207
|
+
end
|
208
|
+
|
209
|
+
def calculate_class_probabilities
|
210
|
+
@data_items.each do |entry|
|
211
|
+
@class_counts[klass_index(entry.klass)] += 1
|
212
|
+
end
|
213
|
+
|
214
|
+
@class_counts.each_with_index do |k, index|
|
215
|
+
@class_prob[index] = k.to_f / @data_items.length
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
# counts the instances of a certain value of a certain attribute and the assigned class
|
220
|
+
def count_instances
|
221
|
+
@data_items.each do |item|
|
222
|
+
@data_labels.each_with_index do |dl, dl_index|
|
223
|
+
@pcc[dl_index][value_index(item[dl_index], dl_index)][klass_index(item.klass)] += 1
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
# calculates the conditional probability and stores it in the @pcp-array
|
229
|
+
def calculate_conditional_probabilities
|
230
|
+
@pcc.each_with_index do |attributes, a_index|
|
231
|
+
attributes.each_with_index do |values, v_index|
|
232
|
+
values.each_with_index do |klass, k_index|
|
233
|
+
@pcp[a_index][v_index][k_index] = (klass.to_f + @m * @class_prob[k_index]) / (@class_counts[k_index] + @m).to_f
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
#DataEntry stores the instance of the data entry
|
240
|
+
#the data is accessible via entries
|
241
|
+
#stores the class-column in the attribute klass and
|
242
|
+
#removes the column for the class-entry
|
243
|
+
class DataEntry
|
244
|
+
attr_accessor :klass, :entries
|
245
|
+
|
246
|
+
def initialize(attributes, klass)
|
247
|
+
@klass = klass
|
248
|
+
@entries = attributes
|
249
|
+
end
|
250
|
+
|
251
|
+
# wrapper method for the access to @entries
|
252
|
+
def [](index)
|
253
|
+
@entries[index]
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
@@ -0,0 +1,110 @@
|
|
1
|
+
# Author:: Sergio Fierens (Implementation only)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require 'set'
|
11
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
12
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Classifiers
|
16
|
+
|
17
|
+
# = Introduction
|
18
|
+
#
|
19
|
+
# The idea of the OneR algorithm is identify the single
|
20
|
+
# attribute to use to classify data that makes
|
21
|
+
# fewest prediction errors.
|
22
|
+
# It generates rules based on a single attribute.
|
23
|
+
class OneR < Classifier
|
24
|
+
|
25
|
+
attr_reader :data_set, :rule
|
26
|
+
|
27
|
+
# Build a new OneR classifier. You must provide a DataSet instance
|
28
|
+
# as parameter. The last attribute of each item is considered as
|
29
|
+
# the item class.
|
30
|
+
def build(data_set)
|
31
|
+
data_set.check_not_empty
|
32
|
+
@data_set = data_set
|
33
|
+
if (data_set.num_attributes == 1)
|
34
|
+
@zero_r = ZeroR.new.build(data_set)
|
35
|
+
return self;
|
36
|
+
else
|
37
|
+
@zero_r = nil;
|
38
|
+
end
|
39
|
+
domains = @data_set.build_domains
|
40
|
+
@rule = nil
|
41
|
+
domains[1...-1].each_index do |attr_index|
|
42
|
+
rule = build_rule(@data_set.data_items, attr_index, domains)
|
43
|
+
@rule = rule if !@rule || rule[:correct] > @rule[:correct]
|
44
|
+
end
|
45
|
+
return self
|
46
|
+
end
|
47
|
+
|
48
|
+
# You can evaluate new data, predicting its class.
|
49
|
+
# e.g.
|
50
|
+
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
51
|
+
def eval(data)
|
52
|
+
return @zero_r.eval(data) if @zero_r
|
53
|
+
attr_value = data[@rule[:attr_index]]
|
54
|
+
return @rule[:rule][attr_value]
|
55
|
+
end
|
56
|
+
|
57
|
+
# This method returns the generated rules in ruby code.
|
58
|
+
# e.g.
|
59
|
+
#
|
60
|
+
# classifier.get_rules
|
61
|
+
# # => if age_range == '<30' then marketing_target = 'Y'
|
62
|
+
# elsif age_range == '[30-50)' then marketing_target = 'N'
|
63
|
+
# elsif age_range == '[50-80]' then marketing_target = 'N'
|
64
|
+
# end
|
65
|
+
#
|
66
|
+
# It is a nice way to inspect induction results, and also to execute them:
|
67
|
+
# marketing_target = nil
|
68
|
+
# eval classifier.get_rules
|
69
|
+
# puts marketing_target
|
70
|
+
# # => 'Y'
|
71
|
+
def get_rules
|
72
|
+
return @zero_r.get_rules if @zero_r
|
73
|
+
sentences = []
|
74
|
+
attr_label = @data_set.data_labels[@rule[:attr_index]]
|
75
|
+
class_label = @data_set.data_labels.last
|
76
|
+
@rule[:rule].each_pair do |attr_value, class_value|
|
77
|
+
sentences << "#{attr_label} == '#{attr_value}' then #{class_label} = '#{class_value}'"
|
78
|
+
end
|
79
|
+
return "if " + sentences.join("\nelsif ") + "\nend"
|
80
|
+
end
|
81
|
+
|
82
|
+
protected
|
83
|
+
|
84
|
+
def build_rule(data_examples, attr_index, domains)
|
85
|
+
domain = domains[attr_index]
|
86
|
+
value_freq = Hash.new
|
87
|
+
domain.each do |attr_value|
|
88
|
+
value_freq[attr_value] = Hash.new { |hash, key| hash[key] = 0 }
|
89
|
+
end
|
90
|
+
data_examples.each do |data|
|
91
|
+
value_freq[data[attr_index]][data.last] = value_freq[data[attr_index]][data.last] + 1
|
92
|
+
end
|
93
|
+
rule = {}
|
94
|
+
correct_instances = 0
|
95
|
+
value_freq.each_pair do |attr, class_freq_hash|
|
96
|
+
max_freq = 0
|
97
|
+
class_freq_hash.each_pair do |class_value, freq|
|
98
|
+
if max_freq < freq
|
99
|
+
rule[attr] = class_value
|
100
|
+
max_freq = freq
|
101
|
+
end
|
102
|
+
end
|
103
|
+
correct_instances += max_freq
|
104
|
+
end
|
105
|
+
return {:attr_index => attr_index, :rule => rule, :correct => correct_instances}
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|