ai4ruby 1.11
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +47 -0
- data/examples/classifiers/id3_data.csv +121 -0
- data/examples/classifiers/id3_example.rb +29 -0
- data/examples/classifiers/naive_bayes_data.csv +11 -0
- data/examples/classifiers/naive_bayes_example.rb +16 -0
- data/examples/classifiers/results.txt +31 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
- data/examples/genetic_algorithm/travel_cost.csv +16 -0
- data/examples/neural_network/backpropagation_example.rb +67 -0
- data/examples/neural_network/patterns_with_base_noise.rb +68 -0
- data/examples/neural_network/patterns_with_noise.rb +66 -0
- data/examples/neural_network/training_patterns.rb +68 -0
- data/examples/neural_network/xor_example.rb +35 -0
- data/examples/som/som_data.rb +156 -0
- data/examples/som/som_multi_node_example.rb +22 -0
- data/examples/som/som_single_example.rb +24 -0
- data/lib/ai4r.rb +33 -0
- data/lib/ai4r/classifiers/classifier.rb +62 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
- data/lib/ai4r/classifiers/ib1.rb +121 -0
- data/lib/ai4r/classifiers/id3.rb +326 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
- data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
- data/lib/ai4r/classifiers/one_r.rb +110 -0
- data/lib/ai4r/classifiers/prism.rb +197 -0
- data/lib/ai4r/classifiers/zero_r.rb +73 -0
- data/lib/ai4r/clusterers/average_linkage.rb +59 -0
- data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
- data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
- data/lib/ai4r/clusterers/clusterer.rb +61 -0
- data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
- data/lib/ai4r/clusterers/diana.rb +139 -0
- data/lib/ai4r/clusterers/k_means.rb +126 -0
- data/lib/ai4r/clusterers/median_linkage.rb +61 -0
- data/lib/ai4r/clusterers/single_linkage.rb +194 -0
- data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +31 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
- data/lib/ai4r/data/data_set.rb +266 -0
- data/lib/ai4r/data/parameterizable.rb +64 -0
- data/lib/ai4r/data/proximity.rb +100 -0
- data/lib/ai4r/data/statistics.rb +77 -0
- data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
- data/lib/ai4r/neural_network/backpropagation.rb +326 -0
- data/lib/ai4r/neural_network/hopfield.rb +149 -0
- data/lib/ai4r/som/layer.rb +68 -0
- data/lib/ai4r/som/node.rb +96 -0
- data/lib/ai4r/som/som.rb +155 -0
- data/lib/ai4r/som/two_phase_layer.rb +90 -0
- data/test/classifiers/hyperpipes_test.rb +84 -0
- data/test/classifiers/ib1_test.rb +78 -0
- data/test/classifiers/id3_test.rb +208 -0
- data/test/classifiers/multilayer_perceptron_test.rb +79 -0
- data/test/classifiers/naive_bayes_test.rb +43 -0
- data/test/classifiers/one_r_test.rb +62 -0
- data/test/classifiers/prism_test.rb +85 -0
- data/test/classifiers/zero_r_test.rb +49 -0
- data/test/clusterers/average_linkage_test.rb +51 -0
- data/test/clusterers/bisecting_k_means_test.rb +66 -0
- data/test/clusterers/centroid_linkage_test.rb +53 -0
- data/test/clusterers/complete_linkage_test.rb +57 -0
- data/test/clusterers/diana_test.rb +69 -0
- data/test/clusterers/k_means_test.rb +100 -0
- data/test/clusterers/median_linkage_test.rb +53 -0
- data/test/clusterers/single_linkage_test.rb +122 -0
- data/test/clusterers/ward_linkage_hierarchical_test.rb +61 -0
- data/test/clusterers/ward_linkage_test.rb +53 -0
- data/test/clusterers/weighted_average_linkage_test.rb +53 -0
- data/test/data/data_set_test.rb +96 -0
- data/test/data/proximity_test.rb +81 -0
- data/test/data/statistics_test.rb +65 -0
- data/test/experiment/classifier_evaluator_test.rb +76 -0
- data/test/genetic_algorithm/chromosome_test.rb +58 -0
- data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
- data/test/neural_network/backpropagation_test.rb +82 -0
- data/test/neural_network/hopfield_test.rb +72 -0
- data/test/som/som_test.rb +97 -0
- metadata +168 -0
@@ -0,0 +1,135 @@
|
|
1
|
+
# Author:: Sergio Fierens (Implementation only)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set.rb'
|
11
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
12
|
+
require File.dirname(__FILE__) + '/../neural_network/backpropagation'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Classifiers
|
16
|
+
|
17
|
+
# = Introduction
|
18
|
+
#
|
19
|
+
# The idea behind the MultilayerPerceptron classifier is to
|
20
|
+
# train a Multilayer Perceptron neural network with the provided examples,
|
21
|
+
# and predict the class for new data items.
|
22
|
+
#
|
23
|
+
# = Parameters
|
24
|
+
#
|
25
|
+
# Use class method get_parameters_info to obtain details on the algorithm
|
26
|
+
# parameters. Use set_parameters to set values for this parameters.
|
27
|
+
# See Parameterizable module documentation.
|
28
|
+
#
|
29
|
+
# * :network_class => Neural network implementation class.
|
30
|
+
# By default: Ai4r::NeuralNetwork::Backpropagation.
|
31
|
+
# * :network_parameters => Parameters to be forwarded to the back end
|
32
|
+
# neural ntework.
|
33
|
+
# * :hidden_layers => Hidden layer structure. E.g. [8, 6] will generate
|
34
|
+
# 2 hidden layers with 8 and 6 neurons each. By default []
|
35
|
+
# * :training_iterations => How many times the training should be repeated.
|
36
|
+
# By default: 1000.
|
37
|
+
# :active_node_value => Default: 1
|
38
|
+
# :inactive_node_value => Default: 1
|
39
|
+
class MultilayerPerceptron < Classifier
|
40
|
+
|
41
|
+
attr_reader :data_set, :class_value, :network, :domains
|
42
|
+
|
43
|
+
parameters_info :network_class => "Neural network implementation class."+
|
44
|
+
"By default: Ai4r::NeuralNetwork::Backpropagation.",
|
45
|
+
:network_parameters => "parameters to be forwarded to the back end " +
|
46
|
+
"neural network.",
|
47
|
+
:hidden_layers => "Hidden layer structure. E.g. [8, 6] will generate " +
|
48
|
+
"2 hidden layers with 8 and 6 neurons each. By default []",
|
49
|
+
:training_iterations => "How many times the training should be " +
|
50
|
+
"repeated. By default: 1000",
|
51
|
+
:active_node_value => "Default: 1",
|
52
|
+
:inactive_node_value => "Default: 0"
|
53
|
+
|
54
|
+
def initialize
|
55
|
+
@network_class = Ai4r::NeuralNetwork::Backpropagation
|
56
|
+
@hidden_layers = []
|
57
|
+
@training_iterations = 500
|
58
|
+
@network_parameters = {}
|
59
|
+
@active_node_value = 1
|
60
|
+
@inactive_node_value = 0
|
61
|
+
end
|
62
|
+
|
63
|
+
# Build a new MultilayerPerceptron classifier. You must provide a DataSet
|
64
|
+
# instance as parameter. The last attribute of each item is considered as
|
65
|
+
# the item class.
|
66
|
+
def build(data_set)
|
67
|
+
data_set.check_not_empty
|
68
|
+
@data_set = data_set
|
69
|
+
@domains = @data_set.build_domains.collect {|domain| domain.to_a}
|
70
|
+
@outputs = @domains.last.length
|
71
|
+
@inputs = 0
|
72
|
+
@domains[0...-1].each {|domain| @inputs += domain.length}
|
73
|
+
@structure = [@inputs] + @hidden_layers + [@outputs]
|
74
|
+
@network = @network_class.new @structure
|
75
|
+
@training_iterations.times do
|
76
|
+
data_set.data_items.each do |data_item|
|
77
|
+
input_values = data_to_input(data_item[0...-1])
|
78
|
+
output_values = data_to_output(data_item.last)
|
79
|
+
@network.train(input_values, output_values)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
return self
|
83
|
+
end
|
84
|
+
|
85
|
+
# You can evaluate new data, predicting its class.
|
86
|
+
# e.g.
|
87
|
+
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
88
|
+
def eval(data)
|
89
|
+
input_values = data_to_input(data)
|
90
|
+
output_values = @network.eval(input_values)
|
91
|
+
return @domains.last[get_max_index(output_values)]
|
92
|
+
end
|
93
|
+
|
94
|
+
# Multilayer Perceptron Classifiers cannot generate
|
95
|
+
# human-readable rules.
|
96
|
+
def get_rules
|
97
|
+
return "raise 'Neural networks classifiers do not generate human-readable rules.'"
|
98
|
+
end
|
99
|
+
|
100
|
+
protected
|
101
|
+
|
102
|
+
def data_to_input(data_item)
|
103
|
+
input_values = Array.new(@inputs, @inactive_node_value)
|
104
|
+
accum_index = 0
|
105
|
+
data_item.each_index do |att_index|
|
106
|
+
att_value = data_item[att_index]
|
107
|
+
domain_index = @domains[att_index].index(att_value)
|
108
|
+
input_values[domain_index + accum_index] = @active_node_value
|
109
|
+
accum_index = @domains[att_index].length
|
110
|
+
end
|
111
|
+
return input_values
|
112
|
+
end
|
113
|
+
|
114
|
+
def data_to_output(data_item)
|
115
|
+
output_values = Array.new(@outputs, @inactive_node_value)
|
116
|
+
output_values[@domains.last.index(data_item)] = @active_node_value
|
117
|
+
return output_values
|
118
|
+
end
|
119
|
+
|
120
|
+
def get_max_index(output_values)
|
121
|
+
max_value = @inactive_node_value
|
122
|
+
max_index = 0
|
123
|
+
output_values.each_index do |output_index|
|
124
|
+
if max_value < output_values[output_index]
|
125
|
+
max_value = output_values[output_index]
|
126
|
+
max_index = output_index
|
127
|
+
end
|
128
|
+
end
|
129
|
+
return max_index
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,259 @@
|
|
1
|
+
# Author:: Thomas Kern
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/classifier'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Classifiers
|
15
|
+
|
16
|
+
|
17
|
+
# = Introduction
|
18
|
+
#
|
19
|
+
# This is an implementation of a Naive Bayesian Classifier without any
|
20
|
+
# specialisation (ie. for text classification)
|
21
|
+
# Probabilities P(a_i | v_j) are estimated using m-estimates, hence the
|
22
|
+
# m parameter as second parameter when isntantiating the class.
|
23
|
+
# The estimation looks like this:
|
24
|
+
#(n_c + mp) / (n + m)
|
25
|
+
#
|
26
|
+
# the variables are:
|
27
|
+
# n = the number of training examples for which v = v_j
|
28
|
+
# n_c = number of examples for which v = v_j and a = a_i
|
29
|
+
# p = a priori estimate for P(a_i | v_j)
|
30
|
+
# m = the equivalent sample size
|
31
|
+
#
|
32
|
+
# stores the conditional probabilities in an array named @pcp and in this form:
|
33
|
+
# @pcp[attributes][values][classes]
|
34
|
+
#
|
35
|
+
# This kind of estimator is useful when the training data set is relatively small.
|
36
|
+
# If the data set is big enough, set it to 0, which is also the default value
|
37
|
+
#
|
38
|
+
#
|
39
|
+
# For further details regarding Bayes and Naive Bayes Classifier have a look at those websites:
|
40
|
+
# http://en.wikipedia.org/wiki/Naive_Bayesian_classification
|
41
|
+
# http://en.wikipedia.org/wiki/Bayes%27_theorem
|
42
|
+
#
|
43
|
+
#
|
44
|
+
# = Parameters
|
45
|
+
#
|
46
|
+
# * :m => Optional. Default value is set to 0. It may be set to a value greater than 0 when
|
47
|
+
# the size of the dataset is relatively small
|
48
|
+
#
|
49
|
+
# = How to use it
|
50
|
+
#
|
51
|
+
# data = DataSet.new.load_csv_with_labels "bayes_data.csv"
|
52
|
+
# b = NaiveBayes.new.
|
53
|
+
# set_parameters({:m=>3}).
|
54
|
+
# build data
|
55
|
+
# b.eval(["Red", "SUV", "Domestic"])
|
56
|
+
#
|
57
|
+
class NaiveBayes < Classifier
|
58
|
+
|
59
|
+
parameters_info :m => "Default value is set to 0. It may be set to a value greater than " +
|
60
|
+
"0 when the size of the dataset is relatively small"
|
61
|
+
|
62
|
+
def initialize
|
63
|
+
@m = 0
|
64
|
+
@class_counts = []
|
65
|
+
@class_prob = [] # stores the probability of the classes
|
66
|
+
@pcc = [] # stores the number of instances divided into attribute/value/class
|
67
|
+
@pcp = [] # stores the conditional probabilities of the values of an attribute
|
68
|
+
@klass_index = {} # hashmap for quick lookup of all the used klasses and their indice
|
69
|
+
@values = {} # hashmap for quick lookup of all the values
|
70
|
+
end
|
71
|
+
|
72
|
+
# You can evaluate new data, predicting its category.
|
73
|
+
# e.g.
|
74
|
+
# b.eval(["Red", "SUV", "Domestic"])
|
75
|
+
# => 'No'
|
76
|
+
def eval(data)
|
77
|
+
prob = @class_prob.map {|cp| cp}
|
78
|
+
prob = calculate_class_probabilities_for_entry(data, prob)
|
79
|
+
index_to_klass(prob.index(prob.max))
|
80
|
+
end
|
81
|
+
|
82
|
+
# Calculates the probabilities for the data entry Data.
|
83
|
+
# data has to be an array of the same dimension as the training data minus the
|
84
|
+
# class column.
|
85
|
+
# Returns a map containint all classes as keys:
|
86
|
+
# {Class_1 => probability, Class_2 => probability2 ... }
|
87
|
+
# Probability is <= 1 and of type Float.
|
88
|
+
# e.g.
|
89
|
+
# b.get_probability_map(["Red", "SUV", "Domestic"])
|
90
|
+
# => {"Yes"=>0.4166666666666667, "No"=>0.5833333333333334}
|
91
|
+
def get_probability_map(data)
|
92
|
+
prob = @class_prob.map {|cp| cp}
|
93
|
+
prob = calculate_class_probabilities_for_entry(data, prob)
|
94
|
+
prob = normalize_class_probability prob
|
95
|
+
probability_map = {}
|
96
|
+
prob.each_with_index { |p, i| probability_map[index_to_klass(i)] = p }
|
97
|
+
return probability_map
|
98
|
+
end
|
99
|
+
|
100
|
+
# counts values of the attribute instances and calculates the probability of the classes
|
101
|
+
# and the conditional probabilities
|
102
|
+
# Parameter data has to be an instance of CsvDataSet
|
103
|
+
def build(data)
|
104
|
+
raise "Error instance must be passed" unless data.is_a?(DataSet)
|
105
|
+
raise "Data should not be empty" if data.data_items.length == 0
|
106
|
+
|
107
|
+
initialize_domain_data(data)
|
108
|
+
initialize_klass_index
|
109
|
+
initialize_pc
|
110
|
+
calculate_probabilities
|
111
|
+
|
112
|
+
return self
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def initialize_domain_data(data)
|
118
|
+
@domains = data.build_domains
|
119
|
+
@data_items = data.data_items.map { |item| DataEntry.new(item[0...-1], item.last) }
|
120
|
+
@data_labels = data.data_labels[0...-1]
|
121
|
+
@klasses = @domains.last.to_a
|
122
|
+
end
|
123
|
+
|
124
|
+
|
125
|
+
# calculates the klass probability of a data entry
|
126
|
+
# as usual, the probability of the value is multiplied with every conditional
|
127
|
+
# probability of every attribute in condition to a specific class
|
128
|
+
# this is repeated for every class
|
129
|
+
def calculate_class_probabilities_for_entry(data, prob)
|
130
|
+
prob.each_with_index do |prob_entry, prob_index|
|
131
|
+
data.each_with_index do |att, index|
|
132
|
+
next if value_index(att, index).nil?
|
133
|
+
prob[prob_index] *= @pcp[index][value_index(att, index)][prob_index]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# normalises the array of probabilities so the sum of the array equals 1
|
139
|
+
def normalize_class_probability(prob)
|
140
|
+
prob_sum = sum(prob)
|
141
|
+
prob_sum > 0 ?
|
142
|
+
prob.map {|prob_entry| prob_entry / prob_sum } :
|
143
|
+
prob
|
144
|
+
end
|
145
|
+
|
146
|
+
# sums an array up; returns a number of type Float
|
147
|
+
def sum(array)
|
148
|
+
array.inject(0.0){|b, i| b+i}
|
149
|
+
end
|
150
|
+
|
151
|
+
# returns the name of the class when the index is found
|
152
|
+
def index_to_klass(index)
|
153
|
+
@klass_index.has_value?(index) ? @klass_index.index(index) : nil
|
154
|
+
end
|
155
|
+
|
156
|
+
# initializes @values and @klass_index; maps a certain value to a uniq index
|
157
|
+
def initialize_klass_index
|
158
|
+
@klasses.each_with_index do |dl, index|
|
159
|
+
@klass_index[dl] = index
|
160
|
+
end
|
161
|
+
|
162
|
+
@data_labels.each_with_index do |dl, index|
|
163
|
+
@values[index] = {}
|
164
|
+
@domains[index].each_with_index do |d, d_index|
|
165
|
+
@values[index][d] = d_index
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# returns the index of a class
|
171
|
+
def klass_index(klass)
|
172
|
+
@klass_index[klass]
|
173
|
+
end
|
174
|
+
|
175
|
+
# returns the index of a value, depending on the attribute index
|
176
|
+
def value_index(value, dl_index)
|
177
|
+
@values[dl_index][value]
|
178
|
+
end
|
179
|
+
|
180
|
+
# builds an array of the form:
|
181
|
+
# array[attributes][values][classes]
|
182
|
+
def build_array(dl, index)
|
183
|
+
domains = Array.new(@domains[index].length)
|
184
|
+
domains.map do |p1|
|
185
|
+
pl = Array.new @klasses.length, 0
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
# initializes the two array for storing the count and conditional probabilities of
|
190
|
+
# the attributes
|
191
|
+
def initialize_pc
|
192
|
+
@data_labels.each_with_index do |dl, index|
|
193
|
+
@pcc << build_array(dl, index)
|
194
|
+
@pcp << build_array(dl, index)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
# calculates the occurrences of a class and the instances of a certain value of a
|
199
|
+
# certain attribute and the assigned class.
|
200
|
+
# In addition to that, it also calculates the conditional probabilities and values
|
201
|
+
def calculate_probabilities
|
202
|
+
@klasses.each {|dl| @class_counts[klass_index(dl)] = 0}
|
203
|
+
|
204
|
+
calculate_class_probabilities
|
205
|
+
count_instances
|
206
|
+
calculate_conditional_probabilities
|
207
|
+
end
|
208
|
+
|
209
|
+
def calculate_class_probabilities
|
210
|
+
@data_items.each do |entry|
|
211
|
+
@class_counts[klass_index(entry.klass)] += 1
|
212
|
+
end
|
213
|
+
|
214
|
+
@class_counts.each_with_index do |k, index|
|
215
|
+
@class_prob[index] = k.to_f / @data_items.length
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
# counts the instances of a certain value of a certain attribute and the assigned class
|
220
|
+
def count_instances
|
221
|
+
@data_items.each do |item|
|
222
|
+
@data_labels.each_with_index do |dl, dl_index|
|
223
|
+
@pcc[dl_index][value_index(item[dl_index], dl_index)][klass_index(item.klass)] += 1
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
# calculates the conditional probability and stores it in the @pcp-array
|
229
|
+
def calculate_conditional_probabilities
|
230
|
+
@pcc.each_with_index do |attributes, a_index|
|
231
|
+
attributes.each_with_index do |values, v_index|
|
232
|
+
values.each_with_index do |klass, k_index|
|
233
|
+
@pcp[a_index][v_index][k_index] = (klass.to_f + @m * @class_prob[k_index]) / (@class_counts[k_index] + @m).to_f
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
#DataEntry stores the instance of the data entry
|
240
|
+
#the data is accessible via entries
|
241
|
+
#stores the class-column in the attribute klass and
|
242
|
+
#removes the column for the class-entry
|
243
|
+
class DataEntry
|
244
|
+
attr_accessor :klass, :entries
|
245
|
+
|
246
|
+
def initialize(attributes, klass)
|
247
|
+
@klass = klass
|
248
|
+
@entries = attributes
|
249
|
+
end
|
250
|
+
|
251
|
+
# wrapper method for the access to @entries
|
252
|
+
def [](index)
|
253
|
+
@entries[index]
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
@@ -0,0 +1,110 @@
|
|
1
|
+
# Author:: Sergio Fierens (Implementation only)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require 'set'
|
11
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
12
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Classifiers
|
16
|
+
|
17
|
+
# = Introduction
|
18
|
+
#
|
19
|
+
# The idea of the OneR algorithm is identify the single
|
20
|
+
# attribute to use to classify data that makes
|
21
|
+
# fewest prediction errors.
|
22
|
+
# It generates rules based on a single attribute.
|
23
|
+
class OneR < Classifier
|
24
|
+
|
25
|
+
attr_reader :data_set, :rule
|
26
|
+
|
27
|
+
# Build a new OneR classifier. You must provide a DataSet instance
|
28
|
+
# as parameter. The last attribute of each item is considered as
|
29
|
+
# the item class.
|
30
|
+
def build(data_set)
|
31
|
+
data_set.check_not_empty
|
32
|
+
@data_set = data_set
|
33
|
+
if (data_set.num_attributes == 1)
|
34
|
+
@zero_r = ZeroR.new.build(data_set)
|
35
|
+
return self;
|
36
|
+
else
|
37
|
+
@zero_r = nil;
|
38
|
+
end
|
39
|
+
domains = @data_set.build_domains
|
40
|
+
@rule = nil
|
41
|
+
domains[1...-1].each_index do |attr_index|
|
42
|
+
rule = build_rule(@data_set.data_items, attr_index, domains)
|
43
|
+
@rule = rule if !@rule || rule[:correct] > @rule[:correct]
|
44
|
+
end
|
45
|
+
return self
|
46
|
+
end
|
47
|
+
|
48
|
+
# You can evaluate new data, predicting its class.
|
49
|
+
# e.g.
|
50
|
+
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
51
|
+
def eval(data)
|
52
|
+
return @zero_r.eval(data) if @zero_r
|
53
|
+
attr_value = data[@rule[:attr_index]]
|
54
|
+
return @rule[:rule][attr_value]
|
55
|
+
end
|
56
|
+
|
57
|
+
# This method returns the generated rules in ruby code.
|
58
|
+
# e.g.
|
59
|
+
#
|
60
|
+
# classifier.get_rules
|
61
|
+
# # => if age_range == '<30' then marketing_target = 'Y'
|
62
|
+
# elsif age_range == '[30-50)' then marketing_target = 'N'
|
63
|
+
# elsif age_range == '[50-80]' then marketing_target = 'N'
|
64
|
+
# end
|
65
|
+
#
|
66
|
+
# It is a nice way to inspect induction results, and also to execute them:
|
67
|
+
# marketing_target = nil
|
68
|
+
# eval classifier.get_rules
|
69
|
+
# puts marketing_target
|
70
|
+
# # => 'Y'
|
71
|
+
def get_rules
|
72
|
+
return @zero_r.get_rules if @zero_r
|
73
|
+
sentences = []
|
74
|
+
attr_label = @data_set.data_labels[@rule[:attr_index]]
|
75
|
+
class_label = @data_set.data_labels.last
|
76
|
+
@rule[:rule].each_pair do |attr_value, class_value|
|
77
|
+
sentences << "#{attr_label} == '#{attr_value}' then #{class_label} = '#{class_value}'"
|
78
|
+
end
|
79
|
+
return "if " + sentences.join("\nelsif ") + "\nend"
|
80
|
+
end
|
81
|
+
|
82
|
+
protected
|
83
|
+
|
84
|
+
def build_rule(data_examples, attr_index, domains)
|
85
|
+
domain = domains[attr_index]
|
86
|
+
value_freq = Hash.new
|
87
|
+
domain.each do |attr_value|
|
88
|
+
value_freq[attr_value] = Hash.new { |hash, key| hash[key] = 0 }
|
89
|
+
end
|
90
|
+
data_examples.each do |data|
|
91
|
+
value_freq[data[attr_index]][data.last] = value_freq[data[attr_index]][data.last] + 1
|
92
|
+
end
|
93
|
+
rule = {}
|
94
|
+
correct_instances = 0
|
95
|
+
value_freq.each_pair do |attr, class_freq_hash|
|
96
|
+
max_freq = 0
|
97
|
+
class_freq_hash.each_pair do |class_value, freq|
|
98
|
+
if max_freq < freq
|
99
|
+
rule[attr] = class_value
|
100
|
+
max_freq = freq
|
101
|
+
end
|
102
|
+
end
|
103
|
+
correct_instances += max_freq
|
104
|
+
end
|
105
|
+
return {:attr_index => attr_index, :rule => rule, :correct => correct_instances}
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|