ai4ruby 1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +47 -0
- data/examples/classifiers/id3_data.csv +121 -0
- data/examples/classifiers/id3_example.rb +29 -0
- data/examples/classifiers/naive_bayes_data.csv +11 -0
- data/examples/classifiers/naive_bayes_example.rb +16 -0
- data/examples/classifiers/results.txt +31 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
- data/examples/genetic_algorithm/travel_cost.csv +16 -0
- data/examples/neural_network/backpropagation_example.rb +67 -0
- data/examples/neural_network/patterns_with_base_noise.rb +68 -0
- data/examples/neural_network/patterns_with_noise.rb +66 -0
- data/examples/neural_network/training_patterns.rb +68 -0
- data/examples/neural_network/xor_example.rb +35 -0
- data/examples/som/som_data.rb +156 -0
- data/examples/som/som_multi_node_example.rb +22 -0
- data/examples/som/som_single_example.rb +24 -0
- data/lib/ai4r.rb +33 -0
- data/lib/ai4r/classifiers/classifier.rb +62 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
- data/lib/ai4r/classifiers/ib1.rb +121 -0
- data/lib/ai4r/classifiers/id3.rb +326 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
- data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
- data/lib/ai4r/classifiers/one_r.rb +110 -0
- data/lib/ai4r/classifiers/prism.rb +197 -0
- data/lib/ai4r/classifiers/zero_r.rb +73 -0
- data/lib/ai4r/clusterers/average_linkage.rb +59 -0
- data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
- data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
- data/lib/ai4r/clusterers/clusterer.rb +61 -0
- data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
- data/lib/ai4r/clusterers/diana.rb +139 -0
- data/lib/ai4r/clusterers/k_means.rb +126 -0
- data/lib/ai4r/clusterers/median_linkage.rb +61 -0
- data/lib/ai4r/clusterers/single_linkage.rb +194 -0
- data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +31 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
- data/lib/ai4r/data/data_set.rb +266 -0
- data/lib/ai4r/data/parameterizable.rb +64 -0
- data/lib/ai4r/data/proximity.rb +100 -0
- data/lib/ai4r/data/statistics.rb +77 -0
- data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
- data/lib/ai4r/neural_network/backpropagation.rb +326 -0
- data/lib/ai4r/neural_network/hopfield.rb +149 -0
- data/lib/ai4r/som/layer.rb +68 -0
- data/lib/ai4r/som/node.rb +96 -0
- data/lib/ai4r/som/som.rb +155 -0
- data/lib/ai4r/som/two_phase_layer.rb +90 -0
- data/test/classifiers/hyperpipes_test.rb +84 -0
- data/test/classifiers/ib1_test.rb +78 -0
- data/test/classifiers/id3_test.rb +208 -0
- data/test/classifiers/multilayer_perceptron_test.rb +79 -0
- data/test/classifiers/naive_bayes_test.rb +43 -0
- data/test/classifiers/one_r_test.rb +62 -0
- data/test/classifiers/prism_test.rb +85 -0
- data/test/classifiers/zero_r_test.rb +49 -0
- data/test/clusterers/average_linkage_test.rb +51 -0
- data/test/clusterers/bisecting_k_means_test.rb +66 -0
- data/test/clusterers/centroid_linkage_test.rb +53 -0
- data/test/clusterers/complete_linkage_test.rb +57 -0
- data/test/clusterers/diana_test.rb +69 -0
- data/test/clusterers/k_means_test.rb +100 -0
- data/test/clusterers/median_linkage_test.rb +53 -0
- data/test/clusterers/single_linkage_test.rb +122 -0
- data/test/clusterers/ward_linkage_hierarchical_test.rb +61 -0
- data/test/clusterers/ward_linkage_test.rb +53 -0
- data/test/clusterers/weighted_average_linkage_test.rb +53 -0
- data/test/data/data_set_test.rb +96 -0
- data/test/data/proximity_test.rb +81 -0
- data/test/data/statistics_test.rb +65 -0
- data/test/experiment/classifier_evaluator_test.rb +76 -0
- data/test/genetic_algorithm/chromosome_test.rb +58 -0
- data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
- data/test/neural_network/backpropagation_test.rb +82 -0
- data/test/neural_network/hopfield_test.rb +72 -0
- data/test/som/som_test.rb +97 -0
- metadata +168 -0
@@ -0,0 +1,118 @@
|
|
1
|
+
# Author:: Sergio Fierens (Implementation only)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require 'set'
|
11
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
12
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Classifiers
|
16
|
+
|
17
|
+
include Ai4r::Data
|
18
|
+
|
19
|
+
# = Introduction
|
20
|
+
#
|
21
|
+
# A fast classifier algorithm, created by Lucio de Souza Coelho
|
22
|
+
# and Len Trigg.
|
23
|
+
class Hyperpipes < Classifier
|
24
|
+
|
25
|
+
attr_reader :data_set, :pipes
|
26
|
+
|
27
|
+
# Build a new Hyperpipes classifier. You must provide a DataSet instance
|
28
|
+
# as parameter. The last attribute of each item is considered as
|
29
|
+
# the item class.
|
30
|
+
def build(data_set)
|
31
|
+
data_set.check_not_empty
|
32
|
+
@data_set = data_set
|
33
|
+
@domains = data_set.build_domains
|
34
|
+
|
35
|
+
@pipes = {}
|
36
|
+
@domains.last.each {|cat| @pipes[cat] = build_pipe(@data_set)}
|
37
|
+
@data_set.data_items.each {|item| update_pipe(@pipes[item.last], item) }
|
38
|
+
|
39
|
+
return self
|
40
|
+
end
|
41
|
+
|
42
|
+
# You can evaluate new data, predicting its class.
|
43
|
+
# e.g.
|
44
|
+
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
45
|
+
def eval(data)
|
46
|
+
votes = Hash.new {0}
|
47
|
+
@pipes.each do |category, pipe|
|
48
|
+
pipe.each_with_index do |bounds, i|
|
49
|
+
if data[i].is_a? Numeric
|
50
|
+
votes[category]+=1 if data[i]>=bounds[:min] && data[i]<=bounds[:max]
|
51
|
+
else
|
52
|
+
votes[category]+=1 if bounds[data[i]]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return votes.to_a.max {|x, y| x.last <=> y.last}.first
|
57
|
+
end
|
58
|
+
|
59
|
+
# This method returns the generated rules in ruby code.
|
60
|
+
# e.g.
|
61
|
+
#
|
62
|
+
# classifier.get_rules
|
63
|
+
# # => if age_range == '<30' then marketing_target = 'Y'
|
64
|
+
# elsif age_range == '[30-50)' then marketing_target = 'N'
|
65
|
+
# elsif age_range == '[50-80]' then marketing_target = 'N'
|
66
|
+
# end
|
67
|
+
#
|
68
|
+
# It is a nice way to inspect induction results, and also to execute them:
|
69
|
+
# marketing_target = nil
|
70
|
+
# eval classifier.get_rules
|
71
|
+
# puts marketing_target
|
72
|
+
# # => 'Y'
|
73
|
+
def get_rules
|
74
|
+
rules = []
|
75
|
+
rules << "votes = Hash.new {0}"
|
76
|
+
data = @data_set.data_items.first
|
77
|
+
labels = @data_set.data_labels.collect {|l| l.to_s}
|
78
|
+
@pipes.each do |category, pipe|
|
79
|
+
pipe.each_with_index do |bounds, i|
|
80
|
+
rule = "votes['#{category}'] += 1 "
|
81
|
+
if data[i].is_a? Numeric
|
82
|
+
rule += "if #{labels[i]} >= #{bounds[:min]} && #{labels[i]} <= #{bounds[:max]}"
|
83
|
+
else
|
84
|
+
rule += "if #{bounds.inspect}[#{labels[i]}]"
|
85
|
+
end
|
86
|
+
rules << rule
|
87
|
+
end
|
88
|
+
end
|
89
|
+
rules << "#{labels.last} = votes.to_a.max {|x, y| x.last <=> y.last}.first"
|
90
|
+
return rules.join("\n")
|
91
|
+
end
|
92
|
+
|
93
|
+
protected
|
94
|
+
|
95
|
+
def build_pipe(data_set)
|
96
|
+
data_set.data_items.first[0...-1].collect do |att|
|
97
|
+
if att.is_a? Numeric
|
98
|
+
{:min=>1.0/0, :max=>-1.0/0}
|
99
|
+
else
|
100
|
+
Hash.new(false)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def update_pipe(pipe, data_item)
|
106
|
+
data_item[0...-1].each_with_index do |att, i|
|
107
|
+
if att.is_a? Numeric
|
108
|
+
pipe[i][:min] = att if att < pipe[i][:min]
|
109
|
+
pipe[i][:max] = att if att > pipe[i][:max]
|
110
|
+
else
|
111
|
+
pipe[i][att] = true
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
# Author:: Sergio Fierens (Implementation only)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require 'set'
|
11
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
12
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Classifiers
|
16
|
+
|
17
|
+
# = Introduction
|
18
|
+
#
|
19
|
+
# IB1 algorithm implementation.
|
20
|
+
# IB1 is the simplest instance-based learning (IBL) algorithm.
|
21
|
+
#
|
22
|
+
# D. Aha, D. Kibler (1991). Instance-based learning algorithms.
|
23
|
+
# Machine Learning. 6:37-66.
|
24
|
+
#
|
25
|
+
# IBI is identical to the nearest neighbor algorithm except that
|
26
|
+
# it normalizes its attributes' ranges, processes instances
|
27
|
+
# incrementally, and has a simple policy for tolerating missing values
|
28
|
+
class IB1 < Classifier
|
29
|
+
|
30
|
+
attr_reader :data_set
|
31
|
+
|
32
|
+
# Build a new IB1 classifier. You must provide a DataSet instance
|
33
|
+
# as parameter. The last attribute of each item is considered as
|
34
|
+
# the item class.
|
35
|
+
def build(data_set)
|
36
|
+
data_set.check_not_empty
|
37
|
+
@data_set = data_set
|
38
|
+
@min_values = Array.new(data_set.data_labels.length)
|
39
|
+
@max_values = Array.new(data_set.data_labels.length)
|
40
|
+
data_set.data_items.each { |data_item| update_min_max(data_item[0...-1]) }
|
41
|
+
return self
|
42
|
+
end
|
43
|
+
|
44
|
+
# You can evaluate new data, predicting its class.
|
45
|
+
# e.g.
|
46
|
+
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
47
|
+
def eval(data)
|
48
|
+
update_min_max(data)
|
49
|
+
min_distance = 1.0/0
|
50
|
+
klass = nil
|
51
|
+
@data_set.data_items.each do |train_item|
|
52
|
+
d = distance(data, train_item)
|
53
|
+
if d < min_distance
|
54
|
+
min_distance = d
|
55
|
+
klass = train_item.last
|
56
|
+
end
|
57
|
+
end
|
58
|
+
return klass
|
59
|
+
end
|
60
|
+
|
61
|
+
protected
|
62
|
+
|
63
|
+
# We keep in the state the min and max value of each attribute,
|
64
|
+
# to provide normalized distances between to values of a numeric attribute
|
65
|
+
def update_min_max(atts)
|
66
|
+
atts.each_with_index do |att, i|
|
67
|
+
if att && att.is_a?(Numeric)
|
68
|
+
@min_values[i] = att if @min_values[i].nil? || @min_values[i] > att
|
69
|
+
@max_values[i] = att if @max_values[i].nil? || @max_values[i] < att
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Normalized distance between 2 instances
|
75
|
+
#
|
76
|
+
#
|
77
|
+
# Returns sum of
|
78
|
+
# * squared difference between normalized numeric att values
|
79
|
+
# * 1 for nominal atts which differs or one is missing
|
80
|
+
# * 1 if both atts are missing
|
81
|
+
# * normalized numeric att value if other att value is missing and > 0.5
|
82
|
+
# * 1.0-normalized numeric att value if other att value is missing and < 0.5
|
83
|
+
def distance(a, b)
|
84
|
+
d = 0
|
85
|
+
a.each_with_index do |att_a, i|
|
86
|
+
att_b = b[i]
|
87
|
+
if att_a.nil?
|
88
|
+
if att_b.is_a? Numeric
|
89
|
+
diff = norm(att_b, i)
|
90
|
+
diff = 1.0 - diff if diff < 0.5
|
91
|
+
else
|
92
|
+
diff = 1
|
93
|
+
end
|
94
|
+
elsif att_a.is_a? Numeric
|
95
|
+
if att_b.is_a? Numeric
|
96
|
+
diff = norm(att_a, i) - norm(att_b, i);
|
97
|
+
else
|
98
|
+
diff = norm(att_a, i)
|
99
|
+
diff = 1.0 - diff if diff < 0.5
|
100
|
+
end
|
101
|
+
elsif att_a != att_b
|
102
|
+
diff = 1
|
103
|
+
else
|
104
|
+
diff = 0
|
105
|
+
end
|
106
|
+
d += diff * diff
|
107
|
+
end
|
108
|
+
return d
|
109
|
+
end
|
110
|
+
|
111
|
+
# Returns normalized value att
|
112
|
+
#
|
113
|
+
# index is the index of the attribute in the instance.
|
114
|
+
def norm(att, index)
|
115
|
+
return 0 if @min_values[index].nil?
|
116
|
+
return 1.0*(att - @min_values[index]) / (@max_values[index] -@min_values[index]);
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -0,0 +1,326 @@
|
|
1
|
+
# Author:: Sergio Fierens (Implementation, Quinlan is
|
2
|
+
# the creator of the algorithm)
|
3
|
+
# License:: MPL 1.1
|
4
|
+
# Project:: ai4r
|
5
|
+
# Url:: http://ai4r.rubyforge.org/
|
6
|
+
#
|
7
|
+
# You can redistribute it and/or modify it under the terms of
|
8
|
+
# the Mozilla Public License version 1.1 as published by the
|
9
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
10
|
+
|
11
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
12
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
|
16
|
+
module Classifiers
|
17
|
+
|
18
|
+
# = Introduction
|
19
|
+
# This is an implementation of the ID3 algorithm (Quinlan)
|
20
|
+
# Given a set of preclassified examples, it builds a top-down
|
21
|
+
# induction of decision tree, biased by the information gain and
|
22
|
+
# entropy measure.
|
23
|
+
#
|
24
|
+
# * http://en.wikipedia.org/wiki/Decision_tree
|
25
|
+
# * http://en.wikipedia.org/wiki/ID3_algorithm
|
26
|
+
#
|
27
|
+
# = How to use it
|
28
|
+
#
|
29
|
+
# DATA_LABELS = [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
30
|
+
#
|
31
|
+
# DATA_ITEMS = [
|
32
|
+
# ['New York', '<30', 'M', 'Y'],
|
33
|
+
# ['Chicago', '<30', 'M', 'Y'],
|
34
|
+
# ['Chicago', '<30', 'F', 'Y'],
|
35
|
+
# ['New York', '<30', 'M', 'Y'],
|
36
|
+
# ['New York', '<30', 'M', 'Y'],
|
37
|
+
# ['Chicago', '[30-50)', 'M', 'Y'],
|
38
|
+
# ['New York', '[30-50)', 'F', 'N'],
|
39
|
+
# ['Chicago', '[30-50)', 'F', 'Y'],
|
40
|
+
# ['New York', '[30-50)', 'F', 'N'],
|
41
|
+
# ['Chicago', '[50-80]', 'M', 'N'],
|
42
|
+
# ['New York', '[50-80]', 'F', 'N'],
|
43
|
+
# ['New York', '[50-80]', 'M', 'N'],
|
44
|
+
# ['Chicago', '[50-80]', 'M', 'N'],
|
45
|
+
# ['New York', '[50-80]', 'F', 'N'],
|
46
|
+
# ['Chicago', '>80', 'F', 'Y']
|
47
|
+
# ]
|
48
|
+
#
|
49
|
+
# data_set = DataSet.new(:data_items=>DATA_SET, :data_labels=>DATA_LABELS)
|
50
|
+
# id3 = Ai4r::Classifiers::ID3.new.build(data_set)
|
51
|
+
#
|
52
|
+
# id3.get_rules
|
53
|
+
# # => if age_range=='<30' then marketing_target='Y'
|
54
|
+
# elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
|
55
|
+
# elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
|
56
|
+
# elsif age_range=='[50-80]' then marketing_target='N'
|
57
|
+
# elsif age_range=='>80' then marketing_target='Y'
|
58
|
+
# else raise 'There was not enough information during training to do a proper induction for this data element' end
|
59
|
+
#
|
60
|
+
# id3.eval(['New York', '<30', 'M'])
|
61
|
+
# # => 'Y'
|
62
|
+
#
|
63
|
+
# = A better way to load the data
|
64
|
+
#
|
65
|
+
# In the real life you will use lot more data training examples, with more
|
66
|
+
# attributes. Consider moving your data to an external CSV (comma separate
|
67
|
+
# values) file.
|
68
|
+
#
|
69
|
+
# data_file = "#{File.dirname(__FILE__)}/data_set.csv"
|
70
|
+
# data_set = DataSet.load_csv_with_labels data_file
|
71
|
+
# id3 = Ai4r::Classifiers::ID3.new.build(data_set)
|
72
|
+
#
|
73
|
+
# = A nice tip for data evaluation
|
74
|
+
#
|
75
|
+
# id3 = Ai4r::Classifiers::ID3.new.build(data_set)
|
76
|
+
#
|
77
|
+
# age_range = '<30'
|
78
|
+
# marketing_target = nil
|
79
|
+
# eval id3.get_rules
|
80
|
+
# puts marketing_target
|
81
|
+
# # => 'Y'
|
82
|
+
#
|
83
|
+
# = More about ID3 and decision trees
|
84
|
+
#
|
85
|
+
# * http://en.wikipedia.org/wiki/Decision_tree
|
86
|
+
# * http://en.wikipedia.org/wiki/ID3_algorithm
|
87
|
+
#
|
88
|
+
# = About the project
|
89
|
+
# Author:: Sergio Fierens
|
90
|
+
# License:: MPL 1.1
|
91
|
+
# Url:: http://ai4r.rubyforge.org/
|
92
|
+
class ID3 < Classifier
|
93
|
+
|
94
|
+
attr_reader :data_set
|
95
|
+
|
96
|
+
# Create a new ID3 classifier. You must provide a DataSet instance
|
97
|
+
# as parameter. The last attribute of each item is considered as the
|
98
|
+
# item class.
|
99
|
+
def build(data_set)
|
100
|
+
data_set.check_not_empty
|
101
|
+
@data_set = data_set
|
102
|
+
preprocess_data(@data_set.data_items)
|
103
|
+
return self
|
104
|
+
end
|
105
|
+
|
106
|
+
# You can evaluate new data, predicting its category.
|
107
|
+
# e.g.
|
108
|
+
# id3.eval(['New York', '<30', 'F']) # => 'Y'
|
109
|
+
def eval(data)
|
110
|
+
@tree.value(data) if @tree
|
111
|
+
end
|
112
|
+
|
113
|
+
# This method returns the generated rules in ruby code.
|
114
|
+
# e.g.
|
115
|
+
#
|
116
|
+
# id3.get_rules
|
117
|
+
# # => if age_range=='<30' then marketing_target='Y'
|
118
|
+
# elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
|
119
|
+
# elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
|
120
|
+
# elsif age_range=='[50-80]' then marketing_target='N'
|
121
|
+
# elsif age_range=='>80' then marketing_target='Y'
|
122
|
+
# else raise 'There was not enough information during training to do a proper induction for this data element' end
|
123
|
+
#
|
124
|
+
# It is a nice way to inspect induction results, and also to execute them:
|
125
|
+
# age_range = '<30'
|
126
|
+
# marketing_target = nil
|
127
|
+
# eval id3.get_rules
|
128
|
+
# puts marketing_target
|
129
|
+
# # => 'Y'
|
130
|
+
def get_rules
|
131
|
+
#return "Empty ID3 tree" if !@tree
|
132
|
+
rules = @tree.get_rules
|
133
|
+
rules = rules.collect do |rule|
|
134
|
+
"#{rule[0..-2].join(' and ')} then #{rule.last}"
|
135
|
+
end
|
136
|
+
return "if #{rules.join("\nelsif ")}\nelse raise 'There was not enough information during training to do a proper induction for this data element' end"
|
137
|
+
end
|
138
|
+
|
139
|
+
private
|
140
|
+
def preprocess_data(data_examples)
|
141
|
+
@tree = build_node(data_examples)
|
142
|
+
end
|
143
|
+
|
144
|
+
private
|
145
|
+
def build_node(data_examples, flag_att = [])
|
146
|
+
return ErrorNode.new if data_examples.length == 0
|
147
|
+
domain = domain(data_examples)
|
148
|
+
return CategoryNode.new(@data_set.data_labels.last, domain.last[0]) if domain.last.length == 1
|
149
|
+
min_entropy_index = min_entropy_index(data_examples, domain, flag_att)
|
150
|
+
flag_att << min_entropy_index
|
151
|
+
split_data_examples = split_data_examples(data_examples, domain, min_entropy_index)
|
152
|
+
return CategoryNode.new(@data_set.data_labels.last, most_freq(data_examples, domain)) if split_data_examples.length == 1
|
153
|
+
nodes = split_data_examples.collect do |partial_data_examples|
|
154
|
+
build_node(partial_data_examples, flag_att)
|
155
|
+
end
|
156
|
+
return EvaluationNode.new(@data_set.data_labels, min_entropy_index, domain[min_entropy_index], nodes)
|
157
|
+
end
|
158
|
+
|
159
|
+
private
|
160
|
+
def self.sum(values)
|
161
|
+
values.inject( 0 ) { |sum,x| sum+x }
|
162
|
+
end
|
163
|
+
|
164
|
+
private
|
165
|
+
def self.log2(z)
|
166
|
+
return 0.0 if z == 0
|
167
|
+
Math.log(z)/LOG2
|
168
|
+
end
|
169
|
+
|
170
|
+
private
|
171
|
+
def most_freq(examples, domain)
|
172
|
+
freqs = []
|
173
|
+
domain.last.length.times { freqs << 0}
|
174
|
+
examples.each do |example|
|
175
|
+
cat_index = domain.last.index(example.last)
|
176
|
+
freq = freqs[cat_index] + 1
|
177
|
+
freqs[cat_index] = freq
|
178
|
+
end
|
179
|
+
max_freq = freqs.max
|
180
|
+
max_freq_index = freqs.index(max_freq)
|
181
|
+
domain.last[max_freq_index]
|
182
|
+
end
|
183
|
+
|
184
|
+
private
|
185
|
+
def split_data_examples(data_examples, domain, att_index)
|
186
|
+
data_examples_array = []
|
187
|
+
att_value_examples = {}
|
188
|
+
data_examples.each do |example|
|
189
|
+
example_set = att_value_examples[example[att_index]]
|
190
|
+
example_set = [] if !example_set
|
191
|
+
example_set << example
|
192
|
+
att_value_examples.store(example[att_index], example_set)
|
193
|
+
end
|
194
|
+
att_value_examples.each_pair do |att_value, example_set|
|
195
|
+
att_value_index = domain[att_index].index(att_value)
|
196
|
+
data_examples_array[att_value_index] = example_set
|
197
|
+
end
|
198
|
+
return data_examples_array
|
199
|
+
end
|
200
|
+
|
201
|
+
private
|
202
|
+
def min_entropy_index(data_examples, domain, flag_att=[])
|
203
|
+
min_entropy = nil
|
204
|
+
min_index = 0
|
205
|
+
domain[0..-2].each_index do |index|
|
206
|
+
freq_grid = freq_grid(index, data_examples, domain)
|
207
|
+
entropy = entropy(freq_grid, data_examples.length)
|
208
|
+
if (!min_entropy || entropy < min_entropy) && !flag_att.include?(index)
|
209
|
+
min_entropy = entropy
|
210
|
+
min_index = index
|
211
|
+
end
|
212
|
+
end
|
213
|
+
return min_index
|
214
|
+
end
|
215
|
+
|
216
|
+
private
|
217
|
+
def domain(data_examples)
|
218
|
+
#return build_domains(data_examples)
|
219
|
+
domain = []
|
220
|
+
@data_set.data_labels.length.times { domain << [] }
|
221
|
+
data_examples.each do |data|
|
222
|
+
data.each_index do |i|
|
223
|
+
domain[i] << data[i] if i<domain.length && !domain[i].include?(data[i])
|
224
|
+
end
|
225
|
+
end
|
226
|
+
return domain
|
227
|
+
end
|
228
|
+
|
229
|
+
private
|
230
|
+
def freq_grid(att_index, data_examples, domain)
|
231
|
+
#Initialize empty grid
|
232
|
+
grid_element = []
|
233
|
+
domain.last.length.times { grid_element << 0}
|
234
|
+
grid = []
|
235
|
+
domain[att_index].length.times { grid << grid_element.clone }
|
236
|
+
#Fill frecuency with grid
|
237
|
+
data_examples.each do |example|
|
238
|
+
att_val = example[att_index]
|
239
|
+
att_val_index = domain[att_index].index(att_val)
|
240
|
+
category = example.last
|
241
|
+
category_index = domain.last.index(category)
|
242
|
+
freq = grid[att_val_index][category_index] + 1
|
243
|
+
grid[att_val_index][category_index] = freq
|
244
|
+
end
|
245
|
+
return grid
|
246
|
+
end
|
247
|
+
|
248
|
+
private
|
249
|
+
def entropy(freq_grid, total_examples)
|
250
|
+
#Calc entropy of each element
|
251
|
+
entropy = 0
|
252
|
+
freq_grid.each do |att_freq|
|
253
|
+
att_total_freq = ID3.sum(att_freq)
|
254
|
+
partial_entropy = 0
|
255
|
+
if att_total_freq != 0
|
256
|
+
att_freq.each do |freq|
|
257
|
+
prop = freq.to_f/att_total_freq
|
258
|
+
partial_entropy += (-1*prop*ID3.log2(prop))
|
259
|
+
end
|
260
|
+
end
|
261
|
+
entropy += (att_total_freq.to_f/total_examples) * partial_entropy
|
262
|
+
end
|
263
|
+
return entropy
|
264
|
+
end
|
265
|
+
|
266
|
+
private
|
267
|
+
LOG2 = Math.log(2)
|
268
|
+
end
|
269
|
+
|
270
|
+
class EvaluationNode #:nodoc: all
|
271
|
+
|
272
|
+
attr_reader :index, :values, :nodes
|
273
|
+
|
274
|
+
def initialize(data_labels, index, values, nodes)
|
275
|
+
@index = index
|
276
|
+
@values = values
|
277
|
+
@nodes = nodes
|
278
|
+
@data_labels = data_labels
|
279
|
+
end
|
280
|
+
|
281
|
+
def value(data)
|
282
|
+
value = data[@index]
|
283
|
+
return rule_not_found if !@values.include?(value)
|
284
|
+
return nodes[@values.index(value)].value(data)
|
285
|
+
end
|
286
|
+
|
287
|
+
def get_rules
|
288
|
+
rule_set = []
|
289
|
+
@nodes.each_index do |child_node_index|
|
290
|
+
my_rule = "#{@data_labels[@index]}=='#{@values[child_node_index]}'"
|
291
|
+
child_node = @nodes[child_node_index]
|
292
|
+
child_node_rules = child_node.get_rules
|
293
|
+
child_node_rules.each do |child_rule|
|
294
|
+
child_rule.unshift(my_rule)
|
295
|
+
end
|
296
|
+
rule_set += child_node_rules
|
297
|
+
end
|
298
|
+
return rule_set
|
299
|
+
end
|
300
|
+
|
301
|
+
end
|
302
|
+
|
303
|
+
class CategoryNode #:nodoc: all
|
304
|
+
def initialize(label, value)
|
305
|
+
@label = label
|
306
|
+
@value = value
|
307
|
+
end
|
308
|
+
def value(data)
|
309
|
+
return @value
|
310
|
+
end
|
311
|
+
def get_rules
|
312
|
+
return [["#{@label}='#{@value}'"]]
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
class ErrorNode #:nodoc: all
|
317
|
+
def value(data)
|
318
|
+
raise "There was not enough information during training to do a proper induction for this data element."
|
319
|
+
end
|
320
|
+
def get_rules
|
321
|
+
return []
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|
325
|
+
end
|
326
|
+
end
|