nimbus 1.0.1 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +149 -0
- data/lib/nimbus.rb +15 -11
- data/lib/nimbus/application.rb +20 -23
- data/lib/nimbus/classification_tree.rb +111 -0
- data/lib/nimbus/configuration.rb +52 -37
- data/lib/nimbus/forest.rb +56 -20
- data/lib/nimbus/individual.rb +7 -7
- data/lib/nimbus/loss_functions.rb +44 -10
- data/lib/nimbus/regression_tree.rb +103 -0
- data/lib/nimbus/training_set.rb +4 -4
- data/lib/nimbus/tree.rb +20 -83
- data/lib/nimbus/version.rb +3 -0
- data/spec/classification_tree_spec.rb +132 -0
- data/spec/configuration_spec.rb +46 -19
- data/spec/fixtures/classification_config.yml +13 -0
- data/spec/fixtures/classification_random_forest.yml +922 -0
- data/spec/fixtures/classification_testing.data +500 -0
- data/spec/fixtures/classification_training.data +1000 -0
- data/spec/forest_spec.rb +109 -50
- data/spec/individual_spec.rb +2 -2
- data/spec/loss_functions_spec.rb +71 -0
- data/spec/nimbus_spec.rb +4 -4
- data/spec/regression_tree_spec.rb +129 -0
- data/spec/training_set_spec.rb +5 -5
- data/spec/tree_spec.rb +4 -115
- metadata +53 -45
- data/spec/fixtures/regression_snp_importances.txt +0 -200
- data/spec/fixtures/regression_testing_file_predictions.txt +0 -200
- data/spec/fixtures/regression_training_file_predictions.txt +0 -758
data/lib/nimbus/configuration.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
module Nimbus
|
2
2
|
#####################################################################
|
3
|
-
# Nimbus configuration object.
|
4
|
-
#
|
3
|
+
# Nimbus configuration object.
|
4
|
+
#
|
5
5
|
# This class reads every user file.
|
6
|
-
# Once the user's config.yml file is loaded, a set of default and
|
6
|
+
# Once the user's config.yml file is loaded, a set of default and
|
7
7
|
# custom options is created and stored.
|
8
|
-
#
|
9
|
-
# Nimbus::Configuration also reads the testing files and the data
|
8
|
+
#
|
9
|
+
# Nimbus::Configuration also reads the testing files and the data
|
10
10
|
# to create the training set to be passed to the Nimbus::Forest random
|
11
11
|
# forest generator and the Nimbus::Tree classes in it.
|
12
12
|
#
|
@@ -15,6 +15,7 @@ module Nimbus
|
|
15
15
|
:training_file,
|
16
16
|
:testing_file,
|
17
17
|
:forest_file,
|
18
|
+
:classes,
|
18
19
|
:config_file,
|
19
20
|
:forest_size,
|
20
21
|
:tree_SNP_sample_size,
|
@@ -32,65 +33,66 @@ module Nimbus
|
|
32
33
|
:output_snp_importances_file,
|
33
34
|
:silent
|
34
35
|
)
|
35
|
-
|
36
|
+
|
36
37
|
DEFAULTS = {
|
37
38
|
:forest_size => 500,
|
38
39
|
:tree_SNP_sample_size => 60,
|
39
40
|
:tree_SNP_total_count => 200,
|
40
41
|
:tree_node_min_size => 5,
|
41
|
-
|
42
|
+
|
42
43
|
:loss_function_discrete => 'majority_class',
|
43
|
-
:loss_function_continuous => '
|
44
|
-
|
44
|
+
:loss_function_continuous => 'average',
|
45
|
+
|
45
46
|
:training_file => 'training.data',
|
46
47
|
:testing_file => 'testing.data',
|
47
48
|
:forest_file => 'forest.yml',
|
48
49
|
:config_file => 'config.yml',
|
49
|
-
|
50
|
+
|
50
51
|
:output_forest_file => 'random_forest.yml',
|
51
52
|
:output_training_file => 'training_file_predictions.txt',
|
52
53
|
:output_testing_file => 'testing_file_predictions.txt',
|
53
54
|
:output_tree_errors_file => 'generalization_errors.txt',
|
54
55
|
:output_snp_importances_file => 'snp_importances.txt',
|
55
|
-
|
56
|
+
|
56
57
|
:silent => false
|
57
58
|
}
|
58
|
-
|
59
|
+
|
59
60
|
# Initialize a Nimbus::Configuration object.
|
60
61
|
#
|
61
62
|
# Set all options to their default values.
|
62
63
|
def initialize
|
63
64
|
@do_training = false
|
64
65
|
@do_testing = false
|
65
|
-
|
66
|
+
|
66
67
|
@forest_size = DEFAULTS[:forest_size]
|
67
68
|
@tree_SNP_sample_size = DEFAULTS[:tree_SNP_sample_size]
|
68
69
|
@tree_SNP_total_count = DEFAULTS[:tree_SNP_total_count]
|
69
70
|
@tree_node_min_size = DEFAULTS[:tree_node_min_size]
|
70
71
|
@loss_function_discrete = DEFAULTS[:loss_function_discrete]
|
71
72
|
@loss_function_continuous = DEFAULTS[:loss_function_continuous]
|
72
|
-
|
73
|
+
|
73
74
|
@output_forest_file = File.expand_path(DEFAULTS[:output_forest_file], Dir.pwd)
|
74
75
|
@output_training_file = File.expand_path(DEFAULTS[:output_training_file], Dir.pwd)
|
75
76
|
@output_testing_file = File.expand_path(DEFAULTS[:output_testing_file], Dir.pwd)
|
76
77
|
@output_tree_errors_file = File.expand_path(DEFAULTS[:output_tree_errors_file], Dir.pwd)
|
77
78
|
@output_snp_importances_file = File.expand_path(DEFAULTS[:output_snp_importances_file], Dir.pwd)
|
78
|
-
|
79
|
+
|
79
80
|
@silent = ENV['nimbus_test'] == 'running_nimbus_tests' ? true : DEFAULTS[:silent]
|
80
81
|
end
|
81
|
-
|
82
|
+
|
82
83
|
# Accessor method for the tree-related subset of options.
|
83
84
|
def tree
|
84
|
-
{
|
85
|
+
{
|
85
86
|
:snp_sample_size => @tree_SNP_sample_size,
|
86
87
|
:snp_total_count => @tree_SNP_total_count,
|
87
|
-
:tree_node_min_size => @tree_node_min_size
|
88
|
+
:tree_node_min_size => @tree_node_min_size,
|
89
|
+
:classes => @classes
|
88
90
|
}
|
89
91
|
end
|
90
|
-
|
92
|
+
|
91
93
|
# This is the first method to be called on Configuration when a config.yml file
|
92
94
|
# exists with user input options for the forest.
|
93
|
-
#
|
95
|
+
#
|
94
96
|
# * The method will read the config file and change the default value of the selected options.
|
95
97
|
# * Then based on the options and the existence of training, testing and forest files, it will mark:
|
96
98
|
# - if training is needed,
|
@@ -110,24 +112,26 @@ module Nimbus
|
|
110
112
|
raise Nimbus::WrongFormatFileError, "It was not posible to parse the config file (#{config_file}): \r\n#{e.message} "
|
111
113
|
end
|
112
114
|
end
|
113
|
-
|
115
|
+
|
114
116
|
if user_config_params['input']
|
115
117
|
@training_file = File.expand_path(user_config_params['input']['training'], dirname) if user_config_params['input']['training']
|
116
118
|
@testing_file = File.expand_path(user_config_params['input']['testing' ], dirname) if user_config_params['input']['testing']
|
117
119
|
@forest_file = File.expand_path(user_config_params['input']['forest' ], dirname) if user_config_params['input']['forest']
|
120
|
+
@classes = user_config_params['input']['classes'] if user_config_params['input']['classes']
|
118
121
|
else
|
119
122
|
@training_file = File.expand_path(DEFAULTS[:training_file], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:training_file], Dir.pwd)
|
120
123
|
@testing_file = File.expand_path(DEFAULTS[:testing_file ], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:testing_file ], Dir.pwd)
|
121
124
|
@forest_file = File.expand_path(DEFAULTS[:forest_file ], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:forest_file ], Dir.pwd)
|
122
125
|
end
|
123
|
-
|
126
|
+
|
124
127
|
@do_training = true if @training_file
|
125
128
|
@do_testing = true if @testing_file
|
126
|
-
|
129
|
+
@classes = @classes.map{|c| c.to_s.strip} if @classes
|
130
|
+
|
127
131
|
if @do_testing && !@do_training && !@forest_file
|
128
132
|
raise Nimbus::InputFileError, "There is not random forest data (training file not defined, and forest file not found)."
|
129
133
|
end
|
130
|
-
|
134
|
+
|
131
135
|
if user_config_params['forest']
|
132
136
|
@forest_size = user_config_params['forest']['forest_size'].to_i if user_config_params['forest']['forest_size']
|
133
137
|
@tree_SNP_total_count = user_config_params['forest']['SNP_total_count'].to_i if user_config_params['forest']['SNP_total_count']
|
@@ -138,7 +142,7 @@ module Nimbus
|
|
138
142
|
check_configuration
|
139
143
|
log_configuration
|
140
144
|
end
|
141
|
-
|
145
|
+
|
142
146
|
# The method reads the training file, and if the data is valid, creates a Nimbus::TrainingSet
|
143
147
|
# containing every individual to be used as training sample for a random forest.
|
144
148
|
def load_training_data
|
@@ -150,12 +154,15 @@ module Nimbus
|
|
150
154
|
raise Nimbus::InputFileError, "Individual ##{data_id} from training set has no value for all #{@tree_SNP_total_count} SNPs" unless snp_list.size == @tree_SNP_total_count
|
151
155
|
raise Nimbus::InputFileError, "There are individuals with no ID, please check data in training file." unless (!data_id.nil? && data_id.strip != '')
|
152
156
|
raise Nimbus::InputFileError, "Individual ##{data_id} has no fenotype value, please check data in training file." unless (!data_feno.nil? && data_feno.strip != '')
|
153
|
-
|
154
|
-
|
157
|
+
raise Nimbus::InputFileError, "Individual ##{data_id} has invalid class (not in [#{classes*', '}]), please check data in training file." unless (@classes.nil? || @classes.include?(data_feno))
|
158
|
+
|
159
|
+
data_feno = (@classes ? data_feno.to_s : data_feno.to_f)
|
160
|
+
@training_set.individuals[data_id.to_i] = Nimbus::Individual.new(data_id.to_i, data_feno, snp_list.map{|snp| snp.to_i})
|
161
|
+
@training_set.ids_fenotypes[data_id.to_i] = data_feno
|
155
162
|
end
|
156
163
|
}
|
157
164
|
end
|
158
|
-
|
165
|
+
|
159
166
|
# Reads the testing file, and if the data is valid, yields one Nimbus::Individual at a time.
|
160
167
|
def read_testing_data
|
161
168
|
File.open(@testing_file) {|file|
|
@@ -169,7 +176,7 @@ module Nimbus
|
|
169
176
|
end
|
170
177
|
}
|
171
178
|
end
|
172
|
-
|
179
|
+
|
173
180
|
# Creates a Nimbus::Forest object from a user defined random forest data file.
|
174
181
|
#
|
175
182
|
# The format of the input file should be the same as the forest output data of a Nimbus Application.
|
@@ -186,14 +193,14 @@ module Nimbus
|
|
186
193
|
forest.trees = trees
|
187
194
|
forest
|
188
195
|
end
|
189
|
-
|
196
|
+
|
190
197
|
# Include tests to be passed by the info contained in the config file.
|
191
198
|
#
|
192
199
|
# If some of the configuration data provided by the user is invalid, an error is raised and execution stops.
|
193
200
|
def check_configuration
|
194
201
|
raise Nimbus::ConfigurationError, "The mtry sample size must be smaller than the total SNPs count." if @tree_SNP_sample_size > @tree_SNP_total_count
|
195
202
|
end
|
196
|
-
|
203
|
+
|
197
204
|
# Prints the information stored in the Nimbus::Configuration object
|
198
205
|
#
|
199
206
|
# It could include errors on the configuration input data, training related info and/or testing related info.
|
@@ -209,31 +216,39 @@ module Nimbus
|
|
209
216
|
Nimbus.message "*" * 50
|
210
217
|
Nimbus.stop "Error: No input data. Nimbus finished."
|
211
218
|
end
|
212
|
-
|
219
|
+
|
213
220
|
Nimbus.message "*" * 50
|
214
|
-
Nimbus.message "* Nimbus
|
221
|
+
Nimbus.message "* Nimbus version #{::Nimbus::VERSION}"
|
222
|
+
Nimbus.message "* configured with the following parameters: "
|
215
223
|
Nimbus.message "* Forest size: #{@forest_size} trees"
|
216
224
|
Nimbus.message "* Total SNP count: #{@tree_SNP_total_count}"
|
217
225
|
Nimbus.message "* SNPs sample size (mtry): #{@tree_SNP_sample_size}"
|
218
226
|
Nimbus.message "* Minimun node size in tree: #{@tree_node_min_size}"
|
227
|
+
|
228
|
+
if @classes
|
229
|
+
Nimbus.message "* Mode: CLASSIFICATION"
|
230
|
+
Nimbus.message "* Classes: [#{@classes*', '}]"
|
231
|
+
else
|
232
|
+
Nimbus.message "* Mode: REGRESSION"
|
233
|
+
end
|
219
234
|
Nimbus.message "*" * 50
|
220
|
-
|
235
|
+
|
221
236
|
if @do_training
|
222
237
|
Nimbus.message "* Training data:"
|
223
238
|
Nimbus.message "* Training file: #{@training_file}"
|
224
239
|
Nimbus.message "*" * 50
|
225
240
|
end
|
226
|
-
|
241
|
+
|
227
242
|
if @do_testing
|
228
243
|
Nimbus.message "* Data to be tested:"
|
229
244
|
Nimbus.message "* Testing file: #{@testing_file}"
|
230
245
|
if @forest_file
|
231
|
-
Nimbus.message "* using the structure of the random forest stored in:"
|
246
|
+
Nimbus.message "* using the structure of the random forest stored in:"
|
232
247
|
Nimbus.message "* Random forest file: #{@forest_file}"
|
233
248
|
end
|
234
249
|
Nimbus.message "*" * 50
|
235
250
|
end
|
236
251
|
end
|
237
|
-
|
252
|
+
|
238
253
|
end
|
239
254
|
end
|
data/lib/nimbus/forest.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
module Nimbus
|
2
2
|
|
3
3
|
#####################################################################
|
4
|
-
# Forest represents the Random forest being generated
|
4
|
+
# Forest represents the Random forest being generated
|
5
5
|
# (or used to test samples) by the application object.
|
6
6
|
#
|
7
7
|
class Forest
|
8
8
|
attr_accessor :size, :trees, :bag, :predictions, :tree_errors, :snp_importances
|
9
9
|
attr_accessor :options
|
10
|
-
|
10
|
+
|
11
11
|
# Initialize Forest object with options included in the Nimbus::Configuration object received.
|
12
12
|
def initialize(config)
|
13
13
|
@trees = []
|
@@ -20,7 +20,7 @@ module Nimbus
|
|
20
20
|
@tree_snp_importances = []
|
21
21
|
raise Nimbus::ForestError, "Forest size parameter (#{@size}) is invalid. You need at least one tree." if @size < 1
|
22
22
|
end
|
23
|
-
|
23
|
+
|
24
24
|
# Creates a random forest based on the TrainingSet included in the configuration, creating N random trees (size N defined in the configuration).
|
25
25
|
#
|
26
26
|
# This is the method called when the application's configuration flags training on.
|
@@ -35,10 +35,11 @@ module Nimbus
|
|
35
35
|
# Every tree of the forest is created with a different random sample of the individuals in the training set.
|
36
36
|
def grow
|
37
37
|
@size.times do |i|
|
38
|
-
Nimbus.write("
|
38
|
+
Nimbus.write("\rCreating trees: #{i+1}/#{@size} ")
|
39
39
|
tree_individuals_bag = individuals_random_sample
|
40
40
|
tree_out_of_bag = oob tree_individuals_bag
|
41
|
-
|
41
|
+
tree_class = (classification? ? ClassificationTree : RegressionTree)
|
42
|
+
tree = tree_class.new @options.tree
|
42
43
|
@trees << tree.seed(@options.training_set.individuals, tree_individuals_bag, @options.training_set.ids_fenotypes)
|
43
44
|
@tree_errors << tree.generalization_error_from_oob(tree_out_of_bag)
|
44
45
|
@tree_snp_importances << tree.estimate_importances(tree_out_of_bag)
|
@@ -46,13 +47,18 @@ module Nimbus
|
|
46
47
|
Nimbus.clear_line!
|
47
48
|
end
|
48
49
|
average_snp_importances
|
49
|
-
|
50
|
+
totalize_predictions
|
50
51
|
end
|
51
|
-
|
52
|
-
# Traverse a testing set through every tree of the forest
|
52
|
+
|
53
|
+
# Traverse a testing set through every tree of the forest.
|
53
54
|
#
|
54
55
|
# This is the method called when the application's configuration flags testing on.
|
55
56
|
def traverse
|
57
|
+
classification? ? traverse_classification_forest : traverse_regression_forest
|
58
|
+
end
|
59
|
+
|
60
|
+
# Traverse a testing set through every regression tree of the forest and get averaged predictions for every individual in the sample.
|
61
|
+
def traverse_regression_forest
|
56
62
|
@predictions = {}
|
57
63
|
prediction_count = trees.size
|
58
64
|
@options.read_testing_data{|individual|
|
@@ -63,44 +69,66 @@ module Nimbus
|
|
63
69
|
@predictions[individual.id] = (individual_prediction / prediction_count).round(5)
|
64
70
|
}
|
65
71
|
end
|
66
|
-
|
72
|
+
|
73
|
+
# Traverse a testing set through every classification tree of the forest and get majority class predictions for every individual in the sample.
|
74
|
+
def traverse_classification_forest
|
75
|
+
@predictions = {}
|
76
|
+
@options.read_testing_data{|individual|
|
77
|
+
individual_prediction = []
|
78
|
+
trees.each do |t|
|
79
|
+
individual_prediction << Nimbus::Tree.traverse(t, individual.snp_list)
|
80
|
+
end
|
81
|
+
@predictions[individual.id] = Nimbus::LossFunctions.majority_class_in_list(individual_prediction, @options.tree[:classes])
|
82
|
+
}
|
83
|
+
end
|
84
|
+
|
67
85
|
# The array containing every tree in the forest, to YAML format.
|
68
86
|
def to_yaml
|
69
87
|
@trees.to_yaml
|
70
88
|
end
|
71
|
-
|
89
|
+
|
72
90
|
private
|
73
|
-
|
91
|
+
|
74
92
|
def individuals_random_sample
|
75
93
|
individuals_sample = bag.inject([]){|items, i| items << bag.sample }.sort
|
76
94
|
end
|
77
|
-
|
95
|
+
|
78
96
|
def oob(in_bag=[])
|
79
97
|
bag - in_bag.uniq
|
80
98
|
end
|
81
|
-
|
99
|
+
|
82
100
|
def bag
|
83
101
|
@bag ||= @options.training_set.all_ids
|
84
102
|
end
|
85
|
-
|
103
|
+
|
86
104
|
def acumulate_predictions(preds)
|
87
105
|
preds.each_pair.each{|id, value|
|
88
106
|
if @predictions[id].nil?
|
89
|
-
@predictions[id] = value
|
107
|
+
@predictions[id] = (classification? ? [value] : value)
|
90
108
|
@times_predicted[id] = 1.0
|
91
109
|
else
|
92
|
-
@predictions[id] += value
|
110
|
+
classification? ? (@predictions[id] << value) : (@predictions[id] += value)
|
93
111
|
@times_predicted[id] += 1
|
94
112
|
end
|
95
113
|
}
|
96
114
|
end
|
97
|
-
|
115
|
+
|
116
|
+
def totalize_predictions
|
117
|
+
classification? ? majority_class_predicted : average_predictions
|
118
|
+
end
|
119
|
+
|
98
120
|
def average_predictions
|
99
121
|
@predictions.each_pair{|id, value|
|
100
122
|
@predictions[id] = (@predictions[id] / @times_predicted[id]).round(5)
|
101
123
|
}
|
102
124
|
end
|
103
|
-
|
125
|
+
|
126
|
+
def majority_class_predicted
|
127
|
+
@predictions.each_pair{|id, values|
|
128
|
+
@predictions[id] = Nimbus::LossFunctions.majority_class_in_list(values, @options.tree[:classes])
|
129
|
+
}
|
130
|
+
end
|
131
|
+
|
104
132
|
def average_snp_importances
|
105
133
|
1.upto(@options.tree_SNP_total_count) {|snp|
|
106
134
|
@snp_importances[snp] = 0.0
|
@@ -110,7 +138,15 @@ module Nimbus
|
|
110
138
|
@snp_importances[snp] = @snp_importances[snp] / @size
|
111
139
|
}
|
112
140
|
end
|
113
|
-
|
141
|
+
|
142
|
+
def classification?
|
143
|
+
@options.tree[:classes]
|
144
|
+
end
|
145
|
+
|
146
|
+
def regression?
|
147
|
+
@options.tree[:classes].nil?
|
148
|
+
end
|
149
|
+
|
114
150
|
end
|
115
|
-
|
151
|
+
|
116
152
|
end
|
data/lib/nimbus/individual.rb
CHANGED
@@ -1,19 +1,19 @@
|
|
1
1
|
module Nimbus
|
2
2
|
#####################################################################
|
3
|
-
# Nimbus Individual object.
|
4
|
-
#
|
5
|
-
# It represents a single individual of a training or testing sample.
|
6
|
-
#
|
3
|
+
# Nimbus Individual object.
|
4
|
+
#
|
5
|
+
# It represents a single individual of a training or testing sample.
|
6
|
+
#
|
7
7
|
# This class stores information about a individual:
|
8
8
|
#
|
9
9
|
# * id,
|
10
10
|
# * values for all the SNPs of the individual,
|
11
|
-
# * fenotype if present,
|
11
|
+
# * fenotype if present,
|
12
12
|
# * the prediction is it exists.
|
13
13
|
#
|
14
14
|
class Individual
|
15
15
|
attr_accessor :id, :fenotype, :prediction, :snp_list
|
16
|
-
|
16
|
+
|
17
17
|
# Initialize individual with passed data.
|
18
18
|
def initialize(i, fen, snps=[])
|
19
19
|
self.id = i
|
@@ -21,5 +21,5 @@ module Nimbus
|
|
21
21
|
self.snp_list = snps
|
22
22
|
end
|
23
23
|
end
|
24
|
-
|
24
|
+
|
25
25
|
end
|
@@ -1,23 +1,24 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module Nimbus
|
3
|
-
|
3
|
+
|
4
4
|
#####################################################################
|
5
5
|
# Math functions.
|
6
|
-
#
|
6
|
+
#
|
7
7
|
# The LossFunctions class provides handy mathematical functions as class methods
|
8
8
|
# to be used by Tree and Forest when estimating predictions, errors and loss functions
|
9
|
-
# for training and testing data.
|
9
|
+
# for training and testing data.
|
10
10
|
#
|
11
11
|
module LossFunctions
|
12
|
-
|
12
|
+
|
13
13
|
class << self
|
14
|
-
|
14
|
+
## REGRESSION
|
15
|
+
|
15
16
|
# Simple average: sum(n) / n
|
16
17
|
def average(ids, value_table)
|
17
18
|
ids.inject(0.0){|sum, i| sum + value_table[i]} / ids.size
|
18
19
|
end
|
19
20
|
|
20
|
-
# Mean squared error: sum (x-y)^2
|
21
|
+
# Mean squared error: sum (x-y)^2
|
21
22
|
def mean_squared_error(ids, value_table, mean = nil)
|
22
23
|
mean ||= self.average ids, value_table
|
23
24
|
ids.inject(0.0){|sum, i| sum + ((value_table[i] - mean)**2) }
|
@@ -29,14 +30,47 @@ module Nimbus
|
|
29
30
|
def quadratic_loss(ids, value_table, mean = nil)
|
30
31
|
self.mean_squared_error(ids, value_table, mean) / ids.size
|
31
32
|
end
|
32
|
-
|
33
|
+
|
33
34
|
# Difference between two values, squared. (x-y)^2
|
34
35
|
def squared_difference(x,y)
|
35
36
|
0.0 + (x-y)**2
|
36
37
|
end
|
37
|
-
|
38
|
+
|
39
|
+
## CLASSSIFICATION
|
40
|
+
|
41
|
+
# Gini index of a list of classified individuals.
|
42
|
+
#
|
43
|
+
# If a dataset T contains examples from n classes, then:
|
44
|
+
# gini(T) = 1 - Sum (Pj)^2
|
45
|
+
# where Pj is the relative frequency of class j in T
|
46
|
+
def gini_index(ids, value_table, classes)
|
47
|
+
total_size = ids.size.to_f
|
48
|
+
gini = 1 - class_sizes(ids, value_table, classes).inject(0.0){|sum, size|
|
49
|
+
sum + (size/total_size)**2}
|
50
|
+
gini.round(5)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Majority class of a list of classified individuals.
|
54
|
+
# If more than one class has the same number of individuals,
|
55
|
+
# one of the majority classes is selected randomly.
|
56
|
+
def majority_class(ids, value_table, classes)
|
57
|
+
sizes = class_sizes(ids, value_table, classes)
|
58
|
+
Hash[classes.zip sizes].keep_if{|k,v| v == sizes.max}.keys.sample
|
59
|
+
end
|
60
|
+
|
61
|
+
# Majority class of a list of classes.
|
62
|
+
# If more than one class has the same number of individuals,
|
63
|
+
# one of the majority classes is selected randomly.
|
64
|
+
def majority_class_in_list(list, classes)
|
65
|
+
sizes = classes.map{|c| list.count{|i| i == c}}
|
66
|
+
Hash[classes.zip sizes].keep_if{|k,v| v == sizes.max}.keys.sample
|
67
|
+
end
|
68
|
+
|
69
|
+
# Array with the list of sizes of each class in the given list of individuals.
|
70
|
+
def class_sizes(ids, value_table, classes)
|
71
|
+
classes.map{|c| ids.count{|i| value_table[i] == c}}
|
72
|
+
end
|
38
73
|
end
|
39
|
-
|
74
|
+
|
40
75
|
end
|
41
76
|
end
|
42
|
-
|