nimbus 1.0.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +149 -0
- data/lib/nimbus.rb +15 -11
- data/lib/nimbus/application.rb +20 -23
- data/lib/nimbus/classification_tree.rb +111 -0
- data/lib/nimbus/configuration.rb +52 -37
- data/lib/nimbus/forest.rb +56 -20
- data/lib/nimbus/individual.rb +7 -7
- data/lib/nimbus/loss_functions.rb +44 -10
- data/lib/nimbus/regression_tree.rb +103 -0
- data/lib/nimbus/training_set.rb +4 -4
- data/lib/nimbus/tree.rb +20 -83
- data/lib/nimbus/version.rb +3 -0
- data/spec/classification_tree_spec.rb +132 -0
- data/spec/configuration_spec.rb +46 -19
- data/spec/fixtures/classification_config.yml +13 -0
- data/spec/fixtures/classification_random_forest.yml +922 -0
- data/spec/fixtures/classification_testing.data +500 -0
- data/spec/fixtures/classification_training.data +1000 -0
- data/spec/forest_spec.rb +109 -50
- data/spec/individual_spec.rb +2 -2
- data/spec/loss_functions_spec.rb +71 -0
- data/spec/nimbus_spec.rb +4 -4
- data/spec/regression_tree_spec.rb +129 -0
- data/spec/training_set_spec.rb +5 -5
- data/spec/tree_spec.rb +4 -115
- metadata +53 -45
- data/spec/fixtures/regression_snp_importances.txt +0 -200
- data/spec/fixtures/regression_testing_file_predictions.txt +0 -200
- data/spec/fixtures/regression_training_file_predictions.txt +0 -758
data/lib/nimbus/configuration.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
module Nimbus
|
2
2
|
#####################################################################
|
3
|
-
# Nimbus configuration object.
|
4
|
-
#
|
3
|
+
# Nimbus configuration object.
|
4
|
+
#
|
5
5
|
# This class reads every user file.
|
6
|
-
# Once the user's config.yml file is loaded, a set of default and
|
6
|
+
# Once the user's config.yml file is loaded, a set of default and
|
7
7
|
# custom options is created and stored.
|
8
|
-
#
|
9
|
-
# Nimbus::Configuration also reads the testing files and the data
|
8
|
+
#
|
9
|
+
# Nimbus::Configuration also reads the testing files and the data
|
10
10
|
# to create the training set to be passed to the Nimbus::Forest random
|
11
11
|
# forest generator and the Nimbus::Tree classes in it.
|
12
12
|
#
|
@@ -15,6 +15,7 @@ module Nimbus
|
|
15
15
|
:training_file,
|
16
16
|
:testing_file,
|
17
17
|
:forest_file,
|
18
|
+
:classes,
|
18
19
|
:config_file,
|
19
20
|
:forest_size,
|
20
21
|
:tree_SNP_sample_size,
|
@@ -32,65 +33,66 @@ module Nimbus
|
|
32
33
|
:output_snp_importances_file,
|
33
34
|
:silent
|
34
35
|
)
|
35
|
-
|
36
|
+
|
36
37
|
DEFAULTS = {
|
37
38
|
:forest_size => 500,
|
38
39
|
:tree_SNP_sample_size => 60,
|
39
40
|
:tree_SNP_total_count => 200,
|
40
41
|
:tree_node_min_size => 5,
|
41
|
-
|
42
|
+
|
42
43
|
:loss_function_discrete => 'majority_class',
|
43
|
-
:loss_function_continuous => '
|
44
|
-
|
44
|
+
:loss_function_continuous => 'average',
|
45
|
+
|
45
46
|
:training_file => 'training.data',
|
46
47
|
:testing_file => 'testing.data',
|
47
48
|
:forest_file => 'forest.yml',
|
48
49
|
:config_file => 'config.yml',
|
49
|
-
|
50
|
+
|
50
51
|
:output_forest_file => 'random_forest.yml',
|
51
52
|
:output_training_file => 'training_file_predictions.txt',
|
52
53
|
:output_testing_file => 'testing_file_predictions.txt',
|
53
54
|
:output_tree_errors_file => 'generalization_errors.txt',
|
54
55
|
:output_snp_importances_file => 'snp_importances.txt',
|
55
|
-
|
56
|
+
|
56
57
|
:silent => false
|
57
58
|
}
|
58
|
-
|
59
|
+
|
59
60
|
# Initialize a Nimbus::Configuration object.
|
60
61
|
#
|
61
62
|
# Set all options to their default values.
|
62
63
|
def initialize
|
63
64
|
@do_training = false
|
64
65
|
@do_testing = false
|
65
|
-
|
66
|
+
|
66
67
|
@forest_size = DEFAULTS[:forest_size]
|
67
68
|
@tree_SNP_sample_size = DEFAULTS[:tree_SNP_sample_size]
|
68
69
|
@tree_SNP_total_count = DEFAULTS[:tree_SNP_total_count]
|
69
70
|
@tree_node_min_size = DEFAULTS[:tree_node_min_size]
|
70
71
|
@loss_function_discrete = DEFAULTS[:loss_function_discrete]
|
71
72
|
@loss_function_continuous = DEFAULTS[:loss_function_continuous]
|
72
|
-
|
73
|
+
|
73
74
|
@output_forest_file = File.expand_path(DEFAULTS[:output_forest_file], Dir.pwd)
|
74
75
|
@output_training_file = File.expand_path(DEFAULTS[:output_training_file], Dir.pwd)
|
75
76
|
@output_testing_file = File.expand_path(DEFAULTS[:output_testing_file], Dir.pwd)
|
76
77
|
@output_tree_errors_file = File.expand_path(DEFAULTS[:output_tree_errors_file], Dir.pwd)
|
77
78
|
@output_snp_importances_file = File.expand_path(DEFAULTS[:output_snp_importances_file], Dir.pwd)
|
78
|
-
|
79
|
+
|
79
80
|
@silent = ENV['nimbus_test'] == 'running_nimbus_tests' ? true : DEFAULTS[:silent]
|
80
81
|
end
|
81
|
-
|
82
|
+
|
82
83
|
# Accessor method for the tree-related subset of options.
|
83
84
|
def tree
|
84
|
-
{
|
85
|
+
{
|
85
86
|
:snp_sample_size => @tree_SNP_sample_size,
|
86
87
|
:snp_total_count => @tree_SNP_total_count,
|
87
|
-
:tree_node_min_size => @tree_node_min_size
|
88
|
+
:tree_node_min_size => @tree_node_min_size,
|
89
|
+
:classes => @classes
|
88
90
|
}
|
89
91
|
end
|
90
|
-
|
92
|
+
|
91
93
|
# This is the first method to be called on Configuration when a config.yml file
|
92
94
|
# exists with user input options for the forest.
|
93
|
-
#
|
95
|
+
#
|
94
96
|
# * The method will read the config file and change the default value of the selected options.
|
95
97
|
# * Then based on the options and the existence of training, testing and forest files, it will mark:
|
96
98
|
# - if training is needed,
|
@@ -110,24 +112,26 @@ module Nimbus
|
|
110
112
|
raise Nimbus::WrongFormatFileError, "It was not posible to parse the config file (#{config_file}): \r\n#{e.message} "
|
111
113
|
end
|
112
114
|
end
|
113
|
-
|
115
|
+
|
114
116
|
if user_config_params['input']
|
115
117
|
@training_file = File.expand_path(user_config_params['input']['training'], dirname) if user_config_params['input']['training']
|
116
118
|
@testing_file = File.expand_path(user_config_params['input']['testing' ], dirname) if user_config_params['input']['testing']
|
117
119
|
@forest_file = File.expand_path(user_config_params['input']['forest' ], dirname) if user_config_params['input']['forest']
|
120
|
+
@classes = user_config_params['input']['classes'] if user_config_params['input']['classes']
|
118
121
|
else
|
119
122
|
@training_file = File.expand_path(DEFAULTS[:training_file], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:training_file], Dir.pwd)
|
120
123
|
@testing_file = File.expand_path(DEFAULTS[:testing_file ], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:testing_file ], Dir.pwd)
|
121
124
|
@forest_file = File.expand_path(DEFAULTS[:forest_file ], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:forest_file ], Dir.pwd)
|
122
125
|
end
|
123
|
-
|
126
|
+
|
124
127
|
@do_training = true if @training_file
|
125
128
|
@do_testing = true if @testing_file
|
126
|
-
|
129
|
+
@classes = @classes.map{|c| c.to_s.strip} if @classes
|
130
|
+
|
127
131
|
if @do_testing && !@do_training && !@forest_file
|
128
132
|
raise Nimbus::InputFileError, "There is not random forest data (training file not defined, and forest file not found)."
|
129
133
|
end
|
130
|
-
|
134
|
+
|
131
135
|
if user_config_params['forest']
|
132
136
|
@forest_size = user_config_params['forest']['forest_size'].to_i if user_config_params['forest']['forest_size']
|
133
137
|
@tree_SNP_total_count = user_config_params['forest']['SNP_total_count'].to_i if user_config_params['forest']['SNP_total_count']
|
@@ -138,7 +142,7 @@ module Nimbus
|
|
138
142
|
check_configuration
|
139
143
|
log_configuration
|
140
144
|
end
|
141
|
-
|
145
|
+
|
142
146
|
# The method reads the training file, and if the data is valid, creates a Nimbus::TrainingSet
|
143
147
|
# containing every individual to be used as training sample for a random forest.
|
144
148
|
def load_training_data
|
@@ -150,12 +154,15 @@ module Nimbus
|
|
150
154
|
raise Nimbus::InputFileError, "Individual ##{data_id} from training set has no value for all #{@tree_SNP_total_count} SNPs" unless snp_list.size == @tree_SNP_total_count
|
151
155
|
raise Nimbus::InputFileError, "There are individuals with no ID, please check data in training file." unless (!data_id.nil? && data_id.strip != '')
|
152
156
|
raise Nimbus::InputFileError, "Individual ##{data_id} has no fenotype value, please check data in training file." unless (!data_feno.nil? && data_feno.strip != '')
|
153
|
-
|
154
|
-
|
157
|
+
raise Nimbus::InputFileError, "Individual ##{data_id} has invalid class (not in [#{classes*', '}]), please check data in training file." unless (@classes.nil? || @classes.include?(data_feno))
|
158
|
+
|
159
|
+
data_feno = (@classes ? data_feno.to_s : data_feno.to_f)
|
160
|
+
@training_set.individuals[data_id.to_i] = Nimbus::Individual.new(data_id.to_i, data_feno, snp_list.map{|snp| snp.to_i})
|
161
|
+
@training_set.ids_fenotypes[data_id.to_i] = data_feno
|
155
162
|
end
|
156
163
|
}
|
157
164
|
end
|
158
|
-
|
165
|
+
|
159
166
|
# Reads the testing file, and if the data is valid, yields one Nimbus::Individual at a time.
|
160
167
|
def read_testing_data
|
161
168
|
File.open(@testing_file) {|file|
|
@@ -169,7 +176,7 @@ module Nimbus
|
|
169
176
|
end
|
170
177
|
}
|
171
178
|
end
|
172
|
-
|
179
|
+
|
173
180
|
# Creates a Nimbus::Forest object from a user defined random forest data file.
|
174
181
|
#
|
175
182
|
# The format of the input file should be the same as the forest output data of a Nimbus Application.
|
@@ -186,14 +193,14 @@ module Nimbus
|
|
186
193
|
forest.trees = trees
|
187
194
|
forest
|
188
195
|
end
|
189
|
-
|
196
|
+
|
190
197
|
# Include tests to be passed by the info contained in the config file.
|
191
198
|
#
|
192
199
|
# If some of the configuration data provided by the user is invalid, an error is raised and execution stops.
|
193
200
|
def check_configuration
|
194
201
|
raise Nimbus::ConfigurationError, "The mtry sample size must be smaller than the total SNPs count." if @tree_SNP_sample_size > @tree_SNP_total_count
|
195
202
|
end
|
196
|
-
|
203
|
+
|
197
204
|
# Prints the information stored in the Nimbus::Configuration object
|
198
205
|
#
|
199
206
|
# It could include errors on the configuration input data, training related info and/or testing related info.
|
@@ -209,31 +216,39 @@ module Nimbus
|
|
209
216
|
Nimbus.message "*" * 50
|
210
217
|
Nimbus.stop "Error: No input data. Nimbus finished."
|
211
218
|
end
|
212
|
-
|
219
|
+
|
213
220
|
Nimbus.message "*" * 50
|
214
|
-
Nimbus.message "* Nimbus
|
221
|
+
Nimbus.message "* Nimbus version #{::Nimbus::VERSION}"
|
222
|
+
Nimbus.message "* configured with the following parameters: "
|
215
223
|
Nimbus.message "* Forest size: #{@forest_size} trees"
|
216
224
|
Nimbus.message "* Total SNP count: #{@tree_SNP_total_count}"
|
217
225
|
Nimbus.message "* SNPs sample size (mtry): #{@tree_SNP_sample_size}"
|
218
226
|
Nimbus.message "* Minimun node size in tree: #{@tree_node_min_size}"
|
227
|
+
|
228
|
+
if @classes
|
229
|
+
Nimbus.message "* Mode: CLASSIFICATION"
|
230
|
+
Nimbus.message "* Classes: [#{@classes*', '}]"
|
231
|
+
else
|
232
|
+
Nimbus.message "* Mode: REGRESSION"
|
233
|
+
end
|
219
234
|
Nimbus.message "*" * 50
|
220
|
-
|
235
|
+
|
221
236
|
if @do_training
|
222
237
|
Nimbus.message "* Training data:"
|
223
238
|
Nimbus.message "* Training file: #{@training_file}"
|
224
239
|
Nimbus.message "*" * 50
|
225
240
|
end
|
226
|
-
|
241
|
+
|
227
242
|
if @do_testing
|
228
243
|
Nimbus.message "* Data to be tested:"
|
229
244
|
Nimbus.message "* Testing file: #{@testing_file}"
|
230
245
|
if @forest_file
|
231
|
-
Nimbus.message "* using the structure of the random forest stored in:"
|
246
|
+
Nimbus.message "* using the structure of the random forest stored in:"
|
232
247
|
Nimbus.message "* Random forest file: #{@forest_file}"
|
233
248
|
end
|
234
249
|
Nimbus.message "*" * 50
|
235
250
|
end
|
236
251
|
end
|
237
|
-
|
252
|
+
|
238
253
|
end
|
239
254
|
end
|
data/lib/nimbus/forest.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
module Nimbus
|
2
2
|
|
3
3
|
#####################################################################
|
4
|
-
# Forest represents the Random forest being generated
|
4
|
+
# Forest represents the Random forest being generated
|
5
5
|
# (or used to test samples) by the application object.
|
6
6
|
#
|
7
7
|
class Forest
|
8
8
|
attr_accessor :size, :trees, :bag, :predictions, :tree_errors, :snp_importances
|
9
9
|
attr_accessor :options
|
10
|
-
|
10
|
+
|
11
11
|
# Initialize Forest object with options included in the Nimbus::Configuration object received.
|
12
12
|
def initialize(config)
|
13
13
|
@trees = []
|
@@ -20,7 +20,7 @@ module Nimbus
|
|
20
20
|
@tree_snp_importances = []
|
21
21
|
raise Nimbus::ForestError, "Forest size parameter (#{@size}) is invalid. You need at least one tree." if @size < 1
|
22
22
|
end
|
23
|
-
|
23
|
+
|
24
24
|
# Creates a random forest based on the TrainingSet included in the configuration, creating N random trees (size N defined in the configuration).
|
25
25
|
#
|
26
26
|
# This is the method called when the application's configuration flags training on.
|
@@ -35,10 +35,11 @@ module Nimbus
|
|
35
35
|
# Every tree of the forest is created with a different random sample of the individuals in the training set.
|
36
36
|
def grow
|
37
37
|
@size.times do |i|
|
38
|
-
Nimbus.write("
|
38
|
+
Nimbus.write("\rCreating trees: #{i+1}/#{@size} ")
|
39
39
|
tree_individuals_bag = individuals_random_sample
|
40
40
|
tree_out_of_bag = oob tree_individuals_bag
|
41
|
-
|
41
|
+
tree_class = (classification? ? ClassificationTree : RegressionTree)
|
42
|
+
tree = tree_class.new @options.tree
|
42
43
|
@trees << tree.seed(@options.training_set.individuals, tree_individuals_bag, @options.training_set.ids_fenotypes)
|
43
44
|
@tree_errors << tree.generalization_error_from_oob(tree_out_of_bag)
|
44
45
|
@tree_snp_importances << tree.estimate_importances(tree_out_of_bag)
|
@@ -46,13 +47,18 @@ module Nimbus
|
|
46
47
|
Nimbus.clear_line!
|
47
48
|
end
|
48
49
|
average_snp_importances
|
49
|
-
|
50
|
+
totalize_predictions
|
50
51
|
end
|
51
|
-
|
52
|
-
# Traverse a testing set through every tree of the forest
|
52
|
+
|
53
|
+
# Traverse a testing set through every tree of the forest.
|
53
54
|
#
|
54
55
|
# This is the method called when the application's configuration flags testing on.
|
55
56
|
def traverse
|
57
|
+
classification? ? traverse_classification_forest : traverse_regression_forest
|
58
|
+
end
|
59
|
+
|
60
|
+
# Traverse a testing set through every regression tree of the forest and get averaged predictions for every individual in the sample.
|
61
|
+
def traverse_regression_forest
|
56
62
|
@predictions = {}
|
57
63
|
prediction_count = trees.size
|
58
64
|
@options.read_testing_data{|individual|
|
@@ -63,44 +69,66 @@ module Nimbus
|
|
63
69
|
@predictions[individual.id] = (individual_prediction / prediction_count).round(5)
|
64
70
|
}
|
65
71
|
end
|
66
|
-
|
72
|
+
|
73
|
+
# Traverse a testing set through every classification tree of the forest and get majority class predictions for every individual in the sample.
|
74
|
+
def traverse_classification_forest
|
75
|
+
@predictions = {}
|
76
|
+
@options.read_testing_data{|individual|
|
77
|
+
individual_prediction = []
|
78
|
+
trees.each do |t|
|
79
|
+
individual_prediction << Nimbus::Tree.traverse(t, individual.snp_list)
|
80
|
+
end
|
81
|
+
@predictions[individual.id] = Nimbus::LossFunctions.majority_class_in_list(individual_prediction, @options.tree[:classes])
|
82
|
+
}
|
83
|
+
end
|
84
|
+
|
67
85
|
# The array containing every tree in the forest, to YAML format.
|
68
86
|
def to_yaml
|
69
87
|
@trees.to_yaml
|
70
88
|
end
|
71
|
-
|
89
|
+
|
72
90
|
private
|
73
|
-
|
91
|
+
|
74
92
|
def individuals_random_sample
|
75
93
|
individuals_sample = bag.inject([]){|items, i| items << bag.sample }.sort
|
76
94
|
end
|
77
|
-
|
95
|
+
|
78
96
|
def oob(in_bag=[])
|
79
97
|
bag - in_bag.uniq
|
80
98
|
end
|
81
|
-
|
99
|
+
|
82
100
|
def bag
|
83
101
|
@bag ||= @options.training_set.all_ids
|
84
102
|
end
|
85
|
-
|
103
|
+
|
86
104
|
def acumulate_predictions(preds)
|
87
105
|
preds.each_pair.each{|id, value|
|
88
106
|
if @predictions[id].nil?
|
89
|
-
@predictions[id] = value
|
107
|
+
@predictions[id] = (classification? ? [value] : value)
|
90
108
|
@times_predicted[id] = 1.0
|
91
109
|
else
|
92
|
-
@predictions[id] += value
|
110
|
+
classification? ? (@predictions[id] << value) : (@predictions[id] += value)
|
93
111
|
@times_predicted[id] += 1
|
94
112
|
end
|
95
113
|
}
|
96
114
|
end
|
97
|
-
|
115
|
+
|
116
|
+
def totalize_predictions
|
117
|
+
classification? ? majority_class_predicted : average_predictions
|
118
|
+
end
|
119
|
+
|
98
120
|
def average_predictions
|
99
121
|
@predictions.each_pair{|id, value|
|
100
122
|
@predictions[id] = (@predictions[id] / @times_predicted[id]).round(5)
|
101
123
|
}
|
102
124
|
end
|
103
|
-
|
125
|
+
|
126
|
+
def majority_class_predicted
|
127
|
+
@predictions.each_pair{|id, values|
|
128
|
+
@predictions[id] = Nimbus::LossFunctions.majority_class_in_list(values, @options.tree[:classes])
|
129
|
+
}
|
130
|
+
end
|
131
|
+
|
104
132
|
def average_snp_importances
|
105
133
|
1.upto(@options.tree_SNP_total_count) {|snp|
|
106
134
|
@snp_importances[snp] = 0.0
|
@@ -110,7 +138,15 @@ module Nimbus
|
|
110
138
|
@snp_importances[snp] = @snp_importances[snp] / @size
|
111
139
|
}
|
112
140
|
end
|
113
|
-
|
141
|
+
|
142
|
+
def classification?
|
143
|
+
@options.tree[:classes]
|
144
|
+
end
|
145
|
+
|
146
|
+
def regression?
|
147
|
+
@options.tree[:classes].nil?
|
148
|
+
end
|
149
|
+
|
114
150
|
end
|
115
|
-
|
151
|
+
|
116
152
|
end
|
data/lib/nimbus/individual.rb
CHANGED
@@ -1,19 +1,19 @@
|
|
1
1
|
module Nimbus
|
2
2
|
#####################################################################
|
3
|
-
# Nimbus Individual object.
|
4
|
-
#
|
5
|
-
# It represents a single individual of a training or testing sample.
|
6
|
-
#
|
3
|
+
# Nimbus Individual object.
|
4
|
+
#
|
5
|
+
# It represents a single individual of a training or testing sample.
|
6
|
+
#
|
7
7
|
# This class stores information about a individual:
|
8
8
|
#
|
9
9
|
# * id,
|
10
10
|
# * values for all the SNPs of the individual,
|
11
|
-
# * fenotype if present,
|
11
|
+
# * fenotype if present,
|
12
12
|
# * the prediction is it exists.
|
13
13
|
#
|
14
14
|
class Individual
|
15
15
|
attr_accessor :id, :fenotype, :prediction, :snp_list
|
16
|
-
|
16
|
+
|
17
17
|
# Initialize individual with passed data.
|
18
18
|
def initialize(i, fen, snps=[])
|
19
19
|
self.id = i
|
@@ -21,5 +21,5 @@ module Nimbus
|
|
21
21
|
self.snp_list = snps
|
22
22
|
end
|
23
23
|
end
|
24
|
-
|
24
|
+
|
25
25
|
end
|
@@ -1,23 +1,24 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module Nimbus
|
3
|
-
|
3
|
+
|
4
4
|
#####################################################################
|
5
5
|
# Math functions.
|
6
|
-
#
|
6
|
+
#
|
7
7
|
# The LossFunctions class provides handy mathematical functions as class methods
|
8
8
|
# to be used by Tree and Forest when estimating predictions, errors and loss functions
|
9
|
-
# for training and testing data.
|
9
|
+
# for training and testing data.
|
10
10
|
#
|
11
11
|
module LossFunctions
|
12
|
-
|
12
|
+
|
13
13
|
class << self
|
14
|
-
|
14
|
+
## REGRESSION
|
15
|
+
|
15
16
|
# Simple average: sum(n) / n
|
16
17
|
def average(ids, value_table)
|
17
18
|
ids.inject(0.0){|sum, i| sum + value_table[i]} / ids.size
|
18
19
|
end
|
19
20
|
|
20
|
-
# Mean squared error: sum (x-y)^2
|
21
|
+
# Mean squared error: sum (x-y)^2
|
21
22
|
def mean_squared_error(ids, value_table, mean = nil)
|
22
23
|
mean ||= self.average ids, value_table
|
23
24
|
ids.inject(0.0){|sum, i| sum + ((value_table[i] - mean)**2) }
|
@@ -29,14 +30,47 @@ module Nimbus
|
|
29
30
|
def quadratic_loss(ids, value_table, mean = nil)
|
30
31
|
self.mean_squared_error(ids, value_table, mean) / ids.size
|
31
32
|
end
|
32
|
-
|
33
|
+
|
33
34
|
# Difference between two values, squared. (x-y)^2
|
34
35
|
def squared_difference(x,y)
|
35
36
|
0.0 + (x-y)**2
|
36
37
|
end
|
37
|
-
|
38
|
+
|
39
|
+
## CLASSSIFICATION
|
40
|
+
|
41
|
+
# Gini index of a list of classified individuals.
|
42
|
+
#
|
43
|
+
# If a dataset T contains examples from n classes, then:
|
44
|
+
# gini(T) = 1 - Sum (Pj)^2
|
45
|
+
# where Pj is the relative frequency of class j in T
|
46
|
+
def gini_index(ids, value_table, classes)
|
47
|
+
total_size = ids.size.to_f
|
48
|
+
gini = 1 - class_sizes(ids, value_table, classes).inject(0.0){|sum, size|
|
49
|
+
sum + (size/total_size)**2}
|
50
|
+
gini.round(5)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Majority class of a list of classified individuals.
|
54
|
+
# If more than one class has the same number of individuals,
|
55
|
+
# one of the majority classes is selected randomly.
|
56
|
+
def majority_class(ids, value_table, classes)
|
57
|
+
sizes = class_sizes(ids, value_table, classes)
|
58
|
+
Hash[classes.zip sizes].keep_if{|k,v| v == sizes.max}.keys.sample
|
59
|
+
end
|
60
|
+
|
61
|
+
# Majority class of a list of classes.
|
62
|
+
# If more than one class has the same number of individuals,
|
63
|
+
# one of the majority classes is selected randomly.
|
64
|
+
def majority_class_in_list(list, classes)
|
65
|
+
sizes = classes.map{|c| list.count{|i| i == c}}
|
66
|
+
Hash[classes.zip sizes].keep_if{|k,v| v == sizes.max}.keys.sample
|
67
|
+
end
|
68
|
+
|
69
|
+
# Array with the list of sizes of each class in the given list of individuals.
|
70
|
+
def class_sizes(ids, value_table, classes)
|
71
|
+
classes.map{|c| ids.count{|i| value_table[i] == c}}
|
72
|
+
end
|
38
73
|
end
|
39
|
-
|
74
|
+
|
40
75
|
end
|
41
76
|
end
|
42
|
-
|