nimbus 1.0.1 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,12 +1,12 @@
1
1
  module Nimbus
2
2
  #####################################################################
3
- # Nimbus configuration object.
4
- #
3
+ # Nimbus configuration object.
4
+ #
5
5
  # This class reads every user file.
6
- # Once the user's config.yml file is loaded, a set of default and
6
+ # Once the user's config.yml file is loaded, a set of default and
7
7
  # custom options is created and stored.
8
- #
9
- # Nimbus::Configuration also reads the testing files and the data
8
+ #
9
+ # Nimbus::Configuration also reads the testing files and the data
10
10
  # to create the training set to be passed to the Nimbus::Forest random
11
11
  # forest generator and the Nimbus::Tree classes in it.
12
12
  #
@@ -15,6 +15,7 @@ module Nimbus
15
15
  :training_file,
16
16
  :testing_file,
17
17
  :forest_file,
18
+ :classes,
18
19
  :config_file,
19
20
  :forest_size,
20
21
  :tree_SNP_sample_size,
@@ -32,65 +33,66 @@ module Nimbus
32
33
  :output_snp_importances_file,
33
34
  :silent
34
35
  )
35
-
36
+
36
37
  DEFAULTS = {
37
38
  :forest_size => 500,
38
39
  :tree_SNP_sample_size => 60,
39
40
  :tree_SNP_total_count => 200,
40
41
  :tree_node_min_size => 5,
41
-
42
+
42
43
  :loss_function_discrete => 'majority_class',
43
- :loss_function_continuous => 'mean',
44
-
44
+ :loss_function_continuous => 'average',
45
+
45
46
  :training_file => 'training.data',
46
47
  :testing_file => 'testing.data',
47
48
  :forest_file => 'forest.yml',
48
49
  :config_file => 'config.yml',
49
-
50
+
50
51
  :output_forest_file => 'random_forest.yml',
51
52
  :output_training_file => 'training_file_predictions.txt',
52
53
  :output_testing_file => 'testing_file_predictions.txt',
53
54
  :output_tree_errors_file => 'generalization_errors.txt',
54
55
  :output_snp_importances_file => 'snp_importances.txt',
55
-
56
+
56
57
  :silent => false
57
58
  }
58
-
59
+
59
60
  # Initialize a Nimbus::Configuration object.
60
61
  #
61
62
  # Set all options to their default values.
62
63
  def initialize
63
64
  @do_training = false
64
65
  @do_testing = false
65
-
66
+
66
67
  @forest_size = DEFAULTS[:forest_size]
67
68
  @tree_SNP_sample_size = DEFAULTS[:tree_SNP_sample_size]
68
69
  @tree_SNP_total_count = DEFAULTS[:tree_SNP_total_count]
69
70
  @tree_node_min_size = DEFAULTS[:tree_node_min_size]
70
71
  @loss_function_discrete = DEFAULTS[:loss_function_discrete]
71
72
  @loss_function_continuous = DEFAULTS[:loss_function_continuous]
72
-
73
+
73
74
  @output_forest_file = File.expand_path(DEFAULTS[:output_forest_file], Dir.pwd)
74
75
  @output_training_file = File.expand_path(DEFAULTS[:output_training_file], Dir.pwd)
75
76
  @output_testing_file = File.expand_path(DEFAULTS[:output_testing_file], Dir.pwd)
76
77
  @output_tree_errors_file = File.expand_path(DEFAULTS[:output_tree_errors_file], Dir.pwd)
77
78
  @output_snp_importances_file = File.expand_path(DEFAULTS[:output_snp_importances_file], Dir.pwd)
78
-
79
+
79
80
  @silent = ENV['nimbus_test'] == 'running_nimbus_tests' ? true : DEFAULTS[:silent]
80
81
  end
81
-
82
+
82
83
  # Accessor method for the tree-related subset of options.
83
84
  def tree
84
- {
85
+ {
85
86
  :snp_sample_size => @tree_SNP_sample_size,
86
87
  :snp_total_count => @tree_SNP_total_count,
87
- :tree_node_min_size => @tree_node_min_size
88
+ :tree_node_min_size => @tree_node_min_size,
89
+ :classes => @classes
88
90
  }
89
91
  end
90
-
92
+
91
93
  # This is the first method to be called on Configuration when a config.yml file
92
94
  # exists with user input options for the forest.
93
- #
95
+ #
94
96
  # * The method will read the config file and change the default value of the selected options.
95
97
  # * Then based on the options and the existence of training, testing and forest files, it will mark:
96
98
  # - if training is needed,
@@ -110,24 +112,26 @@ module Nimbus
110
112
  raise Nimbus::WrongFormatFileError, "It was not posible to parse the config file (#{config_file}): \r\n#{e.message} "
111
113
  end
112
114
  end
113
-
115
+
114
116
  if user_config_params['input']
115
117
  @training_file = File.expand_path(user_config_params['input']['training'], dirname) if user_config_params['input']['training']
116
118
  @testing_file = File.expand_path(user_config_params['input']['testing' ], dirname) if user_config_params['input']['testing']
117
119
  @forest_file = File.expand_path(user_config_params['input']['forest' ], dirname) if user_config_params['input']['forest']
120
+ @classes = user_config_params['input']['classes'] if user_config_params['input']['classes']
118
121
  else
119
122
  @training_file = File.expand_path(DEFAULTS[:training_file], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:training_file], Dir.pwd)
120
123
  @testing_file = File.expand_path(DEFAULTS[:testing_file ], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:testing_file ], Dir.pwd)
121
124
  @forest_file = File.expand_path(DEFAULTS[:forest_file ], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:forest_file ], Dir.pwd)
122
125
  end
123
-
126
+
124
127
  @do_training = true if @training_file
125
128
  @do_testing = true if @testing_file
126
-
129
+ @classes = @classes.map{|c| c.to_s.strip} if @classes
130
+
127
131
  if @do_testing && !@do_training && !@forest_file
128
132
  raise Nimbus::InputFileError, "There is not random forest data (training file not defined, and forest file not found)."
129
133
  end
130
-
134
+
131
135
  if user_config_params['forest']
132
136
  @forest_size = user_config_params['forest']['forest_size'].to_i if user_config_params['forest']['forest_size']
133
137
  @tree_SNP_total_count = user_config_params['forest']['SNP_total_count'].to_i if user_config_params['forest']['SNP_total_count']
@@ -138,7 +142,7 @@ module Nimbus
138
142
  check_configuration
139
143
  log_configuration
140
144
  end
141
-
145
+
142
146
  # The method reads the training file, and if the data is valid, creates a Nimbus::TrainingSet
143
147
  # containing every individual to be used as training sample for a random forest.
144
148
  def load_training_data
@@ -150,12 +154,15 @@ module Nimbus
150
154
  raise Nimbus::InputFileError, "Individual ##{data_id} from training set has no value for all #{@tree_SNP_total_count} SNPs" unless snp_list.size == @tree_SNP_total_count
151
155
  raise Nimbus::InputFileError, "There are individuals with no ID, please check data in training file." unless (!data_id.nil? && data_id.strip != '')
152
156
  raise Nimbus::InputFileError, "Individual ##{data_id} has no fenotype value, please check data in training file." unless (!data_feno.nil? && data_feno.strip != '')
153
- @training_set.individuals[data_id.to_i] = Nimbus::Individual.new(data_id.to_i, data_feno.to_f, snp_list.map{|snp| snp.to_i})
154
- @training_set.ids_fenotypes[data_id.to_i] = data_feno.to_f
157
+ raise Nimbus::InputFileError, "Individual ##{data_id} has invalid class (not in [#{classes*', '}]), please check data in training file." unless (@classes.nil? || @classes.include?(data_feno))
158
+
159
+ data_feno = (@classes ? data_feno.to_s : data_feno.to_f)
160
+ @training_set.individuals[data_id.to_i] = Nimbus::Individual.new(data_id.to_i, data_feno, snp_list.map{|snp| snp.to_i})
161
+ @training_set.ids_fenotypes[data_id.to_i] = data_feno
155
162
  end
156
163
  }
157
164
  end
158
-
165
+
159
166
  # Reads the testing file, and if the data is valid, yields one Nimbus::Individual at a time.
160
167
  def read_testing_data
161
168
  File.open(@testing_file) {|file|
@@ -169,7 +176,7 @@ module Nimbus
169
176
  end
170
177
  }
171
178
  end
172
-
179
+
173
180
  # Creates a Nimbus::Forest object from a user defined random forest data file.
174
181
  #
175
182
  # The format of the input file should be the same as the forest output data of a Nimbus Application.
@@ -186,14 +193,14 @@ module Nimbus
186
193
  forest.trees = trees
187
194
  forest
188
195
  end
189
-
196
+
190
197
  # Include tests to be passed by the info contained in the config file.
191
198
  #
192
199
  # If some of the configuration data provided by the user is invalid, an error is raised and execution stops.
193
200
  def check_configuration
194
201
  raise Nimbus::ConfigurationError, "The mtry sample size must be smaller than the total SNPs count." if @tree_SNP_sample_size > @tree_SNP_total_count
195
202
  end
196
-
203
+
197
204
  # Prints the information stored in the Nimbus::Configuration object
198
205
  #
199
206
  # It could include errors on the configuration input data, training related info and/or testing related info.
@@ -209,31 +216,39 @@ module Nimbus
209
216
  Nimbus.message "*" * 50
210
217
  Nimbus.stop "Error: No input data. Nimbus finished."
211
218
  end
212
-
219
+
213
220
  Nimbus.message "*" * 50
214
- Nimbus.message "* Nimbus configured with the following parameters: "
221
+ Nimbus.message "* Nimbus version #{::Nimbus::VERSION}"
222
+ Nimbus.message "* configured with the following parameters: "
215
223
  Nimbus.message "* Forest size: #{@forest_size} trees"
216
224
  Nimbus.message "* Total SNP count: #{@tree_SNP_total_count}"
217
225
  Nimbus.message "* SNPs sample size (mtry): #{@tree_SNP_sample_size}"
218
226
  Nimbus.message "* Minimun node size in tree: #{@tree_node_min_size}"
227
+
228
+ if @classes
229
+ Nimbus.message "* Mode: CLASSIFICATION"
230
+ Nimbus.message "* Classes: [#{@classes*', '}]"
231
+ else
232
+ Nimbus.message "* Mode: REGRESSION"
233
+ end
219
234
  Nimbus.message "*" * 50
220
-
235
+
221
236
  if @do_training
222
237
  Nimbus.message "* Training data:"
223
238
  Nimbus.message "* Training file: #{@training_file}"
224
239
  Nimbus.message "*" * 50
225
240
  end
226
-
241
+
227
242
  if @do_testing
228
243
  Nimbus.message "* Data to be tested:"
229
244
  Nimbus.message "* Testing file: #{@testing_file}"
230
245
  if @forest_file
231
- Nimbus.message "* using the structure of the random forest stored in:"
246
+ Nimbus.message "* using the structure of the random forest stored in:"
232
247
  Nimbus.message "* Random forest file: #{@forest_file}"
233
248
  end
234
249
  Nimbus.message "*" * 50
235
250
  end
236
251
  end
237
-
252
+
238
253
  end
239
254
  end
data/lib/nimbus/forest.rb CHANGED
@@ -1,13 +1,13 @@
1
1
  module Nimbus
2
2
 
3
3
  #####################################################################
4
- # Forest represents the Random forest being generated
4
+ # Forest represents the Random forest being generated
5
5
  # (or used to test samples) by the application object.
6
6
  #
7
7
  class Forest
8
8
  attr_accessor :size, :trees, :bag, :predictions, :tree_errors, :snp_importances
9
9
  attr_accessor :options
10
-
10
+
11
11
  # Initialize Forest object with options included in the Nimbus::Configuration object received.
12
12
  def initialize(config)
13
13
  @trees = []
@@ -20,7 +20,7 @@ module Nimbus
20
20
  @tree_snp_importances = []
21
21
  raise Nimbus::ForestError, "Forest size parameter (#{@size}) is invalid. You need at least one tree." if @size < 1
22
22
  end
23
-
23
+
24
24
  # Creates a random forest based on the TrainingSet included in the configuration, creating N random trees (size N defined in the configuration).
25
25
  #
26
26
  # This is the method called when the application's configuration flags training on.
@@ -35,10 +35,11 @@ module Nimbus
35
35
  # Every tree of the forest is created with a different random sample of the individuals in the training set.
36
36
  def grow
37
37
  @size.times do |i|
38
- Nimbus.write("Creating trees: #{i+1}/#{@size} ")
38
+ Nimbus.write("\rCreating trees: #{i+1}/#{@size} ")
39
39
  tree_individuals_bag = individuals_random_sample
40
40
  tree_out_of_bag = oob tree_individuals_bag
41
- tree = Tree.new @options.tree
41
+ tree_class = (classification? ? ClassificationTree : RegressionTree)
42
+ tree = tree_class.new @options.tree
42
43
  @trees << tree.seed(@options.training_set.individuals, tree_individuals_bag, @options.training_set.ids_fenotypes)
43
44
  @tree_errors << tree.generalization_error_from_oob(tree_out_of_bag)
44
45
  @tree_snp_importances << tree.estimate_importances(tree_out_of_bag)
@@ -46,13 +47,18 @@ module Nimbus
46
47
  Nimbus.clear_line!
47
48
  end
48
49
  average_snp_importances
49
- average_predictions
50
+ totalize_predictions
50
51
  end
51
-
52
- # Traverse a testing set through every tree of the forest and get averaged predictions for every individual in the sample.
52
+
53
+ # Traverse a testing set through every tree of the forest.
53
54
  #
54
55
  # This is the method called when the application's configuration flags testing on.
55
56
  def traverse
57
+ classification? ? traverse_classification_forest : traverse_regression_forest
58
+ end
59
+
60
+ # Traverse a testing set through every regression tree of the forest and get averaged predictions for every individual in the sample.
61
+ def traverse_regression_forest
56
62
  @predictions = {}
57
63
  prediction_count = trees.size
58
64
  @options.read_testing_data{|individual|
@@ -63,44 +69,66 @@ module Nimbus
63
69
  @predictions[individual.id] = (individual_prediction / prediction_count).round(5)
64
70
  }
65
71
  end
66
-
72
+
73
+ # Traverse a testing set through every classification tree of the forest and get majority class predictions for every individual in the sample.
74
+ def traverse_classification_forest
75
+ @predictions = {}
76
+ @options.read_testing_data{|individual|
77
+ individual_prediction = []
78
+ trees.each do |t|
79
+ individual_prediction << Nimbus::Tree.traverse(t, individual.snp_list)
80
+ end
81
+ @predictions[individual.id] = Nimbus::LossFunctions.majority_class_in_list(individual_prediction, @options.tree[:classes])
82
+ }
83
+ end
84
+
67
85
  # The array containing every tree in the forest, to YAML format.
68
86
  def to_yaml
69
87
  @trees.to_yaml
70
88
  end
71
-
89
+
72
90
  private
73
-
91
+
74
92
  def individuals_random_sample
75
93
  individuals_sample = bag.inject([]){|items, i| items << bag.sample }.sort
76
94
  end
77
-
95
+
78
96
  def oob(in_bag=[])
79
97
  bag - in_bag.uniq
80
98
  end
81
-
99
+
82
100
  def bag
83
101
  @bag ||= @options.training_set.all_ids
84
102
  end
85
-
103
+
86
104
  def acumulate_predictions(preds)
87
105
  preds.each_pair.each{|id, value|
88
106
  if @predictions[id].nil?
89
- @predictions[id] = value
107
+ @predictions[id] = (classification? ? [value] : value)
90
108
  @times_predicted[id] = 1.0
91
109
  else
92
- @predictions[id] += value
110
+ classification? ? (@predictions[id] << value) : (@predictions[id] += value)
93
111
  @times_predicted[id] += 1
94
112
  end
95
113
  }
96
114
  end
97
-
115
+
116
+ def totalize_predictions
117
+ classification? ? majority_class_predicted : average_predictions
118
+ end
119
+
98
120
  def average_predictions
99
121
  @predictions.each_pair{|id, value|
100
122
  @predictions[id] = (@predictions[id] / @times_predicted[id]).round(5)
101
123
  }
102
124
  end
103
-
125
+
126
+ def majority_class_predicted
127
+ @predictions.each_pair{|id, values|
128
+ @predictions[id] = Nimbus::LossFunctions.majority_class_in_list(values, @options.tree[:classes])
129
+ }
130
+ end
131
+
104
132
  def average_snp_importances
105
133
  1.upto(@options.tree_SNP_total_count) {|snp|
106
134
  @snp_importances[snp] = 0.0
@@ -110,7 +138,15 @@ module Nimbus
110
138
  @snp_importances[snp] = @snp_importances[snp] / @size
111
139
  }
112
140
  end
113
-
141
+
142
+ def classification?
143
+ @options.tree[:classes]
144
+ end
145
+
146
+ def regression?
147
+ @options.tree[:classes].nil?
148
+ end
149
+
114
150
  end
115
-
151
+
116
152
  end
@@ -1,19 +1,19 @@
1
1
  module Nimbus
2
2
  #####################################################################
3
- # Nimbus Individual object.
4
- #
5
- # It represents a single individual of a training or testing sample.
6
- #
3
+ # Nimbus Individual object.
4
+ #
5
+ # It represents a single individual of a training or testing sample.
6
+ #
7
7
  # This class stores information about a individual:
8
8
  #
9
9
  # * id,
10
10
  # * values for all the SNPs of the individual,
11
- # * fenotype if present,
11
+ # * fenotype if present,
12
12
  # * the prediction is it exists.
13
13
  #
14
14
  class Individual
15
15
  attr_accessor :id, :fenotype, :prediction, :snp_list
16
-
16
+
17
17
  # Initialize individual with passed data.
18
18
  def initialize(i, fen, snps=[])
19
19
  self.id = i
@@ -21,5 +21,5 @@ module Nimbus
21
21
  self.snp_list = snps
22
22
  end
23
23
  end
24
-
24
+
25
25
  end
@@ -1,23 +1,24 @@
1
1
  # encoding: utf-8
2
2
  module Nimbus
3
-
3
+
4
4
  #####################################################################
5
5
  # Math functions.
6
- #
6
+ #
7
7
  # The LossFunctions class provides handy mathematical functions as class methods
8
8
  # to be used by Tree and Forest when estimating predictions, errors and loss functions
9
- # for training and testing data.
9
+ # for training and testing data.
10
10
  #
11
11
  module LossFunctions
12
-
12
+
13
13
  class << self
14
-
14
+ ## REGRESSION
15
+
15
16
  # Simple average: sum(n) / n
16
17
  def average(ids, value_table)
17
18
  ids.inject(0.0){|sum, i| sum + value_table[i]} / ids.size
18
19
  end
19
20
 
20
- # Mean squared error: sum (x-y)^2
21
+ # Mean squared error: sum (x-y)^2
21
22
  def mean_squared_error(ids, value_table, mean = nil)
22
23
  mean ||= self.average ids, value_table
23
24
  ids.inject(0.0){|sum, i| sum + ((value_table[i] - mean)**2) }
@@ -29,14 +30,47 @@ module Nimbus
29
30
  def quadratic_loss(ids, value_table, mean = nil)
30
31
  self.mean_squared_error(ids, value_table, mean) / ids.size
31
32
  end
32
-
33
+
33
34
  # Difference between two values, squared. (x-y)^2
34
35
  def squared_difference(x,y)
35
36
  0.0 + (x-y)**2
36
37
  end
37
-
38
+
39
+ ## CLASSSIFICATION
40
+
41
+ # Gini index of a list of classified individuals.
42
+ #
43
+ # If a dataset T contains examples from n classes, then:
44
+ # gini(T) = 1 - Sum (Pj)^2
45
+ # where Pj is the relative frequency of class j in T
46
+ def gini_index(ids, value_table, classes)
47
+ total_size = ids.size.to_f
48
+ gini = 1 - class_sizes(ids, value_table, classes).inject(0.0){|sum, size|
49
+ sum + (size/total_size)**2}
50
+ gini.round(5)
51
+ end
52
+
53
+ # Majority class of a list of classified individuals.
54
+ # If more than one class has the same number of individuals,
55
+ # one of the majority classes is selected randomly.
56
+ def majority_class(ids, value_table, classes)
57
+ sizes = class_sizes(ids, value_table, classes)
58
+ Hash[classes.zip sizes].keep_if{|k,v| v == sizes.max}.keys.sample
59
+ end
60
+
61
+ # Majority class of a list of classes.
62
+ # If more than one class has the same number of individuals,
63
+ # one of the majority classes is selected randomly.
64
+ def majority_class_in_list(list, classes)
65
+ sizes = classes.map{|c| list.count{|i| i == c}}
66
+ Hash[classes.zip sizes].keep_if{|k,v| v == sizes.max}.keys.sample
67
+ end
68
+
69
+ # Array with the list of sizes of each class in the given list of individuals.
70
+ def class_sizes(ids, value_table, classes)
71
+ classes.map{|c| ids.count{|i| value_table[i] == c}}
72
+ end
38
73
  end
39
-
74
+
40
75
  end
41
76
  end
42
-