nimbus 1.0.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,12 @@
1
1
  module Nimbus
2
2
  #####################################################################
3
- # Nimbus configuration object.
4
- #
3
+ # Nimbus configuration object.
4
+ #
5
5
  # This class reads every user file.
6
- # Once the user's config.yml file is loaded, a set of default and
6
+ # Once the user's config.yml file is loaded, a set of default and
7
7
  # custom options is created and stored.
8
- #
9
- # Nimbus::Configuration also reads the testing files and the data
8
+ #
9
+ # Nimbus::Configuration also reads the testing files and the data
10
10
  # to create the training set to be passed to the Nimbus::Forest random
11
11
  # forest generator and the Nimbus::Tree classes in it.
12
12
  #
@@ -15,6 +15,7 @@ module Nimbus
15
15
  :training_file,
16
16
  :testing_file,
17
17
  :forest_file,
18
+ :classes,
18
19
  :config_file,
19
20
  :forest_size,
20
21
  :tree_SNP_sample_size,
@@ -32,65 +33,66 @@ module Nimbus
32
33
  :output_snp_importances_file,
33
34
  :silent
34
35
  )
35
-
36
+
36
37
  DEFAULTS = {
37
38
  :forest_size => 500,
38
39
  :tree_SNP_sample_size => 60,
39
40
  :tree_SNP_total_count => 200,
40
41
  :tree_node_min_size => 5,
41
-
42
+
42
43
  :loss_function_discrete => 'majority_class',
43
- :loss_function_continuous => 'mean',
44
-
44
+ :loss_function_continuous => 'average',
45
+
45
46
  :training_file => 'training.data',
46
47
  :testing_file => 'testing.data',
47
48
  :forest_file => 'forest.yml',
48
49
  :config_file => 'config.yml',
49
-
50
+
50
51
  :output_forest_file => 'random_forest.yml',
51
52
  :output_training_file => 'training_file_predictions.txt',
52
53
  :output_testing_file => 'testing_file_predictions.txt',
53
54
  :output_tree_errors_file => 'generalization_errors.txt',
54
55
  :output_snp_importances_file => 'snp_importances.txt',
55
-
56
+
56
57
  :silent => false
57
58
  }
58
-
59
+
59
60
  # Initialize a Nimbus::Configuration object.
60
61
  #
61
62
  # Set all options to their default values.
62
63
  def initialize
63
64
  @do_training = false
64
65
  @do_testing = false
65
-
66
+
66
67
  @forest_size = DEFAULTS[:forest_size]
67
68
  @tree_SNP_sample_size = DEFAULTS[:tree_SNP_sample_size]
68
69
  @tree_SNP_total_count = DEFAULTS[:tree_SNP_total_count]
69
70
  @tree_node_min_size = DEFAULTS[:tree_node_min_size]
70
71
  @loss_function_discrete = DEFAULTS[:loss_function_discrete]
71
72
  @loss_function_continuous = DEFAULTS[:loss_function_continuous]
72
-
73
+
73
74
  @output_forest_file = File.expand_path(DEFAULTS[:output_forest_file], Dir.pwd)
74
75
  @output_training_file = File.expand_path(DEFAULTS[:output_training_file], Dir.pwd)
75
76
  @output_testing_file = File.expand_path(DEFAULTS[:output_testing_file], Dir.pwd)
76
77
  @output_tree_errors_file = File.expand_path(DEFAULTS[:output_tree_errors_file], Dir.pwd)
77
78
  @output_snp_importances_file = File.expand_path(DEFAULTS[:output_snp_importances_file], Dir.pwd)
78
-
79
+
79
80
  @silent = ENV['nimbus_test'] == 'running_nimbus_tests' ? true : DEFAULTS[:silent]
80
81
  end
81
-
82
+
82
83
  # Accessor method for the tree-related subset of options.
83
84
  def tree
84
- {
85
+ {
85
86
  :snp_sample_size => @tree_SNP_sample_size,
86
87
  :snp_total_count => @tree_SNP_total_count,
87
- :tree_node_min_size => @tree_node_min_size
88
+ :tree_node_min_size => @tree_node_min_size,
89
+ :classes => @classes
88
90
  }
89
91
  end
90
-
92
+
91
93
  # This is the first method to be called on Configuration when a config.yml file
92
94
  # exists with user input options for the forest.
93
- #
95
+ #
94
96
  # * The method will read the config file and change the default value of the selected options.
95
97
  # * Then based on the options and the existence of training, testing and forest files, it will mark:
96
98
  # - if training is needed,
@@ -110,24 +112,26 @@ module Nimbus
110
112
  raise Nimbus::WrongFormatFileError, "It was not posible to parse the config file (#{config_file}): \r\n#{e.message} "
111
113
  end
112
114
  end
113
-
115
+
114
116
  if user_config_params['input']
115
117
  @training_file = File.expand_path(user_config_params['input']['training'], dirname) if user_config_params['input']['training']
116
118
  @testing_file = File.expand_path(user_config_params['input']['testing' ], dirname) if user_config_params['input']['testing']
117
119
  @forest_file = File.expand_path(user_config_params['input']['forest' ], dirname) if user_config_params['input']['forest']
120
+ @classes = user_config_params['input']['classes'] if user_config_params['input']['classes']
118
121
  else
119
122
  @training_file = File.expand_path(DEFAULTS[:training_file], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:training_file], Dir.pwd)
120
123
  @testing_file = File.expand_path(DEFAULTS[:testing_file ], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:testing_file ], Dir.pwd)
121
124
  @forest_file = File.expand_path(DEFAULTS[:forest_file ], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:forest_file ], Dir.pwd)
122
125
  end
123
-
126
+
124
127
  @do_training = true if @training_file
125
128
  @do_testing = true if @testing_file
126
-
129
+ @classes = @classes.map{|c| c.to_s.strip} if @classes
130
+
127
131
  if @do_testing && !@do_training && !@forest_file
128
132
  raise Nimbus::InputFileError, "There is not random forest data (training file not defined, and forest file not found)."
129
133
  end
130
-
134
+
131
135
  if user_config_params['forest']
132
136
  @forest_size = user_config_params['forest']['forest_size'].to_i if user_config_params['forest']['forest_size']
133
137
  @tree_SNP_total_count = user_config_params['forest']['SNP_total_count'].to_i if user_config_params['forest']['SNP_total_count']
@@ -138,7 +142,7 @@ module Nimbus
138
142
  check_configuration
139
143
  log_configuration
140
144
  end
141
-
145
+
142
146
  # The method reads the training file, and if the data is valid, creates a Nimbus::TrainingSet
143
147
  # containing every individual to be used as training sample for a random forest.
144
148
  def load_training_data
@@ -150,12 +154,15 @@ module Nimbus
150
154
  raise Nimbus::InputFileError, "Individual ##{data_id} from training set has no value for all #{@tree_SNP_total_count} SNPs" unless snp_list.size == @tree_SNP_total_count
151
155
  raise Nimbus::InputFileError, "There are individuals with no ID, please check data in training file." unless (!data_id.nil? && data_id.strip != '')
152
156
  raise Nimbus::InputFileError, "Individual ##{data_id} has no fenotype value, please check data in training file." unless (!data_feno.nil? && data_feno.strip != '')
153
- @training_set.individuals[data_id.to_i] = Nimbus::Individual.new(data_id.to_i, data_feno.to_f, snp_list.map{|snp| snp.to_i})
154
- @training_set.ids_fenotypes[data_id.to_i] = data_feno.to_f
157
+ raise Nimbus::InputFileError, "Individual ##{data_id} has invalid class (not in [#{classes*', '}]), please check data in training file." unless (@classes.nil? || @classes.include?(data_feno))
158
+
159
+ data_feno = (@classes ? data_feno.to_s : data_feno.to_f)
160
+ @training_set.individuals[data_id.to_i] = Nimbus::Individual.new(data_id.to_i, data_feno, snp_list.map{|snp| snp.to_i})
161
+ @training_set.ids_fenotypes[data_id.to_i] = data_feno
155
162
  end
156
163
  }
157
164
  end
158
-
165
+
159
166
  # Reads the testing file, and if the data is valid, yields one Nimbus::Individual at a time.
160
167
  def read_testing_data
161
168
  File.open(@testing_file) {|file|
@@ -169,7 +176,7 @@ module Nimbus
169
176
  end
170
177
  }
171
178
  end
172
-
179
+
173
180
  # Creates a Nimbus::Forest object from a user defined random forest data file.
174
181
  #
175
182
  # The format of the input file should be the same as the forest output data of a Nimbus Application.
@@ -186,14 +193,14 @@ module Nimbus
186
193
  forest.trees = trees
187
194
  forest
188
195
  end
189
-
196
+
190
197
  # Include tests to be passed by the info contained in the config file.
191
198
  #
192
199
  # If some of the configuration data provided by the user is invalid, an error is raised and execution stops.
193
200
  def check_configuration
194
201
  raise Nimbus::ConfigurationError, "The mtry sample size must be smaller than the total SNPs count." if @tree_SNP_sample_size > @tree_SNP_total_count
195
202
  end
196
-
203
+
197
204
  # Prints the information stored in the Nimbus::Configuration object
198
205
  #
199
206
  # It could include errors on the configuration input data, training related info and/or testing related info.
@@ -209,31 +216,39 @@ module Nimbus
209
216
  Nimbus.message "*" * 50
210
217
  Nimbus.stop "Error: No input data. Nimbus finished."
211
218
  end
212
-
219
+
213
220
  Nimbus.message "*" * 50
214
- Nimbus.message "* Nimbus configured with the following parameters: "
221
+ Nimbus.message "* Nimbus version #{::Nimbus::VERSION}"
222
+ Nimbus.message "* configured with the following parameters: "
215
223
  Nimbus.message "* Forest size: #{@forest_size} trees"
216
224
  Nimbus.message "* Total SNP count: #{@tree_SNP_total_count}"
217
225
  Nimbus.message "* SNPs sample size (mtry): #{@tree_SNP_sample_size}"
218
226
  Nimbus.message "* Minimun node size in tree: #{@tree_node_min_size}"
227
+
228
+ if @classes
229
+ Nimbus.message "* Mode: CLASSIFICATION"
230
+ Nimbus.message "* Classes: [#{@classes*', '}]"
231
+ else
232
+ Nimbus.message "* Mode: REGRESSION"
233
+ end
219
234
  Nimbus.message "*" * 50
220
-
235
+
221
236
  if @do_training
222
237
  Nimbus.message "* Training data:"
223
238
  Nimbus.message "* Training file: #{@training_file}"
224
239
  Nimbus.message "*" * 50
225
240
  end
226
-
241
+
227
242
  if @do_testing
228
243
  Nimbus.message "* Data to be tested:"
229
244
  Nimbus.message "* Testing file: #{@testing_file}"
230
245
  if @forest_file
231
- Nimbus.message "* using the structure of the random forest stored in:"
246
+ Nimbus.message "* using the structure of the random forest stored in:"
232
247
  Nimbus.message "* Random forest file: #{@forest_file}"
233
248
  end
234
249
  Nimbus.message "*" * 50
235
250
  end
236
251
  end
237
-
252
+
238
253
  end
239
254
  end
data/lib/nimbus/forest.rb CHANGED
@@ -1,13 +1,13 @@
1
1
  module Nimbus
2
2
 
3
3
  #####################################################################
4
- # Forest represents the Random forest being generated
4
+ # Forest represents the Random forest being generated
5
5
  # (or used to test samples) by the application object.
6
6
  #
7
7
  class Forest
8
8
  attr_accessor :size, :trees, :bag, :predictions, :tree_errors, :snp_importances
9
9
  attr_accessor :options
10
-
10
+
11
11
  # Initialize Forest object with options included in the Nimbus::Configuration object received.
12
12
  def initialize(config)
13
13
  @trees = []
@@ -20,7 +20,7 @@ module Nimbus
20
20
  @tree_snp_importances = []
21
21
  raise Nimbus::ForestError, "Forest size parameter (#{@size}) is invalid. You need at least one tree." if @size < 1
22
22
  end
23
-
23
+
24
24
  # Creates a random forest based on the TrainingSet included in the configuration, creating N random trees (size N defined in the configuration).
25
25
  #
26
26
  # This is the method called when the application's configuration flags training on.
@@ -35,10 +35,11 @@ module Nimbus
35
35
  # Every tree of the forest is created with a different random sample of the individuals in the training set.
36
36
  def grow
37
37
  @size.times do |i|
38
- Nimbus.write("Creating trees: #{i+1}/#{@size} ")
38
+ Nimbus.write("\rCreating trees: #{i+1}/#{@size} ")
39
39
  tree_individuals_bag = individuals_random_sample
40
40
  tree_out_of_bag = oob tree_individuals_bag
41
- tree = Tree.new @options.tree
41
+ tree_class = (classification? ? ClassificationTree : RegressionTree)
42
+ tree = tree_class.new @options.tree
42
43
  @trees << tree.seed(@options.training_set.individuals, tree_individuals_bag, @options.training_set.ids_fenotypes)
43
44
  @tree_errors << tree.generalization_error_from_oob(tree_out_of_bag)
44
45
  @tree_snp_importances << tree.estimate_importances(tree_out_of_bag)
@@ -46,13 +47,18 @@ module Nimbus
46
47
  Nimbus.clear_line!
47
48
  end
48
49
  average_snp_importances
49
- average_predictions
50
+ totalize_predictions
50
51
  end
51
-
52
- # Traverse a testing set through every tree of the forest and get averaged predictions for every individual in the sample.
52
+
53
+ # Traverse a testing set through every tree of the forest.
53
54
  #
54
55
  # This is the method called when the application's configuration flags testing on.
55
56
  def traverse
57
+ classification? ? traverse_classification_forest : traverse_regression_forest
58
+ end
59
+
60
+ # Traverse a testing set through every regression tree of the forest and get averaged predictions for every individual in the sample.
61
+ def traverse_regression_forest
56
62
  @predictions = {}
57
63
  prediction_count = trees.size
58
64
  @options.read_testing_data{|individual|
@@ -63,44 +69,66 @@ module Nimbus
63
69
  @predictions[individual.id] = (individual_prediction / prediction_count).round(5)
64
70
  }
65
71
  end
66
-
72
+
73
+ # Traverse a testing set through every classification tree of the forest and get majority class predictions for every individual in the sample.
74
+ def traverse_classification_forest
75
+ @predictions = {}
76
+ @options.read_testing_data{|individual|
77
+ individual_prediction = []
78
+ trees.each do |t|
79
+ individual_prediction << Nimbus::Tree.traverse(t, individual.snp_list)
80
+ end
81
+ @predictions[individual.id] = Nimbus::LossFunctions.majority_class_in_list(individual_prediction, @options.tree[:classes])
82
+ }
83
+ end
84
+
67
85
  # The array containing every tree in the forest, to YAML format.
68
86
  def to_yaml
69
87
  @trees.to_yaml
70
88
  end
71
-
89
+
72
90
  private
73
-
91
+
74
92
  def individuals_random_sample
75
93
  individuals_sample = bag.inject([]){|items, i| items << bag.sample }.sort
76
94
  end
77
-
95
+
78
96
  def oob(in_bag=[])
79
97
  bag - in_bag.uniq
80
98
  end
81
-
99
+
82
100
  def bag
83
101
  @bag ||= @options.training_set.all_ids
84
102
  end
85
-
103
+
86
104
  def acumulate_predictions(preds)
87
105
  preds.each_pair.each{|id, value|
88
106
  if @predictions[id].nil?
89
- @predictions[id] = value
107
+ @predictions[id] = (classification? ? [value] : value)
90
108
  @times_predicted[id] = 1.0
91
109
  else
92
- @predictions[id] += value
110
+ classification? ? (@predictions[id] << value) : (@predictions[id] += value)
93
111
  @times_predicted[id] += 1
94
112
  end
95
113
  }
96
114
  end
97
-
115
+
116
+ def totalize_predictions
117
+ classification? ? majority_class_predicted : average_predictions
118
+ end
119
+
98
120
  def average_predictions
99
121
  @predictions.each_pair{|id, value|
100
122
  @predictions[id] = (@predictions[id] / @times_predicted[id]).round(5)
101
123
  }
102
124
  end
103
-
125
+
126
+ def majority_class_predicted
127
+ @predictions.each_pair{|id, values|
128
+ @predictions[id] = Nimbus::LossFunctions.majority_class_in_list(values, @options.tree[:classes])
129
+ }
130
+ end
131
+
104
132
  def average_snp_importances
105
133
  1.upto(@options.tree_SNP_total_count) {|snp|
106
134
  @snp_importances[snp] = 0.0
@@ -110,7 +138,15 @@ module Nimbus
110
138
  @snp_importances[snp] = @snp_importances[snp] / @size
111
139
  }
112
140
  end
113
-
141
+
142
+ def classification?
143
+ @options.tree[:classes]
144
+ end
145
+
146
+ def regression?
147
+ @options.tree[:classes].nil?
148
+ end
149
+
114
150
  end
115
-
151
+
116
152
  end
@@ -1,19 +1,19 @@
1
1
  module Nimbus
2
2
  #####################################################################
3
- # Nimbus Individual object.
4
- #
5
- # It represents a single individual of a training or testing sample.
6
- #
3
+ # Nimbus Individual object.
4
+ #
5
+ # It represents a single individual of a training or testing sample.
6
+ #
7
7
  # This class stores information about a individual:
8
8
  #
9
9
  # * id,
10
10
  # * values for all the SNPs of the individual,
11
- # * fenotype if present,
11
+ # * fenotype if present,
12
12
  # * the prediction is it exists.
13
13
  #
14
14
  class Individual
15
15
  attr_accessor :id, :fenotype, :prediction, :snp_list
16
-
16
+
17
17
  # Initialize individual with passed data.
18
18
  def initialize(i, fen, snps=[])
19
19
  self.id = i
@@ -21,5 +21,5 @@ module Nimbus
21
21
  self.snp_list = snps
22
22
  end
23
23
  end
24
-
24
+
25
25
  end
@@ -1,23 +1,24 @@
1
1
  # encoding: utf-8
2
2
  module Nimbus
3
-
3
+
4
4
  #####################################################################
5
5
  # Math functions.
6
- #
6
+ #
7
7
  # The LossFunctions class provides handy mathematical functions as class methods
8
8
  # to be used by Tree and Forest when estimating predictions, errors and loss functions
9
- # for training and testing data.
9
+ # for training and testing data.
10
10
  #
11
11
  module LossFunctions
12
-
12
+
13
13
  class << self
14
-
14
+ ## REGRESSION
15
+
15
16
  # Simple average: sum(n) / n
16
17
  def average(ids, value_table)
17
18
  ids.inject(0.0){|sum, i| sum + value_table[i]} / ids.size
18
19
  end
19
20
 
20
- # Mean squared error: sum (x-y)^2
21
+ # Mean squared error: sum (x-y)^2
21
22
  def mean_squared_error(ids, value_table, mean = nil)
22
23
  mean ||= self.average ids, value_table
23
24
  ids.inject(0.0){|sum, i| sum + ((value_table[i] - mean)**2) }
@@ -29,14 +30,47 @@ module Nimbus
29
30
  def quadratic_loss(ids, value_table, mean = nil)
30
31
  self.mean_squared_error(ids, value_table, mean) / ids.size
31
32
  end
32
-
33
+
33
34
  # Difference between two values, squared. (x-y)^2
34
35
  def squared_difference(x,y)
35
36
  0.0 + (x-y)**2
36
37
  end
37
-
38
+
39
+ ## CLASSSIFICATION
40
+
41
+ # Gini index of a list of classified individuals.
42
+ #
43
+ # If a dataset T contains examples from n classes, then:
44
+ # gini(T) = 1 - Sum (Pj)^2
45
+ # where Pj is the relative frequency of class j in T
46
+ def gini_index(ids, value_table, classes)
47
+ total_size = ids.size.to_f
48
+ gini = 1 - class_sizes(ids, value_table, classes).inject(0.0){|sum, size|
49
+ sum + (size/total_size)**2}
50
+ gini.round(5)
51
+ end
52
+
53
+ # Majority class of a list of classified individuals.
54
+ # If more than one class has the same number of individuals,
55
+ # one of the majority classes is selected randomly.
56
+ def majority_class(ids, value_table, classes)
57
+ sizes = class_sizes(ids, value_table, classes)
58
+ Hash[classes.zip sizes].keep_if{|k,v| v == sizes.max}.keys.sample
59
+ end
60
+
61
+ # Majority class of a list of classes.
62
+ # If more than one class has the same number of individuals,
63
+ # one of the majority classes is selected randomly.
64
+ def majority_class_in_list(list, classes)
65
+ sizes = classes.map{|c| list.count{|i| i == c}}
66
+ Hash[classes.zip sizes].keep_if{|k,v| v == sizes.max}.keys.sample
67
+ end
68
+
69
+ # Array with the list of sizes of each class in the given list of individuals.
70
+ def class_sizes(ids, value_table, classes)
71
+ classes.map{|c| ids.count{|i| value_table[i] == c}}
72
+ end
38
73
  end
39
-
74
+
40
75
  end
41
76
  end
42
-