nimbus 0.9 → 0.10

Sign up to get free protection for your applications and to get access to all the features.
data/lib/nimbus.rb CHANGED
@@ -9,6 +9,11 @@ require 'nimbus/tree'
9
9
  require 'nimbus/forest'
10
10
  require 'nimbus/application'
11
11
 
12
+ #####################################################################
13
+ # Nimbus module.
14
+ # Used as a namespace containing all the Nimbus code.
15
+ # The module defines a Nimbus::Application and interacts with the user output console.
16
+ #
12
17
  module Nimbus
13
18
 
14
19
  STDERR = $stderr
@@ -1,15 +1,16 @@
1
1
  module Nimbus
2
2
 
3
3
  #####################################################################
4
- # Nimbus main application object. When invoking +nimbus+ from the
5
- # command line, a Nimbus::Application object is created and run.
4
+ # Nimbus main application object.
5
+ #
6
+ # When invoking +nimbus+ from the command line,
7
+ # a Nimbus::Application object is created and run.
6
8
  #
7
9
  class Application
8
10
  attr_accessor :config
9
11
 
10
12
  # Initialize a Nimbus::Application object.
11
13
  # Check and load the configuration options.
12
- #
13
14
  def initialize
14
15
  nimbus_exception_handling do
15
16
  config.load
@@ -18,10 +19,11 @@ module Nimbus
18
19
  end
19
20
 
20
21
  # Run the Nimbus application. The run method performs the following
21
- # two steps:
22
+ # three steps:
22
23
  #
23
- # * Creates a Nimbus::Forest object.
24
- # * Writes results to output files.
24
+ # * Create a Nimbus::Forest object.
25
+ # * Decide action to take: training a random forest and/or use the forest to predict values for a testing set
26
+ # * Write results to output files.
25
27
  def run
26
28
  nimbus_exception_handling do
27
29
 
@@ -44,6 +46,8 @@ module Nimbus
44
46
  end
45
47
 
46
48
  # Creates an instance of Nimbus::Configuration if it does not exist.
49
+ # This config object contains every option to be used for the random forest
50
+ # including the user input set through the config.yml file.
47
51
  def config
48
52
  @config ||= ::Nimbus::Configuration.new
49
53
  end
@@ -1,4 +1,15 @@
1
1
  module Nimbus
2
+ #####################################################################
3
+ # Nimbus configuration object.
4
+ #
5
+ # This class reads every user file.
6
+ # Once the user's config.yml file is loaded, a set of default and
7
+ # custom options is created and stored.
8
+ #
9
+ # Nimbus::Configuration also reads the testing files and the data
10
+ # to create the training set to be passed to the Nimbus::Forest random
11
+ # forest generator and the Nimbus::Tree classes in it.
12
+ #
2
13
  class Configuration
3
14
  attr_accessor(
4
15
  :training_file,
@@ -42,7 +53,9 @@ module Nimbus
42
53
  :output_snp_importances_file => 'snp_importances.txt'
43
54
  }
44
55
 
45
-
56
+ # Initialize a Nimbus::Configuration object.
57
+ #
58
+ # Set all options to their default values.
46
59
  def initialize
47
60
  @do_training = false
48
61
  @do_testing = false
@@ -61,6 +74,7 @@ module Nimbus
61
74
  @output_snp_importances_file = File.expand_path(DEFAULTS[:output_snp_importances_file], Dir.pwd)
62
75
  end
63
76
 
77
+ # Accessor method for the tree-related subset of options.
64
78
  def tree
65
79
  {
66
80
  :snp_sample_size => @tree_SNP_sample_size,
@@ -69,6 +83,16 @@ module Nimbus
69
83
  }
70
84
  end
71
85
 
86
+ # This is the first method to be called on Configuration when a config.yml file
87
+ # exists with user input options for the forest.
88
+ #
89
+ # * The method will read the config file and change the default value of the selected options.
90
+ # * Then based on the options and the existence of training, testing and forest files, it will mark:
91
+ # - if training is needed,
92
+ # - if testing is needed,
93
+ # - which forest will be used for the testing.
94
+ # * Finally it will run basic checks on the input data trying to prevent future program errors.
95
+ #
72
96
  def load(config_file = DEFAULTS[:config_file])
73
97
  user_config_params = {}
74
98
  if File.exists?(File.expand_path(config_file, Dir.pwd))
@@ -107,6 +131,8 @@ module Nimbus
107
131
  log_configuration
108
132
  end
109
133
 
134
+ # The method reads the training file, and if the data is valid, creates a Nimbus::TrainingSet
135
+ # containing every individual to be used as training sample for a random forest.
110
136
  def load_training_data
111
137
  File.open(@training_file) {|file|
112
138
  @training_set = Nimbus::TrainingSet.new({}, {})
@@ -122,6 +148,7 @@ module Nimbus
122
148
  }
123
149
  end
124
150
 
151
+ # Reads the testing file, and if the data is valid, yields one Nimbus::Individual at a time.
125
152
  def read_testing_data
126
153
  File.open(@testing_file) {|file|
127
154
  file.each do |line|
@@ -135,6 +162,9 @@ module Nimbus
135
162
  }
136
163
  end
137
164
 
165
+ # Creates a Nimbus::Forest object from a user defined random forest data file.
166
+ #
167
+ # The format of the input file should be the same as the forest output data of a Nimbus Application.
138
168
  def load_forest
139
169
  trees = []
140
170
  if File.exists?(@forest_file)
@@ -149,10 +179,16 @@ module Nimbus
149
179
  forest
150
180
  end
151
181
 
182
+ # Include tests to be passed by the info contained in the config file.
183
+ #
184
+ # If some of the configuration data provided by the user is invalid, an error is raised and execution stops.
152
185
  def check_configuration
153
186
  raise Nimbus::ConfigurationError, "The mtry sample size must be smaller than the total SNPs count." if @tree_SNP_sample_size > @tree_SNP_total_count
154
187
  end
155
188
 
189
+ # Prints the information stored in the Nimbus::Configuration object
190
+ #
191
+ # It could include errors on the configuration input data, training related info and/or testing related info.
156
192
  def log_configuration
157
193
  if !@do_training && !@do_testing
158
194
  Nimbus.message "*" * 50
@@ -1,10 +1,18 @@
1
1
  module Nimbus
2
+ # Nimbus custom Error class.
2
3
  class Error < StandardError; end
4
+ # Error when a non existent or invalid option is used.
3
5
  class InvalidOptionError < Error; end
6
+ # Error in some of the input files.
4
7
  class InputFileError < Error; end
8
+ # Error if data from some input file are incorrectly formatted.
5
9
  class WrongFormatFileError < Error; end
10
+ # Error if configuration options are invalid.
6
11
  class ConfigurationError < Error; end
12
+ # Error handling a random Forest.
7
13
  class ForestError < Error; end
14
+ # Error handling a Tree object.
8
15
  class TreeError < Error; end
16
+ # Error in the data of an Individual object.
9
17
  class IndividualError < Error; end
10
18
  end
data/lib/nimbus/forest.rb CHANGED
@@ -1,9 +1,14 @@
1
1
  module Nimbus
2
-
2
+
3
+ #####################################################################
4
+ # Forest represents the Random forest being generated
5
+ # (or used to test samples) by the application object.
6
+ #
3
7
  class Forest
4
8
  attr_accessor :size, :trees, :bag, :predictions, :tree_errors, :snp_importances
5
9
  attr_accessor :options
6
10
 
11
+ # Initialize Forest object with options included in the Nimbus::Configuration object received.
7
12
  def initialize(config)
8
13
  @trees = []
9
14
  @tree_errors = []
@@ -16,6 +21,18 @@ module Nimbus
16
21
  raise Nimbus::ForestError, "Forest size parameter (#{@size}) is invalid. You need at least one tree." if @size < 1
17
22
  end
18
23
 
24
+ # Creates a random forest based on the TrainingSet included in the configuration, creating N random trees (size N defined in the configuration).
25
+ #
26
+ # This is the method called when the application's configuration flags training on.
27
+ #
28
+ # It performs this tasks:
29
+ #
30
+ # * grow the forest (all the N random trees)
31
+ # * store generalization errors for every tree
32
+ # * obtain averaged importances for all the SNPs
33
+ # * calculate averaged predictions for all individuals in the training sample
34
+ #
35
+ # Every tree of the forest is created with a different random sample of the individuals in the training set.
19
36
  def grow
20
37
  @size.times do |i|
21
38
  Nimbus.write("Creating trees: #{i+1}/#{@size} ")
@@ -32,6 +49,9 @@ module Nimbus
32
49
  average_predictions
33
50
  end
34
51
 
52
+ # Traverse a testing set through every tree of the forest and get averaged predictions for every individual in the sample.
53
+ #
54
+ # This is the method called when the application's configuration flags testing on.
35
55
  def traverse
36
56
  @predictions = {}
37
57
  prediction_count = trees.size
@@ -44,6 +64,7 @@ module Nimbus
44
64
  }
45
65
  end
46
66
 
67
+ # The array containing every tree in the forest, to YAML format.
47
68
  def to_yaml
48
69
  @trees.to_yaml
49
70
  end
@@ -1,8 +1,20 @@
1
1
  module Nimbus
2
-
2
+ #####################################################################
3
+ # Nimbus Individual object.
4
+ #
5
+ # It represents a single individual of a training or testing sample.
6
+ #
7
+ # This class stores information about a individual:
8
+ #
9
+ # * id,
10
+ # * values for all the SNPs of the individual,
11
+ # * fenotype if present,
12
+ # * the prediction is it exists.
13
+ #
3
14
  class Individual
4
15
  attr_accessor :id, :fenotype, :prediction, :snp_list
5
16
 
17
+ # Initialize individual with passed data.
6
18
  def initialize(i, fen, snps=[])
7
19
  self.id = i
8
20
  self.fenotype = fen
@@ -1,21 +1,36 @@
1
+ # encoding: utf-8
1
2
  module Nimbus
3
+
4
+ #####################################################################
5
+ # Math functions.
6
+ #
7
+ # The LossFunctions class provides handy mathematical functions as class methods
8
+ # to be used by Tree and Forest when estimating predictions, errors and loss functions
9
+ # for training and testing data.
10
+ #
2
11
  module LossFunctions
3
12
 
4
13
  class << self
5
-
14
+
15
+ # Simple average: sum(n) / n
6
16
  def average(ids, value_table)
7
17
  ids.inject(0.0){|sum, i| sum + value_table[i]} / ids.size
8
18
  end
9
19
 
20
+ # Mean squared error: sum (x-y)^2
10
21
  def mean_squared_error(ids, value_table, mean = nil)
11
22
  mean ||= self.average ids, value_table
12
23
  ids.inject(0.0){|sum, i| sum + ((value_table[i] - mean)**2) }
13
24
  end
14
25
 
26
+ # Quadratic loss: averaged mean squared error: sum (x-y)^2 / n
27
+ #
28
+ # Default loss function for regression forests.
15
29
  def quadratic_loss(ids, value_table, mean = nil)
16
30
  self.mean_squared_error(ids, value_table, mean) / ids.size
17
31
  end
18
32
 
33
+ # Difference between two values, squared. (x-y)^2
19
34
  def squared_difference(x,y)
20
35
  0.0 + (x-y)**2
21
36
  end
@@ -1,13 +1,19 @@
1
1
  module Nimbus
2
-
2
+ #####################################################################
3
+ # Set of individuals to be used as training sample for a random forest.
4
+ #
5
+ # the TrainingSet class stores an array of individuals, and a hash with the fenotypes of every individual indexed by id.
6
+ #
3
7
  class TrainingSet
4
8
  attr_accessor :individuals, :ids_fenotypes
5
9
 
10
+ # Initialize a new training set with the individuals and fenotype info received.
6
11
  def initialize(individuals, ids_fenotypes)
7
12
  @individuals = individuals
8
13
  @ids_fenotypes = ids_fenotypes
9
14
  end
10
15
 
16
+ # Array of all the ids of the individuals in this training sample.
11
17
  def all_ids
12
18
  @all_ids ||= @ids_fenotypes.keys
13
19
  @all_ids
data/lib/nimbus/tree.rb CHANGED
@@ -1,15 +1,34 @@
1
1
  module Nimbus
2
-
2
+
3
+ #####################################################################
4
+ # Tree object representing a random tree.
5
+ #
6
+ # A tree is generated following this steps:
7
+ #
8
+ # * 1: Calculate loss function for the individuals in the node (first node contains all the individuals).
9
+ # * 2: Take a random sample of the SNPs (size m << total count of SNPs)
10
+ # * 3: Compute the loss function for the split of the sample based on value of every SNP.
11
+ # * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in three nodes, based on value for that SNP [0, 1, or 2]
12
+ # * 5: Repeat from 1 for every node until:
13
+ # - a) The individuals count in that node is < minimum size OR
14
+ # - b) None of the SNP splits has a loss function smaller than the node loss function
15
+ # * 6) When a node stops, label the node with the average fenotype value of the individuals in the node.
16
+ #
3
17
  class Tree
4
18
  attr_accessor :snp_sample_size, :snp_total_count, :node_min_size, :used_snps, :structure, :generalization_error, :predictions, :importances
5
19
  attr_accessor :individuals, :id_to_fenotype
6
20
 
21
+ # Initialize Tree object with the configuration (as in Nimbus::Configuration.tree) options received.
7
22
  def initialize(options)
8
23
  @snp_total_count = options[:snp_total_count]
9
24
  @snp_sample_size = options[:snp_sample_size]
10
25
  @node_min_size = options[:tree_node_min_size]
11
26
  end
12
27
 
28
+ # Creates the structure of the tree, as a hash of SNP splits and values.
29
+ #
30
+ # It just initializes the needed variables and then defines the first node of the tree.
31
+ # The rest of the structure of the tree is computed recursively building every node calling +build_node+.
13
32
  def seed(all_individuals, individuals_sample, ids_fenotypes)
14
33
  @individuals = all_individuals
15
34
  @id_to_fenotype = ids_fenotypes
@@ -19,6 +38,11 @@ module Nimbus
19
38
  @structure = build_node individuals_sample, Nimbus::LossFunctions.average(individuals_sample, @id_to_fenotype)
20
39
  end
21
40
 
41
+ # Creates a node by taking a random sample of the SNPs and computing the loss function for every split by SNP of that sample.
42
+ #
43
+ # * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in three:
44
+ # (those with value 0 for the SNP_min, those with value 1 for the SNP_min, and those with value 2 for the SNP_min) then it builds these 3 new nodes.
45
+ # * Otherwise every individual in the node gets labeled with the average of the fenotype values of all of them.
22
46
  def build_node(individuals_ids, y_hat)
23
47
  # General loss function value for the node
24
48
  individuals_count = individuals_ids.size
@@ -45,16 +69,13 @@ module Nimbus
45
69
  return build_branch(min_SNP, split, means, y_hat) if min_loss < node_loss_function
46
70
  return label_node(y_hat, individuals_ids)
47
71
  end
48
-
49
- def build_branch(snp, split, y_hats, parent_y_hat)
50
- node_0 = split[0].size == 0 ? label_node(parent_y_hat, []) : build_node(split[0], y_hats[0])
51
- node_1 = split[1].size == 0 ? label_node(parent_y_hat, []) : build_node(split[1], y_hats[1])
52
- node_2 = split[2].size == 0 ? label_node(parent_y_hat, []) : build_node(split[2], y_hats[2])
53
-
54
- split_by_snp(snp)
55
- return { snp => [node_0, node_1, node_2] }
56
- end
57
72
 
73
+ # Compute generalization error for the tree.
74
+ #
75
+ # Traversing the 'out of bag' (OOB) sample (those individuals of the training set not
76
+ # used in the building of this tree) through the tree, and comparing
77
+ # the prediction with the real fenotype of the individual (and then averaging) is
78
+ # possible to calculate the unbiased generalization error for the tree.
58
79
  def generalization_error_from_oob(oob_ids)
59
80
  return nil if (@structure.nil? || @individuals.nil? || @id_to_fenotype.nil?)
60
81
  oob_errors = {}
@@ -65,6 +86,15 @@ module Nimbus
65
86
  @generalization_error = Nimbus::LossFunctions.average oob_ids, oob_errors
66
87
  end
67
88
 
89
+ # Estimation of importance for every SNP.
90
+ #
91
+ # The importance of any SNP in the tree is calculated using the OOB sample.
92
+ # For every SNP, every individual in the sample is pushed down the tree but with the
93
+ # value of that SNP permuted with other individual in the sample.
94
+ #
95
+ # That way the difference between the regular prediction and the prediction with the SNP value modified can be estimated for any given SNP.
96
+ #
97
+ # This method computes importance estimations for every SNPs used in the tree (for any other SNP it would be 0).
68
98
  def estimate_importances(oob_ids)
69
99
  return nil if (@generalization_error.nil? && generalization_error_from_oob(oob_ids))
70
100
  oob_individuals_count = oob_ids.size
@@ -81,25 +111,30 @@ module Nimbus
81
111
  @importances
82
112
  end
83
113
 
114
+ # Class method to traverse a single individual through a tree structure.
115
+ #
116
+ # Returns the prediction for that individual (the label of the final node reached by the individual).
84
117
  def self.traverse(tree_structure, data)
85
118
  return tree_structure if tree_structure.is_a? Numeric
86
119
  raise Nimbus::TreeError, "Forest data has invalid structure. Please check your forest data (file)." if !(tree_structure.is_a?(Hash) && tree_structure.keys.size == 1)
87
120
  return self.traverse( tree_structure.values.first[ data[tree_structure.keys.first - 1].to_i], data)
88
121
  end
89
122
 
90
- def traverse_with_permutation(tree_structure, data, snp_to_permute, individual_to_permute)
91
- return tree_structure if tree_structure.is_a? Numeric
92
- individual_data = (tree_structure.keys.first == snp_to_permute ? individual_to_permute : data)
93
- return traverse_with_permutation( tree_structure.values.first[ individual_data[tree_structure.keys.first - 1].to_i], data, snp_to_permute, individual_to_permute)
94
- end
95
-
96
-
97
123
  private
98
124
 
99
125
  def snps_random_sample
100
126
  (1..@snp_total_count).to_a.sample(@snp_sample_size).sort
101
127
  end
102
128
 
129
+ def build_branch(snp, split, y_hats, parent_y_hat)
130
+ node_0 = split[0].size == 0 ? label_node(parent_y_hat, []) : build_node(split[0], y_hats[0])
131
+ node_1 = split[1].size == 0 ? label_node(parent_y_hat, []) : build_node(split[1], y_hats[1])
132
+ node_2 = split[2].size == 0 ? label_node(parent_y_hat, []) : build_node(split[2], y_hats[2])
133
+
134
+ split_by_snp(snp)
135
+ return { snp => [node_0, node_1, node_2] }
136
+ end
137
+
103
138
  def label_node(value, ids)
104
139
  label = value.round(5)
105
140
  ids.uniq.each{|i| @predictions[i] = label}
@@ -120,6 +155,12 @@ module Nimbus
120
155
  @used_snps << x
121
156
  end
122
157
 
158
+ def traverse_with_permutation(tree_structure, data, snp_to_permute, individual_to_permute)
159
+ return tree_structure if tree_structure.is_a? Numeric
160
+ individual_data = (tree_structure.keys.first == snp_to_permute ? individual_to_permute : data)
161
+ return traverse_with_permutation( tree_structure.values.first[ individual_data[tree_structure.keys.first - 1].to_i], data, snp_to_permute, individual_to_permute)
162
+ end
163
+
123
164
  end
124
165
 
125
166
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: nimbus
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: "0.9"
5
+ version: "0.10"
6
6
  platform: ruby
7
7
  authors:
8
8
  - "Juanjo Baz\xC3\xA1n"