nimbus 0.9 → 0.10
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/nimbus.rb +5 -0
- data/lib/nimbus/application.rb +10 -6
- data/lib/nimbus/configuration.rb +37 -1
- data/lib/nimbus/exceptions.rb +8 -0
- data/lib/nimbus/forest.rb +22 -1
- data/lib/nimbus/individual.rb +13 -1
- data/lib/nimbus/loss_functions.rb +16 -1
- data/lib/nimbus/training_set.rb +7 -1
- data/lib/nimbus/tree.rb +58 -17
- metadata +1 -1
data/lib/nimbus.rb
CHANGED
@@ -9,6 +9,11 @@ require 'nimbus/tree'
|
|
9
9
|
require 'nimbus/forest'
|
10
10
|
require 'nimbus/application'
|
11
11
|
|
12
|
+
#####################################################################
|
13
|
+
# Nimbus module.
|
14
|
+
# Used as a namespace containing all the Nimbus code.
|
15
|
+
# The module defines a Nimbus::Application and interacts with the user output console.
|
16
|
+
#
|
12
17
|
module Nimbus
|
13
18
|
|
14
19
|
STDERR = $stderr
|
data/lib/nimbus/application.rb
CHANGED
@@ -1,15 +1,16 @@
|
|
1
1
|
module Nimbus
|
2
2
|
|
3
3
|
#####################################################################
|
4
|
-
# Nimbus main application object.
|
5
|
-
#
|
4
|
+
# Nimbus main application object.
|
5
|
+
#
|
6
|
+
# When invoking +nimbus+ from the command line,
|
7
|
+
# a Nimbus::Application object is created and run.
|
6
8
|
#
|
7
9
|
class Application
|
8
10
|
attr_accessor :config
|
9
11
|
|
10
12
|
# Initialize a Nimbus::Application object.
|
11
13
|
# Check and load the configuration options.
|
12
|
-
#
|
13
14
|
def initialize
|
14
15
|
nimbus_exception_handling do
|
15
16
|
config.load
|
@@ -18,10 +19,11 @@ module Nimbus
|
|
18
19
|
end
|
19
20
|
|
20
21
|
# Run the Nimbus application. The run method performs the following
|
21
|
-
#
|
22
|
+
# three steps:
|
22
23
|
#
|
23
|
-
# *
|
24
|
-
# *
|
24
|
+
# * Create a Nimbus::Forest object.
|
25
|
+
# * Decide action to take: training a random forest and/or use the forest to predict values for a testing set
|
26
|
+
# * Write results to output files.
|
25
27
|
def run
|
26
28
|
nimbus_exception_handling do
|
27
29
|
|
@@ -44,6 +46,8 @@ module Nimbus
|
|
44
46
|
end
|
45
47
|
|
46
48
|
# Creates an instance of Nimbus::Configuration if it does not exist.
|
49
|
+
# This config object contains every option to be used for the random forest
|
50
|
+
# including the user input set through the config.yml file.
|
47
51
|
def config
|
48
52
|
@config ||= ::Nimbus::Configuration.new
|
49
53
|
end
|
data/lib/nimbus/configuration.rb
CHANGED
@@ -1,4 +1,15 @@
|
|
1
1
|
module Nimbus
|
2
|
+
#####################################################################
|
3
|
+
# Nimbus configuration object.
|
4
|
+
#
|
5
|
+
# This class reads every user file.
|
6
|
+
# Once the user's config.yml file is loaded, a set of default and
|
7
|
+
# custom options is created and stored.
|
8
|
+
#
|
9
|
+
# Nimbus::Configuration also reads the testing files and the data
|
10
|
+
# to create the training set to be passed to the Nimbus::Forest random
|
11
|
+
# forest generator and the Nimbus::Tree classes in it.
|
12
|
+
#
|
2
13
|
class Configuration
|
3
14
|
attr_accessor(
|
4
15
|
:training_file,
|
@@ -42,7 +53,9 @@ module Nimbus
|
|
42
53
|
:output_snp_importances_file => 'snp_importances.txt'
|
43
54
|
}
|
44
55
|
|
45
|
-
|
56
|
+
# Initialize a Nimbus::Configuration object.
|
57
|
+
#
|
58
|
+
# Set all options to their default values.
|
46
59
|
def initialize
|
47
60
|
@do_training = false
|
48
61
|
@do_testing = false
|
@@ -61,6 +74,7 @@ module Nimbus
|
|
61
74
|
@output_snp_importances_file = File.expand_path(DEFAULTS[:output_snp_importances_file], Dir.pwd)
|
62
75
|
end
|
63
76
|
|
77
|
+
# Accessor method for the tree-related subset of options.
|
64
78
|
def tree
|
65
79
|
{
|
66
80
|
:snp_sample_size => @tree_SNP_sample_size,
|
@@ -69,6 +83,16 @@ module Nimbus
|
|
69
83
|
}
|
70
84
|
end
|
71
85
|
|
86
|
+
# This is the first method to be called on Configuration when a config.yml file
|
87
|
+
# exists with user input options for the forest.
|
88
|
+
#
|
89
|
+
# * The method will read the config file and change the default value of the selected options.
|
90
|
+
# * Then based on the options and the existence of training, testing and forest files, it will mark:
|
91
|
+
# - if training is needed,
|
92
|
+
# - if testing is needed,
|
93
|
+
# - which forest will be used for the testing.
|
94
|
+
# * Finally it will run basic checks on the input data trying to prevent future program errors.
|
95
|
+
#
|
72
96
|
def load(config_file = DEFAULTS[:config_file])
|
73
97
|
user_config_params = {}
|
74
98
|
if File.exists?(File.expand_path(config_file, Dir.pwd))
|
@@ -107,6 +131,8 @@ module Nimbus
|
|
107
131
|
log_configuration
|
108
132
|
end
|
109
133
|
|
134
|
+
# The method reads the training file, and if the data is valid, creates a Nimbus::TrainingSet
|
135
|
+
# containing every individual to be used as training sample for a random forest.
|
110
136
|
def load_training_data
|
111
137
|
File.open(@training_file) {|file|
|
112
138
|
@training_set = Nimbus::TrainingSet.new({}, {})
|
@@ -122,6 +148,7 @@ module Nimbus
|
|
122
148
|
}
|
123
149
|
end
|
124
150
|
|
151
|
+
# Reads the testing file, and if the data is valid, yields one Nimbus::Individual at a time.
|
125
152
|
def read_testing_data
|
126
153
|
File.open(@testing_file) {|file|
|
127
154
|
file.each do |line|
|
@@ -135,6 +162,9 @@ module Nimbus
|
|
135
162
|
}
|
136
163
|
end
|
137
164
|
|
165
|
+
# Creates a Nimbus::Forest object from a user defined random forest data file.
|
166
|
+
#
|
167
|
+
# The format of the input file should be the same as the forest output data of a Nimbus Application.
|
138
168
|
def load_forest
|
139
169
|
trees = []
|
140
170
|
if File.exists?(@forest_file)
|
@@ -149,10 +179,16 @@ module Nimbus
|
|
149
179
|
forest
|
150
180
|
end
|
151
181
|
|
182
|
+
# Include tests to be passed by the info contained in the config file.
|
183
|
+
#
|
184
|
+
# If some of the configuration data provided by the user is invalid, an error is raised and execution stops.
|
152
185
|
def check_configuration
|
153
186
|
raise Nimbus::ConfigurationError, "The mtry sample size must be smaller than the total SNPs count." if @tree_SNP_sample_size > @tree_SNP_total_count
|
154
187
|
end
|
155
188
|
|
189
|
+
# Prints the information stored in the Nimbus::Configuration object
|
190
|
+
#
|
191
|
+
# It could include errors on the configuration input data, training related info and/or testing related info.
|
156
192
|
def log_configuration
|
157
193
|
if !@do_training && !@do_testing
|
158
194
|
Nimbus.message "*" * 50
|
data/lib/nimbus/exceptions.rb
CHANGED
@@ -1,10 +1,18 @@
|
|
1
1
|
module Nimbus
|
2
|
+
# Nimbus custom Error class.
|
2
3
|
class Error < StandardError; end
|
4
|
+
# Error when a non existent or invalid option is used.
|
3
5
|
class InvalidOptionError < Error; end
|
6
|
+
# Error in some of the input files.
|
4
7
|
class InputFileError < Error; end
|
8
|
+
# Error if data from some input file are incorrectly formatted.
|
5
9
|
class WrongFormatFileError < Error; end
|
10
|
+
# Error if configuration options are invalid.
|
6
11
|
class ConfigurationError < Error; end
|
12
|
+
# Error handling a random Forest.
|
7
13
|
class ForestError < Error; end
|
14
|
+
# Error handling a Tree object.
|
8
15
|
class TreeError < Error; end
|
16
|
+
# Error in the data of an Individual object.
|
9
17
|
class IndividualError < Error; end
|
10
18
|
end
|
data/lib/nimbus/forest.rb
CHANGED
@@ -1,9 +1,14 @@
|
|
1
1
|
module Nimbus
|
2
|
-
|
2
|
+
|
3
|
+
#####################################################################
|
4
|
+
# Forest represents the Random forest being generated
|
5
|
+
# (or used to test samples) by the application object.
|
6
|
+
#
|
3
7
|
class Forest
|
4
8
|
attr_accessor :size, :trees, :bag, :predictions, :tree_errors, :snp_importances
|
5
9
|
attr_accessor :options
|
6
10
|
|
11
|
+
# Initialize Forest object with options included in the Nimbus::Configuration object received.
|
7
12
|
def initialize(config)
|
8
13
|
@trees = []
|
9
14
|
@tree_errors = []
|
@@ -16,6 +21,18 @@ module Nimbus
|
|
16
21
|
raise Nimbus::ForestError, "Forest size parameter (#{@size}) is invalid. You need at least one tree." if @size < 1
|
17
22
|
end
|
18
23
|
|
24
|
+
# Creates a random forest based on the TrainingSet included in the configuration, creating N random trees (size N defined in the configuration).
|
25
|
+
#
|
26
|
+
# This is the method called when the application's configuration flags training on.
|
27
|
+
#
|
28
|
+
# It performs this tasks:
|
29
|
+
#
|
30
|
+
# * grow the forest (all the N random trees)
|
31
|
+
# * store generalization errors for every tree
|
32
|
+
# * obtain averaged importances for all the SNPs
|
33
|
+
# * calculate averaged predictions for all individuals in the training sample
|
34
|
+
#
|
35
|
+
# Every tree of the forest is created with a different random sample of the individuals in the training set.
|
19
36
|
def grow
|
20
37
|
@size.times do |i|
|
21
38
|
Nimbus.write("Creating trees: #{i+1}/#{@size} ")
|
@@ -32,6 +49,9 @@ module Nimbus
|
|
32
49
|
average_predictions
|
33
50
|
end
|
34
51
|
|
52
|
+
# Traverse a testing set through every tree of the forest and get averaged predictions for every individual in the sample.
|
53
|
+
#
|
54
|
+
# This is the method called when the application's configuration flags testing on.
|
35
55
|
def traverse
|
36
56
|
@predictions = {}
|
37
57
|
prediction_count = trees.size
|
@@ -44,6 +64,7 @@ module Nimbus
|
|
44
64
|
}
|
45
65
|
end
|
46
66
|
|
67
|
+
# The array containing every tree in the forest, to YAML format.
|
47
68
|
def to_yaml
|
48
69
|
@trees.to_yaml
|
49
70
|
end
|
data/lib/nimbus/individual.rb
CHANGED
@@ -1,8 +1,20 @@
|
|
1
1
|
module Nimbus
|
2
|
-
|
2
|
+
#####################################################################
|
3
|
+
# Nimbus Individual object.
|
4
|
+
#
|
5
|
+
# It represents a single individual of a training or testing sample.
|
6
|
+
#
|
7
|
+
# This class stores information about a individual:
|
8
|
+
#
|
9
|
+
# * id,
|
10
|
+
# * values for all the SNPs of the individual,
|
11
|
+
# * fenotype if present,
|
12
|
+
# * the prediction is it exists.
|
13
|
+
#
|
3
14
|
class Individual
|
4
15
|
attr_accessor :id, :fenotype, :prediction, :snp_list
|
5
16
|
|
17
|
+
# Initialize individual with passed data.
|
6
18
|
def initialize(i, fen, snps=[])
|
7
19
|
self.id = i
|
8
20
|
self.fenotype = fen
|
@@ -1,21 +1,36 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
module Nimbus
|
3
|
+
|
4
|
+
#####################################################################
|
5
|
+
# Math functions.
|
6
|
+
#
|
7
|
+
# The LossFunctions class provides handy mathematical functions as class methods
|
8
|
+
# to be used by Tree and Forest when estimating predictions, errors and loss functions
|
9
|
+
# for training and testing data.
|
10
|
+
#
|
2
11
|
module LossFunctions
|
3
12
|
|
4
13
|
class << self
|
5
|
-
|
14
|
+
|
15
|
+
# Simple average: sum(n) / n
|
6
16
|
def average(ids, value_table)
|
7
17
|
ids.inject(0.0){|sum, i| sum + value_table[i]} / ids.size
|
8
18
|
end
|
9
19
|
|
20
|
+
# Mean squared error: sum (x-y)^2
|
10
21
|
def mean_squared_error(ids, value_table, mean = nil)
|
11
22
|
mean ||= self.average ids, value_table
|
12
23
|
ids.inject(0.0){|sum, i| sum + ((value_table[i] - mean)**2) }
|
13
24
|
end
|
14
25
|
|
26
|
+
# Quadratic loss: averaged mean squared error: sum (x-y)^2 / n
|
27
|
+
#
|
28
|
+
# Default loss function for regression forests.
|
15
29
|
def quadratic_loss(ids, value_table, mean = nil)
|
16
30
|
self.mean_squared_error(ids, value_table, mean) / ids.size
|
17
31
|
end
|
18
32
|
|
33
|
+
# Difference between two values, squared. (x-y)^2
|
19
34
|
def squared_difference(x,y)
|
20
35
|
0.0 + (x-y)**2
|
21
36
|
end
|
data/lib/nimbus/training_set.rb
CHANGED
@@ -1,13 +1,19 @@
|
|
1
1
|
module Nimbus
|
2
|
-
|
2
|
+
#####################################################################
|
3
|
+
# Set of individuals to be used as training sample for a random forest.
|
4
|
+
#
|
5
|
+
# the TrainingSet class stores an array of individuals, and a hash with the fenotypes of every individual indexed by id.
|
6
|
+
#
|
3
7
|
class TrainingSet
|
4
8
|
attr_accessor :individuals, :ids_fenotypes
|
5
9
|
|
10
|
+
# Initialize a new training set with the individuals and fenotype info received.
|
6
11
|
def initialize(individuals, ids_fenotypes)
|
7
12
|
@individuals = individuals
|
8
13
|
@ids_fenotypes = ids_fenotypes
|
9
14
|
end
|
10
15
|
|
16
|
+
# Array of all the ids of the individuals in this training sample.
|
11
17
|
def all_ids
|
12
18
|
@all_ids ||= @ids_fenotypes.keys
|
13
19
|
@all_ids
|
data/lib/nimbus/tree.rb
CHANGED
@@ -1,15 +1,34 @@
|
|
1
1
|
module Nimbus
|
2
|
-
|
2
|
+
|
3
|
+
#####################################################################
|
4
|
+
# Tree object representing a random tree.
|
5
|
+
#
|
6
|
+
# A tree is generated following this steps:
|
7
|
+
#
|
8
|
+
# * 1: Calculate loss function for the individuals in the node (first node contains all the individuals).
|
9
|
+
# * 2: Take a random sample of the SNPs (size m << total count of SNPs)
|
10
|
+
# * 3: Compute the loss function for the split of the sample based on value of every SNP.
|
11
|
+
# * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in three nodes, based on value for that SNP [0, 1, or 2]
|
12
|
+
# * 5: Repeat from 1 for every node until:
|
13
|
+
# - a) The individuals count in that node is < minimum size OR
|
14
|
+
# - b) None of the SNP splits has a loss function smaller than the node loss function
|
15
|
+
# * 6) When a node stops, label the node with the average fenotype value of the individuals in the node.
|
16
|
+
#
|
3
17
|
class Tree
|
4
18
|
attr_accessor :snp_sample_size, :snp_total_count, :node_min_size, :used_snps, :structure, :generalization_error, :predictions, :importances
|
5
19
|
attr_accessor :individuals, :id_to_fenotype
|
6
20
|
|
21
|
+
# Initialize Tree object with the configuration (as in Nimbus::Configuration.tree) options received.
|
7
22
|
def initialize(options)
|
8
23
|
@snp_total_count = options[:snp_total_count]
|
9
24
|
@snp_sample_size = options[:snp_sample_size]
|
10
25
|
@node_min_size = options[:tree_node_min_size]
|
11
26
|
end
|
12
27
|
|
28
|
+
# Creates the structure of the tree, as a hash of SNP splits and values.
|
29
|
+
#
|
30
|
+
# It just initializes the needed variables and then defines the first node of the tree.
|
31
|
+
# The rest of the structure of the tree is computed recursively building every node calling +build_node+.
|
13
32
|
def seed(all_individuals, individuals_sample, ids_fenotypes)
|
14
33
|
@individuals = all_individuals
|
15
34
|
@id_to_fenotype = ids_fenotypes
|
@@ -19,6 +38,11 @@ module Nimbus
|
|
19
38
|
@structure = build_node individuals_sample, Nimbus::LossFunctions.average(individuals_sample, @id_to_fenotype)
|
20
39
|
end
|
21
40
|
|
41
|
+
# Creates a node by taking a random sample of the SNPs and computing the loss function for every split by SNP of that sample.
|
42
|
+
#
|
43
|
+
# * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in three:
|
44
|
+
# (those with value 0 for the SNP_min, those with value 1 for the SNP_min, and those with value 2 for the SNP_min) then it builds these 3 new nodes.
|
45
|
+
# * Otherwise every individual in the node gets labeled with the average of the fenotype values of all of them.
|
22
46
|
def build_node(individuals_ids, y_hat)
|
23
47
|
# General loss function value for the node
|
24
48
|
individuals_count = individuals_ids.size
|
@@ -45,16 +69,13 @@ module Nimbus
|
|
45
69
|
return build_branch(min_SNP, split, means, y_hat) if min_loss < node_loss_function
|
46
70
|
return label_node(y_hat, individuals_ids)
|
47
71
|
end
|
48
|
-
|
49
|
-
def build_branch(snp, split, y_hats, parent_y_hat)
|
50
|
-
node_0 = split[0].size == 0 ? label_node(parent_y_hat, []) : build_node(split[0], y_hats[0])
|
51
|
-
node_1 = split[1].size == 0 ? label_node(parent_y_hat, []) : build_node(split[1], y_hats[1])
|
52
|
-
node_2 = split[2].size == 0 ? label_node(parent_y_hat, []) : build_node(split[2], y_hats[2])
|
53
|
-
|
54
|
-
split_by_snp(snp)
|
55
|
-
return { snp => [node_0, node_1, node_2] }
|
56
|
-
end
|
57
72
|
|
73
|
+
# Compute generalization error for the tree.
|
74
|
+
#
|
75
|
+
# Traversing the 'out of bag' (OOB) sample (those individuals of the training set not
|
76
|
+
# used in the building of this tree) through the tree, and comparing
|
77
|
+
# the prediction with the real fenotype of the individual (and then averaging) is
|
78
|
+
# possible to calculate the unbiased generalization error for the tree.
|
58
79
|
def generalization_error_from_oob(oob_ids)
|
59
80
|
return nil if (@structure.nil? || @individuals.nil? || @id_to_fenotype.nil?)
|
60
81
|
oob_errors = {}
|
@@ -65,6 +86,15 @@ module Nimbus
|
|
65
86
|
@generalization_error = Nimbus::LossFunctions.average oob_ids, oob_errors
|
66
87
|
end
|
67
88
|
|
89
|
+
# Estimation of importance for every SNP.
|
90
|
+
#
|
91
|
+
# The importance of any SNP in the tree is calculated using the OOB sample.
|
92
|
+
# For every SNP, every individual in the sample is pushed down the tree but with the
|
93
|
+
# value of that SNP permuted with other individual in the sample.
|
94
|
+
#
|
95
|
+
# That way the difference between the regular prediction and the prediction with the SNP value modified can be estimated for any given SNP.
|
96
|
+
#
|
97
|
+
# This method computes importance estimations for every SNPs used in the tree (for any other SNP it would be 0).
|
68
98
|
def estimate_importances(oob_ids)
|
69
99
|
return nil if (@generalization_error.nil? && generalization_error_from_oob(oob_ids))
|
70
100
|
oob_individuals_count = oob_ids.size
|
@@ -81,25 +111,30 @@ module Nimbus
|
|
81
111
|
@importances
|
82
112
|
end
|
83
113
|
|
114
|
+
# Class method to traverse a single individual through a tree structure.
|
115
|
+
#
|
116
|
+
# Returns the prediction for that individual (the label of the final node reached by the individual).
|
84
117
|
def self.traverse(tree_structure, data)
|
85
118
|
return tree_structure if tree_structure.is_a? Numeric
|
86
119
|
raise Nimbus::TreeError, "Forest data has invalid structure. Please check your forest data (file)." if !(tree_structure.is_a?(Hash) && tree_structure.keys.size == 1)
|
87
120
|
return self.traverse( tree_structure.values.first[ data[tree_structure.keys.first - 1].to_i], data)
|
88
121
|
end
|
89
122
|
|
90
|
-
def traverse_with_permutation(tree_structure, data, snp_to_permute, individual_to_permute)
|
91
|
-
return tree_structure if tree_structure.is_a? Numeric
|
92
|
-
individual_data = (tree_structure.keys.first == snp_to_permute ? individual_to_permute : data)
|
93
|
-
return traverse_with_permutation( tree_structure.values.first[ individual_data[tree_structure.keys.first - 1].to_i], data, snp_to_permute, individual_to_permute)
|
94
|
-
end
|
95
|
-
|
96
|
-
|
97
123
|
private
|
98
124
|
|
99
125
|
def snps_random_sample
|
100
126
|
(1..@snp_total_count).to_a.sample(@snp_sample_size).sort
|
101
127
|
end
|
102
128
|
|
129
|
+
def build_branch(snp, split, y_hats, parent_y_hat)
|
130
|
+
node_0 = split[0].size == 0 ? label_node(parent_y_hat, []) : build_node(split[0], y_hats[0])
|
131
|
+
node_1 = split[1].size == 0 ? label_node(parent_y_hat, []) : build_node(split[1], y_hats[1])
|
132
|
+
node_2 = split[2].size == 0 ? label_node(parent_y_hat, []) : build_node(split[2], y_hats[2])
|
133
|
+
|
134
|
+
split_by_snp(snp)
|
135
|
+
return { snp => [node_0, node_1, node_2] }
|
136
|
+
end
|
137
|
+
|
103
138
|
def label_node(value, ids)
|
104
139
|
label = value.round(5)
|
105
140
|
ids.uniq.each{|i| @predictions[i] = label}
|
@@ -120,6 +155,12 @@ module Nimbus
|
|
120
155
|
@used_snps << x
|
121
156
|
end
|
122
157
|
|
158
|
+
def traverse_with_permutation(tree_structure, data, snp_to_permute, individual_to_permute)
|
159
|
+
return tree_structure if tree_structure.is_a? Numeric
|
160
|
+
individual_data = (tree_structure.keys.first == snp_to_permute ? individual_to_permute : data)
|
161
|
+
return traverse_with_permutation( tree_structure.values.first[ individual_data[tree_structure.keys.first - 1].to_i], data, snp_to_permute, individual_to_permute)
|
162
|
+
end
|
163
|
+
|
123
164
|
end
|
124
165
|
|
125
166
|
end
|