nimbus 1.0.1 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +149 -0
- data/lib/nimbus.rb +15 -11
- data/lib/nimbus/application.rb +20 -23
- data/lib/nimbus/classification_tree.rb +111 -0
- data/lib/nimbus/configuration.rb +52 -37
- data/lib/nimbus/forest.rb +56 -20
- data/lib/nimbus/individual.rb +7 -7
- data/lib/nimbus/loss_functions.rb +44 -10
- data/lib/nimbus/regression_tree.rb +103 -0
- data/lib/nimbus/training_set.rb +4 -4
- data/lib/nimbus/tree.rb +20 -83
- data/lib/nimbus/version.rb +3 -0
- data/spec/classification_tree_spec.rb +132 -0
- data/spec/configuration_spec.rb +46 -19
- data/spec/fixtures/classification_config.yml +13 -0
- data/spec/fixtures/classification_random_forest.yml +922 -0
- data/spec/fixtures/classification_testing.data +500 -0
- data/spec/fixtures/classification_training.data +1000 -0
- data/spec/forest_spec.rb +109 -50
- data/spec/individual_spec.rb +2 -2
- data/spec/loss_functions_spec.rb +71 -0
- data/spec/nimbus_spec.rb +4 -4
- data/spec/regression_tree_spec.rb +129 -0
- data/spec/training_set_spec.rb +5 -5
- data/spec/tree_spec.rb +4 -115
- metadata +53 -45
- data/spec/fixtures/regression_snp_importances.txt +0 -200
- data/spec/fixtures/regression_testing_file_predictions.txt +0 -200
- data/spec/fixtures/regression_training_file_predictions.txt +0 -758
@@ -0,0 +1,103 @@
|
|
1
|
+
module Nimbus
|
2
|
+
|
3
|
+
#####################################################################
|
4
|
+
# Tree object representing a random regression tree.
|
5
|
+
#
|
6
|
+
# A tree is generated following this steps:
|
7
|
+
#
|
8
|
+
# * 1: Calculate loss function for the individuals in the node (first node contains all the individuals).
|
9
|
+
# * 2: Take a random sample of the SNPs (size m << total count of SNPs)
|
10
|
+
# * 3: Compute the loss function (quadratic loss) for the split of the sample based on value of every SNP.
|
11
|
+
# * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in three nodes, based on value for that SNP [0, 1, or 2]
|
12
|
+
# * 5: Repeat from 1 for every node until:
|
13
|
+
# - a) The individuals count in that node is < minimum size OR
|
14
|
+
# - b) None of the SNP splits has a loss function smaller than the node loss function
|
15
|
+
# * 6) When a node stops, label the node with the average fenotype value of the individuals in the node.
|
16
|
+
#
|
17
|
+
class RegressionTree < Tree
|
18
|
+
|
19
|
+
# Creates the structure of the tree, as a hash of SNP splits and values.
|
20
|
+
#
|
21
|
+
# It just initializes the needed variables and then defines the first node of the tree.
|
22
|
+
# The rest of the structure of the tree is computed recursively building every node calling +build_node+.
|
23
|
+
def seed(all_individuals, individuals_sample, ids_fenotypes)
|
24
|
+
super
|
25
|
+
@structure = build_node individuals_sample, Nimbus::LossFunctions.average(individuals_sample, @id_to_fenotype)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Creates a node by taking a random sample of the SNPs and computing the loss function for every split by SNP of that sample.
|
29
|
+
#
|
30
|
+
# * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in three:
|
31
|
+
# (those with value 0 for the SNP_min, those with value 1 for the SNP_min, and those with value 2 for the SNP_min) then it builds these 3 new nodes.
|
32
|
+
# * Otherwise every individual in the node gets labeled with the average of the fenotype values of all of them.
|
33
|
+
def build_node(individuals_ids, y_hat)
|
34
|
+
# General loss function value for the node
|
35
|
+
individuals_count = individuals_ids.size
|
36
|
+
return label_node(y_hat, individuals_ids) if individuals_count < @node_min_size
|
37
|
+
node_loss_function = Nimbus::LossFunctions.quadratic_loss individuals_ids, @id_to_fenotype, y_hat
|
38
|
+
|
39
|
+
# Finding the SNP that minimizes loss function
|
40
|
+
snps = snps_random_sample
|
41
|
+
min_loss, min_SNP, split, means = node_loss_function, nil, nil, nil
|
42
|
+
|
43
|
+
snps.each do |snp|
|
44
|
+
individuals_split_by_snp_value = split_by_snp_value individuals_ids, snp
|
45
|
+
mean_0 = Nimbus::LossFunctions.average individuals_split_by_snp_value[0], @id_to_fenotype
|
46
|
+
mean_1 = Nimbus::LossFunctions.average individuals_split_by_snp_value[1], @id_to_fenotype
|
47
|
+
mean_2 = Nimbus::LossFunctions.average individuals_split_by_snp_value[2], @id_to_fenotype
|
48
|
+
loss_0 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[0], @id_to_fenotype, mean_0
|
49
|
+
loss_1 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[1], @id_to_fenotype, mean_1
|
50
|
+
loss_2 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[2], @id_to_fenotype, mean_2
|
51
|
+
loss_snp = (loss_0 + loss_1 + loss_2) / individuals_count
|
52
|
+
|
53
|
+
min_loss, min_SNP, split, means = loss_snp, snp, individuals_split_by_snp_value, [mean_0, mean_1, mean_2] if loss_snp < min_loss
|
54
|
+
end
|
55
|
+
|
56
|
+
return build_branch(min_SNP, split, means, y_hat) if min_loss < node_loss_function
|
57
|
+
return label_node(y_hat, individuals_ids)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Compute generalization error for the tree.
|
61
|
+
#
|
62
|
+
# Traversing the 'out of bag' (OOB) sample (those individuals of the training set not
|
63
|
+
# used in the building of this tree) through the tree, and comparing
|
64
|
+
# the prediction with the real fenotype of the individual (and then averaging) is
|
65
|
+
# possible to calculate the unbiased generalization error for the tree.
|
66
|
+
def generalization_error_from_oob(oob_ids)
|
67
|
+
return nil if (@structure.nil? || @individuals.nil? || @id_to_fenotype.nil?)
|
68
|
+
oob_errors = {}
|
69
|
+
oob_ids.each do |oobi|
|
70
|
+
oob_prediction = Tree.traverse @structure, individuals[oobi].snp_list
|
71
|
+
oob_errors[oobi] = Nimbus::LossFunctions.squared_difference oob_prediction, @id_to_fenotype[oobi]
|
72
|
+
end
|
73
|
+
@generalization_error = Nimbus::LossFunctions.average oob_ids, oob_errors
|
74
|
+
end
|
75
|
+
|
76
|
+
# Estimation of importance for every SNP.
|
77
|
+
#
|
78
|
+
# The importance of any SNP in the tree is calculated using the OOB sample.
|
79
|
+
# For every SNP, every individual in the sample is pushed down the tree but with the
|
80
|
+
# value of that SNP permuted with other individual in the sample.
|
81
|
+
#
|
82
|
+
# That way the difference between the regular prediction and the prediction with the SNP value modified can be estimated for any given SNP.
|
83
|
+
#
|
84
|
+
# This method computes importance estimations for every SNPs used in the tree (for any other SNP it would be 0).
|
85
|
+
def estimate_importances(oob_ids)
|
86
|
+
return nil if (@generalization_error.nil? && generalization_error_from_oob(oob_ids).nil?)
|
87
|
+
oob_individuals_count = oob_ids.size
|
88
|
+
@importances = {}
|
89
|
+
@used_snps.uniq.each do |current_snp|
|
90
|
+
shuffled_ids = oob_ids.shuffle
|
91
|
+
permutated_snp_error = 0.0
|
92
|
+
oob_ids.each_with_index {|oobi, index|
|
93
|
+
permutated_prediction = traverse_with_permutation @structure, individuals[oobi].snp_list, current_snp, individuals[shuffled_ids[index]].snp_list
|
94
|
+
permutated_snp_error += Nimbus::LossFunctions.squared_difference @id_to_fenotype[oobi], permutated_prediction
|
95
|
+
}
|
96
|
+
@importances[current_snp] = ((permutated_snp_error / oob_individuals_count) - @generalization_error).round(5)
|
97
|
+
end
|
98
|
+
@importances
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
data/lib/nimbus/training_set.rb
CHANGED
@@ -1,23 +1,23 @@
|
|
1
1
|
module Nimbus
|
2
2
|
#####################################################################
|
3
3
|
# Set of individuals to be used as training sample for a random forest.
|
4
|
-
#
|
4
|
+
#
|
5
5
|
# the TrainingSet class stores an array of individuals, and a hash with the fenotypes of every individual indexed by id.
|
6
6
|
#
|
7
7
|
class TrainingSet
|
8
8
|
attr_accessor :individuals, :ids_fenotypes
|
9
|
-
|
9
|
+
|
10
10
|
# Initialize a new training set with the individuals and fenotype info received.
|
11
11
|
def initialize(individuals, ids_fenotypes)
|
12
12
|
@individuals = individuals
|
13
13
|
@ids_fenotypes = ids_fenotypes
|
14
14
|
end
|
15
|
-
|
15
|
+
|
16
16
|
# Array of all the ids of the individuals in this training sample.
|
17
17
|
def all_ids
|
18
18
|
@all_ids ||= @ids_fenotypes.keys
|
19
19
|
@all_ids
|
20
20
|
end
|
21
21
|
end
|
22
|
-
|
22
|
+
|
23
23
|
end
|
data/lib/nimbus/tree.rb
CHANGED
@@ -6,25 +6,25 @@ module Nimbus
|
|
6
6
|
# A tree is generated following this steps:
|
7
7
|
#
|
8
8
|
# * 1: Calculate loss function for the individuals in the node (first node contains all the individuals).
|
9
|
-
# * 2: Take a random sample of the SNPs (size m << total count of SNPs)
|
9
|
+
# * 2: Take a random sample of the SNPs (size m << total count of SNPs)
|
10
10
|
# * 3: Compute the loss function for the split of the sample based on value of every SNP.
|
11
11
|
# * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in three nodes, based on value for that SNP [0, 1, or 2]
|
12
12
|
# * 5: Repeat from 1 for every node until:
|
13
13
|
# - a) The individuals count in that node is < minimum size OR
|
14
14
|
# - b) None of the SNP splits has a loss function smaller than the node loss function
|
15
|
-
# * 6) When a node stops, label the node with the average fenotype value of the individuals in the node.
|
15
|
+
# * 6) When a node stops, label the node with the average fenotype value (for regression problems) or the majority class (for classification problems) of the individuals in the node.
|
16
16
|
#
|
17
17
|
class Tree
|
18
18
|
attr_accessor :snp_sample_size, :snp_total_count, :node_min_size, :used_snps, :structure, :generalization_error, :predictions, :importances
|
19
19
|
attr_accessor :individuals, :id_to_fenotype
|
20
|
-
|
20
|
+
|
21
21
|
# Initialize Tree object with the configuration (as in Nimbus::Configuration.tree) options received.
|
22
22
|
def initialize(options)
|
23
23
|
@snp_total_count = options[:snp_total_count]
|
24
24
|
@snp_sample_size = options[:snp_sample_size]
|
25
25
|
@node_min_size = options[:tree_node_min_size]
|
26
26
|
end
|
27
|
-
|
27
|
+
|
28
28
|
# Creates the structure of the tree, as a hash of SNP splits and values.
|
29
29
|
#
|
30
30
|
# It just initializes the needed variables and then defines the first node of the tree.
|
@@ -34,113 +34,50 @@ module Nimbus
|
|
34
34
|
@id_to_fenotype = ids_fenotypes
|
35
35
|
@predictions = {}
|
36
36
|
@used_snps = []
|
37
|
-
|
38
|
-
@structure = build_node individuals_sample, Nimbus::LossFunctions.average(individuals_sample, @id_to_fenotype)
|
39
37
|
end
|
40
38
|
|
41
39
|
# Creates a node by taking a random sample of the SNPs and computing the loss function for every split by SNP of that sample.
|
42
|
-
#
|
43
|
-
# * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in three:
|
44
|
-
# (those with value 0 for the SNP_min, those with value 1 for the SNP_min, and those with value 2 for the SNP_min) then it builds these 3 new nodes.
|
45
|
-
# * Otherwise every individual in the node gets labeled with the average of the fenotype values of all of them.
|
46
40
|
def build_node(individuals_ids, y_hat)
|
47
|
-
# General loss function value for the node
|
48
|
-
individuals_count = individuals_ids.size
|
49
|
-
return label_node(y_hat, individuals_ids) if individuals_count < @node_min_size
|
50
|
-
node_loss_function = Nimbus::LossFunctions.quadratic_loss individuals_ids, @id_to_fenotype, y_hat
|
51
|
-
|
52
|
-
# Finding the SNP that minimizes loss function
|
53
|
-
snps = snps_random_sample
|
54
|
-
min_loss, min_SNP, split, means = node_loss_function, nil, nil, nil
|
55
|
-
|
56
|
-
snps.each do |snp|
|
57
|
-
individuals_split_by_snp_value = split_by_snp_value individuals_ids, snp
|
58
|
-
mean_0 = Nimbus::LossFunctions.average individuals_split_by_snp_value[0], @id_to_fenotype
|
59
|
-
mean_1 = Nimbus::LossFunctions.average individuals_split_by_snp_value[1], @id_to_fenotype
|
60
|
-
mean_2 = Nimbus::LossFunctions.average individuals_split_by_snp_value[2], @id_to_fenotype
|
61
|
-
loss_0 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[0], @id_to_fenotype, mean_0
|
62
|
-
loss_1 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[1], @id_to_fenotype, mean_1
|
63
|
-
loss_2 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[2], @id_to_fenotype, mean_2
|
64
|
-
loss_snp = (loss_0 + loss_1 + loss_2) / individuals_count
|
65
|
-
|
66
|
-
min_loss, min_SNP, split, means = loss_snp, snp, individuals_split_by_snp_value, [mean_0, mean_1, mean_2] if loss_snp < min_loss
|
67
|
-
end
|
68
|
-
|
69
|
-
return build_branch(min_SNP, split, means, y_hat) if min_loss < node_loss_function
|
70
|
-
return label_node(y_hat, individuals_ids)
|
71
41
|
end
|
72
42
|
|
73
43
|
# Compute generalization error for the tree.
|
74
|
-
#
|
75
|
-
# Traversing the 'out of bag' (OOB) sample (those individuals of the training set not
|
76
|
-
# used in the building of this tree) through the tree, and comparing
|
77
|
-
# the prediction with the real fenotype of the individual (and then averaging) is
|
78
|
-
# possible to calculate the unbiased generalization error for the tree.
|
79
44
|
def generalization_error_from_oob(oob_ids)
|
80
|
-
return nil if (@structure.nil? || @individuals.nil? || @id_to_fenotype.nil?)
|
81
|
-
oob_errors = {}
|
82
|
-
oob_ids.each do |oobi|
|
83
|
-
oob_prediction = Tree.traverse @structure, individuals[oobi].snp_list
|
84
|
-
oob_errors[oobi] = Nimbus::LossFunctions.squared_difference oob_prediction, @id_to_fenotype[oobi]
|
85
|
-
end
|
86
|
-
@generalization_error = Nimbus::LossFunctions.average oob_ids, oob_errors
|
87
45
|
end
|
88
46
|
|
89
47
|
# Estimation of importance for every SNP.
|
90
|
-
#
|
91
|
-
# The importance of any SNP in the tree is calculated using the OOB sample.
|
92
|
-
# For every SNP, every individual in the sample is pushed down the tree but with the
|
93
|
-
# value of that SNP permuted with other individual in the sample.
|
94
|
-
#
|
95
|
-
# That way the difference between the regular prediction and the prediction with the SNP value modified can be estimated for any given SNP.
|
96
|
-
#
|
97
|
-
# This method computes importance estimations for every SNPs used in the tree (for any other SNP it would be 0).
|
98
48
|
def estimate_importances(oob_ids)
|
99
|
-
return nil if (@generalization_error.nil? && generalization_error_from_oob(oob_ids).nil?)
|
100
|
-
oob_individuals_count = oob_ids.size
|
101
|
-
@importances = {}
|
102
|
-
@used_snps.uniq.each do |current_snp|
|
103
|
-
shuffled_ids = oob_ids.shuffle
|
104
|
-
permutated_snp_error = 0.0
|
105
|
-
oob_ids.each_with_index {|oobi, index|
|
106
|
-
permutated_prediction = traverse_with_permutation @structure, individuals[oobi].snp_list, current_snp, individuals[shuffled_ids[index]].snp_list
|
107
|
-
permutated_snp_error += Nimbus::LossFunctions.squared_difference @id_to_fenotype[oobi], permutated_prediction
|
108
|
-
}
|
109
|
-
@importances[current_snp] = ((permutated_snp_error / oob_individuals_count) - @generalization_error).round(5)
|
110
|
-
end
|
111
|
-
@importances
|
112
49
|
end
|
113
|
-
|
114
|
-
# Class method to traverse a single individual through a tree structure.
|
50
|
+
|
51
|
+
# Class method to traverse a single individual through a tree structure.
|
115
52
|
#
|
116
53
|
# Returns the prediction for that individual (the label of the final node reached by the individual).
|
117
54
|
def self.traverse(tree_structure, data)
|
118
|
-
return tree_structure if tree_structure.is_a?
|
55
|
+
return tree_structure if tree_structure.is_a?(Numeric) || tree_structure.is_a?(String)
|
119
56
|
raise Nimbus::TreeError, "Forest data has invalid structure. Please check your forest data (file)." if !(tree_structure.is_a?(Hash) && tree_structure.keys.size == 1)
|
120
57
|
return self.traverse( tree_structure.values.first[ data[tree_structure.keys.first - 1].to_i], data)
|
121
58
|
end
|
122
|
-
|
123
|
-
|
124
|
-
|
59
|
+
|
60
|
+
protected
|
61
|
+
|
125
62
|
def snps_random_sample
|
126
63
|
(1..@snp_total_count).to_a.sample(@snp_sample_size).sort
|
127
64
|
end
|
128
|
-
|
65
|
+
|
129
66
|
def build_branch(snp, split, y_hats, parent_y_hat)
|
130
67
|
node_0 = split[0].size == 0 ? label_node(parent_y_hat, []) : build_node(split[0], y_hats[0])
|
131
68
|
node_1 = split[1].size == 0 ? label_node(parent_y_hat, []) : build_node(split[1], y_hats[1])
|
132
69
|
node_2 = split[2].size == 0 ? label_node(parent_y_hat, []) : build_node(split[2], y_hats[2])
|
133
|
-
|
70
|
+
|
134
71
|
split_by_snp(snp)
|
135
72
|
return { snp => [node_0, node_1, node_2] }
|
136
73
|
end
|
137
|
-
|
74
|
+
|
138
75
|
def label_node(value, ids)
|
139
|
-
label = value.round(5)
|
76
|
+
label = value.is_a?(String) ? value : value.round(5)
|
140
77
|
ids.uniq.each{|i| @predictions[i] = label}
|
141
78
|
label
|
142
79
|
end
|
143
|
-
|
80
|
+
|
144
81
|
def split_by_snp_value(ids, snp)
|
145
82
|
split = [[], [], []]
|
146
83
|
ids.each do |i|
|
@@ -150,17 +87,17 @@ module Nimbus
|
|
150
87
|
rescue => ex
|
151
88
|
raise Nimbus::TreeError, "Values for SNPs columns must be in [0, 1, 2]"
|
152
89
|
end
|
153
|
-
|
90
|
+
|
154
91
|
def split_by_snp(x)
|
155
92
|
@used_snps << x
|
156
93
|
end
|
157
|
-
|
94
|
+
|
158
95
|
def traverse_with_permutation(tree_structure, data, snp_to_permute, individual_to_permute)
|
159
|
-
return tree_structure if tree_structure.is_a?
|
96
|
+
return tree_structure if tree_structure.is_a?(Numeric) || tree_structure.is_a?(String)
|
160
97
|
individual_data = (tree_structure.keys.first == snp_to_permute ? individual_to_permute : data)
|
161
98
|
return traverse_with_permutation( tree_structure.values.first[ individual_data[tree_structure.keys.first - 1].to_i], data, snp_to_permute, individual_to_permute)
|
162
99
|
end
|
163
|
-
|
100
|
+
|
164
101
|
end
|
165
|
-
|
102
|
+
|
166
103
|
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
describe Nimbus::ClassificationTree do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@config = Nimbus::Configuration.new
|
7
|
+
@config.load fixture_file('classification_config.yml')
|
8
|
+
|
9
|
+
@tree = Nimbus::ClassificationTree.new @config.tree
|
10
|
+
end
|
11
|
+
|
12
|
+
it "is initialized with tree config info" do
|
13
|
+
@tree.snp_total_count.should == 100
|
14
|
+
@tree.snp_sample_size.should == 33
|
15
|
+
@tree.node_min_size.should == 5
|
16
|
+
@tree.classes.size.should == 2
|
17
|
+
@tree.classes[0].should == '0'
|
18
|
+
@tree.classes[1].should == '1'
|
19
|
+
end
|
20
|
+
|
21
|
+
it "creates a tree structure when seeded with training data" do
|
22
|
+
@config.load_training_data
|
23
|
+
@tree.structure.should be_nil
|
24
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
25
|
+
@tree.structure.should_not be_nil
|
26
|
+
@tree.structure.should be_kind_of Hash
|
27
|
+
|
28
|
+
@tree.structure.keys.first.should == @tree.used_snps.last
|
29
|
+
@tree.used_snps.should_not be_empty
|
30
|
+
end
|
31
|
+
|
32
|
+
it "splits node in three when building a node and finds a suitable split" do
|
33
|
+
@config.load_training_data
|
34
|
+
@tree.stub!(:snps_random_sample).and_return((68..100).to_a) #97 is best split
|
35
|
+
|
36
|
+
@tree.individuals = @config.training_set.individuals
|
37
|
+
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
38
|
+
@tree.used_snps = []
|
39
|
+
@tree.predictions = {}
|
40
|
+
|
41
|
+
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
42
|
+
branch.keys.size.should == 1
|
43
|
+
branch.keys.first.should == 97
|
44
|
+
branch[97].size.should == 3
|
45
|
+
branch[97][0].should be_kind_of Hash
|
46
|
+
branch[97][1].should be_kind_of Hash
|
47
|
+
branch[97][2].should be_kind_of Hash
|
48
|
+
end
|
49
|
+
|
50
|
+
it "keeps track of all SNPs used for the tree" do
|
51
|
+
@config.load_training_data
|
52
|
+
snps = (33..65).to_a
|
53
|
+
@tree.stub!(:snps_random_sample).and_return(snps)
|
54
|
+
@tree.used_snps.should be_nil
|
55
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
56
|
+
@tree.used_snps.size.should > 4
|
57
|
+
@tree.used_snps.each{|snp|
|
58
|
+
snps.include?(snp).should be_true
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
62
|
+
it "labels node when building a node and there is not a suitable split" do
|
63
|
+
@config.load_training_data
|
64
|
+
@tree.stub!(:snps_random_sample).and_return([33])
|
65
|
+
|
66
|
+
@tree.individuals = @config.training_set.individuals
|
67
|
+
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
68
|
+
@tree.used_snps = []
|
69
|
+
@tree.predictions = {}
|
70
|
+
|
71
|
+
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
72
|
+
branch[33][0].should be_kind_of String
|
73
|
+
branch[33][1].should be_kind_of String
|
74
|
+
branch[33][2].should be_kind_of String
|
75
|
+
end
|
76
|
+
|
77
|
+
it "labels node when building a node with less individuals than the minimum node size" do
|
78
|
+
@config.load_training_data
|
79
|
+
|
80
|
+
@tree.individuals = @config.training_set.individuals
|
81
|
+
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
82
|
+
@tree.used_snps = []
|
83
|
+
@tree.predictions = {}
|
84
|
+
|
85
|
+
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
86
|
+
label.should be_kind_of String
|
87
|
+
|
88
|
+
label = @tree.build_node [2, 10], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
89
|
+
label.should be_kind_of String
|
90
|
+
|
91
|
+
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
92
|
+
label.should be_kind_of String
|
93
|
+
|
94
|
+
label = @tree.build_node [99, 22, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
95
|
+
label.should be_kind_of String
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'computes generalization error for the tree' do
|
99
|
+
@config.load_training_data
|
100
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
101
|
+
@tree.generalization_error.should be_nil
|
102
|
+
@tree.generalization_error_from_oob((3..300).to_a)
|
103
|
+
@tree.generalization_error.should be_kind_of Numeric
|
104
|
+
@tree.generalization_error.should > 0.0
|
105
|
+
@tree.generalization_error.should < 1.0
|
106
|
+
end
|
107
|
+
|
108
|
+
it 'estimates importance for all SNPs' do
|
109
|
+
@config.load_training_data
|
110
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
111
|
+
@tree.importances.should be_nil
|
112
|
+
@tree.estimate_importances((200..533).to_a)
|
113
|
+
@tree.importances.should be_kind_of Hash
|
114
|
+
@tree.importances.keys.should_not be_empty
|
115
|
+
(@tree.importances.keys - (1..100).to_a).should be_empty #all keys are snp indexes (100 snps in training file)
|
116
|
+
end
|
117
|
+
|
118
|
+
it 'get prediction for an individual pushing it down a tree structure' do
|
119
|
+
tree_structure = YAML.load(File.open fixture_file('classification_random_forest.yml')).first
|
120
|
+
individual_data = [0]*100
|
121
|
+
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
122
|
+
prediction.should == '1'
|
123
|
+
|
124
|
+
individual_data[26-1] = 1
|
125
|
+
individual_data[57-1] = 2
|
126
|
+
individual_data[98-1] = 2
|
127
|
+
individual_data[8-1] = 1
|
128
|
+
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
129
|
+
prediction.should == '0'
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
data/spec/configuration_spec.rb
CHANGED
@@ -2,30 +2,57 @@
|
|
2
2
|
require File.dirname(__FILE__) + '/spec_helper'
|
3
3
|
|
4
4
|
describe Nimbus::Configuration do
|
5
|
-
|
5
|
+
|
6
6
|
it "loads configuration options from config file" do
|
7
7
|
config = Nimbus::Configuration.new
|
8
8
|
config.load fixture_file('regression_config.yml')
|
9
|
-
|
9
|
+
|
10
10
|
config.training_file.should == fixture_file('regression_training.data')
|
11
11
|
config.testing_file.should == fixture_file('regression_testing.data')
|
12
12
|
config.forest_file.should == fixture_file('regression_random_forest.yml')
|
13
|
-
|
13
|
+
config.classes.should be_nil
|
14
|
+
|
14
15
|
config.forest_size.should == 3
|
15
16
|
config.tree_SNP_sample_size.should == 60
|
16
17
|
config.tree_SNP_total_count.should == 200
|
17
|
-
config.tree_node_min_size.should == 5
|
18
|
+
config.tree_node_min_size.should == 5
|
19
|
+
|
20
|
+
config = Nimbus::Configuration.new
|
21
|
+
config.load fixture_file('classification_config.yml')
|
22
|
+
|
23
|
+
config.training_file.should == fixture_file('classification_training.data')
|
24
|
+
config.testing_file.should == fixture_file('classification_testing.data')
|
25
|
+
config.forest_file.should == fixture_file('classification_random_forest.yml')
|
26
|
+
config.classes.should == ['0','1']
|
27
|
+
|
28
|
+
config.forest_size.should == 3
|
29
|
+
config.tree_SNP_sample_size.should == 33
|
30
|
+
config.tree_SNP_total_count.should == 100
|
31
|
+
config.tree_node_min_size.should == 5
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'tree method return tree-related subset of options for regression trees' do
|
35
|
+
config = Nimbus::Configuration.new
|
36
|
+
config.load fixture_file('regression_config.yml')
|
37
|
+
tree_options = config.tree
|
38
|
+
|
39
|
+
tree_options[:snp_sample_size].should_not be_nil
|
40
|
+
tree_options[:snp_total_count].should_not be_nil
|
41
|
+
tree_options[:tree_node_min_size].should_not be_nil
|
42
|
+
tree_options[:classes].should be_nil
|
18
43
|
end
|
19
|
-
|
20
|
-
it 'tree method return tree-related subset of options' do
|
44
|
+
|
45
|
+
it 'tree method return tree-related subset of options for classification trees' do
|
21
46
|
config = Nimbus::Configuration.new
|
47
|
+
config.load fixture_file('classification_config.yml')
|
22
48
|
tree_options = config.tree
|
23
|
-
|
49
|
+
|
24
50
|
tree_options[:snp_sample_size].should_not be_nil
|
25
51
|
tree_options[:snp_total_count].should_not be_nil
|
26
52
|
tree_options[:tree_node_min_size].should_not be_nil
|
53
|
+
tree_options[:classes].should_not be_nil
|
27
54
|
end
|
28
|
-
|
55
|
+
|
29
56
|
it "creates a training set object from training data file" do
|
30
57
|
config = Nimbus::Configuration.new
|
31
58
|
config.load fixture_file('regression_config.yml')
|
@@ -33,30 +60,30 @@ describe Nimbus::Configuration do
|
|
33
60
|
config.load_training_data
|
34
61
|
config.training_set.should be_kind_of Nimbus::TrainingSet
|
35
62
|
config.training_set.all_ids.sort.should == (1..800).to_a
|
36
|
-
|
63
|
+
|
37
64
|
File.open(fixture_file('regression_training.data')) {|file|
|
38
65
|
feno1, id1, *snp_list_1 = file.readline.split
|
39
66
|
feno2, id2, *snp_list_2 = file.readline.split
|
40
67
|
feno3, id3, *snp_list_3 = file.readline.split
|
41
|
-
|
68
|
+
|
42
69
|
i1 = Nimbus::Individual.new(id1.to_i, feno1.to_f, snp_list_1.map{|snp| snp.to_i})
|
43
70
|
i2 = Nimbus::Individual.new(id2.to_i, feno2.to_f, snp_list_2.map{|snp| snp.to_i})
|
44
71
|
i3 = Nimbus::Individual.new(id3.to_i, feno3.to_f, snp_list_3.map{|snp| snp.to_i})
|
45
|
-
|
72
|
+
|
46
73
|
config.training_set.individuals[id1.to_i].id.should == i1.id
|
47
74
|
config.training_set.individuals[id2.to_i].fenotype.should == i2.fenotype
|
48
75
|
config.training_set.individuals[id3.to_i].snp_list.should == i3.snp_list
|
49
|
-
|
76
|
+
|
50
77
|
config.training_set.ids_fenotypes[id1.to_i] = feno1.to_f
|
51
78
|
config.training_set.ids_fenotypes[id2.to_i] = feno2.to_f
|
52
79
|
config.training_set.ids_fenotypes[id3.to_i] = feno3.to_f
|
53
80
|
}
|
54
81
|
end
|
55
|
-
|
82
|
+
|
56
83
|
it "reads testing data and yields one individual at a time" do
|
57
84
|
config = Nimbus::Configuration.new
|
58
85
|
config.load fixture_file('regression_config.yml')
|
59
|
-
|
86
|
+
|
60
87
|
test_individuals = []
|
61
88
|
File.open(fixture_file('regression_testing.data')) {|file|
|
62
89
|
file.each do |line|
|
@@ -73,20 +100,20 @@ describe Nimbus::Configuration do
|
|
73
100
|
individual.snp_list.should == test_individual.snp_list
|
74
101
|
}
|
75
102
|
end
|
76
|
-
|
103
|
+
|
77
104
|
it "creates a forest object loading data from a yaml file" do
|
78
105
|
config = Nimbus::Configuration.new
|
79
106
|
config.load fixture_file('regression_config.yml')
|
80
|
-
|
107
|
+
|
81
108
|
trees = YAML.load(File.open fixture_file('regression_random_forest.yml'))
|
82
109
|
trees.first.keys.first.should == 189
|
83
110
|
trees.size.should == 3
|
84
|
-
|
111
|
+
|
85
112
|
forest = config.load_forest
|
86
113
|
forest.should be_kind_of Nimbus::Forest
|
87
114
|
forest.trees[0].should == trees.first
|
88
115
|
forest.trees[1].should == trees[1]
|
89
|
-
forest.trees.last.should == trees[2]
|
116
|
+
forest.trees.last.should == trees[2]
|
90
117
|
end
|
91
|
-
|
118
|
+
|
92
119
|
end
|