nimbus 1.0.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +149 -0
- data/lib/nimbus.rb +15 -11
- data/lib/nimbus/application.rb +20 -23
- data/lib/nimbus/classification_tree.rb +111 -0
- data/lib/nimbus/configuration.rb +52 -37
- data/lib/nimbus/forest.rb +56 -20
- data/lib/nimbus/individual.rb +7 -7
- data/lib/nimbus/loss_functions.rb +44 -10
- data/lib/nimbus/regression_tree.rb +103 -0
- data/lib/nimbus/training_set.rb +4 -4
- data/lib/nimbus/tree.rb +20 -83
- data/lib/nimbus/version.rb +3 -0
- data/spec/classification_tree_spec.rb +132 -0
- data/spec/configuration_spec.rb +46 -19
- data/spec/fixtures/classification_config.yml +13 -0
- data/spec/fixtures/classification_random_forest.yml +922 -0
- data/spec/fixtures/classification_testing.data +500 -0
- data/spec/fixtures/classification_training.data +1000 -0
- data/spec/forest_spec.rb +109 -50
- data/spec/individual_spec.rb +2 -2
- data/spec/loss_functions_spec.rb +71 -0
- data/spec/nimbus_spec.rb +4 -4
- data/spec/regression_tree_spec.rb +129 -0
- data/spec/training_set_spec.rb +5 -5
- data/spec/tree_spec.rb +4 -115
- metadata +53 -45
- data/spec/fixtures/regression_snp_importances.txt +0 -200
- data/spec/fixtures/regression_testing_file_predictions.txt +0 -200
- data/spec/fixtures/regression_training_file_predictions.txt +0 -758
@@ -0,0 +1,103 @@
|
|
1
|
+
module Nimbus
|
2
|
+
|
3
|
+
#####################################################################
|
4
|
+
# Tree object representing a random regression tree.
|
5
|
+
#
|
6
|
+
# A tree is generated following this steps:
|
7
|
+
#
|
8
|
+
# * 1: Calculate loss function for the individuals in the node (first node contains all the individuals).
|
9
|
+
# * 2: Take a random sample of the SNPs (size m << total count of SNPs)
|
10
|
+
# * 3: Compute the loss function (quadratic loss) for the split of the sample based on value of every SNP.
|
11
|
+
# * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in three nodes, based on value for that SNP [0, 1, or 2]
|
12
|
+
# * 5: Repeat from 1 for every node until:
|
13
|
+
# - a) The individuals count in that node is < minimum size OR
|
14
|
+
# - b) None of the SNP splits has a loss function smaller than the node loss function
|
15
|
+
# * 6) When a node stops, label the node with the average fenotype value of the individuals in the node.
|
16
|
+
#
|
17
|
+
class RegressionTree < Tree
|
18
|
+
|
19
|
+
# Creates the structure of the tree, as a hash of SNP splits and values.
|
20
|
+
#
|
21
|
+
# It just initializes the needed variables and then defines the first node of the tree.
|
22
|
+
# The rest of the structure of the tree is computed recursively building every node calling +build_node+.
|
23
|
+
def seed(all_individuals, individuals_sample, ids_fenotypes)
|
24
|
+
super
|
25
|
+
@structure = build_node individuals_sample, Nimbus::LossFunctions.average(individuals_sample, @id_to_fenotype)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Creates a node by taking a random sample of the SNPs and computing the loss function for every split by SNP of that sample.
|
29
|
+
#
|
30
|
+
# * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in three:
|
31
|
+
# (those with value 0 for the SNP_min, those with value 1 for the SNP_min, and those with value 2 for the SNP_min) then it builds these 3 new nodes.
|
32
|
+
# * Otherwise every individual in the node gets labeled with the average of the fenotype values of all of them.
|
33
|
+
def build_node(individuals_ids, y_hat)
|
34
|
+
# General loss function value for the node
|
35
|
+
individuals_count = individuals_ids.size
|
36
|
+
return label_node(y_hat, individuals_ids) if individuals_count < @node_min_size
|
37
|
+
node_loss_function = Nimbus::LossFunctions.quadratic_loss individuals_ids, @id_to_fenotype, y_hat
|
38
|
+
|
39
|
+
# Finding the SNP that minimizes loss function
|
40
|
+
snps = snps_random_sample
|
41
|
+
min_loss, min_SNP, split, means = node_loss_function, nil, nil, nil
|
42
|
+
|
43
|
+
snps.each do |snp|
|
44
|
+
individuals_split_by_snp_value = split_by_snp_value individuals_ids, snp
|
45
|
+
mean_0 = Nimbus::LossFunctions.average individuals_split_by_snp_value[0], @id_to_fenotype
|
46
|
+
mean_1 = Nimbus::LossFunctions.average individuals_split_by_snp_value[1], @id_to_fenotype
|
47
|
+
mean_2 = Nimbus::LossFunctions.average individuals_split_by_snp_value[2], @id_to_fenotype
|
48
|
+
loss_0 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[0], @id_to_fenotype, mean_0
|
49
|
+
loss_1 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[1], @id_to_fenotype, mean_1
|
50
|
+
loss_2 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[2], @id_to_fenotype, mean_2
|
51
|
+
loss_snp = (loss_0 + loss_1 + loss_2) / individuals_count
|
52
|
+
|
53
|
+
min_loss, min_SNP, split, means = loss_snp, snp, individuals_split_by_snp_value, [mean_0, mean_1, mean_2] if loss_snp < min_loss
|
54
|
+
end
|
55
|
+
|
56
|
+
return build_branch(min_SNP, split, means, y_hat) if min_loss < node_loss_function
|
57
|
+
return label_node(y_hat, individuals_ids)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Compute generalization error for the tree.
|
61
|
+
#
|
62
|
+
# Traversing the 'out of bag' (OOB) sample (those individuals of the training set not
|
63
|
+
# used in the building of this tree) through the tree, and comparing
|
64
|
+
# the prediction with the real fenotype of the individual (and then averaging) is
|
65
|
+
# possible to calculate the unbiased generalization error for the tree.
|
66
|
+
def generalization_error_from_oob(oob_ids)
|
67
|
+
return nil if (@structure.nil? || @individuals.nil? || @id_to_fenotype.nil?)
|
68
|
+
oob_errors = {}
|
69
|
+
oob_ids.each do |oobi|
|
70
|
+
oob_prediction = Tree.traverse @structure, individuals[oobi].snp_list
|
71
|
+
oob_errors[oobi] = Nimbus::LossFunctions.squared_difference oob_prediction, @id_to_fenotype[oobi]
|
72
|
+
end
|
73
|
+
@generalization_error = Nimbus::LossFunctions.average oob_ids, oob_errors
|
74
|
+
end
|
75
|
+
|
76
|
+
# Estimation of importance for every SNP.
|
77
|
+
#
|
78
|
+
# The importance of any SNP in the tree is calculated using the OOB sample.
|
79
|
+
# For every SNP, every individual in the sample is pushed down the tree but with the
|
80
|
+
# value of that SNP permuted with other individual in the sample.
|
81
|
+
#
|
82
|
+
# That way the difference between the regular prediction and the prediction with the SNP value modified can be estimated for any given SNP.
|
83
|
+
#
|
84
|
+
# This method computes importance estimations for every SNPs used in the tree (for any other SNP it would be 0).
|
85
|
+
def estimate_importances(oob_ids)
|
86
|
+
return nil if (@generalization_error.nil? && generalization_error_from_oob(oob_ids).nil?)
|
87
|
+
oob_individuals_count = oob_ids.size
|
88
|
+
@importances = {}
|
89
|
+
@used_snps.uniq.each do |current_snp|
|
90
|
+
shuffled_ids = oob_ids.shuffle
|
91
|
+
permutated_snp_error = 0.0
|
92
|
+
oob_ids.each_with_index {|oobi, index|
|
93
|
+
permutated_prediction = traverse_with_permutation @structure, individuals[oobi].snp_list, current_snp, individuals[shuffled_ids[index]].snp_list
|
94
|
+
permutated_snp_error += Nimbus::LossFunctions.squared_difference @id_to_fenotype[oobi], permutated_prediction
|
95
|
+
}
|
96
|
+
@importances[current_snp] = ((permutated_snp_error / oob_individuals_count) - @generalization_error).round(5)
|
97
|
+
end
|
98
|
+
@importances
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
data/lib/nimbus/training_set.rb
CHANGED
@@ -1,23 +1,23 @@
|
|
1
1
|
module Nimbus
|
2
2
|
#####################################################################
|
3
3
|
# Set of individuals to be used as training sample for a random forest.
|
4
|
-
#
|
4
|
+
#
|
5
5
|
# the TrainingSet class stores an array of individuals, and a hash with the fenotypes of every individual indexed by id.
|
6
6
|
#
|
7
7
|
class TrainingSet
|
8
8
|
attr_accessor :individuals, :ids_fenotypes
|
9
|
-
|
9
|
+
|
10
10
|
# Initialize a new training set with the individuals and fenotype info received.
|
11
11
|
def initialize(individuals, ids_fenotypes)
|
12
12
|
@individuals = individuals
|
13
13
|
@ids_fenotypes = ids_fenotypes
|
14
14
|
end
|
15
|
-
|
15
|
+
|
16
16
|
# Array of all the ids of the individuals in this training sample.
|
17
17
|
def all_ids
|
18
18
|
@all_ids ||= @ids_fenotypes.keys
|
19
19
|
@all_ids
|
20
20
|
end
|
21
21
|
end
|
22
|
-
|
22
|
+
|
23
23
|
end
|
data/lib/nimbus/tree.rb
CHANGED
@@ -6,25 +6,25 @@ module Nimbus
|
|
6
6
|
# A tree is generated following this steps:
|
7
7
|
#
|
8
8
|
# * 1: Calculate loss function for the individuals in the node (first node contains all the individuals).
|
9
|
-
# * 2: Take a random sample of the SNPs (size m << total count of SNPs)
|
9
|
+
# * 2: Take a random sample of the SNPs (size m << total count of SNPs)
|
10
10
|
# * 3: Compute the loss function for the split of the sample based on value of every SNP.
|
11
11
|
# * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in three nodes, based on value for that SNP [0, 1, or 2]
|
12
12
|
# * 5: Repeat from 1 for every node until:
|
13
13
|
# - a) The individuals count in that node is < minimum size OR
|
14
14
|
# - b) None of the SNP splits has a loss function smaller than the node loss function
|
15
|
-
# * 6) When a node stops, label the node with the average fenotype value of the individuals in the node.
|
15
|
+
# * 6) When a node stops, label the node with the average fenotype value (for regression problems) or the majority class (for classification problems) of the individuals in the node.
|
16
16
|
#
|
17
17
|
class Tree
|
18
18
|
attr_accessor :snp_sample_size, :snp_total_count, :node_min_size, :used_snps, :structure, :generalization_error, :predictions, :importances
|
19
19
|
attr_accessor :individuals, :id_to_fenotype
|
20
|
-
|
20
|
+
|
21
21
|
# Initialize Tree object with the configuration (as in Nimbus::Configuration.tree) options received.
|
22
22
|
def initialize(options)
|
23
23
|
@snp_total_count = options[:snp_total_count]
|
24
24
|
@snp_sample_size = options[:snp_sample_size]
|
25
25
|
@node_min_size = options[:tree_node_min_size]
|
26
26
|
end
|
27
|
-
|
27
|
+
|
28
28
|
# Creates the structure of the tree, as a hash of SNP splits and values.
|
29
29
|
#
|
30
30
|
# It just initializes the needed variables and then defines the first node of the tree.
|
@@ -34,113 +34,50 @@ module Nimbus
|
|
34
34
|
@id_to_fenotype = ids_fenotypes
|
35
35
|
@predictions = {}
|
36
36
|
@used_snps = []
|
37
|
-
|
38
|
-
@structure = build_node individuals_sample, Nimbus::LossFunctions.average(individuals_sample, @id_to_fenotype)
|
39
37
|
end
|
40
38
|
|
41
39
|
# Creates a node by taking a random sample of the SNPs and computing the loss function for every split by SNP of that sample.
|
42
|
-
#
|
43
|
-
# * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in three:
|
44
|
-
# (those with value 0 for the SNP_min, those with value 1 for the SNP_min, and those with value 2 for the SNP_min) then it builds these 3 new nodes.
|
45
|
-
# * Otherwise every individual in the node gets labeled with the average of the fenotype values of all of them.
|
46
40
|
def build_node(individuals_ids, y_hat)
|
47
|
-
# General loss function value for the node
|
48
|
-
individuals_count = individuals_ids.size
|
49
|
-
return label_node(y_hat, individuals_ids) if individuals_count < @node_min_size
|
50
|
-
node_loss_function = Nimbus::LossFunctions.quadratic_loss individuals_ids, @id_to_fenotype, y_hat
|
51
|
-
|
52
|
-
# Finding the SNP that minimizes loss function
|
53
|
-
snps = snps_random_sample
|
54
|
-
min_loss, min_SNP, split, means = node_loss_function, nil, nil, nil
|
55
|
-
|
56
|
-
snps.each do |snp|
|
57
|
-
individuals_split_by_snp_value = split_by_snp_value individuals_ids, snp
|
58
|
-
mean_0 = Nimbus::LossFunctions.average individuals_split_by_snp_value[0], @id_to_fenotype
|
59
|
-
mean_1 = Nimbus::LossFunctions.average individuals_split_by_snp_value[1], @id_to_fenotype
|
60
|
-
mean_2 = Nimbus::LossFunctions.average individuals_split_by_snp_value[2], @id_to_fenotype
|
61
|
-
loss_0 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[0], @id_to_fenotype, mean_0
|
62
|
-
loss_1 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[1], @id_to_fenotype, mean_1
|
63
|
-
loss_2 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[2], @id_to_fenotype, mean_2
|
64
|
-
loss_snp = (loss_0 + loss_1 + loss_2) / individuals_count
|
65
|
-
|
66
|
-
min_loss, min_SNP, split, means = loss_snp, snp, individuals_split_by_snp_value, [mean_0, mean_1, mean_2] if loss_snp < min_loss
|
67
|
-
end
|
68
|
-
|
69
|
-
return build_branch(min_SNP, split, means, y_hat) if min_loss < node_loss_function
|
70
|
-
return label_node(y_hat, individuals_ids)
|
71
41
|
end
|
72
42
|
|
73
43
|
# Compute generalization error for the tree.
|
74
|
-
#
|
75
|
-
# Traversing the 'out of bag' (OOB) sample (those individuals of the training set not
|
76
|
-
# used in the building of this tree) through the tree, and comparing
|
77
|
-
# the prediction with the real fenotype of the individual (and then averaging) is
|
78
|
-
# possible to calculate the unbiased generalization error for the tree.
|
79
44
|
def generalization_error_from_oob(oob_ids)
|
80
|
-
return nil if (@structure.nil? || @individuals.nil? || @id_to_fenotype.nil?)
|
81
|
-
oob_errors = {}
|
82
|
-
oob_ids.each do |oobi|
|
83
|
-
oob_prediction = Tree.traverse @structure, individuals[oobi].snp_list
|
84
|
-
oob_errors[oobi] = Nimbus::LossFunctions.squared_difference oob_prediction, @id_to_fenotype[oobi]
|
85
|
-
end
|
86
|
-
@generalization_error = Nimbus::LossFunctions.average oob_ids, oob_errors
|
87
45
|
end
|
88
46
|
|
89
47
|
# Estimation of importance for every SNP.
|
90
|
-
#
|
91
|
-
# The importance of any SNP in the tree is calculated using the OOB sample.
|
92
|
-
# For every SNP, every individual in the sample is pushed down the tree but with the
|
93
|
-
# value of that SNP permuted with other individual in the sample.
|
94
|
-
#
|
95
|
-
# That way the difference between the regular prediction and the prediction with the SNP value modified can be estimated for any given SNP.
|
96
|
-
#
|
97
|
-
# This method computes importance estimations for every SNPs used in the tree (for any other SNP it would be 0).
|
98
48
|
def estimate_importances(oob_ids)
|
99
|
-
return nil if (@generalization_error.nil? && generalization_error_from_oob(oob_ids).nil?)
|
100
|
-
oob_individuals_count = oob_ids.size
|
101
|
-
@importances = {}
|
102
|
-
@used_snps.uniq.each do |current_snp|
|
103
|
-
shuffled_ids = oob_ids.shuffle
|
104
|
-
permutated_snp_error = 0.0
|
105
|
-
oob_ids.each_with_index {|oobi, index|
|
106
|
-
permutated_prediction = traverse_with_permutation @structure, individuals[oobi].snp_list, current_snp, individuals[shuffled_ids[index]].snp_list
|
107
|
-
permutated_snp_error += Nimbus::LossFunctions.squared_difference @id_to_fenotype[oobi], permutated_prediction
|
108
|
-
}
|
109
|
-
@importances[current_snp] = ((permutated_snp_error / oob_individuals_count) - @generalization_error).round(5)
|
110
|
-
end
|
111
|
-
@importances
|
112
49
|
end
|
113
|
-
|
114
|
-
# Class method to traverse a single individual through a tree structure.
|
50
|
+
|
51
|
+
# Class method to traverse a single individual through a tree structure.
|
115
52
|
#
|
116
53
|
# Returns the prediction for that individual (the label of the final node reached by the individual).
|
117
54
|
def self.traverse(tree_structure, data)
|
118
|
-
return tree_structure if tree_structure.is_a?
|
55
|
+
return tree_structure if tree_structure.is_a?(Numeric) || tree_structure.is_a?(String)
|
119
56
|
raise Nimbus::TreeError, "Forest data has invalid structure. Please check your forest data (file)." if !(tree_structure.is_a?(Hash) && tree_structure.keys.size == 1)
|
120
57
|
return self.traverse( tree_structure.values.first[ data[tree_structure.keys.first - 1].to_i], data)
|
121
58
|
end
|
122
|
-
|
123
|
-
|
124
|
-
|
59
|
+
|
60
|
+
protected
|
61
|
+
|
125
62
|
def snps_random_sample
|
126
63
|
(1..@snp_total_count).to_a.sample(@snp_sample_size).sort
|
127
64
|
end
|
128
|
-
|
65
|
+
|
129
66
|
def build_branch(snp, split, y_hats, parent_y_hat)
|
130
67
|
node_0 = split[0].size == 0 ? label_node(parent_y_hat, []) : build_node(split[0], y_hats[0])
|
131
68
|
node_1 = split[1].size == 0 ? label_node(parent_y_hat, []) : build_node(split[1], y_hats[1])
|
132
69
|
node_2 = split[2].size == 0 ? label_node(parent_y_hat, []) : build_node(split[2], y_hats[2])
|
133
|
-
|
70
|
+
|
134
71
|
split_by_snp(snp)
|
135
72
|
return { snp => [node_0, node_1, node_2] }
|
136
73
|
end
|
137
|
-
|
74
|
+
|
138
75
|
def label_node(value, ids)
|
139
|
-
label = value.round(5)
|
76
|
+
label = value.is_a?(String) ? value : value.round(5)
|
140
77
|
ids.uniq.each{|i| @predictions[i] = label}
|
141
78
|
label
|
142
79
|
end
|
143
|
-
|
80
|
+
|
144
81
|
def split_by_snp_value(ids, snp)
|
145
82
|
split = [[], [], []]
|
146
83
|
ids.each do |i|
|
@@ -150,17 +87,17 @@ module Nimbus
|
|
150
87
|
rescue => ex
|
151
88
|
raise Nimbus::TreeError, "Values for SNPs columns must be in [0, 1, 2]"
|
152
89
|
end
|
153
|
-
|
90
|
+
|
154
91
|
def split_by_snp(x)
|
155
92
|
@used_snps << x
|
156
93
|
end
|
157
|
-
|
94
|
+
|
158
95
|
def traverse_with_permutation(tree_structure, data, snp_to_permute, individual_to_permute)
|
159
|
-
return tree_structure if tree_structure.is_a?
|
96
|
+
return tree_structure if tree_structure.is_a?(Numeric) || tree_structure.is_a?(String)
|
160
97
|
individual_data = (tree_structure.keys.first == snp_to_permute ? individual_to_permute : data)
|
161
98
|
return traverse_with_permutation( tree_structure.values.first[ individual_data[tree_structure.keys.first - 1].to_i], data, snp_to_permute, individual_to_permute)
|
162
99
|
end
|
163
|
-
|
100
|
+
|
164
101
|
end
|
165
|
-
|
102
|
+
|
166
103
|
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
describe Nimbus::ClassificationTree do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@config = Nimbus::Configuration.new
|
7
|
+
@config.load fixture_file('classification_config.yml')
|
8
|
+
|
9
|
+
@tree = Nimbus::ClassificationTree.new @config.tree
|
10
|
+
end
|
11
|
+
|
12
|
+
it "is initialized with tree config info" do
|
13
|
+
@tree.snp_total_count.should == 100
|
14
|
+
@tree.snp_sample_size.should == 33
|
15
|
+
@tree.node_min_size.should == 5
|
16
|
+
@tree.classes.size.should == 2
|
17
|
+
@tree.classes[0].should == '0'
|
18
|
+
@tree.classes[1].should == '1'
|
19
|
+
end
|
20
|
+
|
21
|
+
it "creates a tree structure when seeded with training data" do
|
22
|
+
@config.load_training_data
|
23
|
+
@tree.structure.should be_nil
|
24
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
25
|
+
@tree.structure.should_not be_nil
|
26
|
+
@tree.structure.should be_kind_of Hash
|
27
|
+
|
28
|
+
@tree.structure.keys.first.should == @tree.used_snps.last
|
29
|
+
@tree.used_snps.should_not be_empty
|
30
|
+
end
|
31
|
+
|
32
|
+
it "splits node in three when building a node and finds a suitable split" do
|
33
|
+
@config.load_training_data
|
34
|
+
@tree.stub!(:snps_random_sample).and_return((68..100).to_a) #97 is best split
|
35
|
+
|
36
|
+
@tree.individuals = @config.training_set.individuals
|
37
|
+
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
38
|
+
@tree.used_snps = []
|
39
|
+
@tree.predictions = {}
|
40
|
+
|
41
|
+
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
42
|
+
branch.keys.size.should == 1
|
43
|
+
branch.keys.first.should == 97
|
44
|
+
branch[97].size.should == 3
|
45
|
+
branch[97][0].should be_kind_of Hash
|
46
|
+
branch[97][1].should be_kind_of Hash
|
47
|
+
branch[97][2].should be_kind_of Hash
|
48
|
+
end
|
49
|
+
|
50
|
+
it "keeps track of all SNPs used for the tree" do
|
51
|
+
@config.load_training_data
|
52
|
+
snps = (33..65).to_a
|
53
|
+
@tree.stub!(:snps_random_sample).and_return(snps)
|
54
|
+
@tree.used_snps.should be_nil
|
55
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
56
|
+
@tree.used_snps.size.should > 4
|
57
|
+
@tree.used_snps.each{|snp|
|
58
|
+
snps.include?(snp).should be_true
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
62
|
+
it "labels node when building a node and there is not a suitable split" do
|
63
|
+
@config.load_training_data
|
64
|
+
@tree.stub!(:snps_random_sample).and_return([33])
|
65
|
+
|
66
|
+
@tree.individuals = @config.training_set.individuals
|
67
|
+
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
68
|
+
@tree.used_snps = []
|
69
|
+
@tree.predictions = {}
|
70
|
+
|
71
|
+
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
72
|
+
branch[33][0].should be_kind_of String
|
73
|
+
branch[33][1].should be_kind_of String
|
74
|
+
branch[33][2].should be_kind_of String
|
75
|
+
end
|
76
|
+
|
77
|
+
it "labels node when building a node with less individuals than the minimum node size" do
|
78
|
+
@config.load_training_data
|
79
|
+
|
80
|
+
@tree.individuals = @config.training_set.individuals
|
81
|
+
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
82
|
+
@tree.used_snps = []
|
83
|
+
@tree.predictions = {}
|
84
|
+
|
85
|
+
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
86
|
+
label.should be_kind_of String
|
87
|
+
|
88
|
+
label = @tree.build_node [2, 10], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
89
|
+
label.should be_kind_of String
|
90
|
+
|
91
|
+
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
92
|
+
label.should be_kind_of String
|
93
|
+
|
94
|
+
label = @tree.build_node [99, 22, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
95
|
+
label.should be_kind_of String
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'computes generalization error for the tree' do
|
99
|
+
@config.load_training_data
|
100
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
101
|
+
@tree.generalization_error.should be_nil
|
102
|
+
@tree.generalization_error_from_oob((3..300).to_a)
|
103
|
+
@tree.generalization_error.should be_kind_of Numeric
|
104
|
+
@tree.generalization_error.should > 0.0
|
105
|
+
@tree.generalization_error.should < 1.0
|
106
|
+
end
|
107
|
+
|
108
|
+
it 'estimates importance for all SNPs' do
|
109
|
+
@config.load_training_data
|
110
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
111
|
+
@tree.importances.should be_nil
|
112
|
+
@tree.estimate_importances((200..533).to_a)
|
113
|
+
@tree.importances.should be_kind_of Hash
|
114
|
+
@tree.importances.keys.should_not be_empty
|
115
|
+
(@tree.importances.keys - (1..100).to_a).should be_empty #all keys are snp indexes (100 snps in training file)
|
116
|
+
end
|
117
|
+
|
118
|
+
it 'get prediction for an individual pushing it down a tree structure' do
|
119
|
+
tree_structure = YAML.load(File.open fixture_file('classification_random_forest.yml')).first
|
120
|
+
individual_data = [0]*100
|
121
|
+
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
122
|
+
prediction.should == '1'
|
123
|
+
|
124
|
+
individual_data[26-1] = 1
|
125
|
+
individual_data[57-1] = 2
|
126
|
+
individual_data[98-1] = 2
|
127
|
+
individual_data[8-1] = 1
|
128
|
+
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
129
|
+
prediction.should == '0'
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
data/spec/configuration_spec.rb
CHANGED
@@ -2,30 +2,57 @@
|
|
2
2
|
require File.dirname(__FILE__) + '/spec_helper'
|
3
3
|
|
4
4
|
describe Nimbus::Configuration do
|
5
|
-
|
5
|
+
|
6
6
|
it "loads configuration options from config file" do
|
7
7
|
config = Nimbus::Configuration.new
|
8
8
|
config.load fixture_file('regression_config.yml')
|
9
|
-
|
9
|
+
|
10
10
|
config.training_file.should == fixture_file('regression_training.data')
|
11
11
|
config.testing_file.should == fixture_file('regression_testing.data')
|
12
12
|
config.forest_file.should == fixture_file('regression_random_forest.yml')
|
13
|
-
|
13
|
+
config.classes.should be_nil
|
14
|
+
|
14
15
|
config.forest_size.should == 3
|
15
16
|
config.tree_SNP_sample_size.should == 60
|
16
17
|
config.tree_SNP_total_count.should == 200
|
17
|
-
config.tree_node_min_size.should == 5
|
18
|
+
config.tree_node_min_size.should == 5
|
19
|
+
|
20
|
+
config = Nimbus::Configuration.new
|
21
|
+
config.load fixture_file('classification_config.yml')
|
22
|
+
|
23
|
+
config.training_file.should == fixture_file('classification_training.data')
|
24
|
+
config.testing_file.should == fixture_file('classification_testing.data')
|
25
|
+
config.forest_file.should == fixture_file('classification_random_forest.yml')
|
26
|
+
config.classes.should == ['0','1']
|
27
|
+
|
28
|
+
config.forest_size.should == 3
|
29
|
+
config.tree_SNP_sample_size.should == 33
|
30
|
+
config.tree_SNP_total_count.should == 100
|
31
|
+
config.tree_node_min_size.should == 5
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'tree method return tree-related subset of options for regression trees' do
|
35
|
+
config = Nimbus::Configuration.new
|
36
|
+
config.load fixture_file('regression_config.yml')
|
37
|
+
tree_options = config.tree
|
38
|
+
|
39
|
+
tree_options[:snp_sample_size].should_not be_nil
|
40
|
+
tree_options[:snp_total_count].should_not be_nil
|
41
|
+
tree_options[:tree_node_min_size].should_not be_nil
|
42
|
+
tree_options[:classes].should be_nil
|
18
43
|
end
|
19
|
-
|
20
|
-
it 'tree method return tree-related subset of options' do
|
44
|
+
|
45
|
+
it 'tree method return tree-related subset of options for classification trees' do
|
21
46
|
config = Nimbus::Configuration.new
|
47
|
+
config.load fixture_file('classification_config.yml')
|
22
48
|
tree_options = config.tree
|
23
|
-
|
49
|
+
|
24
50
|
tree_options[:snp_sample_size].should_not be_nil
|
25
51
|
tree_options[:snp_total_count].should_not be_nil
|
26
52
|
tree_options[:tree_node_min_size].should_not be_nil
|
53
|
+
tree_options[:classes].should_not be_nil
|
27
54
|
end
|
28
|
-
|
55
|
+
|
29
56
|
it "creates a training set object from training data file" do
|
30
57
|
config = Nimbus::Configuration.new
|
31
58
|
config.load fixture_file('regression_config.yml')
|
@@ -33,30 +60,30 @@ describe Nimbus::Configuration do
|
|
33
60
|
config.load_training_data
|
34
61
|
config.training_set.should be_kind_of Nimbus::TrainingSet
|
35
62
|
config.training_set.all_ids.sort.should == (1..800).to_a
|
36
|
-
|
63
|
+
|
37
64
|
File.open(fixture_file('regression_training.data')) {|file|
|
38
65
|
feno1, id1, *snp_list_1 = file.readline.split
|
39
66
|
feno2, id2, *snp_list_2 = file.readline.split
|
40
67
|
feno3, id3, *snp_list_3 = file.readline.split
|
41
|
-
|
68
|
+
|
42
69
|
i1 = Nimbus::Individual.new(id1.to_i, feno1.to_f, snp_list_1.map{|snp| snp.to_i})
|
43
70
|
i2 = Nimbus::Individual.new(id2.to_i, feno2.to_f, snp_list_2.map{|snp| snp.to_i})
|
44
71
|
i3 = Nimbus::Individual.new(id3.to_i, feno3.to_f, snp_list_3.map{|snp| snp.to_i})
|
45
|
-
|
72
|
+
|
46
73
|
config.training_set.individuals[id1.to_i].id.should == i1.id
|
47
74
|
config.training_set.individuals[id2.to_i].fenotype.should == i2.fenotype
|
48
75
|
config.training_set.individuals[id3.to_i].snp_list.should == i3.snp_list
|
49
|
-
|
76
|
+
|
50
77
|
config.training_set.ids_fenotypes[id1.to_i] = feno1.to_f
|
51
78
|
config.training_set.ids_fenotypes[id2.to_i] = feno2.to_f
|
52
79
|
config.training_set.ids_fenotypes[id3.to_i] = feno3.to_f
|
53
80
|
}
|
54
81
|
end
|
55
|
-
|
82
|
+
|
56
83
|
it "reads testing data and yields one individual at a time" do
|
57
84
|
config = Nimbus::Configuration.new
|
58
85
|
config.load fixture_file('regression_config.yml')
|
59
|
-
|
86
|
+
|
60
87
|
test_individuals = []
|
61
88
|
File.open(fixture_file('regression_testing.data')) {|file|
|
62
89
|
file.each do |line|
|
@@ -73,20 +100,20 @@ describe Nimbus::Configuration do
|
|
73
100
|
individual.snp_list.should == test_individual.snp_list
|
74
101
|
}
|
75
102
|
end
|
76
|
-
|
103
|
+
|
77
104
|
it "creates a forest object loading data from a yaml file" do
|
78
105
|
config = Nimbus::Configuration.new
|
79
106
|
config.load fixture_file('regression_config.yml')
|
80
|
-
|
107
|
+
|
81
108
|
trees = YAML.load(File.open fixture_file('regression_random_forest.yml'))
|
82
109
|
trees.first.keys.first.should == 189
|
83
110
|
trees.size.should == 3
|
84
|
-
|
111
|
+
|
85
112
|
forest = config.load_forest
|
86
113
|
forest.should be_kind_of Nimbus::Forest
|
87
114
|
forest.trees[0].should == trees.first
|
88
115
|
forest.trees[1].should == trees[1]
|
89
|
-
forest.trees.last.should == trees[2]
|
116
|
+
forest.trees.last.should == trees[2]
|
90
117
|
end
|
91
|
-
|
118
|
+
|
92
119
|
end
|