nimbus 2.2.1 → 2.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +7 -0
- data/CONTRIBUTING.md +46 -0
- data/MIT-LICENSE.txt +1 -1
- data/README.md +131 -21
- data/bin/nimbus +2 -2
- data/lib/nimbus.rb +2 -6
- data/lib/nimbus/classification_tree.rb +9 -12
- data/lib/nimbus/configuration.rb +22 -22
- data/lib/nimbus/forest.rb +8 -8
- data/lib/nimbus/loss_functions.rb +11 -0
- data/lib/nimbus/regression_tree.rb +8 -10
- data/lib/nimbus/tree.rb +54 -12
- data/lib/nimbus/version.rb +1 -1
- data/spec/classification_tree_spec.rb +47 -47
- data/spec/configuration_spec.rb +55 -55
- data/spec/fixtures/{classification_config.yml → classification/config.yml} +3 -3
- data/spec/fixtures/classification/random_forest.yml +1174 -0
- data/spec/fixtures/{classification_testing.data → classification/testing.data} +0 -0
- data/spec/fixtures/{classification_training.data → classification/training.data} +0 -0
- data/spec/fixtures/{regression_config.yml → regression/config.yml} +4 -4
- data/spec/fixtures/regression/random_forest.yml +2737 -0
- data/spec/fixtures/{regression_testing.data → regression/testing.data} +0 -0
- data/spec/fixtures/{regression_training.data → regression/training.data} +0 -0
- data/spec/forest_spec.rb +39 -39
- data/spec/individual_spec.rb +3 -3
- data/spec/loss_functions_spec.rb +31 -13
- data/spec/nimbus_spec.rb +2 -2
- data/spec/regression_tree_spec.rb +44 -44
- data/spec/training_set_spec.rb +3 -3
- data/spec/tree_spec.rb +4 -4
- metadata +37 -34
- data/spec/fixtures/classification_random_forest.yml +0 -922
- data/spec/fixtures/regression_random_forest.yml +0 -1741
data/lib/nimbus/forest.rb
CHANGED
@@ -88,6 +88,14 @@ module Nimbus
|
|
88
88
|
@trees.to_yaml
|
89
89
|
end
|
90
90
|
|
91
|
+
def classification?
|
92
|
+
@options.tree[:classes]
|
93
|
+
end
|
94
|
+
|
95
|
+
def regression?
|
96
|
+
@options.tree[:classes].nil?
|
97
|
+
end
|
98
|
+
|
91
99
|
private
|
92
100
|
|
93
101
|
def individuals_random_sample
|
@@ -140,14 +148,6 @@ module Nimbus
|
|
140
148
|
}
|
141
149
|
end
|
142
150
|
|
143
|
-
def classification?
|
144
|
-
@options.tree[:classes]
|
145
|
-
end
|
146
|
-
|
147
|
-
def regression?
|
148
|
-
@options.tree[:classes].nil?
|
149
|
-
end
|
150
|
-
|
151
151
|
end
|
152
152
|
|
153
153
|
end
|
@@ -35,6 +35,17 @@ module Nimbus
|
|
35
35
|
def squared_difference(x,y)
|
36
36
|
0.0 + (x-y)**2
|
37
37
|
end
|
38
|
+
|
39
|
+
# Simplified Huber function
|
40
|
+
def pseudo_huber_error(ids, value_table, mean = nil)
|
41
|
+
mean ||= self.average ids, value_table
|
42
|
+
ids.inject(0.0){|sum, i| sum + (Math.log(Math.cosh(value_table[i] - mean))) }
|
43
|
+
end
|
44
|
+
|
45
|
+
# Simplified Huber loss function: PHE / n
|
46
|
+
def pseudo_huber_loss(ids, value_table, mean = nil)
|
47
|
+
self.pseudo_huber_error(ids, value_table, mean) / ids.size
|
48
|
+
end
|
38
49
|
|
39
50
|
## CLASSSIFICATION
|
40
51
|
|
@@ -8,7 +8,7 @@ module Nimbus
|
|
8
8
|
# * 1: Calculate loss function for the individuals in the node (first node contains all the individuals).
|
9
9
|
# * 2: Take a random sample of the SNPs (size m << total count of SNPs)
|
10
10
|
# * 3: Compute the loss function (quadratic loss) for the split of the sample based on value of every SNP.
|
11
|
-
# * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in
|
11
|
+
# * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in two nodes, based on average value for that SNP [0,1][2], or [0][1,2]
|
12
12
|
# * 5: Repeat from 1 for every node until:
|
13
13
|
# - a) The individuals count in that node is < minimum size OR
|
14
14
|
# - b) None of the SNP splits has a loss function smaller than the node loss function
|
@@ -27,8 +27,8 @@ module Nimbus
|
|
27
27
|
|
28
28
|
# Creates a node by taking a random sample of the SNPs and computing the loss function for every split by SNP of that sample.
|
29
29
|
#
|
30
|
-
# * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in
|
31
|
-
# (
|
30
|
+
# * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in two:
|
31
|
+
# (the average of the 0,1,2 values for the SNP_min in the individuals is computed, and they are splitted in [<=avg], [>avg]) then it builds these 2 new nodes.
|
32
32
|
# * Otherwise every individual in the node gets labeled with the average of the fenotype values of all of them.
|
33
33
|
def build_node(individuals_ids, y_hat)
|
34
34
|
# General loss function value for the node
|
@@ -38,22 +38,20 @@ module Nimbus
|
|
38
38
|
|
39
39
|
# Finding the SNP that minimizes loss function
|
40
40
|
snps = snps_random_sample
|
41
|
-
min_loss, min_SNP, split, means = node_loss_function, nil, nil, nil
|
41
|
+
min_loss, min_SNP, split, split_type, means = node_loss_function, nil, nil, nil, nil
|
42
42
|
|
43
43
|
snps.each do |snp|
|
44
|
-
individuals_split_by_snp_value =
|
44
|
+
individuals_split_by_snp_value, node_split_type = split_by_snp_avegare_value individuals_ids, snp
|
45
45
|
mean_0 = Nimbus::LossFunctions.average individuals_split_by_snp_value[0], @id_to_fenotype
|
46
46
|
mean_1 = Nimbus::LossFunctions.average individuals_split_by_snp_value[1], @id_to_fenotype
|
47
|
-
mean_2 = Nimbus::LossFunctions.average individuals_split_by_snp_value[2], @id_to_fenotype
|
48
47
|
loss_0 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[0], @id_to_fenotype, mean_0
|
49
48
|
loss_1 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[1], @id_to_fenotype, mean_1
|
50
|
-
|
51
|
-
loss_snp = (loss_0 + loss_1 + loss_2) / individuals_count
|
49
|
+
loss_snp = (loss_0 + loss_1) / individuals_count
|
52
50
|
|
53
|
-
min_loss, min_SNP, split, means = loss_snp, snp, individuals_split_by_snp_value, [mean_0, mean_1
|
51
|
+
min_loss, min_SNP, split, split_type, means = loss_snp, snp, individuals_split_by_snp_value, node_split_type, [mean_0, mean_1] if loss_snp < min_loss
|
54
52
|
end
|
55
53
|
|
56
|
-
return build_branch(min_SNP, split, means, y_hat) if min_loss < node_loss_function
|
54
|
+
return build_branch(min_SNP, split, split_type, means, y_hat) if min_loss < node_loss_function
|
57
55
|
return label_node(y_hat, individuals_ids)
|
58
56
|
end
|
59
57
|
|
data/lib/nimbus/tree.rb
CHANGED
@@ -18,6 +18,9 @@ module Nimbus
|
|
18
18
|
attr_accessor :snp_sample_size, :snp_total_count, :node_min_size, :used_snps, :structure, :generalization_error, :predictions, :importances
|
19
19
|
attr_accessor :individuals, :id_to_fenotype
|
20
20
|
|
21
|
+
NODE_SPLIT_01_2 = "zero"
|
22
|
+
NODE_SPLIT_0_12 = "two"
|
23
|
+
|
21
24
|
# Initialize Tree object with the configuration (as in Nimbus::Configuration.tree) options received.
|
22
25
|
def initialize(options)
|
23
26
|
@snp_total_count = options[:snp_total_count]
|
@@ -53,8 +56,14 @@ module Nimbus
|
|
53
56
|
# Returns the prediction for that individual (the label of the final node reached by the individual).
|
54
57
|
def self.traverse(tree_structure, data)
|
55
58
|
return tree_structure if tree_structure.is_a?(Numeric) || tree_structure.is_a?(String)
|
59
|
+
|
56
60
|
raise Nimbus::TreeError, "Forest data has invalid structure. Please check your forest data (file)." if !(tree_structure.is_a?(Hash) && tree_structure.keys.size == 1)
|
57
|
-
|
61
|
+
|
62
|
+
branch = tree_structure.values.first
|
63
|
+
split_type = branch[1].to_s
|
64
|
+
datum = data_traversing_value(data[tree_structure.keys.first - 1], split_type)
|
65
|
+
|
66
|
+
return self.traverse(branch[datum], data)
|
58
67
|
end
|
59
68
|
|
60
69
|
protected
|
@@ -63,13 +72,12 @@ module Nimbus
|
|
63
72
|
(1..@snp_total_count).to_a.sample(@snp_sample_size).sort
|
64
73
|
end
|
65
74
|
|
66
|
-
def build_branch(snp, split, y_hats, parent_y_hat)
|
67
|
-
|
68
|
-
|
69
|
-
node_2 = split[2].size == 0 ? label_node(parent_y_hat, []) : build_node(split[2], y_hats[2])
|
75
|
+
def build_branch(snp, split, split_type, y_hats, parent_y_hat)
|
76
|
+
node_a = split[0].size == 0 ? label_node(parent_y_hat, []) : build_node(split[0], y_hats[0])
|
77
|
+
node_b = split[1].size == 0 ? label_node(parent_y_hat, []) : build_node(split[1], y_hats[1])
|
70
78
|
|
71
79
|
split_by_snp(snp)
|
72
|
-
return { snp => [
|
80
|
+
return { snp => [node_a, split_type, node_b] }
|
73
81
|
end
|
74
82
|
|
75
83
|
def label_node(value, ids)
|
@@ -78,24 +86,58 @@ module Nimbus
|
|
78
86
|
label
|
79
87
|
end
|
80
88
|
|
81
|
-
def
|
82
|
-
|
89
|
+
def split_by_snp_avegare_value(ids, snp)
|
90
|
+
split_012 = [[], [], []]
|
83
91
|
ids.each do |i|
|
84
|
-
|
92
|
+
split_012[ @individuals[i].snp_list[snp-1] ] << @individuals[i].id
|
85
93
|
end
|
86
|
-
split
|
94
|
+
# we split by the average number of 0,1,2 values.
|
95
|
+
# So if there are less or equal 0s than 2s the split is [0,1][2]
|
96
|
+
# and if there are more 0s than 2s the average will be <1 so the split is [0][1,2]
|
97
|
+
split_type = (split_012[0].size <= split_012[2].size ? NODE_SPLIT_01_2 : NODE_SPLIT_0_12)
|
98
|
+
split_type == NODE_SPLIT_01_2 ? split_012[0] += split_012[1] : split_012[2] += split_012[1]
|
99
|
+
split = [split_012[0], split_012[2]]
|
100
|
+
[split, split_type]
|
87
101
|
rescue => ex
|
88
102
|
raise Nimbus::TreeError, "Values for SNPs columns must be in [0, 1, 2]"
|
89
103
|
end
|
90
104
|
|
105
|
+
def split_by_value(ids, snp, value)
|
106
|
+
split = [[], []]
|
107
|
+
ids.each do |i|
|
108
|
+
@individuals[i].snp_list[snp-1] > value ? (split[1] << @individuals[i].id) : (split[0] << @individuals[i].id)
|
109
|
+
end
|
110
|
+
split
|
111
|
+
rescue => ex
|
112
|
+
raise Nimbus::TreeError, "Values for SNPs columns must be numeric"
|
113
|
+
end
|
114
|
+
|
91
115
|
def split_by_snp(x)
|
92
116
|
@used_snps << x
|
93
117
|
end
|
94
118
|
|
95
119
|
def traverse_with_permutation(tree_structure, data, snp_to_permute, individual_to_permute)
|
96
120
|
return tree_structure if tree_structure.is_a?(Numeric) || tree_structure.is_a?(String)
|
97
|
-
|
98
|
-
|
121
|
+
|
122
|
+
key = tree_structure.keys.first
|
123
|
+
branch = tree_structure.values.first
|
124
|
+
individual_data = (key == snp_to_permute ? individual_to_permute : data)
|
125
|
+
split_type = branch[1]
|
126
|
+
datum = data_traversing_value(individual_data[key - 1].to_i, split_type)
|
127
|
+
|
128
|
+
return traverse_with_permutation branch[datum], data, snp_to_permute, individual_to_permute
|
129
|
+
end
|
130
|
+
|
131
|
+
def data_traversing_value(datum, split_type)
|
132
|
+
Nimbus::Tree.data_traversing_value(datum, split_type)
|
133
|
+
end
|
134
|
+
|
135
|
+
def self.data_traversing_value(datum, split_type)
|
136
|
+
if datum == 1
|
137
|
+
return 0 if split_type == NODE_SPLIT_01_2
|
138
|
+
return 2 if split_type == NODE_SPLIT_0_12
|
139
|
+
end
|
140
|
+
datum
|
99
141
|
end
|
100
142
|
|
101
143
|
end
|
data/lib/nimbus/version.rb
CHANGED
@@ -4,34 +4,34 @@ describe Nimbus::ClassificationTree do
|
|
4
4
|
|
5
5
|
before(:each) do
|
6
6
|
@config = Nimbus::Configuration.new
|
7
|
-
@config.load fixture_file('
|
7
|
+
@config.load fixture_file('classification/config.yml')
|
8
8
|
|
9
9
|
@tree = Nimbus::ClassificationTree.new @config.tree
|
10
10
|
end
|
11
11
|
|
12
12
|
it "is initialized with tree config info" do
|
13
|
-
@tree.snp_total_count.
|
14
|
-
@tree.snp_sample_size.
|
15
|
-
@tree.node_min_size.
|
16
|
-
@tree.classes.size.
|
17
|
-
@tree.classes[0].
|
18
|
-
@tree.classes[1].
|
13
|
+
expect(@tree.snp_total_count).to eq 100
|
14
|
+
expect(@tree.snp_sample_size).to eq 33
|
15
|
+
expect(@tree.node_min_size).to eq 5
|
16
|
+
expect(@tree.classes.size).to eq 2
|
17
|
+
expect(@tree.classes[0]).to eq '0'
|
18
|
+
expect(@tree.classes[1]).to eq '1'
|
19
19
|
end
|
20
20
|
|
21
21
|
it "creates a tree structure when seeded with training data" do
|
22
22
|
@config.load_training_data
|
23
|
-
@tree.structure.
|
23
|
+
expect(@tree.structure).to be_nil
|
24
24
|
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
25
|
-
@tree.structure.
|
26
|
-
@tree.structure.
|
25
|
+
expect(@tree.structure).to_not be_nil
|
26
|
+
expect(@tree.structure).to be_kind_of Hash
|
27
27
|
|
28
|
-
@tree.structure.keys.first.
|
29
|
-
@tree.used_snps.
|
28
|
+
expect(@tree.structure.keys.first).to eq @tree.used_snps.last
|
29
|
+
expect(@tree.used_snps).to_not be_empty
|
30
30
|
end
|
31
31
|
|
32
|
-
it "splits node
|
32
|
+
it "splits node when building a node and finds a suitable split" do
|
33
33
|
@config.load_training_data
|
34
|
-
|
34
|
+
allow_any_instance_of(Nimbus::ClassificationTree).to receive(:snps_random_sample).and_return((68..100).to_a) #97 is best split
|
35
35
|
|
36
36
|
@tree.individuals = @config.training_set.individuals
|
37
37
|
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
@@ -39,29 +39,29 @@ describe Nimbus::ClassificationTree do
|
|
39
39
|
@tree.predictions = {}
|
40
40
|
|
41
41
|
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
42
|
-
branch.keys.size.
|
43
|
-
branch.keys.first.
|
44
|
-
branch[97].size.
|
45
|
-
branch[97][0].
|
46
|
-
branch[97][1]
|
47
|
-
branch[97][2].
|
42
|
+
expect(branch.keys.size).to eq 1
|
43
|
+
expect(branch.keys.first).to eq 97
|
44
|
+
expect(branch[97].size).to eq 3
|
45
|
+
expect(branch[97][0]).to be_kind_of Hash
|
46
|
+
expect([Nimbus::Tree::NODE_SPLIT_01_2, Nimbus::Tree::NODE_SPLIT_0_12]).to include(branch[97][1])
|
47
|
+
expect(branch[97][2]).to be_kind_of Hash
|
48
48
|
end
|
49
49
|
|
50
50
|
it "keeps track of all SNPs used for the tree" do
|
51
51
|
@config.load_training_data
|
52
52
|
snps = (33..65).to_a
|
53
|
-
|
54
|
-
@tree.used_snps.
|
53
|
+
allow_any_instance_of(Nimbus::ClassificationTree).to receive(:snps_random_sample).and_return(snps)
|
54
|
+
expect(@tree.used_snps).to be_nil
|
55
55
|
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
56
|
-
@tree.used_snps.size.
|
56
|
+
expect(@tree.used_snps.size).to be > 4
|
57
57
|
@tree.used_snps.each{|snp|
|
58
|
-
snps.include?(snp).
|
58
|
+
expect(snps.include?(snp)).to be true
|
59
59
|
}
|
60
60
|
end
|
61
61
|
|
62
62
|
it "labels node when building a node and there is not a suitable split" do
|
63
63
|
@config.load_training_data
|
64
|
-
|
64
|
+
allow_any_instance_of(Nimbus::ClassificationTree).to receive(:snps_random_sample).and_return([11])
|
65
65
|
|
66
66
|
@tree.individuals = @config.training_set.individuals
|
67
67
|
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
@@ -69,9 +69,9 @@ describe Nimbus::ClassificationTree do
|
|
69
69
|
@tree.predictions = {}
|
70
70
|
|
71
71
|
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
72
|
-
branch[
|
73
|
-
branch[
|
74
|
-
branch[
|
72
|
+
expect(branch[11][0]).to be_kind_of String
|
73
|
+
expect(branch[11][1]).to be_kind_of String
|
74
|
+
expect(branch[11][2]).to be_kind_of String
|
75
75
|
end
|
76
76
|
|
77
77
|
it "labels node when building a node with less individuals than the minimum node size" do
|
@@ -83,50 +83,50 @@ describe Nimbus::ClassificationTree do
|
|
83
83
|
@tree.predictions = {}
|
84
84
|
|
85
85
|
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
86
|
-
label.
|
86
|
+
expect(label).to be_kind_of String
|
87
87
|
|
88
88
|
label = @tree.build_node [2, 10], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
89
|
-
label.
|
89
|
+
expect(label).to be_kind_of String
|
90
90
|
|
91
91
|
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
92
|
-
label.
|
92
|
+
expect(label).to be_kind_of String
|
93
93
|
|
94
94
|
label = @tree.build_node [99, 22, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
95
|
-
label.
|
95
|
+
expect(label).to be_kind_of String
|
96
96
|
end
|
97
97
|
|
98
98
|
it 'computes generalization error for the tree' do
|
99
99
|
@config.load_training_data
|
100
100
|
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
101
|
-
@tree.generalization_error.
|
101
|
+
expect(@tree.generalization_error).to be_nil
|
102
102
|
@tree.generalization_error_from_oob((3..300).to_a)
|
103
|
-
@tree.generalization_error.
|
104
|
-
@tree.generalization_error.
|
105
|
-
@tree.generalization_error.
|
103
|
+
expect(@tree.generalization_error).to be_kind_of Numeric
|
104
|
+
expect(@tree.generalization_error).to be > 0.0
|
105
|
+
expect(@tree.generalization_error).to be < 1.0
|
106
106
|
end
|
107
107
|
|
108
108
|
it 'estimates importance for all SNPs' do
|
109
109
|
@config.load_training_data
|
110
110
|
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
111
|
-
@tree.importances.
|
111
|
+
expect(@tree.importances).to be_nil
|
112
112
|
@tree.estimate_importances((200..533).to_a)
|
113
|
-
@tree.importances.
|
114
|
-
@tree.importances.keys.
|
115
|
-
(@tree.importances.keys - (1..100).to_a).
|
113
|
+
expect(@tree.importances).to be_kind_of Hash
|
114
|
+
expect(@tree.importances.keys).to_not be_empty
|
115
|
+
expect((@tree.importances.keys - (1..100).to_a)).to be_empty #all keys are snp indexes (100 snps in training file)
|
116
116
|
end
|
117
117
|
|
118
118
|
it 'get prediction for an individual pushing it down a tree structure' do
|
119
|
-
tree_structure = Psych.load(File.open fixture_file('
|
119
|
+
tree_structure = Psych.load(File.open fixture_file('classification/random_forest.yml')).first
|
120
120
|
individual_data = [0]*100
|
121
121
|
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
122
|
-
prediction.
|
122
|
+
expect(prediction).to eq '0'
|
123
123
|
|
124
|
-
individual_data[
|
125
|
-
individual_data[
|
126
|
-
individual_data[
|
127
|
-
individual_data[
|
124
|
+
individual_data[8-1] = 2
|
125
|
+
individual_data[29-1] = 0
|
126
|
+
individual_data[1-1] = 1
|
127
|
+
individual_data[7-1] = 1
|
128
128
|
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
129
|
-
prediction.
|
129
|
+
expect(prediction).to eq '1'
|
130
130
|
end
|
131
131
|
|
132
132
|
end
|
data/spec/configuration_spec.rb
CHANGED
@@ -5,65 +5,65 @@ describe Nimbus::Configuration do
|
|
5
5
|
|
6
6
|
it "loads configuration options from config file" do
|
7
7
|
config = Nimbus::Configuration.new
|
8
|
-
config.load fixture_file('
|
8
|
+
config.load fixture_file('regression/config.yml')
|
9
9
|
|
10
|
-
config.training_file.
|
11
|
-
config.testing_file.
|
12
|
-
config.forest_file.
|
13
|
-
config.classes.
|
14
|
-
config.do_importances.
|
10
|
+
expect(config.training_file).to eq fixture_file('regression/training.data')
|
11
|
+
expect(config.testing_file).to eq fixture_file('regression/testing.data')
|
12
|
+
expect(config.forest_file).to eq fixture_file('regression/random_forest.yml')
|
13
|
+
expect(config.classes).to be_nil
|
14
|
+
expect(config.do_importances).to be
|
15
15
|
|
16
|
-
config.forest_size.
|
17
|
-
config.tree_SNP_sample_size.
|
18
|
-
config.tree_SNP_total_count.
|
19
|
-
config.tree_node_min_size.
|
16
|
+
expect(config.forest_size).to eq 3
|
17
|
+
expect(config.tree_SNP_sample_size).to eq 60
|
18
|
+
expect(config.tree_SNP_total_count).to eq 200
|
19
|
+
expect(config.tree_node_min_size).to eq 5
|
20
20
|
|
21
21
|
config = Nimbus::Configuration.new
|
22
|
-
config.load fixture_file('
|
23
|
-
|
24
|
-
config.training_file.
|
25
|
-
config.testing_file.
|
26
|
-
config.forest_file.
|
27
|
-
config.classes.
|
28
|
-
config.do_importances.
|
29
|
-
|
30
|
-
config.forest_size.
|
31
|
-
config.tree_SNP_sample_size.
|
32
|
-
config.tree_SNP_total_count.
|
33
|
-
config.tree_node_min_size.
|
22
|
+
config.load fixture_file('classification/config.yml')
|
23
|
+
|
24
|
+
expect(config.training_file).to eq fixture_file('classification/training.data')
|
25
|
+
expect(config.testing_file).to eq fixture_file('classification/testing.data')
|
26
|
+
expect(config.forest_file).to eq fixture_file('classification/random_forest.yml')
|
27
|
+
expect(config.classes).to eq ['0','1']
|
28
|
+
expect(config.do_importances).to_not be
|
29
|
+
|
30
|
+
expect(config.forest_size).to eq 3
|
31
|
+
expect(config.tree_SNP_sample_size).to eq 33
|
32
|
+
expect(config.tree_SNP_total_count).to eq 100
|
33
|
+
expect(config.tree_node_min_size).to eq 5
|
34
34
|
end
|
35
35
|
|
36
36
|
it 'tree method return tree-related subset of options for regression trees' do
|
37
37
|
config = Nimbus::Configuration.new
|
38
|
-
config.load fixture_file('
|
38
|
+
config.load fixture_file('regression/config.yml')
|
39
39
|
tree_options = config.tree
|
40
40
|
|
41
|
-
tree_options[:snp_sample_size].
|
42
|
-
tree_options[:snp_total_count].
|
43
|
-
tree_options[:tree_node_min_size].
|
44
|
-
tree_options[:classes].
|
41
|
+
expect(tree_options[:snp_sample_size]).to_not be_nil
|
42
|
+
expect(tree_options[:snp_total_count]).to_not be_nil
|
43
|
+
expect(tree_options[:tree_node_min_size]).to_not be_nil
|
44
|
+
expect(tree_options[:classes]).to be_nil
|
45
45
|
end
|
46
46
|
|
47
47
|
it 'tree method return tree-related subset of options for classification trees' do
|
48
48
|
config = Nimbus::Configuration.new
|
49
|
-
config.load fixture_file('
|
49
|
+
config.load fixture_file('classification/config.yml')
|
50
50
|
tree_options = config.tree
|
51
51
|
|
52
|
-
tree_options[:snp_sample_size].
|
53
|
-
tree_options[:snp_total_count].
|
54
|
-
tree_options[:tree_node_min_size].
|
55
|
-
tree_options[:classes].
|
52
|
+
expect(tree_options[:snp_sample_size]).to_not be_nil
|
53
|
+
expect(tree_options[:snp_total_count]).to_not be_nil
|
54
|
+
expect(tree_options[:tree_node_min_size]).to_not be_nil
|
55
|
+
expect(tree_options[:classes]).to_not be_nil
|
56
56
|
end
|
57
57
|
|
58
58
|
it "creates a training set object from training data file" do
|
59
59
|
config = Nimbus::Configuration.new
|
60
|
-
config.load fixture_file('
|
61
|
-
config.training_set.
|
60
|
+
config.load fixture_file('regression/config.yml')
|
61
|
+
expect(config.training_set).to be_nil
|
62
62
|
config.load_training_data
|
63
|
-
config.training_set.
|
64
|
-
config.training_set.all_ids.sort.
|
63
|
+
expect(config.training_set).to be_kind_of Nimbus::TrainingSet
|
64
|
+
expect(config.training_set.all_ids.sort).to eq (1..800).to_a
|
65
65
|
|
66
|
-
File.open(fixture_file('
|
66
|
+
File.open(fixture_file('regression/training.data')) {|file|
|
67
67
|
feno1, id1, *snp_list_1 = file.readline.split
|
68
68
|
feno2, id2, *snp_list_2 = file.readline.split
|
69
69
|
feno3, id3, *snp_list_3 = file.readline.split
|
@@ -72,9 +72,9 @@ describe Nimbus::Configuration do
|
|
72
72
|
i2 = Nimbus::Individual.new(id2.to_i, feno2.to_f, snp_list_2.map{|snp| snp.to_i})
|
73
73
|
i3 = Nimbus::Individual.new(id3.to_i, feno3.to_f, snp_list_3.map{|snp| snp.to_i})
|
74
74
|
|
75
|
-
config.training_set.individuals[id1.to_i].id.
|
76
|
-
config.training_set.individuals[id2.to_i].fenotype.
|
77
|
-
config.training_set.individuals[id3.to_i].snp_list.
|
75
|
+
expect(config.training_set.individuals[id1.to_i].id).to eq i1.id
|
76
|
+
expect(config.training_set.individuals[id2.to_i].fenotype).to eq i2.fenotype
|
77
|
+
expect(config.training_set.individuals[id3.to_i].snp_list).to eq i3.snp_list
|
78
78
|
|
79
79
|
config.training_set.ids_fenotypes[id1.to_i] = feno1.to_f
|
80
80
|
config.training_set.ids_fenotypes[id2.to_i] = feno2.to_f
|
@@ -84,38 +84,38 @@ describe Nimbus::Configuration do
|
|
84
84
|
|
85
85
|
it "reads testing data and yields one individual at a time" do
|
86
86
|
config = Nimbus::Configuration.new
|
87
|
-
config.load fixture_file('
|
87
|
+
config.load fixture_file('regression/config.yml')
|
88
88
|
|
89
89
|
test_individuals = []
|
90
|
-
File.open(fixture_file('
|
90
|
+
File.open(fixture_file('regression/testing.data')) {|file|
|
91
91
|
file.each do |line|
|
92
92
|
data_id, *snp_list = line.strip.split
|
93
93
|
test_individuals << Nimbus::Individual.new(data_id.to_i, nil, snp_list.map{|snp| snp.to_i})
|
94
94
|
end
|
95
95
|
}
|
96
|
-
test_individuals.size.
|
96
|
+
expect(test_individuals.size).to eq 200
|
97
97
|
config.read_testing_data{|individual|
|
98
98
|
test_individual = test_individuals.shift
|
99
|
-
individual.id.
|
100
|
-
individual.id.
|
101
|
-
individual.snp_list.
|
102
|
-
individual.snp_list.
|
99
|
+
expect(individual.id).to_not be_nil
|
100
|
+
expect(individual.id).to eq test_individual.id
|
101
|
+
expect(individual.snp_list).to_not be_empty
|
102
|
+
expect(individual.snp_list).to eq test_individual.snp_list
|
103
103
|
}
|
104
104
|
end
|
105
105
|
|
106
106
|
it "creates a forest object loading data from a yaml file" do
|
107
107
|
config = Nimbus::Configuration.new
|
108
|
-
config.load fixture_file('
|
108
|
+
config.load fixture_file('regression/config.yml')
|
109
109
|
|
110
|
-
trees = Psych.load(File.open fixture_file('
|
111
|
-
trees.first.keys.first.
|
112
|
-
trees.size.
|
110
|
+
trees = Psych.load(File.open fixture_file('regression/random_forest.yml'))
|
111
|
+
expect(trees.first.keys.first).to eq 176
|
112
|
+
expect(trees.size).to eq 3
|
113
113
|
|
114
114
|
forest = config.load_forest
|
115
|
-
forest.
|
116
|
-
forest.trees[0].
|
117
|
-
forest.trees[1].
|
118
|
-
forest.trees.last.
|
115
|
+
expect(forest).to be_kind_of Nimbus::Forest
|
116
|
+
expect(forest.trees[0]).to eq trees.first
|
117
|
+
expect(forest.trees[1]).to eq trees[1]
|
118
|
+
expect(forest.trees.last).to eq trees[2]
|
119
119
|
end
|
120
120
|
|
121
121
|
end
|