nimbus 2.2.1 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +7 -0
- data/CONTRIBUTING.md +46 -0
- data/MIT-LICENSE.txt +1 -1
- data/README.md +131 -21
- data/bin/nimbus +2 -2
- data/lib/nimbus.rb +2 -6
- data/lib/nimbus/classification_tree.rb +9 -12
- data/lib/nimbus/configuration.rb +22 -22
- data/lib/nimbus/forest.rb +8 -8
- data/lib/nimbus/loss_functions.rb +11 -0
- data/lib/nimbus/regression_tree.rb +8 -10
- data/lib/nimbus/tree.rb +54 -12
- data/lib/nimbus/version.rb +1 -1
- data/spec/classification_tree_spec.rb +47 -47
- data/spec/configuration_spec.rb +55 -55
- data/spec/fixtures/{classification_config.yml → classification/config.yml} +3 -3
- data/spec/fixtures/classification/random_forest.yml +1174 -0
- data/spec/fixtures/{classification_testing.data → classification/testing.data} +0 -0
- data/spec/fixtures/{classification_training.data → classification/training.data} +0 -0
- data/spec/fixtures/{regression_config.yml → regression/config.yml} +4 -4
- data/spec/fixtures/regression/random_forest.yml +2737 -0
- data/spec/fixtures/{regression_testing.data → regression/testing.data} +0 -0
- data/spec/fixtures/{regression_training.data → regression/training.data} +0 -0
- data/spec/forest_spec.rb +39 -39
- data/spec/individual_spec.rb +3 -3
- data/spec/loss_functions_spec.rb +31 -13
- data/spec/nimbus_spec.rb +2 -2
- data/spec/regression_tree_spec.rb +44 -44
- data/spec/training_set_spec.rb +3 -3
- data/spec/tree_spec.rb +4 -4
- metadata +37 -34
- data/spec/fixtures/classification_random_forest.yml +0 -922
- data/spec/fixtures/regression_random_forest.yml +0 -1741
data/lib/nimbus/forest.rb
CHANGED
@@ -88,6 +88,14 @@ module Nimbus
|
|
88
88
|
@trees.to_yaml
|
89
89
|
end
|
90
90
|
|
91
|
+
def classification?
|
92
|
+
@options.tree[:classes]
|
93
|
+
end
|
94
|
+
|
95
|
+
def regression?
|
96
|
+
@options.tree[:classes].nil?
|
97
|
+
end
|
98
|
+
|
91
99
|
private
|
92
100
|
|
93
101
|
def individuals_random_sample
|
@@ -140,14 +148,6 @@ module Nimbus
|
|
140
148
|
}
|
141
149
|
end
|
142
150
|
|
143
|
-
def classification?
|
144
|
-
@options.tree[:classes]
|
145
|
-
end
|
146
|
-
|
147
|
-
def regression?
|
148
|
-
@options.tree[:classes].nil?
|
149
|
-
end
|
150
|
-
|
151
151
|
end
|
152
152
|
|
153
153
|
end
|
@@ -35,6 +35,17 @@ module Nimbus
|
|
35
35
|
def squared_difference(x,y)
|
36
36
|
0.0 + (x-y)**2
|
37
37
|
end
|
38
|
+
|
39
|
+
# Simplified Huber function
|
40
|
+
def pseudo_huber_error(ids, value_table, mean = nil)
|
41
|
+
mean ||= self.average ids, value_table
|
42
|
+
ids.inject(0.0){|sum, i| sum + (Math.log(Math.cosh(value_table[i] - mean))) }
|
43
|
+
end
|
44
|
+
|
45
|
+
# Simplified Huber loss function: PHE / n
|
46
|
+
def pseudo_huber_loss(ids, value_table, mean = nil)
|
47
|
+
self.pseudo_huber_error(ids, value_table, mean) / ids.size
|
48
|
+
end
|
38
49
|
|
39
50
|
## CLASSSIFICATION
|
40
51
|
|
@@ -8,7 +8,7 @@ module Nimbus
|
|
8
8
|
# * 1: Calculate loss function for the individuals in the node (first node contains all the individuals).
|
9
9
|
# * 2: Take a random sample of the SNPs (size m << total count of SNPs)
|
10
10
|
# * 3: Compute the loss function (quadratic loss) for the split of the sample based on value of every SNP.
|
11
|
-
# * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in
|
11
|
+
# * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in two nodes, based on average value for that SNP [0,1][2], or [0][1,2]
|
12
12
|
# * 5: Repeat from 1 for every node until:
|
13
13
|
# - a) The individuals count in that node is < minimum size OR
|
14
14
|
# - b) None of the SNP splits has a loss function smaller than the node loss function
|
@@ -27,8 +27,8 @@ module Nimbus
|
|
27
27
|
|
28
28
|
# Creates a node by taking a random sample of the SNPs and computing the loss function for every split by SNP of that sample.
|
29
29
|
#
|
30
|
-
# * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in
|
31
|
-
# (
|
30
|
+
# * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in two:
|
31
|
+
# (the average of the 0,1,2 values for the SNP_min in the individuals is computed, and they are splitted in [<=avg], [>avg]) then it builds these 2 new nodes.
|
32
32
|
# * Otherwise every individual in the node gets labeled with the average of the fenotype values of all of them.
|
33
33
|
def build_node(individuals_ids, y_hat)
|
34
34
|
# General loss function value for the node
|
@@ -38,22 +38,20 @@ module Nimbus
|
|
38
38
|
|
39
39
|
# Finding the SNP that minimizes loss function
|
40
40
|
snps = snps_random_sample
|
41
|
-
min_loss, min_SNP, split, means = node_loss_function, nil, nil, nil
|
41
|
+
min_loss, min_SNP, split, split_type, means = node_loss_function, nil, nil, nil, nil
|
42
42
|
|
43
43
|
snps.each do |snp|
|
44
|
-
individuals_split_by_snp_value =
|
44
|
+
individuals_split_by_snp_value, node_split_type = split_by_snp_avegare_value individuals_ids, snp
|
45
45
|
mean_0 = Nimbus::LossFunctions.average individuals_split_by_snp_value[0], @id_to_fenotype
|
46
46
|
mean_1 = Nimbus::LossFunctions.average individuals_split_by_snp_value[1], @id_to_fenotype
|
47
|
-
mean_2 = Nimbus::LossFunctions.average individuals_split_by_snp_value[2], @id_to_fenotype
|
48
47
|
loss_0 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[0], @id_to_fenotype, mean_0
|
49
48
|
loss_1 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[1], @id_to_fenotype, mean_1
|
50
|
-
|
51
|
-
loss_snp = (loss_0 + loss_1 + loss_2) / individuals_count
|
49
|
+
loss_snp = (loss_0 + loss_1) / individuals_count
|
52
50
|
|
53
|
-
min_loss, min_SNP, split, means = loss_snp, snp, individuals_split_by_snp_value, [mean_0, mean_1
|
51
|
+
min_loss, min_SNP, split, split_type, means = loss_snp, snp, individuals_split_by_snp_value, node_split_type, [mean_0, mean_1] if loss_snp < min_loss
|
54
52
|
end
|
55
53
|
|
56
|
-
return build_branch(min_SNP, split, means, y_hat) if min_loss < node_loss_function
|
54
|
+
return build_branch(min_SNP, split, split_type, means, y_hat) if min_loss < node_loss_function
|
57
55
|
return label_node(y_hat, individuals_ids)
|
58
56
|
end
|
59
57
|
|
data/lib/nimbus/tree.rb
CHANGED
@@ -18,6 +18,9 @@ module Nimbus
|
|
18
18
|
attr_accessor :snp_sample_size, :snp_total_count, :node_min_size, :used_snps, :structure, :generalization_error, :predictions, :importances
|
19
19
|
attr_accessor :individuals, :id_to_fenotype
|
20
20
|
|
21
|
+
NODE_SPLIT_01_2 = "zero"
|
22
|
+
NODE_SPLIT_0_12 = "two"
|
23
|
+
|
21
24
|
# Initialize Tree object with the configuration (as in Nimbus::Configuration.tree) options received.
|
22
25
|
def initialize(options)
|
23
26
|
@snp_total_count = options[:snp_total_count]
|
@@ -53,8 +56,14 @@ module Nimbus
|
|
53
56
|
# Returns the prediction for that individual (the label of the final node reached by the individual).
|
54
57
|
def self.traverse(tree_structure, data)
|
55
58
|
return tree_structure if tree_structure.is_a?(Numeric) || tree_structure.is_a?(String)
|
59
|
+
|
56
60
|
raise Nimbus::TreeError, "Forest data has invalid structure. Please check your forest data (file)." if !(tree_structure.is_a?(Hash) && tree_structure.keys.size == 1)
|
57
|
-
|
61
|
+
|
62
|
+
branch = tree_structure.values.first
|
63
|
+
split_type = branch[1].to_s
|
64
|
+
datum = data_traversing_value(data[tree_structure.keys.first - 1], split_type)
|
65
|
+
|
66
|
+
return self.traverse(branch[datum], data)
|
58
67
|
end
|
59
68
|
|
60
69
|
protected
|
@@ -63,13 +72,12 @@ module Nimbus
|
|
63
72
|
(1..@snp_total_count).to_a.sample(@snp_sample_size).sort
|
64
73
|
end
|
65
74
|
|
66
|
-
def build_branch(snp, split, y_hats, parent_y_hat)
|
67
|
-
|
68
|
-
|
69
|
-
node_2 = split[2].size == 0 ? label_node(parent_y_hat, []) : build_node(split[2], y_hats[2])
|
75
|
+
def build_branch(snp, split, split_type, y_hats, parent_y_hat)
|
76
|
+
node_a = split[0].size == 0 ? label_node(parent_y_hat, []) : build_node(split[0], y_hats[0])
|
77
|
+
node_b = split[1].size == 0 ? label_node(parent_y_hat, []) : build_node(split[1], y_hats[1])
|
70
78
|
|
71
79
|
split_by_snp(snp)
|
72
|
-
return { snp => [
|
80
|
+
return { snp => [node_a, split_type, node_b] }
|
73
81
|
end
|
74
82
|
|
75
83
|
def label_node(value, ids)
|
@@ -78,24 +86,58 @@ module Nimbus
|
|
78
86
|
label
|
79
87
|
end
|
80
88
|
|
81
|
-
def
|
82
|
-
|
89
|
+
def split_by_snp_avegare_value(ids, snp)
|
90
|
+
split_012 = [[], [], []]
|
83
91
|
ids.each do |i|
|
84
|
-
|
92
|
+
split_012[ @individuals[i].snp_list[snp-1] ] << @individuals[i].id
|
85
93
|
end
|
86
|
-
split
|
94
|
+
# we split by the average number of 0,1,2 values.
|
95
|
+
# So if there are less or equal 0s than 2s the split is [0,1][2]
|
96
|
+
# and if there are more 0s than 2s the average will be <1 so the split is [0][1,2]
|
97
|
+
split_type = (split_012[0].size <= split_012[2].size ? NODE_SPLIT_01_2 : NODE_SPLIT_0_12)
|
98
|
+
split_type == NODE_SPLIT_01_2 ? split_012[0] += split_012[1] : split_012[2] += split_012[1]
|
99
|
+
split = [split_012[0], split_012[2]]
|
100
|
+
[split, split_type]
|
87
101
|
rescue => ex
|
88
102
|
raise Nimbus::TreeError, "Values for SNPs columns must be in [0, 1, 2]"
|
89
103
|
end
|
90
104
|
|
105
|
+
def split_by_value(ids, snp, value)
|
106
|
+
split = [[], []]
|
107
|
+
ids.each do |i|
|
108
|
+
@individuals[i].snp_list[snp-1] > value ? (split[1] << @individuals[i].id) : (split[0] << @individuals[i].id)
|
109
|
+
end
|
110
|
+
split
|
111
|
+
rescue => ex
|
112
|
+
raise Nimbus::TreeError, "Values for SNPs columns must be numeric"
|
113
|
+
end
|
114
|
+
|
91
115
|
def split_by_snp(x)
|
92
116
|
@used_snps << x
|
93
117
|
end
|
94
118
|
|
95
119
|
def traverse_with_permutation(tree_structure, data, snp_to_permute, individual_to_permute)
|
96
120
|
return tree_structure if tree_structure.is_a?(Numeric) || tree_structure.is_a?(String)
|
97
|
-
|
98
|
-
|
121
|
+
|
122
|
+
key = tree_structure.keys.first
|
123
|
+
branch = tree_structure.values.first
|
124
|
+
individual_data = (key == snp_to_permute ? individual_to_permute : data)
|
125
|
+
split_type = branch[1]
|
126
|
+
datum = data_traversing_value(individual_data[key - 1].to_i, split_type)
|
127
|
+
|
128
|
+
return traverse_with_permutation branch[datum], data, snp_to_permute, individual_to_permute
|
129
|
+
end
|
130
|
+
|
131
|
+
def data_traversing_value(datum, split_type)
|
132
|
+
Nimbus::Tree.data_traversing_value(datum, split_type)
|
133
|
+
end
|
134
|
+
|
135
|
+
def self.data_traversing_value(datum, split_type)
|
136
|
+
if datum == 1
|
137
|
+
return 0 if split_type == NODE_SPLIT_01_2
|
138
|
+
return 2 if split_type == NODE_SPLIT_0_12
|
139
|
+
end
|
140
|
+
datum
|
99
141
|
end
|
100
142
|
|
101
143
|
end
|
data/lib/nimbus/version.rb
CHANGED
@@ -4,34 +4,34 @@ describe Nimbus::ClassificationTree do
|
|
4
4
|
|
5
5
|
before(:each) do
|
6
6
|
@config = Nimbus::Configuration.new
|
7
|
-
@config.load fixture_file('
|
7
|
+
@config.load fixture_file('classification/config.yml')
|
8
8
|
|
9
9
|
@tree = Nimbus::ClassificationTree.new @config.tree
|
10
10
|
end
|
11
11
|
|
12
12
|
it "is initialized with tree config info" do
|
13
|
-
@tree.snp_total_count.
|
14
|
-
@tree.snp_sample_size.
|
15
|
-
@tree.node_min_size.
|
16
|
-
@tree.classes.size.
|
17
|
-
@tree.classes[0].
|
18
|
-
@tree.classes[1].
|
13
|
+
expect(@tree.snp_total_count).to eq 100
|
14
|
+
expect(@tree.snp_sample_size).to eq 33
|
15
|
+
expect(@tree.node_min_size).to eq 5
|
16
|
+
expect(@tree.classes.size).to eq 2
|
17
|
+
expect(@tree.classes[0]).to eq '0'
|
18
|
+
expect(@tree.classes[1]).to eq '1'
|
19
19
|
end
|
20
20
|
|
21
21
|
it "creates a tree structure when seeded with training data" do
|
22
22
|
@config.load_training_data
|
23
|
-
@tree.structure.
|
23
|
+
expect(@tree.structure).to be_nil
|
24
24
|
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
25
|
-
@tree.structure.
|
26
|
-
@tree.structure.
|
25
|
+
expect(@tree.structure).to_not be_nil
|
26
|
+
expect(@tree.structure).to be_kind_of Hash
|
27
27
|
|
28
|
-
@tree.structure.keys.first.
|
29
|
-
@tree.used_snps.
|
28
|
+
expect(@tree.structure.keys.first).to eq @tree.used_snps.last
|
29
|
+
expect(@tree.used_snps).to_not be_empty
|
30
30
|
end
|
31
31
|
|
32
|
-
it "splits node
|
32
|
+
it "splits node when building a node and finds a suitable split" do
|
33
33
|
@config.load_training_data
|
34
|
-
|
34
|
+
allow_any_instance_of(Nimbus::ClassificationTree).to receive(:snps_random_sample).and_return((68..100).to_a) #97 is best split
|
35
35
|
|
36
36
|
@tree.individuals = @config.training_set.individuals
|
37
37
|
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
@@ -39,29 +39,29 @@ describe Nimbus::ClassificationTree do
|
|
39
39
|
@tree.predictions = {}
|
40
40
|
|
41
41
|
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
42
|
-
branch.keys.size.
|
43
|
-
branch.keys.first.
|
44
|
-
branch[97].size.
|
45
|
-
branch[97][0].
|
46
|
-
branch[97][1]
|
47
|
-
branch[97][2].
|
42
|
+
expect(branch.keys.size).to eq 1
|
43
|
+
expect(branch.keys.first).to eq 97
|
44
|
+
expect(branch[97].size).to eq 3
|
45
|
+
expect(branch[97][0]).to be_kind_of Hash
|
46
|
+
expect([Nimbus::Tree::NODE_SPLIT_01_2, Nimbus::Tree::NODE_SPLIT_0_12]).to include(branch[97][1])
|
47
|
+
expect(branch[97][2]).to be_kind_of Hash
|
48
48
|
end
|
49
49
|
|
50
50
|
it "keeps track of all SNPs used for the tree" do
|
51
51
|
@config.load_training_data
|
52
52
|
snps = (33..65).to_a
|
53
|
-
|
54
|
-
@tree.used_snps.
|
53
|
+
allow_any_instance_of(Nimbus::ClassificationTree).to receive(:snps_random_sample).and_return(snps)
|
54
|
+
expect(@tree.used_snps).to be_nil
|
55
55
|
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
56
|
-
@tree.used_snps.size.
|
56
|
+
expect(@tree.used_snps.size).to be > 4
|
57
57
|
@tree.used_snps.each{|snp|
|
58
|
-
snps.include?(snp).
|
58
|
+
expect(snps.include?(snp)).to be true
|
59
59
|
}
|
60
60
|
end
|
61
61
|
|
62
62
|
it "labels node when building a node and there is not a suitable split" do
|
63
63
|
@config.load_training_data
|
64
|
-
|
64
|
+
allow_any_instance_of(Nimbus::ClassificationTree).to receive(:snps_random_sample).and_return([11])
|
65
65
|
|
66
66
|
@tree.individuals = @config.training_set.individuals
|
67
67
|
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
@@ -69,9 +69,9 @@ describe Nimbus::ClassificationTree do
|
|
69
69
|
@tree.predictions = {}
|
70
70
|
|
71
71
|
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
72
|
-
branch[
|
73
|
-
branch[
|
74
|
-
branch[
|
72
|
+
expect(branch[11][0]).to be_kind_of String
|
73
|
+
expect(branch[11][1]).to be_kind_of String
|
74
|
+
expect(branch[11][2]).to be_kind_of String
|
75
75
|
end
|
76
76
|
|
77
77
|
it "labels node when building a node with less individuals than the minimum node size" do
|
@@ -83,50 +83,50 @@ describe Nimbus::ClassificationTree do
|
|
83
83
|
@tree.predictions = {}
|
84
84
|
|
85
85
|
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
86
|
-
label.
|
86
|
+
expect(label).to be_kind_of String
|
87
87
|
|
88
88
|
label = @tree.build_node [2, 10], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
89
|
-
label.
|
89
|
+
expect(label).to be_kind_of String
|
90
90
|
|
91
91
|
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
92
|
-
label.
|
92
|
+
expect(label).to be_kind_of String
|
93
93
|
|
94
94
|
label = @tree.build_node [99, 22, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
|
95
|
-
label.
|
95
|
+
expect(label).to be_kind_of String
|
96
96
|
end
|
97
97
|
|
98
98
|
it 'computes generalization error for the tree' do
|
99
99
|
@config.load_training_data
|
100
100
|
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
101
|
-
@tree.generalization_error.
|
101
|
+
expect(@tree.generalization_error).to be_nil
|
102
102
|
@tree.generalization_error_from_oob((3..300).to_a)
|
103
|
-
@tree.generalization_error.
|
104
|
-
@tree.generalization_error.
|
105
|
-
@tree.generalization_error.
|
103
|
+
expect(@tree.generalization_error).to be_kind_of Numeric
|
104
|
+
expect(@tree.generalization_error).to be > 0.0
|
105
|
+
expect(@tree.generalization_error).to be < 1.0
|
106
106
|
end
|
107
107
|
|
108
108
|
it 'estimates importance for all SNPs' do
|
109
109
|
@config.load_training_data
|
110
110
|
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
111
|
-
@tree.importances.
|
111
|
+
expect(@tree.importances).to be_nil
|
112
112
|
@tree.estimate_importances((200..533).to_a)
|
113
|
-
@tree.importances.
|
114
|
-
@tree.importances.keys.
|
115
|
-
(@tree.importances.keys - (1..100).to_a).
|
113
|
+
expect(@tree.importances).to be_kind_of Hash
|
114
|
+
expect(@tree.importances.keys).to_not be_empty
|
115
|
+
expect((@tree.importances.keys - (1..100).to_a)).to be_empty #all keys are snp indexes (100 snps in training file)
|
116
116
|
end
|
117
117
|
|
118
118
|
it 'get prediction for an individual pushing it down a tree structure' do
|
119
|
-
tree_structure = Psych.load(File.open fixture_file('
|
119
|
+
tree_structure = Psych.load(File.open fixture_file('classification/random_forest.yml')).first
|
120
120
|
individual_data = [0]*100
|
121
121
|
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
122
|
-
prediction.
|
122
|
+
expect(prediction).to eq '0'
|
123
123
|
|
124
|
-
individual_data[
|
125
|
-
individual_data[
|
126
|
-
individual_data[
|
127
|
-
individual_data[
|
124
|
+
individual_data[8-1] = 2
|
125
|
+
individual_data[29-1] = 0
|
126
|
+
individual_data[1-1] = 1
|
127
|
+
individual_data[7-1] = 1
|
128
128
|
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
129
|
-
prediction.
|
129
|
+
expect(prediction).to eq '1'
|
130
130
|
end
|
131
131
|
|
132
132
|
end
|
data/spec/configuration_spec.rb
CHANGED
@@ -5,65 +5,65 @@ describe Nimbus::Configuration do
|
|
5
5
|
|
6
6
|
it "loads configuration options from config file" do
|
7
7
|
config = Nimbus::Configuration.new
|
8
|
-
config.load fixture_file('
|
8
|
+
config.load fixture_file('regression/config.yml')
|
9
9
|
|
10
|
-
config.training_file.
|
11
|
-
config.testing_file.
|
12
|
-
config.forest_file.
|
13
|
-
config.classes.
|
14
|
-
config.do_importances.
|
10
|
+
expect(config.training_file).to eq fixture_file('regression/training.data')
|
11
|
+
expect(config.testing_file).to eq fixture_file('regression/testing.data')
|
12
|
+
expect(config.forest_file).to eq fixture_file('regression/random_forest.yml')
|
13
|
+
expect(config.classes).to be_nil
|
14
|
+
expect(config.do_importances).to be
|
15
15
|
|
16
|
-
config.forest_size.
|
17
|
-
config.tree_SNP_sample_size.
|
18
|
-
config.tree_SNP_total_count.
|
19
|
-
config.tree_node_min_size.
|
16
|
+
expect(config.forest_size).to eq 3
|
17
|
+
expect(config.tree_SNP_sample_size).to eq 60
|
18
|
+
expect(config.tree_SNP_total_count).to eq 200
|
19
|
+
expect(config.tree_node_min_size).to eq 5
|
20
20
|
|
21
21
|
config = Nimbus::Configuration.new
|
22
|
-
config.load fixture_file('
|
23
|
-
|
24
|
-
config.training_file.
|
25
|
-
config.testing_file.
|
26
|
-
config.forest_file.
|
27
|
-
config.classes.
|
28
|
-
config.do_importances.
|
29
|
-
|
30
|
-
config.forest_size.
|
31
|
-
config.tree_SNP_sample_size.
|
32
|
-
config.tree_SNP_total_count.
|
33
|
-
config.tree_node_min_size.
|
22
|
+
config.load fixture_file('classification/config.yml')
|
23
|
+
|
24
|
+
expect(config.training_file).to eq fixture_file('classification/training.data')
|
25
|
+
expect(config.testing_file).to eq fixture_file('classification/testing.data')
|
26
|
+
expect(config.forest_file).to eq fixture_file('classification/random_forest.yml')
|
27
|
+
expect(config.classes).to eq ['0','1']
|
28
|
+
expect(config.do_importances).to_not be
|
29
|
+
|
30
|
+
expect(config.forest_size).to eq 3
|
31
|
+
expect(config.tree_SNP_sample_size).to eq 33
|
32
|
+
expect(config.tree_SNP_total_count).to eq 100
|
33
|
+
expect(config.tree_node_min_size).to eq 5
|
34
34
|
end
|
35
35
|
|
36
36
|
it 'tree method return tree-related subset of options for regression trees' do
|
37
37
|
config = Nimbus::Configuration.new
|
38
|
-
config.load fixture_file('
|
38
|
+
config.load fixture_file('regression/config.yml')
|
39
39
|
tree_options = config.tree
|
40
40
|
|
41
|
-
tree_options[:snp_sample_size].
|
42
|
-
tree_options[:snp_total_count].
|
43
|
-
tree_options[:tree_node_min_size].
|
44
|
-
tree_options[:classes].
|
41
|
+
expect(tree_options[:snp_sample_size]).to_not be_nil
|
42
|
+
expect(tree_options[:snp_total_count]).to_not be_nil
|
43
|
+
expect(tree_options[:tree_node_min_size]).to_not be_nil
|
44
|
+
expect(tree_options[:classes]).to be_nil
|
45
45
|
end
|
46
46
|
|
47
47
|
it 'tree method return tree-related subset of options for classification trees' do
|
48
48
|
config = Nimbus::Configuration.new
|
49
|
-
config.load fixture_file('
|
49
|
+
config.load fixture_file('classification/config.yml')
|
50
50
|
tree_options = config.tree
|
51
51
|
|
52
|
-
tree_options[:snp_sample_size].
|
53
|
-
tree_options[:snp_total_count].
|
54
|
-
tree_options[:tree_node_min_size].
|
55
|
-
tree_options[:classes].
|
52
|
+
expect(tree_options[:snp_sample_size]).to_not be_nil
|
53
|
+
expect(tree_options[:snp_total_count]).to_not be_nil
|
54
|
+
expect(tree_options[:tree_node_min_size]).to_not be_nil
|
55
|
+
expect(tree_options[:classes]).to_not be_nil
|
56
56
|
end
|
57
57
|
|
58
58
|
it "creates a training set object from training data file" do
|
59
59
|
config = Nimbus::Configuration.new
|
60
|
-
config.load fixture_file('
|
61
|
-
config.training_set.
|
60
|
+
config.load fixture_file('regression/config.yml')
|
61
|
+
expect(config.training_set).to be_nil
|
62
62
|
config.load_training_data
|
63
|
-
config.training_set.
|
64
|
-
config.training_set.all_ids.sort.
|
63
|
+
expect(config.training_set).to be_kind_of Nimbus::TrainingSet
|
64
|
+
expect(config.training_set.all_ids.sort).to eq (1..800).to_a
|
65
65
|
|
66
|
-
File.open(fixture_file('
|
66
|
+
File.open(fixture_file('regression/training.data')) {|file|
|
67
67
|
feno1, id1, *snp_list_1 = file.readline.split
|
68
68
|
feno2, id2, *snp_list_2 = file.readline.split
|
69
69
|
feno3, id3, *snp_list_3 = file.readline.split
|
@@ -72,9 +72,9 @@ describe Nimbus::Configuration do
|
|
72
72
|
i2 = Nimbus::Individual.new(id2.to_i, feno2.to_f, snp_list_2.map{|snp| snp.to_i})
|
73
73
|
i3 = Nimbus::Individual.new(id3.to_i, feno3.to_f, snp_list_3.map{|snp| snp.to_i})
|
74
74
|
|
75
|
-
config.training_set.individuals[id1.to_i].id.
|
76
|
-
config.training_set.individuals[id2.to_i].fenotype.
|
77
|
-
config.training_set.individuals[id3.to_i].snp_list.
|
75
|
+
expect(config.training_set.individuals[id1.to_i].id).to eq i1.id
|
76
|
+
expect(config.training_set.individuals[id2.to_i].fenotype).to eq i2.fenotype
|
77
|
+
expect(config.training_set.individuals[id3.to_i].snp_list).to eq i3.snp_list
|
78
78
|
|
79
79
|
config.training_set.ids_fenotypes[id1.to_i] = feno1.to_f
|
80
80
|
config.training_set.ids_fenotypes[id2.to_i] = feno2.to_f
|
@@ -84,38 +84,38 @@ describe Nimbus::Configuration do
|
|
84
84
|
|
85
85
|
it "reads testing data and yields one individual at a time" do
|
86
86
|
config = Nimbus::Configuration.new
|
87
|
-
config.load fixture_file('
|
87
|
+
config.load fixture_file('regression/config.yml')
|
88
88
|
|
89
89
|
test_individuals = []
|
90
|
-
File.open(fixture_file('
|
90
|
+
File.open(fixture_file('regression/testing.data')) {|file|
|
91
91
|
file.each do |line|
|
92
92
|
data_id, *snp_list = line.strip.split
|
93
93
|
test_individuals << Nimbus::Individual.new(data_id.to_i, nil, snp_list.map{|snp| snp.to_i})
|
94
94
|
end
|
95
95
|
}
|
96
|
-
test_individuals.size.
|
96
|
+
expect(test_individuals.size).to eq 200
|
97
97
|
config.read_testing_data{|individual|
|
98
98
|
test_individual = test_individuals.shift
|
99
|
-
individual.id.
|
100
|
-
individual.id.
|
101
|
-
individual.snp_list.
|
102
|
-
individual.snp_list.
|
99
|
+
expect(individual.id).to_not be_nil
|
100
|
+
expect(individual.id).to eq test_individual.id
|
101
|
+
expect(individual.snp_list).to_not be_empty
|
102
|
+
expect(individual.snp_list).to eq test_individual.snp_list
|
103
103
|
}
|
104
104
|
end
|
105
105
|
|
106
106
|
it "creates a forest object loading data from a yaml file" do
|
107
107
|
config = Nimbus::Configuration.new
|
108
|
-
config.load fixture_file('
|
108
|
+
config.load fixture_file('regression/config.yml')
|
109
109
|
|
110
|
-
trees = Psych.load(File.open fixture_file('
|
111
|
-
trees.first.keys.first.
|
112
|
-
trees.size.
|
110
|
+
trees = Psych.load(File.open fixture_file('regression/random_forest.yml'))
|
111
|
+
expect(trees.first.keys.first).to eq 176
|
112
|
+
expect(trees.size).to eq 3
|
113
113
|
|
114
114
|
forest = config.load_forest
|
115
|
-
forest.
|
116
|
-
forest.trees[0].
|
117
|
-
forest.trees[1].
|
118
|
-
forest.trees.last.
|
115
|
+
expect(forest).to be_kind_of Nimbus::Forest
|
116
|
+
expect(forest.trees[0]).to eq trees.first
|
117
|
+
expect(forest.trees[1]).to eq trees[1]
|
118
|
+
expect(forest.trees.last).to eq trees[2]
|
119
119
|
end
|
120
120
|
|
121
121
|
end
|