nimbus 1.0.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +149 -0
- data/lib/nimbus.rb +15 -11
- data/lib/nimbus/application.rb +20 -23
- data/lib/nimbus/classification_tree.rb +111 -0
- data/lib/nimbus/configuration.rb +52 -37
- data/lib/nimbus/forest.rb +56 -20
- data/lib/nimbus/individual.rb +7 -7
- data/lib/nimbus/loss_functions.rb +44 -10
- data/lib/nimbus/regression_tree.rb +103 -0
- data/lib/nimbus/training_set.rb +4 -4
- data/lib/nimbus/tree.rb +20 -83
- data/lib/nimbus/version.rb +3 -0
- data/spec/classification_tree_spec.rb +132 -0
- data/spec/configuration_spec.rb +46 -19
- data/spec/fixtures/classification_config.yml +13 -0
- data/spec/fixtures/classification_random_forest.yml +922 -0
- data/spec/fixtures/classification_testing.data +500 -0
- data/spec/fixtures/classification_training.data +1000 -0
- data/spec/forest_spec.rb +109 -50
- data/spec/individual_spec.rb +2 -2
- data/spec/loss_functions_spec.rb +71 -0
- data/spec/nimbus_spec.rb +4 -4
- data/spec/regression_tree_spec.rb +129 -0
- data/spec/training_set_spec.rb +5 -5
- data/spec/tree_spec.rb +4 -115
- metadata +53 -45
- data/spec/fixtures/regression_snp_importances.txt +0 -200
- data/spec/fixtures/regression_testing_file_predictions.txt +0 -200
- data/spec/fixtures/regression_training_file_predictions.txt +0 -758
data/spec/forest_spec.rb
CHANGED
@@ -2,56 +2,115 @@
|
|
2
2
|
require File.dirname(__FILE__) + '/spec_helper'
|
3
3
|
|
4
4
|
describe Nimbus::Forest do
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
5
|
+
describe "Regression" do
|
6
|
+
before(:each) do
|
7
|
+
@config = Nimbus::Configuration.new
|
8
|
+
@config.load fixture_file('regression_config.yml')
|
9
|
+
@config.load_training_data
|
10
|
+
@forest = ::Nimbus::Forest.new @config
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'grows a regression forest of N trees' do
|
14
|
+
@forest.trees.should == []
|
15
|
+
@config.forest_size.should == 3
|
16
|
+
@forest.should_not be_classification
|
17
|
+
@forest.should be_regression
|
18
|
+
@forest.grow
|
19
|
+
@forest.trees.size.should == @config.forest_size
|
20
|
+
@forest.trees.each{|t| t.should be_kind_of Hash}
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'creates averaged predictions for individuals in the training set' do
|
24
|
+
@forest.predictions.should == {}
|
25
|
+
@forest.grow
|
26
|
+
(@forest.predictions.keys - (1..800).to_a ).should == [] # 800 individuals in the training file
|
27
|
+
@forest.predictions.values.each{|v| v.should be_kind_of Numeric}
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'computes averaged SNP importances for every SNP' do
|
31
|
+
@forest.snp_importances.should == {}
|
32
|
+
@forest.grow
|
33
|
+
@forest.snp_importances.keys.sort.should == (1..200).to_a # 200 snps in the training file
|
34
|
+
@forest.snp_importances.values.each{|v| v.should be_kind_of Numeric}
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'traverses a set of testing individuals through every tree in the forest and returns predictions' do
|
38
|
+
@forest = @config.load_forest
|
39
|
+
@forest.predictions.should == {}
|
40
|
+
|
41
|
+
tree_structure = YAML.load(File.open fixture_file('regression_random_forest.yml'))
|
42
|
+
expected_predictions = {}
|
43
|
+
@config.read_testing_data{|individual|
|
44
|
+
individual_prediction = 0.0
|
45
|
+
tree_structure.each do |t|
|
46
|
+
individual_prediction = (individual_prediction + Nimbus::Tree.traverse(t, individual.snp_list)).round(5)
|
47
|
+
end
|
48
|
+
expected_predictions[individual.id] = (individual_prediction / 3).round(5)
|
49
|
+
}
|
50
|
+
|
51
|
+
@forest.traverse
|
52
|
+
@forest.predictions.should == expected_predictions
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'can output forest structure in YAML format' do
|
56
|
+
@forest = @config.load_forest
|
57
|
+
YAML.load(File.open fixture_file('regression_random_forest.yml')) == YAML.load(@forest.to_yaml)
|
58
|
+
end
|
50
59
|
end
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
60
|
+
|
61
|
+
describe "Classification" do
|
62
|
+
before(:each) do
|
63
|
+
@config = Nimbus::Configuration.new
|
64
|
+
@config.load fixture_file('classification_config.yml')
|
65
|
+
@config.load_training_data
|
66
|
+
@forest = ::Nimbus::Forest.new @config
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'grows a classification forest of N trees' do
|
70
|
+
@forest.trees.should == []
|
71
|
+
@config.forest_size.should == 3
|
72
|
+
@forest.should be_classification
|
73
|
+
@forest.should_not be_regression
|
74
|
+
@forest.grow
|
75
|
+
@forest.trees.size.should == @config.forest_size
|
76
|
+
@forest.trees.each{|t| t.should be_kind_of Hash}
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'creates predictions for individuals in the training set' do
|
80
|
+
@forest.predictions.should == {}
|
81
|
+
@forest.grow
|
82
|
+
(@forest.predictions.keys - (1..1000).to_a ).should == [] # 1000 individuals in the training file
|
83
|
+
@forest.predictions.values.each{|v| v.should be_kind_of String}
|
84
|
+
end
|
85
|
+
|
86
|
+
it 'computes averaged SNP importances for every SNP' do
|
87
|
+
@forest.snp_importances.should == {}
|
88
|
+
@forest.grow
|
89
|
+
@forest.snp_importances.keys.sort.should == (1..100).to_a # 100 snps in the training file
|
90
|
+
@forest.snp_importances.values.each{|v| v.should be_kind_of Numeric}
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'traverses a set of testing individuals through every tree in the forest and returns predictions' do
|
94
|
+
@forest = @config.load_forest
|
95
|
+
@forest.predictions.should == {}
|
96
|
+
|
97
|
+
tree_structure = YAML.load(File.open fixture_file('classification_random_forest.yml'))
|
98
|
+
expected_predictions = {}
|
99
|
+
@config.read_testing_data{|individual|
|
100
|
+
individual_prediction = []
|
101
|
+
tree_structure.each do |t|
|
102
|
+
individual_prediction << Nimbus::Tree.traverse(t, individual.snp_list)
|
103
|
+
end
|
104
|
+
expected_predictions[individual.id] = Nimbus::LossFunctions.majority_class_in_list(individual_prediction, @config.tree[:classes])
|
105
|
+
}
|
106
|
+
|
107
|
+
@forest.traverse
|
108
|
+
@forest.predictions.should == expected_predictions
|
109
|
+
end
|
110
|
+
|
111
|
+
it 'can output forest structure in YAML format' do
|
112
|
+
@forest = @config.load_forest
|
113
|
+
YAML.load(File.open fixture_file('classification_random_forest.yml')) == YAML.load(@forest.to_yaml)
|
114
|
+
end
|
55
115
|
end
|
56
|
-
|
57
116
|
end
|
data/spec/individual_spec.rb
CHANGED
@@ -2,12 +2,12 @@
|
|
2
2
|
require File.dirname(__FILE__) + '/spec_helper'
|
3
3
|
|
4
4
|
describe Nimbus::Individual do
|
5
|
-
|
5
|
+
|
6
6
|
it "stores id, fenotype and SNPs information for an individual" do
|
7
7
|
@individual = Nimbus::Individual.new(11, 33.275, [1,0,2,1])
|
8
8
|
@individual.id.should == 11
|
9
9
|
@individual.fenotype.should == 33.275
|
10
10
|
@individual.snp_list.should == [1,0,2,1]
|
11
11
|
end
|
12
|
-
|
12
|
+
|
13
13
|
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
3
|
+
|
4
|
+
describe Nimbus::LossFunctions do
|
5
|
+
|
6
|
+
it "method for average" do
|
7
|
+
ids = [1,3,5,7]
|
8
|
+
values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
|
9
|
+
|
10
|
+
Nimbus::LossFunctions.average(ids, values).should == 18.25 # (10 + 21 + 31 + 11 = 73)/4
|
11
|
+
end
|
12
|
+
|
13
|
+
it "method for mean squared error" do
|
14
|
+
ids = [3,7,85]
|
15
|
+
values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
|
16
|
+
|
17
|
+
Nimbus::LossFunctions.mean_squared_error(ids, values).should == 74.0 # (avg(21 + 11 + 22) = 18: sum (x-11)^2
|
18
|
+
end
|
19
|
+
|
20
|
+
it "method for quadratic_loss" do
|
21
|
+
ids = [1,4]
|
22
|
+
values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
|
23
|
+
|
24
|
+
Nimbus::LossFunctions.quadratic_loss(ids, values).round(5).should == 1
|
25
|
+
end
|
26
|
+
|
27
|
+
it "quadratic loss is mean squared error averaged" do
|
28
|
+
ids = [1,2,3,4,5,7,85]
|
29
|
+
values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
|
30
|
+
Nimbus::LossFunctions.quadratic_loss(ids, values).round(5).should == (Nimbus::LossFunctions.mean_squared_error(ids, values)/7 ).round(5)
|
31
|
+
end
|
32
|
+
|
33
|
+
it "method for squared difference" do
|
34
|
+
Nimbus::LossFunctions.squared_difference(50, 40).should == 100.0
|
35
|
+
Nimbus::LossFunctions.squared_difference(22, 10).should == 144.0
|
36
|
+
end
|
37
|
+
|
38
|
+
it "method for majority class" do
|
39
|
+
ids = [1,2,3,4,5,7,85]
|
40
|
+
values = {1 => 'B', 2 => 'C', 3 => 'A', 4 => 'A', 5 => 'C', 7 => 'B', 85 => 'C'} #3C, 2A, 2B
|
41
|
+
classes = ['A', 'B', 'C']
|
42
|
+
Nimbus::LossFunctions.majority_class(ids, values, classes).should == 'C'
|
43
|
+
end
|
44
|
+
|
45
|
+
it "majority class method selects randomly if more than one majority class" do
|
46
|
+
ids = [1,2,3,4,5,7,85,99]
|
47
|
+
values = {1 => 'B', 2 => 'C', 3 => 'A', 4 => 'A', 5 => 'C', 7 => 'B', 85 => 'C', 99 => 'A'} #3C, 3A, 2B
|
48
|
+
classes = ['A', 'B', 'C']
|
49
|
+
results = []
|
50
|
+
20.times do
|
51
|
+
results << Nimbus::LossFunctions.majority_class(ids, values, classes)
|
52
|
+
end
|
53
|
+
results.should include('A')
|
54
|
+
results.should include('C')
|
55
|
+
end
|
56
|
+
|
57
|
+
it "method for majority class in list" do
|
58
|
+
list = %w(A A A B B B C A B C A B A)
|
59
|
+
classes = ['A', 'B', 'C']
|
60
|
+
Nimbus::LossFunctions.majority_class_in_list(list, classes).should == 'A'
|
61
|
+
end
|
62
|
+
|
63
|
+
it "Gini index" do
|
64
|
+
ids = [1,2,3,4,5,7]
|
65
|
+
values = {1 => 'B', 2 => 'C', 3 => 'A', 4 => 'A', 5 => 'C', 7 => 'C'} #3C, 2A, 1B
|
66
|
+
classes = ['A', 'B', 'C']
|
67
|
+
# Gini = 1 - ( (3/6)^2 + (2/6)^2 + (1/6)^2 ) = 0.61111
|
68
|
+
Nimbus::LossFunctions.gini_index(ids, values, classes).should == 0.61111
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
data/spec/nimbus_spec.rb
CHANGED
@@ -3,16 +3,16 @@ require File.dirname(__FILE__) + '/spec_helper'
|
|
3
3
|
|
4
4
|
|
5
5
|
describe 'Nimbus module' do
|
6
|
-
|
6
|
+
|
7
7
|
it "manages a Nimbus::Application object" do
|
8
8
|
app = Nimbus.application
|
9
9
|
app.should be_kind_of Nimbus::Application
|
10
10
|
end
|
11
|
-
|
11
|
+
|
12
12
|
it "accepts setting an external Nimbus::Application" do
|
13
|
-
app = Nimbus::Application.new
|
13
|
+
app = Nimbus::Application.new
|
14
14
|
Nimbus.application = app
|
15
15
|
Nimbus.application.should == app
|
16
16
|
end
|
17
|
-
|
17
|
+
|
18
18
|
end
|
@@ -0,0 +1,129 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
describe Nimbus::RegressionTree do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@config = Nimbus::Configuration.new
|
7
|
+
@config.load fixture_file('regression_config.yml')
|
8
|
+
|
9
|
+
@tree = Nimbus::RegressionTree.new @config.tree
|
10
|
+
end
|
11
|
+
|
12
|
+
it "is initialized with tree config info" do
|
13
|
+
@tree.snp_total_count.should == 200
|
14
|
+
@tree.snp_sample_size.should == 60
|
15
|
+
@tree.node_min_size.should == 5
|
16
|
+
end
|
17
|
+
|
18
|
+
it "creates a tree structure when seeded with training data" do
|
19
|
+
@config.load_training_data
|
20
|
+
@tree.structure.should be_nil
|
21
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
22
|
+
@tree.structure.should_not be_nil
|
23
|
+
@tree.structure.should be_kind_of Hash
|
24
|
+
|
25
|
+
@tree.structure.keys.first.should == @tree.used_snps.last
|
26
|
+
@tree.used_snps.should_not be_empty
|
27
|
+
end
|
28
|
+
|
29
|
+
it "split node in three when building a node and finds a suitable split" do
|
30
|
+
@config.load_training_data
|
31
|
+
@tree.stub!(:snps_random_sample).and_return((141..200).to_a) #189 is best split
|
32
|
+
|
33
|
+
@tree.individuals = @config.training_set.individuals
|
34
|
+
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
35
|
+
@tree.used_snps = []
|
36
|
+
@tree.predictions = {}
|
37
|
+
|
38
|
+
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
39
|
+
branch.keys.size.should == 1
|
40
|
+
branch.keys.first.should == 189
|
41
|
+
branch[189].size.should == 3
|
42
|
+
branch[189][0].should be_kind_of Hash
|
43
|
+
branch[189][1].should be_kind_of Hash
|
44
|
+
branch[189][2].should be_kind_of Hash
|
45
|
+
end
|
46
|
+
|
47
|
+
it "keeps track of all SNPs used for the tree" do
|
48
|
+
@config.load_training_data
|
49
|
+
snps = (131..190).to_a
|
50
|
+
@tree.stub!(:snps_random_sample).and_return(snps)
|
51
|
+
@tree.used_snps.should be_nil
|
52
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
53
|
+
@tree.used_snps.size.should > 4
|
54
|
+
@tree.used_snps.each{|snp|
|
55
|
+
snps.include?(snp).should be_true
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
it "labels node when building a node and there is not a suitable split" do
|
60
|
+
@config.load_training_data
|
61
|
+
@tree.stub!(:snps_random_sample).and_return([33])
|
62
|
+
|
63
|
+
@tree.individuals = @config.training_set.individuals
|
64
|
+
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
65
|
+
@tree.used_snps = []
|
66
|
+
@tree.predictions = {}
|
67
|
+
|
68
|
+
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
69
|
+
branch[33][0].should be_kind_of Numeric
|
70
|
+
branch[33][1].should be_kind_of Numeric
|
71
|
+
branch[33][2].should be_kind_of Numeric
|
72
|
+
end
|
73
|
+
|
74
|
+
it "labels node when building a node with less individuals than the minimum node size" do
|
75
|
+
@config.load_training_data
|
76
|
+
|
77
|
+
@tree.individuals = @config.training_set.individuals
|
78
|
+
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
79
|
+
@tree.used_snps = []
|
80
|
+
@tree.predictions = {}
|
81
|
+
|
82
|
+
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
83
|
+
label.should be_kind_of Numeric
|
84
|
+
|
85
|
+
label = @tree.build_node [2, 10], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
86
|
+
label.should be_kind_of Numeric
|
87
|
+
|
88
|
+
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
89
|
+
label.should be_kind_of Numeric
|
90
|
+
|
91
|
+
label = @tree.build_node [108, 22, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
92
|
+
label.should be_kind_of Numeric
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'computes generalization error for the tree' do
|
96
|
+
@config.load_training_data
|
97
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
98
|
+
@tree.generalization_error.should be_nil
|
99
|
+
@tree.generalization_error_from_oob((2..200).to_a)
|
100
|
+
@tree.generalization_error.should be_kind_of Numeric
|
101
|
+
@tree.generalization_error.should > 0.0
|
102
|
+
@tree.generalization_error.should < 1.0
|
103
|
+
end
|
104
|
+
|
105
|
+
it 'estimates importance for all SNPs' do
|
106
|
+
@config.load_training_data
|
107
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
108
|
+
@tree.importances.should be_nil
|
109
|
+
@tree.estimate_importances((300..533).to_a)
|
110
|
+
@tree.importances.should be_kind_of Hash
|
111
|
+
@tree.importances.keys.should_not be_empty
|
112
|
+
(@tree.importances.keys - (1..200).to_a).should be_empty #all keys are snp indexes (200 snps in training file)
|
113
|
+
end
|
114
|
+
|
115
|
+
it 'get prediction for an individual pushing it down a tree structure' do
|
116
|
+
tree_structure = YAML.load(File.open fixture_file('regression_random_forest.yml')).first
|
117
|
+
individual_data = [0]*200
|
118
|
+
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
119
|
+
prediction.should == 0.25043
|
120
|
+
|
121
|
+
individual_data[189-1] = 1
|
122
|
+
individual_data[4-1] = 1
|
123
|
+
individual_data[62-1] = 2
|
124
|
+
individual_data[146-1] = 2
|
125
|
+
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
126
|
+
prediction.should == -0.9854
|
127
|
+
end
|
128
|
+
|
129
|
+
end
|
data/spec/training_set_spec.rb
CHANGED
@@ -2,24 +2,24 @@
|
|
2
2
|
require File.dirname(__FILE__) + '/spec_helper'
|
3
3
|
|
4
4
|
describe Nimbus::TrainingSet do
|
5
|
-
|
5
|
+
|
6
6
|
it "stores individuals list and fenotype data for them" do
|
7
7
|
i1 = Nimbus::Individual.new 1, 11.0, [1,0,2,1]
|
8
8
|
i2 = Nimbus::Individual.new 2, 22.0, [2,1,2,2]
|
9
9
|
i3 = Nimbus::Individual.new 3, 33.0, [0,2,1,0]
|
10
10
|
@training_set = Nimbus::TrainingSet.new [i1, i3], {i1.id => 11.0, i3.id => 33.0}
|
11
|
-
|
11
|
+
|
12
12
|
@training_set.individuals.should == [i1, i3]
|
13
13
|
@training_set.ids_fenotypes.should == {i1.id => 11.0, i3.id => 33.0}
|
14
14
|
end
|
15
|
-
|
15
|
+
|
16
16
|
it "keeps track of ids of all individuals in the training set" do
|
17
17
|
i1 = Nimbus::Individual.new 1, 11.0, [1,0,2,1]
|
18
18
|
i2 = Nimbus::Individual.new 2, 22.0, [2,1,2,2]
|
19
19
|
i3 = Nimbus::Individual.new 3, 33.0, [0,2,1,0]
|
20
20
|
@training_set = Nimbus::TrainingSet.new [i1, i3], {i1.id => 11.0, i3.id => 33.0}
|
21
|
-
|
21
|
+
|
22
22
|
@training_set.all_ids.should == [1,3]
|
23
23
|
end
|
24
|
-
|
24
|
+
|
25
25
|
end
|
data/spec/tree_spec.rb
CHANGED
@@ -2,129 +2,18 @@
|
|
2
2
|
require File.dirname(__FILE__) + '/spec_helper'
|
3
3
|
|
4
4
|
describe Nimbus::Tree do
|
5
|
-
|
5
|
+
|
6
6
|
before(:each) do
|
7
7
|
@config = Nimbus::Configuration.new
|
8
8
|
@config.load fixture_file('regression_config.yml')
|
9
|
-
|
9
|
+
|
10
10
|
@tree = Nimbus::Tree.new @config.tree
|
11
11
|
end
|
12
|
-
|
12
|
+
|
13
13
|
it "is initialized with tree config info" do
|
14
14
|
@tree.snp_total_count.should == 200
|
15
15
|
@tree.snp_sample_size.should == 60
|
16
16
|
@tree.node_min_size.should == 5
|
17
17
|
end
|
18
|
-
|
19
|
-
it "creates a tree structure when seeded with training data" do
|
20
|
-
@config.load_training_data
|
21
|
-
@tree.structure.should be_nil
|
22
|
-
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
23
|
-
@tree.structure.should_not be_nil
|
24
|
-
@tree.structure.should be_kind_of Hash
|
25
|
-
|
26
|
-
@tree.structure.keys.first.should == @tree.used_snps.last
|
27
|
-
@tree.used_snps.should_not be_empty
|
28
|
-
end
|
29
|
-
|
30
|
-
it "split node in three when building a node and finds a suitable split" do
|
31
|
-
@config.load_training_data
|
32
|
-
@tree.stub!(:snps_random_sample).and_return((141..200).to_a) #189 is best split
|
33
|
-
|
34
|
-
@tree.individuals = @config.training_set.individuals
|
35
|
-
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
36
|
-
@tree.used_snps = []
|
37
|
-
@tree.predictions = {}
|
38
|
-
|
39
|
-
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
40
|
-
branch.keys.size.should == 1
|
41
|
-
branch.keys.first.should == 189
|
42
|
-
branch[189].size.should == 3
|
43
|
-
branch[189][0].should be_kind_of Hash
|
44
|
-
branch[189][1].should be_kind_of Hash
|
45
|
-
branch[189][2].should be_kind_of Hash
|
46
|
-
end
|
47
|
-
|
48
|
-
it "keeps track of all SNPs used for the tree" do
|
49
|
-
@config.load_training_data
|
50
|
-
snps = (131..190).to_a
|
51
|
-
@tree.stub!(:snps_random_sample).and_return(snps)
|
52
|
-
@tree.used_snps.should be_nil
|
53
|
-
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
54
|
-
@tree.used_snps.size.should > 4
|
55
|
-
@tree.used_snps.each{|snp|
|
56
|
-
snps.include?(snp).should be_true
|
57
|
-
}
|
58
|
-
end
|
59
|
-
|
60
|
-
it "labels node when building a node and there is not a suitable split" do
|
61
|
-
@config.load_training_data
|
62
|
-
@tree.stub!(:snps_random_sample).and_return([33])
|
63
|
-
|
64
|
-
@tree.individuals = @config.training_set.individuals
|
65
|
-
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
66
|
-
@tree.used_snps = []
|
67
|
-
@tree.predictions = {}
|
68
|
-
|
69
|
-
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
70
|
-
branch[33][0].should be_kind_of Numeric
|
71
|
-
branch[33][1].should be_kind_of Numeric
|
72
|
-
branch[33][2].should be_kind_of Numeric
|
73
|
-
end
|
74
|
-
|
75
|
-
it "labels node when building a node with less individuals than the minimum node size" do
|
76
|
-
@config.load_training_data
|
77
|
-
|
78
|
-
@tree.individuals = @config.training_set.individuals
|
79
|
-
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
80
|
-
@tree.used_snps = []
|
81
|
-
@tree.predictions = {}
|
82
|
-
|
83
|
-
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
84
|
-
label.should be_kind_of Numeric
|
85
|
-
|
86
|
-
label = @tree.build_node [2, 10], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
87
|
-
label.should be_kind_of Numeric
|
88
|
-
|
89
|
-
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
90
|
-
label.should be_kind_of Numeric
|
91
|
-
|
92
|
-
label = @tree.build_node [108, 22, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
93
|
-
label.should be_kind_of Numeric
|
94
|
-
end
|
95
|
-
|
96
|
-
it 'computes generalization error for the tree' do
|
97
|
-
@config.load_training_data
|
98
|
-
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
99
|
-
@tree.generalization_error.should be_nil
|
100
|
-
@tree.generalization_error_from_oob((2..200).to_a)
|
101
|
-
@tree.generalization_error.should be_kind_of Numeric
|
102
|
-
@tree.generalization_error.should > 0.0
|
103
|
-
@tree.generalization_error.should < 100.0
|
104
|
-
end
|
105
|
-
|
106
|
-
it 'estimates importance for all SNPs' do
|
107
|
-
@config.load_training_data
|
108
|
-
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
109
|
-
@tree.importances.should be_nil
|
110
|
-
@tree.estimate_importances((300..533).to_a)
|
111
|
-
@tree.importances.should be_kind_of Hash
|
112
|
-
@tree.importances.keys.should_not be_empty
|
113
|
-
(@tree.importances.keys - (1..200).to_a).should be_empty
|
114
|
-
end
|
115
|
-
|
116
|
-
it 'get prediction for an individual pushing it down a tree structure' do
|
117
|
-
tree_structure = YAML.load(File.open fixture_file('regression_random_forest.yml')).first
|
118
|
-
individual_data = [0]*200
|
119
|
-
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
120
|
-
prediction.should == 0.25043
|
121
|
-
|
122
|
-
individual_data[189-1] = 1
|
123
|
-
individual_data[4-1] = 1
|
124
|
-
individual_data[62-1] = 2
|
125
|
-
individual_data[146-1] = 2
|
126
|
-
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
127
|
-
prediction.should == -0.9854
|
128
|
-
end
|
129
|
-
|
18
|
+
|
130
19
|
end
|