nimbus 1.0.1 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +149 -0
- data/lib/nimbus.rb +15 -11
- data/lib/nimbus/application.rb +20 -23
- data/lib/nimbus/classification_tree.rb +111 -0
- data/lib/nimbus/configuration.rb +52 -37
- data/lib/nimbus/forest.rb +56 -20
- data/lib/nimbus/individual.rb +7 -7
- data/lib/nimbus/loss_functions.rb +44 -10
- data/lib/nimbus/regression_tree.rb +103 -0
- data/lib/nimbus/training_set.rb +4 -4
- data/lib/nimbus/tree.rb +20 -83
- data/lib/nimbus/version.rb +3 -0
- data/spec/classification_tree_spec.rb +132 -0
- data/spec/configuration_spec.rb +46 -19
- data/spec/fixtures/classification_config.yml +13 -0
- data/spec/fixtures/classification_random_forest.yml +922 -0
- data/spec/fixtures/classification_testing.data +500 -0
- data/spec/fixtures/classification_training.data +1000 -0
- data/spec/forest_spec.rb +109 -50
- data/spec/individual_spec.rb +2 -2
- data/spec/loss_functions_spec.rb +71 -0
- data/spec/nimbus_spec.rb +4 -4
- data/spec/regression_tree_spec.rb +129 -0
- data/spec/training_set_spec.rb +5 -5
- data/spec/tree_spec.rb +4 -115
- metadata +53 -45
- data/spec/fixtures/regression_snp_importances.txt +0 -200
- data/spec/fixtures/regression_testing_file_predictions.txt +0 -200
- data/spec/fixtures/regression_training_file_predictions.txt +0 -758
data/spec/forest_spec.rb
CHANGED
@@ -2,56 +2,115 @@
|
|
2
2
|
require File.dirname(__FILE__) + '/spec_helper'
|
3
3
|
|
4
4
|
describe Nimbus::Forest do
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
5
|
+
describe "Regression" do
|
6
|
+
before(:each) do
|
7
|
+
@config = Nimbus::Configuration.new
|
8
|
+
@config.load fixture_file('regression_config.yml')
|
9
|
+
@config.load_training_data
|
10
|
+
@forest = ::Nimbus::Forest.new @config
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'grows a regression forest of N trees' do
|
14
|
+
@forest.trees.should == []
|
15
|
+
@config.forest_size.should == 3
|
16
|
+
@forest.should_not be_classification
|
17
|
+
@forest.should be_regression
|
18
|
+
@forest.grow
|
19
|
+
@forest.trees.size.should == @config.forest_size
|
20
|
+
@forest.trees.each{|t| t.should be_kind_of Hash}
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'creates averaged predictions for individuals in the training set' do
|
24
|
+
@forest.predictions.should == {}
|
25
|
+
@forest.grow
|
26
|
+
(@forest.predictions.keys - (1..800).to_a ).should == [] # 800 individuals in the training file
|
27
|
+
@forest.predictions.values.each{|v| v.should be_kind_of Numeric}
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'computes averaged SNP importances for every SNP' do
|
31
|
+
@forest.snp_importances.should == {}
|
32
|
+
@forest.grow
|
33
|
+
@forest.snp_importances.keys.sort.should == (1..200).to_a # 200 snps in the training file
|
34
|
+
@forest.snp_importances.values.each{|v| v.should be_kind_of Numeric}
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'traverses a set of testing individuals through every tree in the forest and returns predictions' do
|
38
|
+
@forest = @config.load_forest
|
39
|
+
@forest.predictions.should == {}
|
40
|
+
|
41
|
+
tree_structure = YAML.load(File.open fixture_file('regression_random_forest.yml'))
|
42
|
+
expected_predictions = {}
|
43
|
+
@config.read_testing_data{|individual|
|
44
|
+
individual_prediction = 0.0
|
45
|
+
tree_structure.each do |t|
|
46
|
+
individual_prediction = (individual_prediction + Nimbus::Tree.traverse(t, individual.snp_list)).round(5)
|
47
|
+
end
|
48
|
+
expected_predictions[individual.id] = (individual_prediction / 3).round(5)
|
49
|
+
}
|
50
|
+
|
51
|
+
@forest.traverse
|
52
|
+
@forest.predictions.should == expected_predictions
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'can output forest structure in YAML format' do
|
56
|
+
@forest = @config.load_forest
|
57
|
+
YAML.load(File.open fixture_file('regression_random_forest.yml')) == YAML.load(@forest.to_yaml)
|
58
|
+
end
|
50
59
|
end
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
60
|
+
|
61
|
+
describe "Classification" do
|
62
|
+
before(:each) do
|
63
|
+
@config = Nimbus::Configuration.new
|
64
|
+
@config.load fixture_file('classification_config.yml')
|
65
|
+
@config.load_training_data
|
66
|
+
@forest = ::Nimbus::Forest.new @config
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'grows a classification forest of N trees' do
|
70
|
+
@forest.trees.should == []
|
71
|
+
@config.forest_size.should == 3
|
72
|
+
@forest.should be_classification
|
73
|
+
@forest.should_not be_regression
|
74
|
+
@forest.grow
|
75
|
+
@forest.trees.size.should == @config.forest_size
|
76
|
+
@forest.trees.each{|t| t.should be_kind_of Hash}
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'creates predictions for individuals in the training set' do
|
80
|
+
@forest.predictions.should == {}
|
81
|
+
@forest.grow
|
82
|
+
(@forest.predictions.keys - (1..1000).to_a ).should == [] # 1000 individuals in the training file
|
83
|
+
@forest.predictions.values.each{|v| v.should be_kind_of String}
|
84
|
+
end
|
85
|
+
|
86
|
+
it 'computes averaged SNP importances for every SNP' do
|
87
|
+
@forest.snp_importances.should == {}
|
88
|
+
@forest.grow
|
89
|
+
@forest.snp_importances.keys.sort.should == (1..100).to_a # 100 snps in the training file
|
90
|
+
@forest.snp_importances.values.each{|v| v.should be_kind_of Numeric}
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'traverses a set of testing individuals through every tree in the forest and returns predictions' do
|
94
|
+
@forest = @config.load_forest
|
95
|
+
@forest.predictions.should == {}
|
96
|
+
|
97
|
+
tree_structure = YAML.load(File.open fixture_file('classification_random_forest.yml'))
|
98
|
+
expected_predictions = {}
|
99
|
+
@config.read_testing_data{|individual|
|
100
|
+
individual_prediction = []
|
101
|
+
tree_structure.each do |t|
|
102
|
+
individual_prediction << Nimbus::Tree.traverse(t, individual.snp_list)
|
103
|
+
end
|
104
|
+
expected_predictions[individual.id] = Nimbus::LossFunctions.majority_class_in_list(individual_prediction, @config.tree[:classes])
|
105
|
+
}
|
106
|
+
|
107
|
+
@forest.traverse
|
108
|
+
@forest.predictions.should == expected_predictions
|
109
|
+
end
|
110
|
+
|
111
|
+
it 'can output forest structure in YAML format' do
|
112
|
+
@forest = @config.load_forest
|
113
|
+
YAML.load(File.open fixture_file('classification_random_forest.yml')) == YAML.load(@forest.to_yaml)
|
114
|
+
end
|
55
115
|
end
|
56
|
-
|
57
116
|
end
|
data/spec/individual_spec.rb
CHANGED
@@ -2,12 +2,12 @@
|
|
2
2
|
require File.dirname(__FILE__) + '/spec_helper'
|
3
3
|
|
4
4
|
describe Nimbus::Individual do
|
5
|
-
|
5
|
+
|
6
6
|
it "stores id, fenotype and SNPs information for an individual" do
|
7
7
|
@individual = Nimbus::Individual.new(11, 33.275, [1,0,2,1])
|
8
8
|
@individual.id.should == 11
|
9
9
|
@individual.fenotype.should == 33.275
|
10
10
|
@individual.snp_list.should == [1,0,2,1]
|
11
11
|
end
|
12
|
-
|
12
|
+
|
13
13
|
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
3
|
+
|
4
|
+
describe Nimbus::LossFunctions do
|
5
|
+
|
6
|
+
it "method for average" do
|
7
|
+
ids = [1,3,5,7]
|
8
|
+
values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
|
9
|
+
|
10
|
+
Nimbus::LossFunctions.average(ids, values).should == 18.25 # (10 + 21 + 31 + 11 = 73)/4
|
11
|
+
end
|
12
|
+
|
13
|
+
it "method for mean squared error" do
|
14
|
+
ids = [3,7,85]
|
15
|
+
values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
|
16
|
+
|
17
|
+
Nimbus::LossFunctions.mean_squared_error(ids, values).should == 74.0 # (avg(21 + 11 + 22) = 18: sum (x-11)^2
|
18
|
+
end
|
19
|
+
|
20
|
+
it "method for quadratic_loss" do
|
21
|
+
ids = [1,4]
|
22
|
+
values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
|
23
|
+
|
24
|
+
Nimbus::LossFunctions.quadratic_loss(ids, values).round(5).should == 1
|
25
|
+
end
|
26
|
+
|
27
|
+
it "quadratic loss is mean squared error averaged" do
|
28
|
+
ids = [1,2,3,4,5,7,85]
|
29
|
+
values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
|
30
|
+
Nimbus::LossFunctions.quadratic_loss(ids, values).round(5).should == (Nimbus::LossFunctions.mean_squared_error(ids, values)/7 ).round(5)
|
31
|
+
end
|
32
|
+
|
33
|
+
it "method for squared difference" do
|
34
|
+
Nimbus::LossFunctions.squared_difference(50, 40).should == 100.0
|
35
|
+
Nimbus::LossFunctions.squared_difference(22, 10).should == 144.0
|
36
|
+
end
|
37
|
+
|
38
|
+
it "method for majority class" do
|
39
|
+
ids = [1,2,3,4,5,7,85]
|
40
|
+
values = {1 => 'B', 2 => 'C', 3 => 'A', 4 => 'A', 5 => 'C', 7 => 'B', 85 => 'C'} #3C, 2A, 2B
|
41
|
+
classes = ['A', 'B', 'C']
|
42
|
+
Nimbus::LossFunctions.majority_class(ids, values, classes).should == 'C'
|
43
|
+
end
|
44
|
+
|
45
|
+
it "majority class method selects randomly if more than one majority class" do
|
46
|
+
ids = [1,2,3,4,5,7,85,99]
|
47
|
+
values = {1 => 'B', 2 => 'C', 3 => 'A', 4 => 'A', 5 => 'C', 7 => 'B', 85 => 'C', 99 => 'A'} #3C, 3A, 2B
|
48
|
+
classes = ['A', 'B', 'C']
|
49
|
+
results = []
|
50
|
+
20.times do
|
51
|
+
results << Nimbus::LossFunctions.majority_class(ids, values, classes)
|
52
|
+
end
|
53
|
+
results.should include('A')
|
54
|
+
results.should include('C')
|
55
|
+
end
|
56
|
+
|
57
|
+
it "method for majority class in list" do
|
58
|
+
list = %w(A A A B B B C A B C A B A)
|
59
|
+
classes = ['A', 'B', 'C']
|
60
|
+
Nimbus::LossFunctions.majority_class_in_list(list, classes).should == 'A'
|
61
|
+
end
|
62
|
+
|
63
|
+
it "Gini index" do
|
64
|
+
ids = [1,2,3,4,5,7]
|
65
|
+
values = {1 => 'B', 2 => 'C', 3 => 'A', 4 => 'A', 5 => 'C', 7 => 'C'} #3C, 2A, 1B
|
66
|
+
classes = ['A', 'B', 'C']
|
67
|
+
# Gini = 1 - ( (3/6)^2 + (2/6)^2 + (1/6)^2 ) = 0.61111
|
68
|
+
Nimbus::LossFunctions.gini_index(ids, values, classes).should == 0.61111
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
data/spec/nimbus_spec.rb
CHANGED
@@ -3,16 +3,16 @@ require File.dirname(__FILE__) + '/spec_helper'
|
|
3
3
|
|
4
4
|
|
5
5
|
describe 'Nimbus module' do
|
6
|
-
|
6
|
+
|
7
7
|
it "manages a Nimbus::Application object" do
|
8
8
|
app = Nimbus.application
|
9
9
|
app.should be_kind_of Nimbus::Application
|
10
10
|
end
|
11
|
-
|
11
|
+
|
12
12
|
it "accepts setting an external Nimbus::Application" do
|
13
|
-
app = Nimbus::Application.new
|
13
|
+
app = Nimbus::Application.new
|
14
14
|
Nimbus.application = app
|
15
15
|
Nimbus.application.should == app
|
16
16
|
end
|
17
|
-
|
17
|
+
|
18
18
|
end
|
@@ -0,0 +1,129 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
describe Nimbus::RegressionTree do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@config = Nimbus::Configuration.new
|
7
|
+
@config.load fixture_file('regression_config.yml')
|
8
|
+
|
9
|
+
@tree = Nimbus::RegressionTree.new @config.tree
|
10
|
+
end
|
11
|
+
|
12
|
+
it "is initialized with tree config info" do
|
13
|
+
@tree.snp_total_count.should == 200
|
14
|
+
@tree.snp_sample_size.should == 60
|
15
|
+
@tree.node_min_size.should == 5
|
16
|
+
end
|
17
|
+
|
18
|
+
it "creates a tree structure when seeded with training data" do
|
19
|
+
@config.load_training_data
|
20
|
+
@tree.structure.should be_nil
|
21
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
22
|
+
@tree.structure.should_not be_nil
|
23
|
+
@tree.structure.should be_kind_of Hash
|
24
|
+
|
25
|
+
@tree.structure.keys.first.should == @tree.used_snps.last
|
26
|
+
@tree.used_snps.should_not be_empty
|
27
|
+
end
|
28
|
+
|
29
|
+
it "split node in three when building a node and finds a suitable split" do
|
30
|
+
@config.load_training_data
|
31
|
+
@tree.stub!(:snps_random_sample).and_return((141..200).to_a) #189 is best split
|
32
|
+
|
33
|
+
@tree.individuals = @config.training_set.individuals
|
34
|
+
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
35
|
+
@tree.used_snps = []
|
36
|
+
@tree.predictions = {}
|
37
|
+
|
38
|
+
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
39
|
+
branch.keys.size.should == 1
|
40
|
+
branch.keys.first.should == 189
|
41
|
+
branch[189].size.should == 3
|
42
|
+
branch[189][0].should be_kind_of Hash
|
43
|
+
branch[189][1].should be_kind_of Hash
|
44
|
+
branch[189][2].should be_kind_of Hash
|
45
|
+
end
|
46
|
+
|
47
|
+
it "keeps track of all SNPs used for the tree" do
|
48
|
+
@config.load_training_data
|
49
|
+
snps = (131..190).to_a
|
50
|
+
@tree.stub!(:snps_random_sample).and_return(snps)
|
51
|
+
@tree.used_snps.should be_nil
|
52
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
53
|
+
@tree.used_snps.size.should > 4
|
54
|
+
@tree.used_snps.each{|snp|
|
55
|
+
snps.include?(snp).should be_true
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
it "labels node when building a node and there is not a suitable split" do
|
60
|
+
@config.load_training_data
|
61
|
+
@tree.stub!(:snps_random_sample).and_return([33])
|
62
|
+
|
63
|
+
@tree.individuals = @config.training_set.individuals
|
64
|
+
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
65
|
+
@tree.used_snps = []
|
66
|
+
@tree.predictions = {}
|
67
|
+
|
68
|
+
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
69
|
+
branch[33][0].should be_kind_of Numeric
|
70
|
+
branch[33][1].should be_kind_of Numeric
|
71
|
+
branch[33][2].should be_kind_of Numeric
|
72
|
+
end
|
73
|
+
|
74
|
+
it "labels node when building a node with less individuals than the minimum node size" do
|
75
|
+
@config.load_training_data
|
76
|
+
|
77
|
+
@tree.individuals = @config.training_set.individuals
|
78
|
+
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
79
|
+
@tree.used_snps = []
|
80
|
+
@tree.predictions = {}
|
81
|
+
|
82
|
+
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
83
|
+
label.should be_kind_of Numeric
|
84
|
+
|
85
|
+
label = @tree.build_node [2, 10], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
86
|
+
label.should be_kind_of Numeric
|
87
|
+
|
88
|
+
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
89
|
+
label.should be_kind_of Numeric
|
90
|
+
|
91
|
+
label = @tree.build_node [108, 22, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
92
|
+
label.should be_kind_of Numeric
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'computes generalization error for the tree' do
|
96
|
+
@config.load_training_data
|
97
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
98
|
+
@tree.generalization_error.should be_nil
|
99
|
+
@tree.generalization_error_from_oob((2..200).to_a)
|
100
|
+
@tree.generalization_error.should be_kind_of Numeric
|
101
|
+
@tree.generalization_error.should > 0.0
|
102
|
+
@tree.generalization_error.should < 1.0
|
103
|
+
end
|
104
|
+
|
105
|
+
it 'estimates importance for all SNPs' do
|
106
|
+
@config.load_training_data
|
107
|
+
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
108
|
+
@tree.importances.should be_nil
|
109
|
+
@tree.estimate_importances((300..533).to_a)
|
110
|
+
@tree.importances.should be_kind_of Hash
|
111
|
+
@tree.importances.keys.should_not be_empty
|
112
|
+
(@tree.importances.keys - (1..200).to_a).should be_empty #all keys are snp indexes (200 snps in training file)
|
113
|
+
end
|
114
|
+
|
115
|
+
it 'get prediction for an individual pushing it down a tree structure' do
|
116
|
+
tree_structure = YAML.load(File.open fixture_file('regression_random_forest.yml')).first
|
117
|
+
individual_data = [0]*200
|
118
|
+
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
119
|
+
prediction.should == 0.25043
|
120
|
+
|
121
|
+
individual_data[189-1] = 1
|
122
|
+
individual_data[4-1] = 1
|
123
|
+
individual_data[62-1] = 2
|
124
|
+
individual_data[146-1] = 2
|
125
|
+
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
126
|
+
prediction.should == -0.9854
|
127
|
+
end
|
128
|
+
|
129
|
+
end
|
data/spec/training_set_spec.rb
CHANGED
@@ -2,24 +2,24 @@
|
|
2
2
|
require File.dirname(__FILE__) + '/spec_helper'
|
3
3
|
|
4
4
|
describe Nimbus::TrainingSet do
|
5
|
-
|
5
|
+
|
6
6
|
it "stores individuals list and fenotype data for them" do
|
7
7
|
i1 = Nimbus::Individual.new 1, 11.0, [1,0,2,1]
|
8
8
|
i2 = Nimbus::Individual.new 2, 22.0, [2,1,2,2]
|
9
9
|
i3 = Nimbus::Individual.new 3, 33.0, [0,2,1,0]
|
10
10
|
@training_set = Nimbus::TrainingSet.new [i1, i3], {i1.id => 11.0, i3.id => 33.0}
|
11
|
-
|
11
|
+
|
12
12
|
@training_set.individuals.should == [i1, i3]
|
13
13
|
@training_set.ids_fenotypes.should == {i1.id => 11.0, i3.id => 33.0}
|
14
14
|
end
|
15
|
-
|
15
|
+
|
16
16
|
it "keeps track of ids of all individuals in the training set" do
|
17
17
|
i1 = Nimbus::Individual.new 1, 11.0, [1,0,2,1]
|
18
18
|
i2 = Nimbus::Individual.new 2, 22.0, [2,1,2,2]
|
19
19
|
i3 = Nimbus::Individual.new 3, 33.0, [0,2,1,0]
|
20
20
|
@training_set = Nimbus::TrainingSet.new [i1, i3], {i1.id => 11.0, i3.id => 33.0}
|
21
|
-
|
21
|
+
|
22
22
|
@training_set.all_ids.should == [1,3]
|
23
23
|
end
|
24
|
-
|
24
|
+
|
25
25
|
end
|
data/spec/tree_spec.rb
CHANGED
@@ -2,129 +2,18 @@
|
|
2
2
|
require File.dirname(__FILE__) + '/spec_helper'
|
3
3
|
|
4
4
|
describe Nimbus::Tree do
|
5
|
-
|
5
|
+
|
6
6
|
before(:each) do
|
7
7
|
@config = Nimbus::Configuration.new
|
8
8
|
@config.load fixture_file('regression_config.yml')
|
9
|
-
|
9
|
+
|
10
10
|
@tree = Nimbus::Tree.new @config.tree
|
11
11
|
end
|
12
|
-
|
12
|
+
|
13
13
|
it "is initialized with tree config info" do
|
14
14
|
@tree.snp_total_count.should == 200
|
15
15
|
@tree.snp_sample_size.should == 60
|
16
16
|
@tree.node_min_size.should == 5
|
17
17
|
end
|
18
|
-
|
19
|
-
it "creates a tree structure when seeded with training data" do
|
20
|
-
@config.load_training_data
|
21
|
-
@tree.structure.should be_nil
|
22
|
-
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
23
|
-
@tree.structure.should_not be_nil
|
24
|
-
@tree.structure.should be_kind_of Hash
|
25
|
-
|
26
|
-
@tree.structure.keys.first.should == @tree.used_snps.last
|
27
|
-
@tree.used_snps.should_not be_empty
|
28
|
-
end
|
29
|
-
|
30
|
-
it "split node in three when building a node and finds a suitable split" do
|
31
|
-
@config.load_training_data
|
32
|
-
@tree.stub!(:snps_random_sample).and_return((141..200).to_a) #189 is best split
|
33
|
-
|
34
|
-
@tree.individuals = @config.training_set.individuals
|
35
|
-
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
36
|
-
@tree.used_snps = []
|
37
|
-
@tree.predictions = {}
|
38
|
-
|
39
|
-
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
40
|
-
branch.keys.size.should == 1
|
41
|
-
branch.keys.first.should == 189
|
42
|
-
branch[189].size.should == 3
|
43
|
-
branch[189][0].should be_kind_of Hash
|
44
|
-
branch[189][1].should be_kind_of Hash
|
45
|
-
branch[189][2].should be_kind_of Hash
|
46
|
-
end
|
47
|
-
|
48
|
-
it "keeps track of all SNPs used for the tree" do
|
49
|
-
@config.load_training_data
|
50
|
-
snps = (131..190).to_a
|
51
|
-
@tree.stub!(:snps_random_sample).and_return(snps)
|
52
|
-
@tree.used_snps.should be_nil
|
53
|
-
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
54
|
-
@tree.used_snps.size.should > 4
|
55
|
-
@tree.used_snps.each{|snp|
|
56
|
-
snps.include?(snp).should be_true
|
57
|
-
}
|
58
|
-
end
|
59
|
-
|
60
|
-
it "labels node when building a node and there is not a suitable split" do
|
61
|
-
@config.load_training_data
|
62
|
-
@tree.stub!(:snps_random_sample).and_return([33])
|
63
|
-
|
64
|
-
@tree.individuals = @config.training_set.individuals
|
65
|
-
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
66
|
-
@tree.used_snps = []
|
67
|
-
@tree.predictions = {}
|
68
|
-
|
69
|
-
branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
70
|
-
branch[33][0].should be_kind_of Numeric
|
71
|
-
branch[33][1].should be_kind_of Numeric
|
72
|
-
branch[33][2].should be_kind_of Numeric
|
73
|
-
end
|
74
|
-
|
75
|
-
it "labels node when building a node with less individuals than the minimum node size" do
|
76
|
-
@config.load_training_data
|
77
|
-
|
78
|
-
@tree.individuals = @config.training_set.individuals
|
79
|
-
@tree.id_to_fenotype = @config.training_set.ids_fenotypes
|
80
|
-
@tree.used_snps = []
|
81
|
-
@tree.predictions = {}
|
82
|
-
|
83
|
-
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
84
|
-
label.should be_kind_of Numeric
|
85
|
-
|
86
|
-
label = @tree.build_node [2, 10], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
87
|
-
label.should be_kind_of Numeric
|
88
|
-
|
89
|
-
label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
90
|
-
label.should be_kind_of Numeric
|
91
|
-
|
92
|
-
label = @tree.build_node [108, 22, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
93
|
-
label.should be_kind_of Numeric
|
94
|
-
end
|
95
|
-
|
96
|
-
it 'computes generalization error for the tree' do
|
97
|
-
@config.load_training_data
|
98
|
-
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
99
|
-
@tree.generalization_error.should be_nil
|
100
|
-
@tree.generalization_error_from_oob((2..200).to_a)
|
101
|
-
@tree.generalization_error.should be_kind_of Numeric
|
102
|
-
@tree.generalization_error.should > 0.0
|
103
|
-
@tree.generalization_error.should < 100.0
|
104
|
-
end
|
105
|
-
|
106
|
-
it 'estimates importance for all SNPs' do
|
107
|
-
@config.load_training_data
|
108
|
-
@tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
|
109
|
-
@tree.importances.should be_nil
|
110
|
-
@tree.estimate_importances((300..533).to_a)
|
111
|
-
@tree.importances.should be_kind_of Hash
|
112
|
-
@tree.importances.keys.should_not be_empty
|
113
|
-
(@tree.importances.keys - (1..200).to_a).should be_empty
|
114
|
-
end
|
115
|
-
|
116
|
-
it 'get prediction for an individual pushing it down a tree structure' do
|
117
|
-
tree_structure = YAML.load(File.open fixture_file('regression_random_forest.yml')).first
|
118
|
-
individual_data = [0]*200
|
119
|
-
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
120
|
-
prediction.should == 0.25043
|
121
|
-
|
122
|
-
individual_data[189-1] = 1
|
123
|
-
individual_data[4-1] = 1
|
124
|
-
individual_data[62-1] = 2
|
125
|
-
individual_data[146-1] = 2
|
126
|
-
prediction = Nimbus::Tree.traverse tree_structure, individual_data
|
127
|
-
prediction.should == -0.9854
|
128
|
-
end
|
129
|
-
|
18
|
+
|
130
19
|
end
|