nimbus 1.0.1 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,103 @@
1
+ module Nimbus
2
+
3
+ #####################################################################
4
+ # Tree object representing a random regression tree.
5
+ #
6
+ # A tree is generated following this steps:
7
+ #
8
+ # * 1: Calculate loss function for the individuals in the node (first node contains all the individuals).
9
+ # * 2: Take a random sample of the SNPs (size m << total count of SNPs)
10
+ # * 3: Compute the loss function (quadratic loss) for the split of the sample based on value of every SNP.
11
+ # * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in three nodes, based on value for that SNP [0, 1, or 2]
12
+ # * 5: Repeat from 1 for every node until:
13
+ # - a) The individuals count in that node is < minimum size OR
14
+ # - b) None of the SNP splits has a loss function smaller than the node loss function
15
+ # * 6) When a node stops, label the node with the average fenotype value of the individuals in the node.
16
+ #
17
+ class RegressionTree < Tree
18
+
19
+ # Creates the structure of the tree, as a hash of SNP splits and values.
20
+ #
21
+ # It just initializes the needed variables and then defines the first node of the tree.
22
+ # The rest of the structure of the tree is computed recursively building every node calling +build_node+.
23
+ def seed(all_individuals, individuals_sample, ids_fenotypes)
24
+ super
25
+ @structure = build_node individuals_sample, Nimbus::LossFunctions.average(individuals_sample, @id_to_fenotype)
26
+ end
27
+
28
+ # Creates a node by taking a random sample of the SNPs and computing the loss function for every split by SNP of that sample.
29
+ #
30
+ # * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in three:
31
+ # (those with value 0 for the SNP_min, those with value 1 for the SNP_min, and those with value 2 for the SNP_min) then it builds these 3 new nodes.
32
+ # * Otherwise every individual in the node gets labeled with the average of the fenotype values of all of them.
33
+ def build_node(individuals_ids, y_hat)
34
+ # General loss function value for the node
35
+ individuals_count = individuals_ids.size
36
+ return label_node(y_hat, individuals_ids) if individuals_count < @node_min_size
37
+ node_loss_function = Nimbus::LossFunctions.quadratic_loss individuals_ids, @id_to_fenotype, y_hat
38
+
39
+ # Finding the SNP that minimizes loss function
40
+ snps = snps_random_sample
41
+ min_loss, min_SNP, split, means = node_loss_function, nil, nil, nil
42
+
43
+ snps.each do |snp|
44
+ individuals_split_by_snp_value = split_by_snp_value individuals_ids, snp
45
+ mean_0 = Nimbus::LossFunctions.average individuals_split_by_snp_value[0], @id_to_fenotype
46
+ mean_1 = Nimbus::LossFunctions.average individuals_split_by_snp_value[1], @id_to_fenotype
47
+ mean_2 = Nimbus::LossFunctions.average individuals_split_by_snp_value[2], @id_to_fenotype
48
+ loss_0 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[0], @id_to_fenotype, mean_0
49
+ loss_1 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[1], @id_to_fenotype, mean_1
50
+ loss_2 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[2], @id_to_fenotype, mean_2
51
+ loss_snp = (loss_0 + loss_1 + loss_2) / individuals_count
52
+
53
+ min_loss, min_SNP, split, means = loss_snp, snp, individuals_split_by_snp_value, [mean_0, mean_1, mean_2] if loss_snp < min_loss
54
+ end
55
+
56
+ return build_branch(min_SNP, split, means, y_hat) if min_loss < node_loss_function
57
+ return label_node(y_hat, individuals_ids)
58
+ end
59
+
60
+ # Compute generalization error for the tree.
61
+ #
62
+ # Traversing the 'out of bag' (OOB) sample (those individuals of the training set not
63
+ # used in the building of this tree) through the tree, and comparing
64
+ # the prediction with the real fenotype of the individual (and then averaging) is
65
+ # possible to calculate the unbiased generalization error for the tree.
66
+ def generalization_error_from_oob(oob_ids)
67
+ return nil if (@structure.nil? || @individuals.nil? || @id_to_fenotype.nil?)
68
+ oob_errors = {}
69
+ oob_ids.each do |oobi|
70
+ oob_prediction = Tree.traverse @structure, individuals[oobi].snp_list
71
+ oob_errors[oobi] = Nimbus::LossFunctions.squared_difference oob_prediction, @id_to_fenotype[oobi]
72
+ end
73
+ @generalization_error = Nimbus::LossFunctions.average oob_ids, oob_errors
74
+ end
75
+
76
+ # Estimation of importance for every SNP.
77
+ #
78
+ # The importance of any SNP in the tree is calculated using the OOB sample.
79
+ # For every SNP, every individual in the sample is pushed down the tree but with the
80
+ # value of that SNP permuted with other individual in the sample.
81
+ #
82
+ # That way the difference between the regular prediction and the prediction with the SNP value modified can be estimated for any given SNP.
83
+ #
84
+ # This method computes importance estimations for every SNPs used in the tree (for any other SNP it would be 0).
85
+ def estimate_importances(oob_ids)
86
+ return nil if (@generalization_error.nil? && generalization_error_from_oob(oob_ids).nil?)
87
+ oob_individuals_count = oob_ids.size
88
+ @importances = {}
89
+ @used_snps.uniq.each do |current_snp|
90
+ shuffled_ids = oob_ids.shuffle
91
+ permutated_snp_error = 0.0
92
+ oob_ids.each_with_index {|oobi, index|
93
+ permutated_prediction = traverse_with_permutation @structure, individuals[oobi].snp_list, current_snp, individuals[shuffled_ids[index]].snp_list
94
+ permutated_snp_error += Nimbus::LossFunctions.squared_difference @id_to_fenotype[oobi], permutated_prediction
95
+ }
96
+ @importances[current_snp] = ((permutated_snp_error / oob_individuals_count) - @generalization_error).round(5)
97
+ end
98
+ @importances
99
+ end
100
+
101
+ end
102
+
103
+ end
@@ -1,23 +1,23 @@
1
1
  module Nimbus
2
2
  #####################################################################
3
3
  # Set of individuals to be used as training sample for a random forest.
4
- #
4
+ #
5
5
  # the TrainingSet class stores an array of individuals, and a hash with the fenotypes of every individual indexed by id.
6
6
  #
7
7
  class TrainingSet
8
8
  attr_accessor :individuals, :ids_fenotypes
9
-
9
+
10
10
  # Initialize a new training set with the individuals and fenotype info received.
11
11
  def initialize(individuals, ids_fenotypes)
12
12
  @individuals = individuals
13
13
  @ids_fenotypes = ids_fenotypes
14
14
  end
15
-
15
+
16
16
  # Array of all the ids of the individuals in this training sample.
17
17
  def all_ids
18
18
  @all_ids ||= @ids_fenotypes.keys
19
19
  @all_ids
20
20
  end
21
21
  end
22
-
22
+
23
23
  end
data/lib/nimbus/tree.rb CHANGED
@@ -6,25 +6,25 @@ module Nimbus
6
6
  # A tree is generated following this steps:
7
7
  #
8
8
  # * 1: Calculate loss function for the individuals in the node (first node contains all the individuals).
9
- # * 2: Take a random sample of the SNPs (size m << total count of SNPs)
9
+ # * 2: Take a random sample of the SNPs (size m << total count of SNPs)
10
10
  # * 3: Compute the loss function for the split of the sample based on value of every SNP.
11
11
  # * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in three nodes, based on value for that SNP [0, 1, or 2]
12
12
  # * 5: Repeat from 1 for every node until:
13
13
  # - a) The individuals count in that node is < minimum size OR
14
14
  # - b) None of the SNP splits has a loss function smaller than the node loss function
15
- # * 6) When a node stops, label the node with the average fenotype value of the individuals in the node.
15
+ # * 6) When a node stops, label the node with the average fenotype value (for regression problems) or the majority class (for classification problems) of the individuals in the node.
16
16
  #
17
17
  class Tree
18
18
  attr_accessor :snp_sample_size, :snp_total_count, :node_min_size, :used_snps, :structure, :generalization_error, :predictions, :importances
19
19
  attr_accessor :individuals, :id_to_fenotype
20
-
20
+
21
21
  # Initialize Tree object with the configuration (as in Nimbus::Configuration.tree) options received.
22
22
  def initialize(options)
23
23
  @snp_total_count = options[:snp_total_count]
24
24
  @snp_sample_size = options[:snp_sample_size]
25
25
  @node_min_size = options[:tree_node_min_size]
26
26
  end
27
-
27
+
28
28
  # Creates the structure of the tree, as a hash of SNP splits and values.
29
29
  #
30
30
  # It just initializes the needed variables and then defines the first node of the tree.
@@ -34,113 +34,50 @@ module Nimbus
34
34
  @id_to_fenotype = ids_fenotypes
35
35
  @predictions = {}
36
36
  @used_snps = []
37
-
38
- @structure = build_node individuals_sample, Nimbus::LossFunctions.average(individuals_sample, @id_to_fenotype)
39
37
  end
40
38
 
41
39
  # Creates a node by taking a random sample of the SNPs and computing the loss function for every split by SNP of that sample.
42
- #
43
- # * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in three:
44
- # (those with value 0 for the SNP_min, those with value 1 for the SNP_min, and those with value 2 for the SNP_min) then it builds these 3 new nodes.
45
- # * Otherwise every individual in the node gets labeled with the average of the fenotype values of all of them.
46
40
  def build_node(individuals_ids, y_hat)
47
- # General loss function value for the node
48
- individuals_count = individuals_ids.size
49
- return label_node(y_hat, individuals_ids) if individuals_count < @node_min_size
50
- node_loss_function = Nimbus::LossFunctions.quadratic_loss individuals_ids, @id_to_fenotype, y_hat
51
-
52
- # Finding the SNP that minimizes loss function
53
- snps = snps_random_sample
54
- min_loss, min_SNP, split, means = node_loss_function, nil, nil, nil
55
-
56
- snps.each do |snp|
57
- individuals_split_by_snp_value = split_by_snp_value individuals_ids, snp
58
- mean_0 = Nimbus::LossFunctions.average individuals_split_by_snp_value[0], @id_to_fenotype
59
- mean_1 = Nimbus::LossFunctions.average individuals_split_by_snp_value[1], @id_to_fenotype
60
- mean_2 = Nimbus::LossFunctions.average individuals_split_by_snp_value[2], @id_to_fenotype
61
- loss_0 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[0], @id_to_fenotype, mean_0
62
- loss_1 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[1], @id_to_fenotype, mean_1
63
- loss_2 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[2], @id_to_fenotype, mean_2
64
- loss_snp = (loss_0 + loss_1 + loss_2) / individuals_count
65
-
66
- min_loss, min_SNP, split, means = loss_snp, snp, individuals_split_by_snp_value, [mean_0, mean_1, mean_2] if loss_snp < min_loss
67
- end
68
-
69
- return build_branch(min_SNP, split, means, y_hat) if min_loss < node_loss_function
70
- return label_node(y_hat, individuals_ids)
71
41
  end
72
42
 
73
43
  # Compute generalization error for the tree.
74
- #
75
- # Traversing the 'out of bag' (OOB) sample (those individuals of the training set not
76
- # used in the building of this tree) through the tree, and comparing
77
- # the prediction with the real fenotype of the individual (and then averaging) is
78
- # possible to calculate the unbiased generalization error for the tree.
79
44
  def generalization_error_from_oob(oob_ids)
80
- return nil if (@structure.nil? || @individuals.nil? || @id_to_fenotype.nil?)
81
- oob_errors = {}
82
- oob_ids.each do |oobi|
83
- oob_prediction = Tree.traverse @structure, individuals[oobi].snp_list
84
- oob_errors[oobi] = Nimbus::LossFunctions.squared_difference oob_prediction, @id_to_fenotype[oobi]
85
- end
86
- @generalization_error = Nimbus::LossFunctions.average oob_ids, oob_errors
87
45
  end
88
46
 
89
47
  # Estimation of importance for every SNP.
90
- #
91
- # The importance of any SNP in the tree is calculated using the OOB sample.
92
- # For every SNP, every individual in the sample is pushed down the tree but with the
93
- # value of that SNP permuted with other individual in the sample.
94
- #
95
- # That way the difference between the regular prediction and the prediction with the SNP value modified can be estimated for any given SNP.
96
- #
97
- # This method computes importance estimations for every SNPs used in the tree (for any other SNP it would be 0).
98
48
  def estimate_importances(oob_ids)
99
- return nil if (@generalization_error.nil? && generalization_error_from_oob(oob_ids).nil?)
100
- oob_individuals_count = oob_ids.size
101
- @importances = {}
102
- @used_snps.uniq.each do |current_snp|
103
- shuffled_ids = oob_ids.shuffle
104
- permutated_snp_error = 0.0
105
- oob_ids.each_with_index {|oobi, index|
106
- permutated_prediction = traverse_with_permutation @structure, individuals[oobi].snp_list, current_snp, individuals[shuffled_ids[index]].snp_list
107
- permutated_snp_error += Nimbus::LossFunctions.squared_difference @id_to_fenotype[oobi], permutated_prediction
108
- }
109
- @importances[current_snp] = ((permutated_snp_error / oob_individuals_count) - @generalization_error).round(5)
110
- end
111
- @importances
112
49
  end
113
-
114
- # Class method to traverse a single individual through a tree structure.
50
+
51
+ # Class method to traverse a single individual through a tree structure.
115
52
  #
116
53
  # Returns the prediction for that individual (the label of the final node reached by the individual).
117
54
  def self.traverse(tree_structure, data)
118
- return tree_structure if tree_structure.is_a? Numeric
55
+ return tree_structure if tree_structure.is_a?(Numeric) || tree_structure.is_a?(String)
119
56
  raise Nimbus::TreeError, "Forest data has invalid structure. Please check your forest data (file)." if !(tree_structure.is_a?(Hash) && tree_structure.keys.size == 1)
120
57
  return self.traverse( tree_structure.values.first[ data[tree_structure.keys.first - 1].to_i], data)
121
58
  end
122
-
123
- private
124
-
59
+
60
+ protected
61
+
125
62
  def snps_random_sample
126
63
  (1..@snp_total_count).to_a.sample(@snp_sample_size).sort
127
64
  end
128
-
65
+
129
66
  def build_branch(snp, split, y_hats, parent_y_hat)
130
67
  node_0 = split[0].size == 0 ? label_node(parent_y_hat, []) : build_node(split[0], y_hats[0])
131
68
  node_1 = split[1].size == 0 ? label_node(parent_y_hat, []) : build_node(split[1], y_hats[1])
132
69
  node_2 = split[2].size == 0 ? label_node(parent_y_hat, []) : build_node(split[2], y_hats[2])
133
-
70
+
134
71
  split_by_snp(snp)
135
72
  return { snp => [node_0, node_1, node_2] }
136
73
  end
137
-
74
+
138
75
  def label_node(value, ids)
139
- label = value.round(5)
76
+ label = value.is_a?(String) ? value : value.round(5)
140
77
  ids.uniq.each{|i| @predictions[i] = label}
141
78
  label
142
79
  end
143
-
80
+
144
81
  def split_by_snp_value(ids, snp)
145
82
  split = [[], [], []]
146
83
  ids.each do |i|
@@ -150,17 +87,17 @@ module Nimbus
150
87
  rescue => ex
151
88
  raise Nimbus::TreeError, "Values for SNPs columns must be in [0, 1, 2]"
152
89
  end
153
-
90
+
154
91
  def split_by_snp(x)
155
92
  @used_snps << x
156
93
  end
157
-
94
+
158
95
  def traverse_with_permutation(tree_structure, data, snp_to_permute, individual_to_permute)
159
- return tree_structure if tree_structure.is_a? Numeric
96
+ return tree_structure if tree_structure.is_a?(Numeric) || tree_structure.is_a?(String)
160
97
  individual_data = (tree_structure.keys.first == snp_to_permute ? individual_to_permute : data)
161
98
  return traverse_with_permutation( tree_structure.values.first[ individual_data[tree_structure.keys.first - 1].to_i], data, snp_to_permute, individual_to_permute)
162
99
  end
163
-
100
+
164
101
  end
165
-
102
+
166
103
  end
@@ -0,0 +1,3 @@
1
+ module Nimbus
2
+ VERSION = "2.0.0"
3
+ end
@@ -0,0 +1,132 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe Nimbus::ClassificationTree do
4
+
5
+ before(:each) do
6
+ @config = Nimbus::Configuration.new
7
+ @config.load fixture_file('classification_config.yml')
8
+
9
+ @tree = Nimbus::ClassificationTree.new @config.tree
10
+ end
11
+
12
+ it "is initialized with tree config info" do
13
+ @tree.snp_total_count.should == 100
14
+ @tree.snp_sample_size.should == 33
15
+ @tree.node_min_size.should == 5
16
+ @tree.classes.size.should == 2
17
+ @tree.classes[0].should == '0'
18
+ @tree.classes[1].should == '1'
19
+ end
20
+
21
+ it "creates a tree structure when seeded with training data" do
22
+ @config.load_training_data
23
+ @tree.structure.should be_nil
24
+ @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
25
+ @tree.structure.should_not be_nil
26
+ @tree.structure.should be_kind_of Hash
27
+
28
+ @tree.structure.keys.first.should == @tree.used_snps.last
29
+ @tree.used_snps.should_not be_empty
30
+ end
31
+
32
+ it "splits node in three when building a node and finds a suitable split" do
33
+ @config.load_training_data
34
+ @tree.stub!(:snps_random_sample).and_return((68..100).to_a) #97 is best split
35
+
36
+ @tree.individuals = @config.training_set.individuals
37
+ @tree.id_to_fenotype = @config.training_set.ids_fenotypes
38
+ @tree.used_snps = []
39
+ @tree.predictions = {}
40
+
41
+ branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
42
+ branch.keys.size.should == 1
43
+ branch.keys.first.should == 97
44
+ branch[97].size.should == 3
45
+ branch[97][0].should be_kind_of Hash
46
+ branch[97][1].should be_kind_of Hash
47
+ branch[97][2].should be_kind_of Hash
48
+ end
49
+
50
+ it "keeps track of all SNPs used for the tree" do
51
+ @config.load_training_data
52
+ snps = (33..65).to_a
53
+ @tree.stub!(:snps_random_sample).and_return(snps)
54
+ @tree.used_snps.should be_nil
55
+ @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
56
+ @tree.used_snps.size.should > 4
57
+ @tree.used_snps.each{|snp|
58
+ snps.include?(snp).should be_true
59
+ }
60
+ end
61
+
62
+ it "labels node when building a node and there is not a suitable split" do
63
+ @config.load_training_data
64
+ @tree.stub!(:snps_random_sample).and_return([33])
65
+
66
+ @tree.individuals = @config.training_set.individuals
67
+ @tree.id_to_fenotype = @config.training_set.ids_fenotypes
68
+ @tree.used_snps = []
69
+ @tree.predictions = {}
70
+
71
+ branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
72
+ branch[33][0].should be_kind_of String
73
+ branch[33][1].should be_kind_of String
74
+ branch[33][2].should be_kind_of String
75
+ end
76
+
77
+ it "labels node when building a node with less individuals than the minimum node size" do
78
+ @config.load_training_data
79
+
80
+ @tree.individuals = @config.training_set.individuals
81
+ @tree.id_to_fenotype = @config.training_set.ids_fenotypes
82
+ @tree.used_snps = []
83
+ @tree.predictions = {}
84
+
85
+ label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
86
+ label.should be_kind_of String
87
+
88
+ label = @tree.build_node [2, 10], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
89
+ label.should be_kind_of String
90
+
91
+ label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
92
+ label.should be_kind_of String
93
+
94
+ label = @tree.build_node [99, 22, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
95
+ label.should be_kind_of String
96
+ end
97
+
98
+ it 'computes generalization error for the tree' do
99
+ @config.load_training_data
100
+ @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
101
+ @tree.generalization_error.should be_nil
102
+ @tree.generalization_error_from_oob((3..300).to_a)
103
+ @tree.generalization_error.should be_kind_of Numeric
104
+ @tree.generalization_error.should > 0.0
105
+ @tree.generalization_error.should < 1.0
106
+ end
107
+
108
+ it 'estimates importance for all SNPs' do
109
+ @config.load_training_data
110
+ @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
111
+ @tree.importances.should be_nil
112
+ @tree.estimate_importances((200..533).to_a)
113
+ @tree.importances.should be_kind_of Hash
114
+ @tree.importances.keys.should_not be_empty
115
+ (@tree.importances.keys - (1..100).to_a).should be_empty #all keys are snp indexes (100 snps in training file)
116
+ end
117
+
118
+ it 'get prediction for an individual pushing it down a tree structure' do
119
+ tree_structure = YAML.load(File.open fixture_file('classification_random_forest.yml')).first
120
+ individual_data = [0]*100
121
+ prediction = Nimbus::Tree.traverse tree_structure, individual_data
122
+ prediction.should == '1'
123
+
124
+ individual_data[26-1] = 1
125
+ individual_data[57-1] = 2
126
+ individual_data[98-1] = 2
127
+ individual_data[8-1] = 1
128
+ prediction = Nimbus::Tree.traverse tree_structure, individual_data
129
+ prediction.should == '0'
130
+ end
131
+
132
+ end
@@ -2,30 +2,57 @@
2
2
  require File.dirname(__FILE__) + '/spec_helper'
3
3
 
4
4
  describe Nimbus::Configuration do
5
-
5
+
6
6
  it "loads configuration options from config file" do
7
7
  config = Nimbus::Configuration.new
8
8
  config.load fixture_file('regression_config.yml')
9
-
9
+
10
10
  config.training_file.should == fixture_file('regression_training.data')
11
11
  config.testing_file.should == fixture_file('regression_testing.data')
12
12
  config.forest_file.should == fixture_file('regression_random_forest.yml')
13
-
13
+ config.classes.should be_nil
14
+
14
15
  config.forest_size.should == 3
15
16
  config.tree_SNP_sample_size.should == 60
16
17
  config.tree_SNP_total_count.should == 200
17
- config.tree_node_min_size.should == 5
18
+ config.tree_node_min_size.should == 5
19
+
20
+ config = Nimbus::Configuration.new
21
+ config.load fixture_file('classification_config.yml')
22
+
23
+ config.training_file.should == fixture_file('classification_training.data')
24
+ config.testing_file.should == fixture_file('classification_testing.data')
25
+ config.forest_file.should == fixture_file('classification_random_forest.yml')
26
+ config.classes.should == ['0','1']
27
+
28
+ config.forest_size.should == 3
29
+ config.tree_SNP_sample_size.should == 33
30
+ config.tree_SNP_total_count.should == 100
31
+ config.tree_node_min_size.should == 5
32
+ end
33
+
34
+ it 'tree method return tree-related subset of options for regression trees' do
35
+ config = Nimbus::Configuration.new
36
+ config.load fixture_file('regression_config.yml')
37
+ tree_options = config.tree
38
+
39
+ tree_options[:snp_sample_size].should_not be_nil
40
+ tree_options[:snp_total_count].should_not be_nil
41
+ tree_options[:tree_node_min_size].should_not be_nil
42
+ tree_options[:classes].should be_nil
18
43
  end
19
-
20
- it 'tree method return tree-related subset of options' do
44
+
45
+ it 'tree method return tree-related subset of options for classification trees' do
21
46
  config = Nimbus::Configuration.new
47
+ config.load fixture_file('classification_config.yml')
22
48
  tree_options = config.tree
23
-
49
+
24
50
  tree_options[:snp_sample_size].should_not be_nil
25
51
  tree_options[:snp_total_count].should_not be_nil
26
52
  tree_options[:tree_node_min_size].should_not be_nil
53
+ tree_options[:classes].should_not be_nil
27
54
  end
28
-
55
+
29
56
  it "creates a training set object from training data file" do
30
57
  config = Nimbus::Configuration.new
31
58
  config.load fixture_file('regression_config.yml')
@@ -33,30 +60,30 @@ describe Nimbus::Configuration do
33
60
  config.load_training_data
34
61
  config.training_set.should be_kind_of Nimbus::TrainingSet
35
62
  config.training_set.all_ids.sort.should == (1..800).to_a
36
-
63
+
37
64
  File.open(fixture_file('regression_training.data')) {|file|
38
65
  feno1, id1, *snp_list_1 = file.readline.split
39
66
  feno2, id2, *snp_list_2 = file.readline.split
40
67
  feno3, id3, *snp_list_3 = file.readline.split
41
-
68
+
42
69
  i1 = Nimbus::Individual.new(id1.to_i, feno1.to_f, snp_list_1.map{|snp| snp.to_i})
43
70
  i2 = Nimbus::Individual.new(id2.to_i, feno2.to_f, snp_list_2.map{|snp| snp.to_i})
44
71
  i3 = Nimbus::Individual.new(id3.to_i, feno3.to_f, snp_list_3.map{|snp| snp.to_i})
45
-
72
+
46
73
  config.training_set.individuals[id1.to_i].id.should == i1.id
47
74
  config.training_set.individuals[id2.to_i].fenotype.should == i2.fenotype
48
75
  config.training_set.individuals[id3.to_i].snp_list.should == i3.snp_list
49
-
76
+
50
77
  config.training_set.ids_fenotypes[id1.to_i] = feno1.to_f
51
78
  config.training_set.ids_fenotypes[id2.to_i] = feno2.to_f
52
79
  config.training_set.ids_fenotypes[id3.to_i] = feno3.to_f
53
80
  }
54
81
  end
55
-
82
+
56
83
  it "reads testing data and yields one individual at a time" do
57
84
  config = Nimbus::Configuration.new
58
85
  config.load fixture_file('regression_config.yml')
59
-
86
+
60
87
  test_individuals = []
61
88
  File.open(fixture_file('regression_testing.data')) {|file|
62
89
  file.each do |line|
@@ -73,20 +100,20 @@ describe Nimbus::Configuration do
73
100
  individual.snp_list.should == test_individual.snp_list
74
101
  }
75
102
  end
76
-
103
+
77
104
  it "creates a forest object loading data from a yaml file" do
78
105
  config = Nimbus::Configuration.new
79
106
  config.load fixture_file('regression_config.yml')
80
-
107
+
81
108
  trees = YAML.load(File.open fixture_file('regression_random_forest.yml'))
82
109
  trees.first.keys.first.should == 189
83
110
  trees.size.should == 3
84
-
111
+
85
112
  forest = config.load_forest
86
113
  forest.should be_kind_of Nimbus::Forest
87
114
  forest.trees[0].should == trees.first
88
115
  forest.trees[1].should == trees[1]
89
- forest.trees.last.should == trees[2]
116
+ forest.trees.last.should == trees[2]
90
117
  end
91
-
118
+
92
119
  end