nimbus 1.0.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,103 @@
1
+ module Nimbus
2
+
3
+ #####################################################################
4
+ # Tree object representing a random regression tree.
5
+ #
6
+ # A tree is generated following this steps:
7
+ #
8
+ # * 1: Calculate loss function for the individuals in the node (first node contains all the individuals).
9
+ # * 2: Take a random sample of the SNPs (size m << total count of SNPs)
10
+ # * 3: Compute the loss function (quadratic loss) for the split of the sample based on value of every SNP.
11
+ # * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in three nodes, based on value for that SNP [0, 1, or 2]
12
+ # * 5: Repeat from 1 for every node until:
13
+ # - a) The individuals count in that node is < minimum size OR
14
+ # - b) None of the SNP splits has a loss function smaller than the node loss function
15
+ # * 6) When a node stops, label the node with the average fenotype value of the individuals in the node.
16
+ #
17
+ class RegressionTree < Tree
18
+
19
+ # Creates the structure of the tree, as a hash of SNP splits and values.
20
+ #
21
+ # It just initializes the needed variables and then defines the first node of the tree.
22
+ # The rest of the structure of the tree is computed recursively building every node calling +build_node+.
23
+ def seed(all_individuals, individuals_sample, ids_fenotypes)
24
+ super
25
+ @structure = build_node individuals_sample, Nimbus::LossFunctions.average(individuals_sample, @id_to_fenotype)
26
+ end
27
+
28
+ # Creates a node by taking a random sample of the SNPs and computing the loss function for every split by SNP of that sample.
29
+ #
30
+ # * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in three:
31
+ # (those with value 0 for the SNP_min, those with value 1 for the SNP_min, and those with value 2 for the SNP_min) then it builds these 3 new nodes.
32
+ # * Otherwise every individual in the node gets labeled with the average of the fenotype values of all of them.
33
+ def build_node(individuals_ids, y_hat)
34
+ # General loss function value for the node
35
+ individuals_count = individuals_ids.size
36
+ return label_node(y_hat, individuals_ids) if individuals_count < @node_min_size
37
+ node_loss_function = Nimbus::LossFunctions.quadratic_loss individuals_ids, @id_to_fenotype, y_hat
38
+
39
+ # Finding the SNP that minimizes loss function
40
+ snps = snps_random_sample
41
+ min_loss, min_SNP, split, means = node_loss_function, nil, nil, nil
42
+
43
+ snps.each do |snp|
44
+ individuals_split_by_snp_value = split_by_snp_value individuals_ids, snp
45
+ mean_0 = Nimbus::LossFunctions.average individuals_split_by_snp_value[0], @id_to_fenotype
46
+ mean_1 = Nimbus::LossFunctions.average individuals_split_by_snp_value[1], @id_to_fenotype
47
+ mean_2 = Nimbus::LossFunctions.average individuals_split_by_snp_value[2], @id_to_fenotype
48
+ loss_0 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[0], @id_to_fenotype, mean_0
49
+ loss_1 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[1], @id_to_fenotype, mean_1
50
+ loss_2 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[2], @id_to_fenotype, mean_2
51
+ loss_snp = (loss_0 + loss_1 + loss_2) / individuals_count
52
+
53
+ min_loss, min_SNP, split, means = loss_snp, snp, individuals_split_by_snp_value, [mean_0, mean_1, mean_2] if loss_snp < min_loss
54
+ end
55
+
56
+ return build_branch(min_SNP, split, means, y_hat) if min_loss < node_loss_function
57
+ return label_node(y_hat, individuals_ids)
58
+ end
59
+
60
+ # Compute generalization error for the tree.
61
+ #
62
+ # Traversing the 'out of bag' (OOB) sample (those individuals of the training set not
63
+ # used in the building of this tree) through the tree, and comparing
64
+ # the prediction with the real fenotype of the individual (and then averaging) is
65
+ # possible to calculate the unbiased generalization error for the tree.
66
+ def generalization_error_from_oob(oob_ids)
67
+ return nil if (@structure.nil? || @individuals.nil? || @id_to_fenotype.nil?)
68
+ oob_errors = {}
69
+ oob_ids.each do |oobi|
70
+ oob_prediction = Tree.traverse @structure, individuals[oobi].snp_list
71
+ oob_errors[oobi] = Nimbus::LossFunctions.squared_difference oob_prediction, @id_to_fenotype[oobi]
72
+ end
73
+ @generalization_error = Nimbus::LossFunctions.average oob_ids, oob_errors
74
+ end
75
+
76
+ # Estimation of importance for every SNP.
77
+ #
78
+ # The importance of any SNP in the tree is calculated using the OOB sample.
79
+ # For every SNP, every individual in the sample is pushed down the tree but with the
80
+ # value of that SNP permuted with other individual in the sample.
81
+ #
82
+ # That way the difference between the regular prediction and the prediction with the SNP value modified can be estimated for any given SNP.
83
+ #
84
+ # This method computes importance estimations for every SNPs used in the tree (for any other SNP it would be 0).
85
+ def estimate_importances(oob_ids)
86
+ return nil if (@generalization_error.nil? && generalization_error_from_oob(oob_ids).nil?)
87
+ oob_individuals_count = oob_ids.size
88
+ @importances = {}
89
+ @used_snps.uniq.each do |current_snp|
90
+ shuffled_ids = oob_ids.shuffle
91
+ permutated_snp_error = 0.0
92
+ oob_ids.each_with_index {|oobi, index|
93
+ permutated_prediction = traverse_with_permutation @structure, individuals[oobi].snp_list, current_snp, individuals[shuffled_ids[index]].snp_list
94
+ permutated_snp_error += Nimbus::LossFunctions.squared_difference @id_to_fenotype[oobi], permutated_prediction
95
+ }
96
+ @importances[current_snp] = ((permutated_snp_error / oob_individuals_count) - @generalization_error).round(5)
97
+ end
98
+ @importances
99
+ end
100
+
101
+ end
102
+
103
+ end
@@ -1,23 +1,23 @@
1
1
  module Nimbus
2
2
  #####################################################################
3
3
  # Set of individuals to be used as training sample for a random forest.
4
- #
4
+ #
5
5
  # the TrainingSet class stores an array of individuals, and a hash with the fenotypes of every individual indexed by id.
6
6
  #
7
7
  class TrainingSet
8
8
  attr_accessor :individuals, :ids_fenotypes
9
-
9
+
10
10
  # Initialize a new training set with the individuals and fenotype info received.
11
11
  def initialize(individuals, ids_fenotypes)
12
12
  @individuals = individuals
13
13
  @ids_fenotypes = ids_fenotypes
14
14
  end
15
-
15
+
16
16
  # Array of all the ids of the individuals in this training sample.
17
17
  def all_ids
18
18
  @all_ids ||= @ids_fenotypes.keys
19
19
  @all_ids
20
20
  end
21
21
  end
22
-
22
+
23
23
  end
data/lib/nimbus/tree.rb CHANGED
@@ -6,25 +6,25 @@ module Nimbus
6
6
  # A tree is generated following this steps:
7
7
  #
8
8
  # * 1: Calculate loss function for the individuals in the node (first node contains all the individuals).
9
- # * 2: Take a random sample of the SNPs (size m << total count of SNPs)
9
+ # * 2: Take a random sample of the SNPs (size m << total count of SNPs)
10
10
  # * 3: Compute the loss function for the split of the sample based on value of every SNP.
11
11
  # * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in three nodes, based on value for that SNP [0, 1, or 2]
12
12
  # * 5: Repeat from 1 for every node until:
13
13
  # - a) The individuals count in that node is < minimum size OR
14
14
  # - b) None of the SNP splits has a loss function smaller than the node loss function
15
- # * 6) When a node stops, label the node with the average fenotype value of the individuals in the node.
15
+ # * 6) When a node stops, label the node with the average fenotype value (for regression problems) or the majority class (for classification problems) of the individuals in the node.
16
16
  #
17
17
  class Tree
18
18
  attr_accessor :snp_sample_size, :snp_total_count, :node_min_size, :used_snps, :structure, :generalization_error, :predictions, :importances
19
19
  attr_accessor :individuals, :id_to_fenotype
20
-
20
+
21
21
  # Initialize Tree object with the configuration (as in Nimbus::Configuration.tree) options received.
22
22
  def initialize(options)
23
23
  @snp_total_count = options[:snp_total_count]
24
24
  @snp_sample_size = options[:snp_sample_size]
25
25
  @node_min_size = options[:tree_node_min_size]
26
26
  end
27
-
27
+
28
28
  # Creates the structure of the tree, as a hash of SNP splits and values.
29
29
  #
30
30
  # It just initializes the needed variables and then defines the first node of the tree.
@@ -34,113 +34,50 @@ module Nimbus
34
34
  @id_to_fenotype = ids_fenotypes
35
35
  @predictions = {}
36
36
  @used_snps = []
37
-
38
- @structure = build_node individuals_sample, Nimbus::LossFunctions.average(individuals_sample, @id_to_fenotype)
39
37
  end
40
38
 
41
39
  # Creates a node by taking a random sample of the SNPs and computing the loss function for every split by SNP of that sample.
42
- #
43
- # * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in three:
44
- # (those with value 0 for the SNP_min, those with value 1 for the SNP_min, and those with value 2 for the SNP_min) then it builds these 3 new nodes.
45
- # * Otherwise every individual in the node gets labeled with the average of the fenotype values of all of them.
46
40
  def build_node(individuals_ids, y_hat)
47
- # General loss function value for the node
48
- individuals_count = individuals_ids.size
49
- return label_node(y_hat, individuals_ids) if individuals_count < @node_min_size
50
- node_loss_function = Nimbus::LossFunctions.quadratic_loss individuals_ids, @id_to_fenotype, y_hat
51
-
52
- # Finding the SNP that minimizes loss function
53
- snps = snps_random_sample
54
- min_loss, min_SNP, split, means = node_loss_function, nil, nil, nil
55
-
56
- snps.each do |snp|
57
- individuals_split_by_snp_value = split_by_snp_value individuals_ids, snp
58
- mean_0 = Nimbus::LossFunctions.average individuals_split_by_snp_value[0], @id_to_fenotype
59
- mean_1 = Nimbus::LossFunctions.average individuals_split_by_snp_value[1], @id_to_fenotype
60
- mean_2 = Nimbus::LossFunctions.average individuals_split_by_snp_value[2], @id_to_fenotype
61
- loss_0 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[0], @id_to_fenotype, mean_0
62
- loss_1 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[1], @id_to_fenotype, mean_1
63
- loss_2 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[2], @id_to_fenotype, mean_2
64
- loss_snp = (loss_0 + loss_1 + loss_2) / individuals_count
65
-
66
- min_loss, min_SNP, split, means = loss_snp, snp, individuals_split_by_snp_value, [mean_0, mean_1, mean_2] if loss_snp < min_loss
67
- end
68
-
69
- return build_branch(min_SNP, split, means, y_hat) if min_loss < node_loss_function
70
- return label_node(y_hat, individuals_ids)
71
41
  end
72
42
 
73
43
  # Compute generalization error for the tree.
74
- #
75
- # Traversing the 'out of bag' (OOB) sample (those individuals of the training set not
76
- # used in the building of this tree) through the tree, and comparing
77
- # the prediction with the real fenotype of the individual (and then averaging) is
78
- # possible to calculate the unbiased generalization error for the tree.
79
44
  def generalization_error_from_oob(oob_ids)
80
- return nil if (@structure.nil? || @individuals.nil? || @id_to_fenotype.nil?)
81
- oob_errors = {}
82
- oob_ids.each do |oobi|
83
- oob_prediction = Tree.traverse @structure, individuals[oobi].snp_list
84
- oob_errors[oobi] = Nimbus::LossFunctions.squared_difference oob_prediction, @id_to_fenotype[oobi]
85
- end
86
- @generalization_error = Nimbus::LossFunctions.average oob_ids, oob_errors
87
45
  end
88
46
 
89
47
  # Estimation of importance for every SNP.
90
- #
91
- # The importance of any SNP in the tree is calculated using the OOB sample.
92
- # For every SNP, every individual in the sample is pushed down the tree but with the
93
- # value of that SNP permuted with other individual in the sample.
94
- #
95
- # That way the difference between the regular prediction and the prediction with the SNP value modified can be estimated for any given SNP.
96
- #
97
- # This method computes importance estimations for every SNPs used in the tree (for any other SNP it would be 0).
98
48
  def estimate_importances(oob_ids)
99
- return nil if (@generalization_error.nil? && generalization_error_from_oob(oob_ids).nil?)
100
- oob_individuals_count = oob_ids.size
101
- @importances = {}
102
- @used_snps.uniq.each do |current_snp|
103
- shuffled_ids = oob_ids.shuffle
104
- permutated_snp_error = 0.0
105
- oob_ids.each_with_index {|oobi, index|
106
- permutated_prediction = traverse_with_permutation @structure, individuals[oobi].snp_list, current_snp, individuals[shuffled_ids[index]].snp_list
107
- permutated_snp_error += Nimbus::LossFunctions.squared_difference @id_to_fenotype[oobi], permutated_prediction
108
- }
109
- @importances[current_snp] = ((permutated_snp_error / oob_individuals_count) - @generalization_error).round(5)
110
- end
111
- @importances
112
49
  end
113
-
114
- # Class method to traverse a single individual through a tree structure.
50
+
51
+ # Class method to traverse a single individual through a tree structure.
115
52
  #
116
53
  # Returns the prediction for that individual (the label of the final node reached by the individual).
117
54
  def self.traverse(tree_structure, data)
118
- return tree_structure if tree_structure.is_a? Numeric
55
+ return tree_structure if tree_structure.is_a?(Numeric) || tree_structure.is_a?(String)
119
56
  raise Nimbus::TreeError, "Forest data has invalid structure. Please check your forest data (file)." if !(tree_structure.is_a?(Hash) && tree_structure.keys.size == 1)
120
57
  return self.traverse( tree_structure.values.first[ data[tree_structure.keys.first - 1].to_i], data)
121
58
  end
122
-
123
- private
124
-
59
+
60
+ protected
61
+
125
62
  def snps_random_sample
126
63
  (1..@snp_total_count).to_a.sample(@snp_sample_size).sort
127
64
  end
128
-
65
+
129
66
  def build_branch(snp, split, y_hats, parent_y_hat)
130
67
  node_0 = split[0].size == 0 ? label_node(parent_y_hat, []) : build_node(split[0], y_hats[0])
131
68
  node_1 = split[1].size == 0 ? label_node(parent_y_hat, []) : build_node(split[1], y_hats[1])
132
69
  node_2 = split[2].size == 0 ? label_node(parent_y_hat, []) : build_node(split[2], y_hats[2])
133
-
70
+
134
71
  split_by_snp(snp)
135
72
  return { snp => [node_0, node_1, node_2] }
136
73
  end
137
-
74
+
138
75
  def label_node(value, ids)
139
- label = value.round(5)
76
+ label = value.is_a?(String) ? value : value.round(5)
140
77
  ids.uniq.each{|i| @predictions[i] = label}
141
78
  label
142
79
  end
143
-
80
+
144
81
  def split_by_snp_value(ids, snp)
145
82
  split = [[], [], []]
146
83
  ids.each do |i|
@@ -150,17 +87,17 @@ module Nimbus
150
87
  rescue => ex
151
88
  raise Nimbus::TreeError, "Values for SNPs columns must be in [0, 1, 2]"
152
89
  end
153
-
90
+
154
91
  def split_by_snp(x)
155
92
  @used_snps << x
156
93
  end
157
-
94
+
158
95
  def traverse_with_permutation(tree_structure, data, snp_to_permute, individual_to_permute)
159
- return tree_structure if tree_structure.is_a? Numeric
96
+ return tree_structure if tree_structure.is_a?(Numeric) || tree_structure.is_a?(String)
160
97
  individual_data = (tree_structure.keys.first == snp_to_permute ? individual_to_permute : data)
161
98
  return traverse_with_permutation( tree_structure.values.first[ individual_data[tree_structure.keys.first - 1].to_i], data, snp_to_permute, individual_to_permute)
162
99
  end
163
-
100
+
164
101
  end
165
-
102
+
166
103
  end
@@ -0,0 +1,3 @@
1
+ module Nimbus
2
+ VERSION = "2.0.0"
3
+ end
@@ -0,0 +1,132 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe Nimbus::ClassificationTree do
4
+
5
+ before(:each) do
6
+ @config = Nimbus::Configuration.new
7
+ @config.load fixture_file('classification_config.yml')
8
+
9
+ @tree = Nimbus::ClassificationTree.new @config.tree
10
+ end
11
+
12
+ it "is initialized with tree config info" do
13
+ @tree.snp_total_count.should == 100
14
+ @tree.snp_sample_size.should == 33
15
+ @tree.node_min_size.should == 5
16
+ @tree.classes.size.should == 2
17
+ @tree.classes[0].should == '0'
18
+ @tree.classes[1].should == '1'
19
+ end
20
+
21
+ it "creates a tree structure when seeded with training data" do
22
+ @config.load_training_data
23
+ @tree.structure.should be_nil
24
+ @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
25
+ @tree.structure.should_not be_nil
26
+ @tree.structure.should be_kind_of Hash
27
+
28
+ @tree.structure.keys.first.should == @tree.used_snps.last
29
+ @tree.used_snps.should_not be_empty
30
+ end
31
+
32
+ it "splits node in three when building a node and finds a suitable split" do
33
+ @config.load_training_data
34
+ @tree.stub!(:snps_random_sample).and_return((68..100).to_a) #97 is best split
35
+
36
+ @tree.individuals = @config.training_set.individuals
37
+ @tree.id_to_fenotype = @config.training_set.ids_fenotypes
38
+ @tree.used_snps = []
39
+ @tree.predictions = {}
40
+
41
+ branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
42
+ branch.keys.size.should == 1
43
+ branch.keys.first.should == 97
44
+ branch[97].size.should == 3
45
+ branch[97][0].should be_kind_of Hash
46
+ branch[97][1].should be_kind_of Hash
47
+ branch[97][2].should be_kind_of Hash
48
+ end
49
+
50
+ it "keeps track of all SNPs used for the tree" do
51
+ @config.load_training_data
52
+ snps = (33..65).to_a
53
+ @tree.stub!(:snps_random_sample).and_return(snps)
54
+ @tree.used_snps.should be_nil
55
+ @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
56
+ @tree.used_snps.size.should > 4
57
+ @tree.used_snps.each{|snp|
58
+ snps.include?(snp).should be_true
59
+ }
60
+ end
61
+
62
+ it "labels node when building a node and there is not a suitable split" do
63
+ @config.load_training_data
64
+ @tree.stub!(:snps_random_sample).and_return([33])
65
+
66
+ @tree.individuals = @config.training_set.individuals
67
+ @tree.id_to_fenotype = @config.training_set.ids_fenotypes
68
+ @tree.used_snps = []
69
+ @tree.predictions = {}
70
+
71
+ branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
72
+ branch[33][0].should be_kind_of String
73
+ branch[33][1].should be_kind_of String
74
+ branch[33][2].should be_kind_of String
75
+ end
76
+
77
+ it "labels node when building a node with less individuals than the minimum node size" do
78
+ @config.load_training_data
79
+
80
+ @tree.individuals = @config.training_set.individuals
81
+ @tree.id_to_fenotype = @config.training_set.ids_fenotypes
82
+ @tree.used_snps = []
83
+ @tree.predictions = {}
84
+
85
+ label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
86
+ label.should be_kind_of String
87
+
88
+ label = @tree.build_node [2, 10], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
89
+ label.should be_kind_of String
90
+
91
+ label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
92
+ label.should be_kind_of String
93
+
94
+ label = @tree.build_node [99, 22, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
95
+ label.should be_kind_of String
96
+ end
97
+
98
+ it 'computes generalization error for the tree' do
99
+ @config.load_training_data
100
+ @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
101
+ @tree.generalization_error.should be_nil
102
+ @tree.generalization_error_from_oob((3..300).to_a)
103
+ @tree.generalization_error.should be_kind_of Numeric
104
+ @tree.generalization_error.should > 0.0
105
+ @tree.generalization_error.should < 1.0
106
+ end
107
+
108
+ it 'estimates importance for all SNPs' do
109
+ @config.load_training_data
110
+ @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
111
+ @tree.importances.should be_nil
112
+ @tree.estimate_importances((200..533).to_a)
113
+ @tree.importances.should be_kind_of Hash
114
+ @tree.importances.keys.should_not be_empty
115
+ (@tree.importances.keys - (1..100).to_a).should be_empty #all keys are snp indexes (100 snps in training file)
116
+ end
117
+
118
+ it 'get prediction for an individual pushing it down a tree structure' do
119
+ tree_structure = YAML.load(File.open fixture_file('classification_random_forest.yml')).first
120
+ individual_data = [0]*100
121
+ prediction = Nimbus::Tree.traverse tree_structure, individual_data
122
+ prediction.should == '1'
123
+
124
+ individual_data[26-1] = 1
125
+ individual_data[57-1] = 2
126
+ individual_data[98-1] = 2
127
+ individual_data[8-1] = 1
128
+ prediction = Nimbus::Tree.traverse tree_structure, individual_data
129
+ prediction.should == '0'
130
+ end
131
+
132
+ end
@@ -2,30 +2,57 @@
2
2
  require File.dirname(__FILE__) + '/spec_helper'
3
3
 
4
4
  describe Nimbus::Configuration do
5
-
5
+
6
6
  it "loads configuration options from config file" do
7
7
  config = Nimbus::Configuration.new
8
8
  config.load fixture_file('regression_config.yml')
9
-
9
+
10
10
  config.training_file.should == fixture_file('regression_training.data')
11
11
  config.testing_file.should == fixture_file('regression_testing.data')
12
12
  config.forest_file.should == fixture_file('regression_random_forest.yml')
13
-
13
+ config.classes.should be_nil
14
+
14
15
  config.forest_size.should == 3
15
16
  config.tree_SNP_sample_size.should == 60
16
17
  config.tree_SNP_total_count.should == 200
17
- config.tree_node_min_size.should == 5
18
+ config.tree_node_min_size.should == 5
19
+
20
+ config = Nimbus::Configuration.new
21
+ config.load fixture_file('classification_config.yml')
22
+
23
+ config.training_file.should == fixture_file('classification_training.data')
24
+ config.testing_file.should == fixture_file('classification_testing.data')
25
+ config.forest_file.should == fixture_file('classification_random_forest.yml')
26
+ config.classes.should == ['0','1']
27
+
28
+ config.forest_size.should == 3
29
+ config.tree_SNP_sample_size.should == 33
30
+ config.tree_SNP_total_count.should == 100
31
+ config.tree_node_min_size.should == 5
32
+ end
33
+
34
+ it 'tree method return tree-related subset of options for regression trees' do
35
+ config = Nimbus::Configuration.new
36
+ config.load fixture_file('regression_config.yml')
37
+ tree_options = config.tree
38
+
39
+ tree_options[:snp_sample_size].should_not be_nil
40
+ tree_options[:snp_total_count].should_not be_nil
41
+ tree_options[:tree_node_min_size].should_not be_nil
42
+ tree_options[:classes].should be_nil
18
43
  end
19
-
20
- it 'tree method return tree-related subset of options' do
44
+
45
+ it 'tree method return tree-related subset of options for classification trees' do
21
46
  config = Nimbus::Configuration.new
47
+ config.load fixture_file('classification_config.yml')
22
48
  tree_options = config.tree
23
-
49
+
24
50
  tree_options[:snp_sample_size].should_not be_nil
25
51
  tree_options[:snp_total_count].should_not be_nil
26
52
  tree_options[:tree_node_min_size].should_not be_nil
53
+ tree_options[:classes].should_not be_nil
27
54
  end
28
-
55
+
29
56
  it "creates a training set object from training data file" do
30
57
  config = Nimbus::Configuration.new
31
58
  config.load fixture_file('regression_config.yml')
@@ -33,30 +60,30 @@ describe Nimbus::Configuration do
33
60
  config.load_training_data
34
61
  config.training_set.should be_kind_of Nimbus::TrainingSet
35
62
  config.training_set.all_ids.sort.should == (1..800).to_a
36
-
63
+
37
64
  File.open(fixture_file('regression_training.data')) {|file|
38
65
  feno1, id1, *snp_list_1 = file.readline.split
39
66
  feno2, id2, *snp_list_2 = file.readline.split
40
67
  feno3, id3, *snp_list_3 = file.readline.split
41
-
68
+
42
69
  i1 = Nimbus::Individual.new(id1.to_i, feno1.to_f, snp_list_1.map{|snp| snp.to_i})
43
70
  i2 = Nimbus::Individual.new(id2.to_i, feno2.to_f, snp_list_2.map{|snp| snp.to_i})
44
71
  i3 = Nimbus::Individual.new(id3.to_i, feno3.to_f, snp_list_3.map{|snp| snp.to_i})
45
-
72
+
46
73
  config.training_set.individuals[id1.to_i].id.should == i1.id
47
74
  config.training_set.individuals[id2.to_i].fenotype.should == i2.fenotype
48
75
  config.training_set.individuals[id3.to_i].snp_list.should == i3.snp_list
49
-
76
+
50
77
  config.training_set.ids_fenotypes[id1.to_i] = feno1.to_f
51
78
  config.training_set.ids_fenotypes[id2.to_i] = feno2.to_f
52
79
  config.training_set.ids_fenotypes[id3.to_i] = feno3.to_f
53
80
  }
54
81
  end
55
-
82
+
56
83
  it "reads testing data and yields one individual at a time" do
57
84
  config = Nimbus::Configuration.new
58
85
  config.load fixture_file('regression_config.yml')
59
-
86
+
60
87
  test_individuals = []
61
88
  File.open(fixture_file('regression_testing.data')) {|file|
62
89
  file.each do |line|
@@ -73,20 +100,20 @@ describe Nimbus::Configuration do
73
100
  individual.snp_list.should == test_individual.snp_list
74
101
  }
75
102
  end
76
-
103
+
77
104
  it "creates a forest object loading data from a yaml file" do
78
105
  config = Nimbus::Configuration.new
79
106
  config.load fixture_file('regression_config.yml')
80
-
107
+
81
108
  trees = YAML.load(File.open fixture_file('regression_random_forest.yml'))
82
109
  trees.first.keys.first.should == 189
83
110
  trees.size.should == 3
84
-
111
+
85
112
  forest = config.load_forest
86
113
  forest.should be_kind_of Nimbus::Forest
87
114
  forest.trees[0].should == trees.first
88
115
  forest.trees[1].should == trees[1]
89
- forest.trees.last.should == trees[2]
116
+ forest.trees.last.should == trees[2]
90
117
  end
91
-
118
+
92
119
  end