RubyGems - nimbus - Versions diffs - 1.0.1 → 2.0.0 - Mend

nimbus 1.0.1 → 2.0.0

Files changed (29) hide show

data/README.md +149 -0
data/lib/nimbus.rb +15 -11
data/lib/nimbus/application.rb +20 -23
data/lib/nimbus/classification_tree.rb +111 -0
data/lib/nimbus/configuration.rb +52 -37
data/lib/nimbus/forest.rb +56 -20
data/lib/nimbus/individual.rb +7 -7
data/lib/nimbus/loss_functions.rb +44 -10
data/lib/nimbus/regression_tree.rb +103 -0
data/lib/nimbus/training_set.rb +4 -4
data/lib/nimbus/tree.rb +20 -83
data/lib/nimbus/version.rb +3 -0
data/spec/classification_tree_spec.rb +132 -0
data/spec/configuration_spec.rb +46 -19
data/spec/fixtures/classification_config.yml +13 -0
data/spec/fixtures/classification_random_forest.yml +922 -0
data/spec/fixtures/classification_testing.data +500 -0
data/spec/fixtures/classification_training.data +1000 -0
data/spec/forest_spec.rb +109 -50
data/spec/individual_spec.rb +2 -2
data/spec/loss_functions_spec.rb +71 -0
data/spec/nimbus_spec.rb +4 -4
data/spec/regression_tree_spec.rb +129 -0
data/spec/training_set_spec.rb +5 -5
data/spec/tree_spec.rb +4 -115
metadata +53 -45
data/spec/fixtures/regression_snp_importances.txt +0 -200
data/spec/fixtures/regression_testing_file_predictions.txt +0 -200
data/spec/fixtures/regression_training_file_predictions.txt +0 -758

data/lib/nimbus/regression_tree.rb ADDED Viewed

@@ -0,0 +1,103 @@
+module Nimbus
+  #####################################################################
+  # Tree object representing a random regression tree.
+  #
+  # A tree is generated following this steps:
+  #
+  # * 1: Calculate loss function for the individuals in the node (first node contains all the individuals).
+  # * 2: Take a random sample of the SNPs (size m << total count of SNPs)
+  # * 3: Compute the loss function (quadratic loss) for the split of the sample based on value of every SNP.
+  # * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in three nodes, based on value for that SNP [0, 1, or 2]
+  # * 5: Repeat from 1 for every node until:
+  #   - a) The individuals count in that node is < minimum size OR
+  #   - b) None of the SNP splits has a loss function smaller than the node loss function
+  # * 6) When a node stops, label the node with the average fenotype value of the individuals in the node.
+  #
+  class RegressionTree < Tree
+    # Creates the structure of the tree, as a hash of SNP splits and values.
+    #
+    # It just initializes the needed variables and then defines the first node of the tree.
+    # The rest of the structure of the tree is computed recursively building every node calling +build_node+.
+    def seed(all_individuals, individuals_sample, ids_fenotypes)
+      super
+      @structure = build_node individuals_sample, Nimbus::LossFunctions.average(individuals_sample, @id_to_fenotype)
+    end
+    # Creates a node by taking a random sample of the SNPs and computing the loss function for every split by SNP of that sample.
+    #
+    # * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in three:
+    # (those with value 0 for the SNP_min, those with value 1 for the SNP_min, and those with value 2 for the SNP_min) then it builds these 3 new nodes.
+    # * Otherwise every individual in the node gets labeled with the average of the fenotype values of all of them.
+    def build_node(individuals_ids, y_hat)
+      # General loss function value for the node
+      individuals_count = individuals_ids.size
+      return label_node(y_hat, individuals_ids) if individuals_count < @node_min_size
+      node_loss_function = Nimbus::LossFunctions.quadratic_loss individuals_ids, @id_to_fenotype, y_hat
+      # Finding the SNP that minimizes loss function
+      snps = snps_random_sample
+      min_loss, min_SNP, split, means = node_loss_function, nil, nil, nil
+      snps.each do |snp|
+        individuals_split_by_snp_value = split_by_snp_value individuals_ids, snp
+        mean_0 = Nimbus::LossFunctions.average individuals_split_by_snp_value[0], @id_to_fenotype
+        mean_1 = Nimbus::LossFunctions.average individuals_split_by_snp_value[1], @id_to_fenotype
+        mean_2 = Nimbus::LossFunctions.average individuals_split_by_snp_value[2], @id_to_fenotype
+        loss_0 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[0], @id_to_fenotype, mean_0
+        loss_1 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[1], @id_to_fenotype, mean_1
+        loss_2 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[2], @id_to_fenotype, mean_2
+        loss_snp = (loss_0 + loss_1 + loss_2) / individuals_count
+        min_loss, min_SNP, split, means = loss_snp, snp, individuals_split_by_snp_value, [mean_0, mean_1, mean_2] if loss_snp < min_loss
+      end
+      return build_branch(min_SNP, split, means, y_hat) if min_loss < node_loss_function
+      return label_node(y_hat, individuals_ids)
+    end
+    # Compute generalization error for the tree.
+    #
+    # Traversing the 'out of bag' (OOB) sample (those individuals of the training set not
+    # used in the building of this tree) through the tree, and comparing
+    # the prediction with the real fenotype of the individual (and then averaging) is
+    # possible to calculate the unbiased generalization error for the tree.
+    def generalization_error_from_oob(oob_ids)
+      return nil if (@structure.nil? || @individuals.nil? || @id_to_fenotype.nil?)
+      oob_errors = {}
+      oob_ids.each do |oobi|
+        oob_prediction = Tree.traverse @structure, individuals[oobi].snp_list
+        oob_errors[oobi] = Nimbus::LossFunctions.squared_difference oob_prediction, @id_to_fenotype[oobi]
+      end
+      @generalization_error = Nimbus::LossFunctions.average oob_ids, oob_errors
+    end
+    # Estimation of importance for every SNP.
+    #
+    # The importance of any SNP in the tree is calculated using the OOB sample.
+    # For every SNP, every individual in the sample is pushed down the tree but with the
+    # value of that SNP permuted with other individual in the sample.
+    #
+    # That way the difference between the regular prediction and the prediction with the SNP value modified can be estimated for any given SNP.
+    #
+    # This method computes importance estimations for every SNPs used in the tree (for any other SNP it would be 0).
+    def estimate_importances(oob_ids)
+      return nil if (@generalization_error.nil? && generalization_error_from_oob(oob_ids).nil?)
+      oob_individuals_count = oob_ids.size
+      @importances = {}
+      @used_snps.uniq.each do |current_snp|
+        shuffled_ids = oob_ids.shuffle
+        permutated_snp_error = 0.0
+        oob_ids.each_with_index {|oobi, index|
+          permutated_prediction = traverse_with_permutation @structure, individuals[oobi].snp_list, current_snp, individuals[shuffled_ids[index]].snp_list
+          permutated_snp_error += Nimbus::LossFunctions.squared_difference @id_to_fenotype[oobi], permutated_prediction
+        }
+        @importances[current_snp] = ((permutated_snp_error / oob_individuals_count) - @generalization_error).round(5)
+      end
+      @importances
+    end
+  end
+end

data/lib/nimbus/training_set.rb CHANGED Viewed

@@ -1,23 +1,23 @@
 module Nimbus
   #####################################################################
   # Set of individuals to be used as training sample for a random forest.
-  #
+  #
   # the TrainingSet class stores an array of individuals, and a hash with the fenotypes of every individual indexed by id.
   #
   class TrainingSet
     attr_accessor :individuals, :ids_fenotypes
     # Initialize a new training set with the individuals and fenotype info received.
     def initialize(individuals, ids_fenotypes)
       @individuals   = individuals
       @ids_fenotypes = ids_fenotypes
     end
     # Array of all the ids of the individuals in this training sample.
     def all_ids
       @all_ids ||= @ids_fenotypes.keys
       @all_ids
     end
   end
 end

data/lib/nimbus/tree.rb CHANGED Viewed

@@ -6,25 +6,25 @@ module Nimbus
   # A tree is generated following this steps:
   #
   # * 1: Calculate loss function for the individuals in the node (first node contains all the individuals).
-  # * 2: Take a random sample of the SNPs (size m << total count of SNPs)
+  # * 2: Take a random sample of the SNPs (size m << total count of SNPs)
   # * 3: Compute the loss function for the split of the sample based on value of every SNP.
   # * 4: If the SNP with minimum loss function also minimizes the general loss of the node, split the individuals sample in three nodes, based on value for that SNP [0, 1, or 2]
   # * 5: Repeat from 1 for every node until:
   #   - a) The individuals count in that node is < minimum size OR
   #   - b) None of the SNP splits has a loss function smaller than the node loss function
-  # * 6) When a node stops, label the node with the average fenotype value of the individuals in the node.
+  # * 6) When a node stops, label the node with the average fenotype value (for regression problems) or the majority class (for classification problems) of the individuals in the node.
   #
   class Tree
     attr_accessor :snp_sample_size, :snp_total_count, :node_min_size, :used_snps, :structure, :generalization_error, :predictions, :importances
     attr_accessor :individuals, :id_to_fenotype
     # Initialize Tree object with the configuration (as in Nimbus::Configuration.tree) options received.
     def initialize(options)
       @snp_total_count = options[:snp_total_count]
       @snp_sample_size = options[:snp_sample_size]
       @node_min_size = options[:tree_node_min_size]
     end
     # Creates the structure of the tree, as a hash of SNP splits and values.
     #
     # It just initializes the needed variables and then defines the first node of the tree.
@@ -34,113 +34,50 @@ module Nimbus
       @id_to_fenotype = ids_fenotypes
       @predictions = {}
       @used_snps = []
-      @structure = build_node individuals_sample, Nimbus::LossFunctions.average(individuals_sample, @id_to_fenotype)
     end
     # Creates a node by taking a random sample of the SNPs and computing the loss function for every split by SNP of that sample.
-    #
-    # * If SNP_min is the SNP with smaller loss function and it is < the loss function of the node, it splits the individuals sample in three:
-    # (those with value 0 for the SNP_min, those with value 1 for the SNP_min, and those with value 2 for the SNP_min) then it builds these 3 new nodes.
-    # * Otherwise every individual in the node gets labeled with the average of the fenotype values of all of them.
     def build_node(individuals_ids, y_hat)
-      # General loss function value for the node
-      individuals_count = individuals_ids.size
-      return label_node(y_hat, individuals_ids) if individuals_count < @node_min_size
-      node_loss_function = Nimbus::LossFunctions.quadratic_loss individuals_ids, @id_to_fenotype, y_hat
-      # Finding the SNP that minimizes loss function
-      snps = snps_random_sample
-      min_loss, min_SNP, split, means = node_loss_function, nil, nil, nil
-      snps.each do |snp|
-        individuals_split_by_snp_value = split_by_snp_value individuals_ids, snp
-        mean_0 = Nimbus::LossFunctions.average individuals_split_by_snp_value[0], @id_to_fenotype
-        mean_1 = Nimbus::LossFunctions.average individuals_split_by_snp_value[1], @id_to_fenotype
-        mean_2 = Nimbus::LossFunctions.average individuals_split_by_snp_value[2], @id_to_fenotype
-        loss_0 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[0], @id_to_fenotype, mean_0
-        loss_1 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[1], @id_to_fenotype, mean_1
-        loss_2 = Nimbus::LossFunctions.mean_squared_error individuals_split_by_snp_value[2], @id_to_fenotype, mean_2
-        loss_snp = (loss_0 + loss_1 + loss_2) / individuals_count
-        min_loss, min_SNP, split, means = loss_snp, snp, individuals_split_by_snp_value, [mean_0, mean_1, mean_2] if loss_snp < min_loss
-      end
-      return build_branch(min_SNP, split, means, y_hat) if min_loss < node_loss_function
-      return label_node(y_hat, individuals_ids)
     end
     # Compute generalization error for the tree.
-    #
-    # Traversing the 'out of bag' (OOB) sample (those individuals of the training set not
-    # used in the building of this tree) through the tree, and comparing
-    # the prediction with the real fenotype of the individual (and then averaging) is
-    # possible to calculate the unbiased generalization error for the tree.
     def generalization_error_from_oob(oob_ids)
-      return nil if (@structure.nil? || @individuals.nil? || @id_to_fenotype.nil?)
-      oob_errors = {}
-      oob_ids.each do |oobi|
-        oob_prediction = Tree.traverse @structure, individuals[oobi].snp_list
-        oob_errors[oobi] = Nimbus::LossFunctions.squared_difference oob_prediction, @id_to_fenotype[oobi]
-      end
-      @generalization_error = Nimbus::LossFunctions.average oob_ids, oob_errors
     end
     # Estimation of importance for every SNP.
-    #
-    # The importance of any SNP in the tree is calculated using the OOB sample.
-    # For every SNP, every individual in the sample is pushed down the tree but with the
-    # value of that SNP permuted with other individual in the sample.
-    #
-    # That way the difference between the regular prediction and the prediction with the SNP value modified can be estimated for any given SNP.
-    #
-    # This method computes importance estimations for every SNPs used in the tree (for any other SNP it would be 0).
     def estimate_importances(oob_ids)
-      return nil if (@generalization_error.nil? && generalization_error_from_oob(oob_ids).nil?)
-      oob_individuals_count = oob_ids.size
-      @importances = {}
-      @used_snps.uniq.each do |current_snp|
-        shuffled_ids = oob_ids.shuffle
-        permutated_snp_error = 0.0
-        oob_ids.each_with_index {|oobi, index|
-          permutated_prediction = traverse_with_permutation @structure, individuals[oobi].snp_list, current_snp, individuals[shuffled_ids[index]].snp_list
-          permutated_snp_error += Nimbus::LossFunctions.squared_difference @id_to_fenotype[oobi], permutated_prediction
-        }
-        @importances[current_snp] = ((permutated_snp_error / oob_individuals_count) - @generalization_error).round(5)
-      end
-      @importances
     end
-    # Class method to traverse a single individual through a tree structure.
+    # Class method to traverse a single individual through a tree structure.
     #
     # Returns the prediction for that individual (the label of the final node reached by the individual).
     def self.traverse(tree_structure, data)
-      return tree_structure if tree_structure.is_a? Numeric
+      return tree_structure if tree_structure.is_a?(Numeric) || tree_structure.is_a?(String)
       raise Nimbus::TreeError, "Forest data has invalid structure. Please check your forest data (file)." if !(tree_structure.is_a?(Hash) && tree_structure.keys.size == 1)
       return self.traverse( tree_structure.values.first[ data[tree_structure.keys.first - 1].to_i], data)
     end
-    private
+    protected
     def snps_random_sample
       (1..@snp_total_count).to_a.sample(@snp_sample_size).sort
     end
     def build_branch(snp, split, y_hats, parent_y_hat)
       node_0 = split[0].size == 0 ? label_node(parent_y_hat, []) : build_node(split[0], y_hats[0])
       node_1 = split[1].size == 0 ? label_node(parent_y_hat, []) : build_node(split[1], y_hats[1])
       node_2 = split[2].size == 0 ? label_node(parent_y_hat, []) : build_node(split[2], y_hats[2])
       split_by_snp(snp)
       return { snp => [node_0, node_1, node_2] }
     end
     def label_node(value, ids)
-      label = value.round(5)
+      label = value.is_a?(String) ? value : value.round(5)
       ids.uniq.each{|i| @predictions[i] = label}
       label
     end
     def split_by_snp_value(ids, snp)
       split = [[], [], []]
       ids.each do |i|
@@ -150,17 +87,17 @@ module Nimbus
     rescue => ex
       raise Nimbus::TreeError, "Values for SNPs columns must be in [0, 1, 2]"
     end
     def split_by_snp(x)
       @used_snps << x
     end
     def traverse_with_permutation(tree_structure, data, snp_to_permute, individual_to_permute)
-      return tree_structure if tree_structure.is_a? Numeric
+      return tree_structure if tree_structure.is_a?(Numeric) || tree_structure.is_a?(String)
       individual_data = (tree_structure.keys.first == snp_to_permute ? individual_to_permute : data)
       return traverse_with_permutation( tree_structure.values.first[ individual_data[tree_structure.keys.first - 1].to_i], data, snp_to_permute, individual_to_permute)
     end
   end
 end

data/lib/nimbus/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Nimbus
+  VERSION = "2.0.0"
+end

data/spec/classification_tree_spec.rb ADDED Viewed

@@ -0,0 +1,132 @@
+require File.dirname(__FILE__) + '/spec_helper'
+describe Nimbus::ClassificationTree do
+  before(:each) do
+    @config = Nimbus::Configuration.new
+    @config.load fixture_file('classification_config.yml')
+    @tree = Nimbus::ClassificationTree.new @config.tree
+  end
+  it "is initialized with tree config info" do
+    @tree.snp_total_count.should == 100
+    @tree.snp_sample_size.should == 33
+    @tree.node_min_size.should   == 5
+    @tree.classes.size.should    == 2
+    @tree.classes[0].should      == '0'
+    @tree.classes[1].should      == '1'
+  end
+  it "creates a tree structure when seeded with training data" do
+    @config.load_training_data
+    @tree.structure.should be_nil
+    @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
+    @tree.structure.should_not be_nil
+    @tree.structure.should be_kind_of Hash
+    @tree.structure.keys.first.should == @tree.used_snps.last
+    @tree.used_snps.should_not be_empty
+  end
+  it "splits node in three when building a node and finds a suitable split" do
+    @config.load_training_data
+    @tree.stub!(:snps_random_sample).and_return((68..100).to_a) #97 is best split
+    @tree.individuals = @config.training_set.individuals
+    @tree.id_to_fenotype = @config.training_set.ids_fenotypes
+    @tree.used_snps = []
+    @tree.predictions = {}
+    branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
+    branch.keys.size.should == 1
+    branch.keys.first.should == 97
+    branch[97].size.should == 3
+    branch[97][0].should be_kind_of Hash
+    branch[97][1].should be_kind_of Hash
+    branch[97][2].should be_kind_of Hash
+  end
+  it "keeps track of all SNPs used for the tree" do
+    @config.load_training_data
+    snps = (33..65).to_a
+    @tree.stub!(:snps_random_sample).and_return(snps)
+    @tree.used_snps.should be_nil
+    @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
+    @tree.used_snps.size.should > 4
+    @tree.used_snps.each{|snp|
+      snps.include?(snp).should be_true
+    }
+  end
+  it "labels node when building a node and there is not a suitable split" do
+    @config.load_training_data
+    @tree.stub!(:snps_random_sample).and_return([33])
+    @tree.individuals = @config.training_set.individuals
+    @tree.id_to_fenotype = @config.training_set.ids_fenotypes
+    @tree.used_snps = []
+    @tree.predictions = {}
+    branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
+    branch[33][0].should be_kind_of String
+    branch[33][1].should be_kind_of String
+    branch[33][2].should be_kind_of String
+  end
+  it "labels node when building a node with less individuals than the minimum node size" do
+    @config.load_training_data
+    @tree.individuals = @config.training_set.individuals
+    @tree.id_to_fenotype = @config.training_set.ids_fenotypes
+    @tree.used_snps = []
+    @tree.predictions = {}
+    label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
+    label.should be_kind_of String
+    label = @tree.build_node [2, 10], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
+    label.should be_kind_of String
+    label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
+    label.should be_kind_of String
+    label = @tree.build_node [99, 22, 10, 33], Nimbus::LossFunctions.majority_class(@config.training_set.all_ids, @config.training_set.ids_fenotypes, @config.classes)
+    label.should be_kind_of String
+  end
+  it 'computes generalization error for the tree' do
+    @config.load_training_data
+    @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
+    @tree.generalization_error.should be_nil
+    @tree.generalization_error_from_oob((3..300).to_a)
+    @tree.generalization_error.should be_kind_of Numeric
+    @tree.generalization_error.should > 0.0
+    @tree.generalization_error.should < 1.0
+  end
+  it 'estimates importance for all SNPs' do
+    @config.load_training_data
+    @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
+    @tree.importances.should be_nil
+    @tree.estimate_importances((200..533).to_a)
+    @tree.importances.should be_kind_of Hash
+    @tree.importances.keys.should_not be_empty
+    (@tree.importances.keys - (1..100).to_a).should be_empty #all keys are snp indexes (100 snps in training file)
+  end
+  it 'get prediction for an individual pushing it down a tree structure' do
+    tree_structure = YAML.load(File.open fixture_file('classification_random_forest.yml')).first
+    individual_data = [0]*100
+    prediction = Nimbus::Tree.traverse tree_structure, individual_data
+    prediction.should == '1'
+    individual_data[26-1] = 1
+    individual_data[57-1] = 2
+    individual_data[98-1] = 2
+    individual_data[8-1]  = 1
+    prediction = Nimbus::Tree.traverse tree_structure, individual_data
+    prediction.should == '0'
+  end
+end

data/spec/configuration_spec.rb CHANGED Viewed

@@ -2,30 +2,57 @@
 require File.dirname(__FILE__) + '/spec_helper'
 describe Nimbus::Configuration do
   it "loads configuration options from config file" do
     config = Nimbus::Configuration.new
     config.load fixture_file('regression_config.yml')
     config.training_file.should == fixture_file('regression_training.data')
     config.testing_file.should == fixture_file('regression_testing.data')
     config.forest_file.should == fixture_file('regression_random_forest.yml')
+    config.classes.should be_nil
     config.forest_size.should == 3
     config.tree_SNP_sample_size.should == 60
     config.tree_SNP_total_count.should == 200
-    config.tree_node_min_size.should == 5
+    config.tree_node_min_size.should == 5
+    config = Nimbus::Configuration.new
+    config.load fixture_file('classification_config.yml')
+    config.training_file.should == fixture_file('classification_training.data')
+    config.testing_file.should == fixture_file('classification_testing.data')
+    config.forest_file.should == fixture_file('classification_random_forest.yml')
+    config.classes.should == ['0','1']
+    config.forest_size.should == 3
+    config.tree_SNP_sample_size.should == 33
+    config.tree_SNP_total_count.should == 100
+    config.tree_node_min_size.should == 5
+  end
+  it 'tree method return tree-related subset of options for regression trees' do
+    config = Nimbus::Configuration.new
+    config.load fixture_file('regression_config.yml')
+    tree_options = config.tree
+    tree_options[:snp_sample_size].should_not be_nil
+    tree_options[:snp_total_count].should_not be_nil
+    tree_options[:tree_node_min_size].should_not be_nil
+    tree_options[:classes].should be_nil
   end
-  it 'tree method return tree-related subset of options' do
+  it 'tree method return tree-related subset of options for classification trees' do
     config = Nimbus::Configuration.new
+    config.load fixture_file('classification_config.yml')
     tree_options = config.tree
     tree_options[:snp_sample_size].should_not be_nil
     tree_options[:snp_total_count].should_not be_nil
     tree_options[:tree_node_min_size].should_not be_nil
+    tree_options[:classes].should_not be_nil
   end
   it "creates a training set object from training data file" do
     config = Nimbus::Configuration.new
     config.load fixture_file('regression_config.yml')
@@ -33,30 +60,30 @@ describe Nimbus::Configuration do
     config.load_training_data
     config.training_set.should be_kind_of Nimbus::TrainingSet
     config.training_set.all_ids.sort.should == (1..800).to_a
     File.open(fixture_file('regression_training.data')) {|file|
       feno1, id1, *snp_list_1 = file.readline.split
       feno2, id2, *snp_list_2 = file.readline.split
       feno3, id3, *snp_list_3 = file.readline.split
       i1 = Nimbus::Individual.new(id1.to_i, feno1.to_f, snp_list_1.map{|snp| snp.to_i})
       i2 = Nimbus::Individual.new(id2.to_i, feno2.to_f, snp_list_2.map{|snp| snp.to_i})
       i3 = Nimbus::Individual.new(id3.to_i, feno3.to_f, snp_list_3.map{|snp| snp.to_i})
       config.training_set.individuals[id1.to_i].id.should == i1.id
       config.training_set.individuals[id2.to_i].fenotype.should == i2.fenotype
       config.training_set.individuals[id3.to_i].snp_list.should == i3.snp_list
       config.training_set.ids_fenotypes[id1.to_i] = feno1.to_f
       config.training_set.ids_fenotypes[id2.to_i] = feno2.to_f
       config.training_set.ids_fenotypes[id3.to_i] = feno3.to_f
     }
   end
   it "reads testing data and yields one individual at a time" do
     config = Nimbus::Configuration.new
     config.load fixture_file('regression_config.yml')
     test_individuals = []
     File.open(fixture_file('regression_testing.data')) {|file|
       file.each do |line|
@@ -73,20 +100,20 @@ describe Nimbus::Configuration do
       individual.snp_list.should == test_individual.snp_list
     }
   end
   it "creates a forest object loading data from a yaml file" do
     config = Nimbus::Configuration.new
     config.load fixture_file('regression_config.yml')
     trees = YAML.load(File.open fixture_file('regression_random_forest.yml'))
     trees.first.keys.first.should == 189
     trees.size.should == 3
     forest = config.load_forest
     forest.should be_kind_of Nimbus::Forest
     forest.trees[0].should == trees.first
     forest.trees[1].should == trees[1]
-    forest.trees.last.should == trees[2]
+    forest.trees.last.should == trees[2]
   end
 end