RubyGems - nimbus - Versions diffs - 2.2.1 → 2.3.0 - Mend

nimbus 2.2.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +7 -0
data/CODE_OF_CONDUCT.md +7 -0
data/CONTRIBUTING.md +46 -0
data/MIT-LICENSE.txt +1 -1
data/README.md +131 -21
data/bin/nimbus +2 -2
data/lib/nimbus.rb +2 -6
data/lib/nimbus/classification_tree.rb +9 -12
data/lib/nimbus/configuration.rb +22 -22
data/lib/nimbus/forest.rb +8 -8
data/lib/nimbus/loss_functions.rb +11 -0
data/lib/nimbus/regression_tree.rb +8 -10
data/lib/nimbus/tree.rb +54 -12
data/lib/nimbus/version.rb +1 -1
data/spec/classification_tree_spec.rb +47 -47
data/spec/configuration_spec.rb +55 -55
data/spec/fixtures/{classification_config.yml → classification/config.yml} +3 -3
data/spec/fixtures/classification/random_forest.yml +1174 -0
data/spec/fixtures/{classification_testing.data → classification/testing.data} +0 -0
data/spec/fixtures/{classification_training.data → classification/training.data} +0 -0
data/spec/fixtures/{regression_config.yml → regression/config.yml} +4 -4
data/spec/fixtures/regression/random_forest.yml +2737 -0
data/spec/fixtures/{regression_testing.data → regression/testing.data} +0 -0
data/spec/fixtures/{regression_training.data → regression/training.data} +0 -0
data/spec/forest_spec.rb +39 -39
data/spec/individual_spec.rb +3 -3
data/spec/loss_functions_spec.rb +31 -13
data/spec/nimbus_spec.rb +2 -2
data/spec/regression_tree_spec.rb +44 -44
data/spec/training_set_spec.rb +3 -3
data/spec/tree_spec.rb +4 -4
metadata +37 -34
data/spec/fixtures/classification_random_forest.yml +0 -922
data/spec/fixtures/regression_random_forest.yml +0 -1741

data/spec/fixtures/{regression_testing.data → regression/testing.data} RENAMED

File without changes

data/spec/fixtures/{regression_training.data → regression/training.data} RENAMED

File without changes

data/spec/forest_spec.rb CHANGED

@@ -5,47 +5,47 @@ describe Nimbus::Forest do
   describe "Regression" do
     before(:each) do
       @config = Nimbus::Configuration.new
-      @config.load fixture_file('regression_config.yml')
-      @config.load_training_data
+      @config.load fixture_file('regression/config.yml')
+      @config.load_training_data if @config.do_training
       @forest = ::Nimbus::Forest.new @config
     end
     it 'grows a regression forest of N trees' do
-      @forest.trees.should == []
-      @config.forest_size.should == 3
-      @forest.should_not be_classification
-      @forest.should be_regression
+      expect(@forest.trees).to eq []
+      expect(@config.forest_size).to eq 3
+      expect(@forest).to_not be_classification
+      expect(@forest).to be_regression
       @forest.grow
-      @forest.trees.size.should == @config.forest_size
-      @forest.trees.each{|t| t.should be_kind_of Hash}
+      expect(@forest.trees.size).to eq @config.forest_size
+      @forest.trees.each{|t| expect(t).to be_kind_of Hash}
     end
     it 'creates averaged predictions for individuals in the training set' do
-      @forest.predictions.should == {}
+      expect(@forest.predictions).to eq({})
       @forest.grow
-      (@forest.predictions.keys - (1..800).to_a ).should == [] # 800 individuals in the training file
-      @forest.predictions.values.each{|v| v.should be_kind_of Numeric}
+      expect((@forest.predictions.keys - (1..800).to_a )).to eq [] # 800 individuals in the training file
+      @forest.predictions.values.each{|v| expect(v).to be_kind_of Numeric}
     end
     it 'computes averaged SNP importances for every SNP' do
-      @forest.snp_importances.should == {}
+      expect(@forest.snp_importances).to eq({})
       @forest.grow
-      @forest.snp_importances.keys.sort.should == (1..200).to_a # 200 snps in the training file
-      @forest.snp_importances.values.each{|v| v.should be_kind_of Numeric}
+      expect(@forest.snp_importances.keys.sort).to eq (1..200).to_a # 200 snps in the training file
+      @forest.snp_importances.values.each{|v| expect(v).to be_kind_of Numeric}
     end
     it 'does not compute SNP importances if config set to false' do
-      @forest.snp_importances.should == {}
+      expect(@forest.snp_importances).to eq({})
       @forest.options.do_importances = false
       @forest.grow
-      @forest.snp_importances.should == {}
+      expect(@forest.snp_importances).to eq({})
     end
     it 'traverses a set of testing individuals through every tree in the forest and returns predictions' do
       @forest = @config.load_forest
-      @forest.predictions.should == {}
+      expect(@forest.predictions).to eq({})
-      tree_structure = Psych.load(File.open fixture_file('regression_random_forest.yml'))
+      tree_structure = Psych.load(File.open fixture_file('regression/random_forest.yml'))
       expected_predictions = {}
       @config.read_testing_data{|individual|
         individual_prediction = 0.0
@@ -56,60 +56,60 @@ describe Nimbus::Forest do
       }
       @forest.traverse
-      @forest.predictions.should == expected_predictions
+      expect(@forest.predictions).to eq expected_predictions
     end
     it 'can output forest structure in YAML format' do
       @forest = @config.load_forest
-      Psych.load(File.open fixture_file('regression_random_forest.yml')) == Psych.load(@forest.to_yaml)
+      Psych.load(File.open fixture_file('regression/random_forest.yml')) == Psych.load(@forest.to_yaml)
     end
   end
   describe "Classification" do
     before(:each) do
       @config = Nimbus::Configuration.new
-      @config.load fixture_file('classification_config.yml')
+      @config.load fixture_file('classification/config.yml')
       @config.load_training_data
       @forest = ::Nimbus::Forest.new @config
     end
     it 'grows a classification forest of N trees' do
-      @forest.trees.should == []
-      @config.forest_size.should == 3
-      @forest.should be_classification
-      @forest.should_not be_regression
+      expect(@forest.trees).to eq []
+      expect(@config.forest_size).to eq 3
+      expect(@forest).to be_classification
+      expect(@forest).to_not be_regression
       @forest.grow
-      @forest.trees.size.should == @config.forest_size
-      @forest.trees.each{|t| t.should be_kind_of Hash}
+      expect(@forest.trees.size).to eq @config.forest_size
+      @forest.trees.each{|t| expect(t).to be_kind_of Hash}
     end
     it 'creates predictions for individuals in the training set' do
-      @forest.predictions.should == {}
+      expect(@forest.predictions).to eq({})
       @forest.grow
-      (@forest.predictions.keys - (1..1000).to_a ).should == [] # 1000 individuals in the training file
-      @forest.predictions.values.each{|v| v.should be_kind_of String}
+      expect((@forest.predictions.keys - (1..1000).to_a )).to eq [] # 1000 individuals in the training file
+      @forest.predictions.values.each{|v| expect(v).to be_kind_of String}
     end
     it 'computes averaged SNP importances for every SNP' do
-      @forest.snp_importances.should == {}
+      expect(@forest.snp_importances).to eq({})
       @forest.options.do_importances = true
       @forest.grow
-      @forest.snp_importances.keys.sort.should == (1..100).to_a # 100 snps in the training file
-      @forest.snp_importances.values.each{|v| v.should be_kind_of Numeric}
+      expect(@forest.snp_importances.keys.sort).to eq (1..100).to_a # 100 snps in the training file
+      @forest.snp_importances.values.each{|v| expect(v).to be_kind_of Numeric}
     end
     it 'does not compute SNP importances if config set to false' do
-      @forest.snp_importances.should == {}
+      expect(@forest.snp_importances).to eq({})
       @forest.options.do_importances = false
       @forest.grow
-      @forest.snp_importances.should == {}
+      expect(@forest.snp_importances).to eq({})
     end
     it 'traverses a set of testing individuals through every tree in the forest and returns predictions' do
       @forest = @config.load_forest
-      @forest.predictions.should == {}
+      expect(@forest.predictions).to eq({})
-      tree_structure = Psych.load(File.open fixture_file('classification_random_forest.yml'))
+      tree_structure = Psych.load(File.open fixture_file('classification/random_forest.yml'))
       expected_predictions = {}
       @config.read_testing_data{|individual|
         individual_prediction = []
@@ -121,12 +121,12 @@ describe Nimbus::Forest do
       }
       @forest.traverse
-      @forest.predictions.should == expected_predictions
+      expect(@forest.predictions).to eq expected_predictions
     end
     it 'can output forest structure in YAML format' do
       @forest = @config.load_forest
-      Psych.load(File.open fixture_file('classification_random_forest.yml')) == Psych.load(@forest.to_yaml)
+      Psych.load(File.open fixture_file('classification/random_forest.yml')) == Psych.load(@forest.to_yaml)
     end
   end
 end

data/spec/individual_spec.rb CHANGED

@@ -5,9 +5,9 @@ describe Nimbus::Individual do
   it "stores id, fenotype and SNPs information for an individual" do
     @individual = Nimbus::Individual.new(11, 33.275, [1,0,2,1])
-    @individual.id.should       == 11
-    @individual.fenotype.should == 33.275
-    @individual.snp_list.should == [1,0,2,1]
+    expect(@individual.id).to eq 11
+    expect(@individual.fenotype).to eq 33.275
+    expect(@individual.snp_list).to eq [1,0,2,1]
   end
 end

data/spec/loss_functions_spec.rb CHANGED

@@ -7,39 +7,57 @@ describe Nimbus::LossFunctions do
     ids = [1,3,5,7]
     values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
-    Nimbus::LossFunctions.average(ids, values).should == 18.25 # (10 + 21 + 31 + 11 = 73)/4
+    expect(Nimbus::LossFunctions.average(ids, values)).to eq 18.25 # (10 + 21 + 31 + 11 = 73)/4
   end
   it "method for mean squared error" do
     ids = [3,7,85]
     values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
-    Nimbus::LossFunctions.mean_squared_error(ids, values).should == 74.0 # (avg(21 + 11 + 22) = 18: sum (x-11)^2
+    expect(Nimbus::LossFunctions.mean_squared_error(ids, values)).to eq 74.0 # (avg(21 + 11 + 22) = 18: sum (x-18)^2
   end
   it "method for quadratic_loss" do
     ids = [1,4]
     values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
-    Nimbus::LossFunctions.quadratic_loss(ids, values).round(5).should == 1
+    expect(Nimbus::LossFunctions.quadratic_loss(ids, values).round(5)).to eq 1
   end
   it "quadratic loss is mean squared error averaged" do
     ids = [1,2,3,4,5,7,85]
     values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
-    Nimbus::LossFunctions.quadratic_loss(ids, values).round(5).should == (Nimbus::LossFunctions.mean_squared_error(ids, values)/7 ).round(5)
+    expect(Nimbus::LossFunctions.quadratic_loss(ids, values).round(5)).to eq (Nimbus::LossFunctions.mean_squared_error(ids, values)/7 ).round(5)
+  end
+  it "method for pseudo Huber error" do
+    ids = [3,7,85]
+    values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
+    expect(Nimbus::LossFunctions.pseudo_huber_error(ids, values).round(5)).to eq 11.92337 # (avg(21 + 11 + 22) = 18: log(cosh(x-18))
+  end
+  it "method for pseudo Huber loss function" do
+    ids = [1,4]
+    values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
+    expect(Nimbus::LossFunctions.pseudo_huber_loss(ids, values).round(5)).to eq 0.43378
+  end
+  it "pseudo Huber loss is pseudo Huber error averaged" do
+    ids = [1,2,3,4,5,7,85]
+    values = {1 => 10, 2 => 5, 3 => 21, 4 => 8, 5 => 31, 7 => 11, 85 => 22}
+    expect(Nimbus::LossFunctions.pseudo_huber_loss(ids, values).round(5)).to eq (Nimbus::LossFunctions.pseudo_huber_error(ids, values)/7 ).round(5)
   end
   it "method for squared difference" do
-    Nimbus::LossFunctions.squared_difference(50, 40).should == 100.0
-    Nimbus::LossFunctions.squared_difference(22, 10).should == 144.0
+    expect(Nimbus::LossFunctions.squared_difference(50, 40)).to eq 100.0
+    expect(Nimbus::LossFunctions.squared_difference(22, 10)).to eq 144.0
   end
   it "method for majority class" do
     ids     = [1,2,3,4,5,7,85]
     values  = {1 => 'B', 2 => 'C', 3 => 'A', 4 => 'A', 5 => 'C', 7 => 'B', 85 => 'C'} #3C, 2A, 2B
     classes = ['A', 'B', 'C']
-    Nimbus::LossFunctions.majority_class(ids, values, classes).should == 'C'
+    expect(Nimbus::LossFunctions.majority_class(ids, values, classes)).to eq 'C'
   end
   it "majority class method selects randomly if more than one majority class" do
@@ -50,27 +68,27 @@ describe Nimbus::LossFunctions do
     20.times do
       results << Nimbus::LossFunctions.majority_class(ids, values, classes)
     end
-    results.should include('A')
-    results.should include('C')
+    expect(results).to include('A')
+    expect(results).to include('C')
   end
   it "method for majority class in list" do
     list    = %w(A A A B B B C A B C A B A)
     classes = ['A', 'B', 'C']
-    Nimbus::LossFunctions.majority_class_in_list(list, classes).should == 'A'
+    expect(Nimbus::LossFunctions.majority_class_in_list(list, classes)).to eq 'A'
   end
   it "method for class sizes" do
     ids     = [1,2,3,4,5,7,85]
     values  = {1 => 'B', 2 => 'C', 3 => 'A', 4 => 'A', 5 => 'C', 7 => 'B', 85 => 'C'} #2A, 2B, 3C
     classes = ['A', 'B', 'C']
-    Nimbus::LossFunctions.class_sizes(ids, values, classes).should == [2, 2, 3]
+    expect(Nimbus::LossFunctions.class_sizes(ids, values, classes)).to eq [2, 2, 3]
   end
   it "method for class sizes in list" do
     list    = %w(A A A B B B C A B C A B A)  # 6A, 5B, 2C
     classes = ['A', 'B', 'C']
-    Nimbus::LossFunctions.class_sizes_in_list(list, classes).should == [6, 5, 2]
+    expect(Nimbus::LossFunctions.class_sizes_in_list(list, classes)).to eq [6, 5, 2]
   end
   it "Gini index" do
@@ -78,7 +96,7 @@ describe Nimbus::LossFunctions do
     values  = {1 => 'B', 2 => 'C', 3 => 'A', 4 => 'A', 5 => 'C', 7 => 'C'} #3C, 2A, 1B
     classes = ['A', 'B', 'C']
     # Gini = 1 - ( (3/6)^2 + (2/6)^2 + (1/6)^2 ) = 0.61111
-    Nimbus::LossFunctions.gini_index(ids, values, classes).should == 0.61111
+    expect(Nimbus::LossFunctions.gini_index(ids, values, classes)).to eq 0.61111
   end
 end

data/spec/nimbus_spec.rb CHANGED

@@ -6,13 +6,13 @@ describe 'Nimbus module' do
   it "manages a Nimbus::Application object" do
     app = Nimbus.application
-    app.should be_kind_of Nimbus::Application
+    expect(app).to be_kind_of Nimbus::Application
   end
   it "accepts setting an external Nimbus::Application" do
     app = Nimbus::Application.new
     Nimbus.application = app
-    Nimbus.application.should == app
+    expect(Nimbus.application).to eq app
   end
 end

data/spec/regression_tree_spec.rb CHANGED

@@ -4,31 +4,31 @@ describe Nimbus::RegressionTree do
   before(:each) do
     @config = Nimbus::Configuration.new
-    @config.load fixture_file('regression_config.yml')
+    @config.load fixture_file('regression/config.yml')
     @tree = Nimbus::RegressionTree.new @config.tree
   end
   it "is initialized with tree config info" do
-    @tree.snp_total_count.should == 200
-    @tree.snp_sample_size.should == 60
-    @tree.node_min_size.should   == 5
+    expect(@tree.snp_total_count).to eq 200
+    expect(@tree.snp_sample_size).to eq 60
+    expect(@tree.node_min_size).to eq 5
   end
   it "creates a tree structure when seeded with training data" do
     @config.load_training_data
-    @tree.structure.should be_nil
+    expect(@tree.structure).to be_nil
     @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
-    @tree.structure.should_not be_nil
-    @tree.structure.should be_kind_of Hash
+    expect(@tree.structure).to_not be_nil
+    expect(@tree.structure).to be_kind_of Hash
-    @tree.structure.keys.first.should == @tree.used_snps.last
-    @tree.used_snps.should_not be_empty
+    expect(@tree.structure.keys.first).to eq @tree.used_snps.last
+    expect(@tree.used_snps).to_not be_empty
   end
-  it "split node in three when building a node and finds a suitable split" do
+  it "split node when building a node and finds a suitable split" do
     @config.load_training_data
-    @tree.stub!(:snps_random_sample).and_return((141..200).to_a) #189 is best split
+    allow_any_instance_of(Nimbus::RegressionTree).to receive(:snps_random_sample).and_return((141..200).to_a) #189 is best split
     @tree.individuals = @config.training_set.individuals
     @tree.id_to_fenotype = @config.training_set.ids_fenotypes
@@ -36,29 +36,29 @@ describe Nimbus::RegressionTree do
     @tree.predictions = {}
     branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
-    branch.keys.size.should == 1
-    branch.keys.first.should == 189
-    branch[189].size.should == 3
-    branch[189][0].should be_kind_of Hash
-    branch[189][1].should be_kind_of Hash
-    branch[189][2].should be_kind_of Hash
+    expect(branch.keys.size).to eq 1
+    expect(branch.keys.first).to eq 189
+    expect(branch[189].size).to eq 3
+    expect(branch[189][0]).to be_kind_of Hash
+    expect([Nimbus::Tree::NODE_SPLIT_01_2, Nimbus::Tree::NODE_SPLIT_0_12]).to include(branch[189][1])
+    expect(branch[189][2]).to be_kind_of Hash
   end
   it "keeps track of all SNPs used for the tree" do
     @config.load_training_data
     snps = (131..190).to_a
-    @tree.stub!(:snps_random_sample).and_return(snps)
-    @tree.used_snps.should be_nil
+    allow_any_instance_of(Nimbus::RegressionTree).to receive(:snps_random_sample).and_return(snps)
+    expect(@tree.used_snps).to be_nil
     @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
-    @tree.used_snps.size.should > 4
+    expect(@tree.used_snps.size).to be > 4
     @tree.used_snps.each{|snp|
-      snps.include?(snp).should be_true
+      expect(snps.include?(snp)).to be true
     }
   end
   it "labels node when building a node and there is not a suitable split" do
     @config.load_training_data
-    @tree.stub!(:snps_random_sample).and_return([33])
+    allow_any_instance_of(Nimbus::RegressionTree).to receive(:snps_random_sample).and_return([91])
     @tree.individuals = @config.training_set.individuals
     @tree.id_to_fenotype = @config.training_set.ids_fenotypes
@@ -66,9 +66,9 @@ describe Nimbus::RegressionTree do
     @tree.predictions = {}
     branch = @tree.build_node @config.training_set.all_ids, Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
-    branch[33][0].should be_kind_of Numeric
-    branch[33][1].should be_kind_of Numeric
-    branch[33][2].should be_kind_of Numeric
+    expect(branch[91][0]).to be_kind_of Numeric
+    expect([Nimbus::Tree::NODE_SPLIT_01_2, Nimbus::Tree::NODE_SPLIT_0_12]).to include(branch[91][1])
+    expect(branch[91][2]).to be_kind_of Numeric
   end
   it "labels node when building a node with less individuals than the minimum node size" do
@@ -80,50 +80,50 @@ describe Nimbus::RegressionTree do
     @tree.predictions = {}
     label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
-    label.should be_kind_of Numeric
+    expect(label).to be_kind_of Numeric
     label = @tree.build_node [2, 10], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
-    label.should be_kind_of Numeric
+    expect(label).to be_kind_of Numeric
     label = @tree.build_node [1, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
-    label.should be_kind_of Numeric
+    expect(label).to be_kind_of Numeric
     label = @tree.build_node [108, 22, 10, 33], Nimbus::LossFunctions.average(@config.training_set.all_ids, @config.training_set.ids_fenotypes)
-    label.should be_kind_of Numeric
+    expect(label).to be_kind_of Numeric
   end
   it 'computes generalization error for the tree' do
     @config.load_training_data
     @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
-    @tree.generalization_error.should be_nil
+    expect(@tree.generalization_error).to be_nil
     @tree.generalization_error_from_oob((2..200).to_a)
-    @tree.generalization_error.should be_kind_of Numeric
-    @tree.generalization_error.should > 0.0
-    @tree.generalization_error.should < 1.0
+    expect(@tree.generalization_error).to be_kind_of Numeric
+    expect(@tree.generalization_error).to be > 0.0
+    expect(@tree.generalization_error).to be < 1.0
   end
   it 'estimates importance for all SNPs' do
     @config.load_training_data
     @tree.seed(@config.training_set.individuals, @config.training_set.all_ids, @config.training_set.ids_fenotypes)
-    @tree.importances.should be_nil
+    expect(@tree.importances).to be_nil
     @tree.estimate_importances((300..533).to_a)
-    @tree.importances.should be_kind_of Hash
-    @tree.importances.keys.should_not be_empty
-    (@tree.importances.keys - (1..200).to_a).should be_empty #all keys are snp indexes (200 snps in training file)
+    expect(@tree.importances).to be_kind_of Hash
+    expect(@tree.importances.keys).to_not be_empty
+    expect((@tree.importances.keys - (1..200).to_a)).to be_empty #all keys are snp indexes (200 snps in training file)
   end
   it 'get prediction for an individual pushing it down a tree structure' do
-    tree_structure = Psych.load(File.open fixture_file('regression_random_forest.yml')).first
+    tree_structure = Psych.load(File.open fixture_file('regression/random_forest.yml')).first
     individual_data = [0]*200
     prediction = Nimbus::Tree.traverse tree_structure, individual_data
-    prediction.should == 0.25043
+    expect(prediction).to eq -0.90813
-    individual_data[189-1] = 1
-    individual_data[4-1] = 1
-    individual_data[62-1] = 2
-    individual_data[146-1] = 2
+    individual_data[44-1] = 2
+    individual_data[98-1] = 1
+    individual_data[22-1] = 1
+    individual_data[31-1] = 2
     prediction = Nimbus::Tree.traverse tree_structure, individual_data
-    prediction.should == -0.9854
+    expect(prediction).to eq -0.95805
   end
 end