RubyGems - nimbus - Versions diffs - 1.0.1 → 2.0.0 - Mend

nimbus 1.0.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

data/README.md +149 -0
data/lib/nimbus.rb +15 -11
data/lib/nimbus/application.rb +20 -23
data/lib/nimbus/classification_tree.rb +111 -0
data/lib/nimbus/configuration.rb +52 -37
data/lib/nimbus/forest.rb +56 -20
data/lib/nimbus/individual.rb +7 -7
data/lib/nimbus/loss_functions.rb +44 -10
data/lib/nimbus/regression_tree.rb +103 -0
data/lib/nimbus/training_set.rb +4 -4
data/lib/nimbus/tree.rb +20 -83
data/lib/nimbus/version.rb +3 -0
data/spec/classification_tree_spec.rb +132 -0
data/spec/configuration_spec.rb +46 -19
data/spec/fixtures/classification_config.yml +13 -0
data/spec/fixtures/classification_random_forest.yml +922 -0
data/spec/fixtures/classification_testing.data +500 -0
data/spec/fixtures/classification_training.data +1000 -0
data/spec/forest_spec.rb +109 -50
data/spec/individual_spec.rb +2 -2
data/spec/loss_functions_spec.rb +71 -0
data/spec/nimbus_spec.rb +4 -4
data/spec/regression_tree_spec.rb +129 -0
data/spec/training_set_spec.rb +5 -5
data/spec/tree_spec.rb +4 -115
metadata +53 -45
data/spec/fixtures/regression_snp_importances.txt +0 -200
data/spec/fixtures/regression_testing_file_predictions.txt +0 -200
data/spec/fixtures/regression_training_file_predictions.txt +0 -758

data/lib/nimbus/configuration.rb CHANGED Viewed

@@ -1,12 +1,12 @@
 module Nimbus
   #####################################################################
-  # Nimbus configuration object.
-  #
+  # Nimbus configuration object.
+  #
   # This class reads every user file.
-  # Once the user's config.yml file is loaded, a set of default and
+  # Once the user's config.yml file is loaded, a set of default and
   # custom options is created and stored.
-  #
-  # Nimbus::Configuration also reads the testing files and the data
+  #
+  # Nimbus::Configuration also reads the testing files and the data
   # to create the training set to be passed to the Nimbus::Forest random
   # forest generator and the Nimbus::Tree classes in it.
   #
@@ -15,6 +15,7 @@ module Nimbus
       :training_file,
       :testing_file,
       :forest_file,
+      :classes,
       :config_file,
       :forest_size,
       :tree_SNP_sample_size,
@@ -32,65 +33,66 @@ module Nimbus
       :output_snp_importances_file,
       :silent
     )
     DEFAULTS = {
       :forest_size          => 500,
       :tree_SNP_sample_size => 60,
       :tree_SNP_total_count => 200,
       :tree_node_min_size   => 5,
       :loss_function_discrete   => 'majority_class',
-      :loss_function_continuous => 'mean',
+      :loss_function_continuous => 'average',
       :training_file => 'training.data',
       :testing_file  => 'testing.data',
       :forest_file   => 'forest.yml',
       :config_file   => 'config.yml',
       :output_forest_file   => 'random_forest.yml',
       :output_training_file => 'training_file_predictions.txt',
       :output_testing_file  => 'testing_file_predictions.txt',
       :output_tree_errors_file => 'generalization_errors.txt',
       :output_snp_importances_file => 'snp_importances.txt',
       :silent => false
     }
     # Initialize a Nimbus::Configuration object.
     #
     # Set all options to their default values.
     def initialize
       @do_training = false
       @do_testing  = false
       @forest_size              = DEFAULTS[:forest_size]
       @tree_SNP_sample_size     = DEFAULTS[:tree_SNP_sample_size]
       @tree_SNP_total_count     = DEFAULTS[:tree_SNP_total_count]
       @tree_node_min_size       = DEFAULTS[:tree_node_min_size]
       @loss_function_discrete   = DEFAULTS[:loss_function_discrete]
       @loss_function_continuous = DEFAULTS[:loss_function_continuous]
       @output_forest_file   = File.expand_path(DEFAULTS[:output_forest_file], Dir.pwd)
       @output_training_file = File.expand_path(DEFAULTS[:output_training_file], Dir.pwd)
       @output_testing_file  = File.expand_path(DEFAULTS[:output_testing_file], Dir.pwd)
       @output_tree_errors_file  = File.expand_path(DEFAULTS[:output_tree_errors_file], Dir.pwd)
       @output_snp_importances_file = File.expand_path(DEFAULTS[:output_snp_importances_file], Dir.pwd)
       @silent = ENV['nimbus_test'] == 'running_nimbus_tests' ? true : DEFAULTS[:silent]
     end
     # Accessor method for the tree-related subset of options.
     def tree
-      {
+      {
         :snp_sample_size => @tree_SNP_sample_size,
         :snp_total_count => @tree_SNP_total_count,
-        :tree_node_min_size => @tree_node_min_size
+        :tree_node_min_size => @tree_node_min_size,
+        :classes => @classes
       }
     end
     # This is the first method to be called on Configuration when a config.yml file
     # exists with user input options for the forest.
-    #
+    #
     # * The method will read the config file and change the default value of the selected options.
     # * Then based on the options and the existence of training, testing and forest files, it will mark:
     #   - if training is needed,
@@ -110,24 +112,26 @@ module Nimbus
           raise Nimbus::WrongFormatFileError, "It was not posible to parse the config file (#{config_file}): \r\n#{e.message} "
         end
       end
       if user_config_params['input']
         @training_file = File.expand_path(user_config_params['input']['training'], dirname) if user_config_params['input']['training']
         @testing_file  = File.expand_path(user_config_params['input']['testing' ], dirname) if user_config_params['input']['testing']
         @forest_file   = File.expand_path(user_config_params['input']['forest'  ], dirname) if user_config_params['input']['forest']
+        @classes       = user_config_params['input']['classes'] if user_config_params['input']['classes']
       else
         @training_file = File.expand_path(DEFAULTS[:training_file], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:training_file], Dir.pwd)
         @testing_file  = File.expand_path(DEFAULTS[:testing_file ], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:testing_file ], Dir.pwd)
         @forest_file   = File.expand_path(DEFAULTS[:forest_file  ], Dir.pwd) if File.exists? File.expand_path(DEFAULTS[:forest_file  ], Dir.pwd)
       end
       @do_training = true if @training_file
       @do_testing  = true if @testing_file
+      @classes = @classes.map{|c| c.to_s.strip} if @classes
       if @do_testing && !@do_training && !@forest_file
         raise Nimbus::InputFileError, "There is not random forest data (training file not defined, and forest file not found)."
       end
       if user_config_params['forest']
         @forest_size          = user_config_params['forest']['forest_size'].to_i if user_config_params['forest']['forest_size']
         @tree_SNP_total_count = user_config_params['forest']['SNP_total_count'].to_i if user_config_params['forest']['SNP_total_count']
@@ -138,7 +142,7 @@ module Nimbus
       check_configuration
       log_configuration
     end
     # The method reads the training file, and if the data is valid, creates a Nimbus::TrainingSet
     # containing every individual to be used as training sample for a random forest.
     def load_training_data
@@ -150,12 +154,15 @@ module Nimbus
           raise Nimbus::InputFileError, "Individual ##{data_id} from training set has no value for all #{@tree_SNP_total_count} SNPs" unless snp_list.size == @tree_SNP_total_count
           raise Nimbus::InputFileError, "There are individuals with no ID, please check data in training file." unless (!data_id.nil? && data_id.strip != '')
           raise Nimbus::InputFileError, "Individual ##{data_id} has no fenotype value, please check data in training file." unless (!data_feno.nil? && data_feno.strip != '')
-          @training_set.individuals[data_id.to_i] = Nimbus::Individual.new(data_id.to_i, data_feno.to_f, snp_list.map{|snp| snp.to_i})
-          @training_set.ids_fenotypes[data_id.to_i] = data_feno.to_f
+          raise Nimbus::InputFileError, "Individual ##{data_id} has invalid class (not in [#{classes*', '}]), please check data in training file." unless (@classes.nil? || @classes.include?(data_feno))
+          data_feno = (@classes ? data_feno.to_s : data_feno.to_f)
+          @training_set.individuals[data_id.to_i] = Nimbus::Individual.new(data_id.to_i, data_feno, snp_list.map{|snp| snp.to_i})
+          @training_set.ids_fenotypes[data_id.to_i] = data_feno
         end
       }
     end
     # Reads the testing file, and if the data is valid, yields one Nimbus::Individual at a time.
     def read_testing_data
       File.open(@testing_file) {|file|
@@ -169,7 +176,7 @@ module Nimbus
         end
       }
     end
     # Creates a Nimbus::Forest object from a user defined random forest data file.
     #
     # The format of the input file should be the same as the forest output data of a Nimbus Application.
@@ -186,14 +193,14 @@ module Nimbus
       forest.trees = trees
       forest
     end
     # Include tests to be passed by the info contained in the config file.
     #
     # If some of the configuration data provided by the user is invalid, an error is raised and execution stops.
     def check_configuration
       raise Nimbus::ConfigurationError, "The mtry sample size must be smaller than the total SNPs count." if @tree_SNP_sample_size > @tree_SNP_total_count
     end
     # Prints the information stored in the Nimbus::Configuration object
     #
     # It could include errors on the configuration input data, training related info and/or testing related info.
@@ -209,31 +216,39 @@ module Nimbus
         Nimbus.message "*" * 50
         Nimbus.stop "Error: No input data. Nimbus finished."
       end
       Nimbus.message "*" * 50
-      Nimbus.message "* Nimbus configured with the following parameters: "
+      Nimbus.message "* Nimbus version #{::Nimbus::VERSION}"
+      Nimbus.message "* configured with the following parameters: "
       Nimbus.message "*   Forest size: #{@forest_size} trees"
       Nimbus.message "*   Total SNP count: #{@tree_SNP_total_count}"
       Nimbus.message "*   SNPs sample size (mtry): #{@tree_SNP_sample_size}"
       Nimbus.message "*   Minimun node size in tree: #{@tree_node_min_size}"
+      if @classes
+        Nimbus.message "*   Mode: CLASSIFICATION"
+        Nimbus.message "*     Classes: [#{@classes*', '}]"
+      else
+        Nimbus.message "*   Mode: REGRESSION"
+      end
       Nimbus.message "*" * 50
       if @do_training
         Nimbus.message "* Training data:"
         Nimbus.message "*   Training file: #{@training_file}"
         Nimbus.message "*" * 50
       end
       if @do_testing
         Nimbus.message "* Data to be tested:"
         Nimbus.message "*   Testing file: #{@testing_file}"
         if @forest_file
-          Nimbus.message "* using the structure of the random forest stored in:"
+          Nimbus.message "* using the structure of the random forest stored in:"
           Nimbus.message "*   Random forest file: #{@forest_file}"
         end
         Nimbus.message "*" * 50
       end
     end
   end
 end

data/lib/nimbus/forest.rb CHANGED Viewed

@@ -1,13 +1,13 @@
 module Nimbus
   #####################################################################
-  # Forest represents the Random forest being generated
+  # Forest represents the Random forest being generated
   # (or used to test samples) by the application object.
   #
   class Forest
     attr_accessor :size, :trees, :bag, :predictions, :tree_errors, :snp_importances
     attr_accessor :options
     # Initialize Forest object with options included in the Nimbus::Configuration object received.
     def initialize(config)
       @trees = []
@@ -20,7 +20,7 @@ module Nimbus
       @tree_snp_importances = []
       raise Nimbus::ForestError, "Forest size parameter (#{@size}) is invalid. You need at least one tree." if @size < 1
     end
     # Creates a random forest based on the TrainingSet included in the configuration, creating N random trees (size N defined in the configuration).
     #
     # This is the method called when the application's configuration flags training on.
@@ -35,10 +35,11 @@ module Nimbus
     # Every tree of the forest is created with a different random sample of the individuals in the training set.
     def grow
       @size.times do |i|
-        Nimbus.write("Creating trees: #{i+1}/#{@size} ")
+        Nimbus.write("\rCreating trees: #{i+1}/#{@size} ")
         tree_individuals_bag = individuals_random_sample
         tree_out_of_bag = oob tree_individuals_bag
-        tree = Tree.new @options.tree
+        tree_class = (classification? ? ClassificationTree : RegressionTree)
+        tree = tree_class.new @options.tree
         @trees << tree.seed(@options.training_set.individuals, tree_individuals_bag, @options.training_set.ids_fenotypes)
         @tree_errors << tree.generalization_error_from_oob(tree_out_of_bag)
         @tree_snp_importances << tree.estimate_importances(tree_out_of_bag)
@@ -46,13 +47,18 @@ module Nimbus
         Nimbus.clear_line!
       end
       average_snp_importances
-      average_predictions
+      totalize_predictions
     end
-    # Traverse a testing set through every tree of the forest and get averaged predictions for every individual in the sample.
+    # Traverse a testing set through every tree of the forest.
     #
     # This is the method called when the application's configuration flags testing on.
     def traverse
+      classification? ? traverse_classification_forest : traverse_regression_forest
+    end
+    # Traverse a testing set through every regression tree of the forest and get averaged predictions for every individual in the sample.
+    def traverse_regression_forest
       @predictions = {}
       prediction_count = trees.size
       @options.read_testing_data{|individual|
@@ -63,44 +69,66 @@ module Nimbus
         @predictions[individual.id] = (individual_prediction / prediction_count).round(5)
       }
     end
+    # Traverse a testing set through every classification tree of the forest and get majority class predictions for every individual in the sample.
+    def traverse_classification_forest
+      @predictions = {}
+      @options.read_testing_data{|individual|
+        individual_prediction = []
+        trees.each do |t|
+          individual_prediction << Nimbus::Tree.traverse(t, individual.snp_list)
+        end
+        @predictions[individual.id] = Nimbus::LossFunctions.majority_class_in_list(individual_prediction, @options.tree[:classes])
+      }
+    end
     # The array containing every tree in the forest, to YAML format.
     def to_yaml
       @trees.to_yaml
     end
     private
     def individuals_random_sample
       individuals_sample = bag.inject([]){|items, i| items << bag.sample }.sort
     end
     def oob(in_bag=[])
       bag - in_bag.uniq
     end
     def bag
       @bag ||= @options.training_set.all_ids
     end
     def acumulate_predictions(preds)
       preds.each_pair.each{|id, value|
         if @predictions[id].nil?
-          @predictions[id] = value
+          @predictions[id] = (classification? ? [value] : value)
           @times_predicted[id] = 1.0
         else
-          @predictions[id] += value
+          classification? ? (@predictions[id] << value) : (@predictions[id] += value)
           @times_predicted[id] += 1
         end
       }
     end
+    def totalize_predictions
+      classification? ? majority_class_predicted : average_predictions
+    end
     def average_predictions
       @predictions.each_pair{|id, value|
         @predictions[id] = (@predictions[id] / @times_predicted[id]).round(5)
       }
     end
+    def majority_class_predicted
+      @predictions.each_pair{|id, values|
+        @predictions[id] = Nimbus::LossFunctions.majority_class_in_list(values, @options.tree[:classes])
+      }
+    end
     def average_snp_importances
       1.upto(@options.tree_SNP_total_count) {|snp|
         @snp_importances[snp] = 0.0
@@ -110,7 +138,15 @@ module Nimbus
         @snp_importances[snp] = @snp_importances[snp] / @size
       }
     end
+    def classification?
+      @options.tree[:classes]
+    end
+    def regression?
+      @options.tree[:classes].nil?
+    end
   end
 end

data/lib/nimbus/individual.rb CHANGED Viewed

@@ -1,19 +1,19 @@
 module Nimbus
   #####################################################################
-  # Nimbus Individual object.
-  #
-  # It represents a single individual of a training or testing sample.
-  #
+  # Nimbus Individual object.
+  #
+  # It represents a single individual of a training or testing sample.
+  #
   # This class stores information about a individual:
   #
   # * id,
   # * values for all the SNPs of the individual,
-  # * fenotype if present,
+  # * fenotype if present,
   # * the prediction is it exists.
   #
   class Individual
     attr_accessor :id, :fenotype, :prediction, :snp_list
     # Initialize individual with passed data.
     def initialize(i, fen, snps=[])
       self.id = i
@@ -21,5 +21,5 @@ module Nimbus
       self.snp_list = snps
     end
   end
 end

data/lib/nimbus/loss_functions.rb CHANGED Viewed

@@ -1,23 +1,24 @@
 # encoding: utf-8
 module Nimbus
   #####################################################################
   # Math functions.
-  #
+  #
   # The LossFunctions class provides handy mathematical functions as class methods
   # to be used by Tree and Forest when estimating predictions, errors and loss functions
-  # for training and testing data.
+  # for training and testing data.
   #
   module LossFunctions
     class << self
+      ## REGRESSION
       # Simple average: sum(n) / n
       def average(ids, value_table)
         ids.inject(0.0){|sum, i| sum + value_table[i]} / ids.size
       end
-      # Mean squared error: sum (x-y)^2
+      # Mean squared error: sum (x-y)^2
       def mean_squared_error(ids, value_table, mean = nil)
         mean ||= self.average ids, value_table
         ids.inject(0.0){|sum, i| sum + ((value_table[i] - mean)**2) }
@@ -29,14 +30,47 @@ module Nimbus
       def quadratic_loss(ids, value_table, mean = nil)
         self.mean_squared_error(ids, value_table, mean) / ids.size
       end
       # Difference between two values, squared. (x-y)^2
       def squared_difference(x,y)
         0.0 + (x-y)**2
       end
+      ## CLASSSIFICATION
+      # Gini index of a list of classified individuals.
+      #
+      # If a dataset T contains examples from n classes, then:
+      # gini(T) = 1 - Sum (Pj)^2
+      # where Pj is the relative frequency of class j in T
+      def gini_index(ids, value_table, classes)
+        total_size = ids.size.to_f
+        gini = 1 - class_sizes(ids, value_table, classes).inject(0.0){|sum, size|
+          sum + (size/total_size)**2}
+        gini.round(5)
+      end
+      # Majority class of a list of classified individuals.
+      # If more than one class has the same number of individuals,
+      # one of the majority classes is selected randomly.
+      def majority_class(ids, value_table, classes)
+        sizes = class_sizes(ids, value_table, classes)
+        Hash[classes.zip sizes].keep_if{|k,v| v == sizes.max}.keys.sample
+      end
+      # Majority class of a list of classes.
+      # If more than one class has the same number of individuals,
+      # one of the majority classes is selected randomly.
+      def majority_class_in_list(list, classes)
+        sizes = classes.map{|c| list.count{|i| i == c}}
+        Hash[classes.zip sizes].keep_if{|k,v| v == sizes.max}.keys.sample
+      end
+      # Array with the list of sizes of each class in the given list of individuals.
+      def class_sizes(ids, value_table, classes)
+        classes.map{|c| ids.count{|i| value_table[i] == c}}
+      end
     end
   end
 end