RubyGems - lazar - Versions diffs - 1.0.0 → 1.0.1 - Mend

lazar 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml +4 -4
data/README.md +64 -1
data/VERSION +1 -1
data/lib/algorithm.rb +1 -0
data/lib/caret.rb +11 -2
data/lib/classification.rb +6 -1
data/lib/compound.rb +32 -23
data/lib/crossvalidation.rb +22 -0
data/lib/dataset.rb +30 -3
data/lib/feature.rb +7 -0
data/lib/feature_selection.rb +4 -1
data/lib/import.rb +5 -1
data/lib/leave-one-out-validation.rb +6 -0
data/lib/model.rb +77 -3
data/lib/nanoparticle.rb +19 -0
data/lib/overwrite.rb +46 -11
data/lib/physchem.rb +23 -5
data/lib/regression.rb +5 -0
data/lib/rest-client-wrapper.rb +1 -0
data/lib/similarity.rb +22 -2
data/lib/substance.rb +1 -0
data/lib/train-test-validation.rb +12 -0
data/lib/validation-statistics.rb +19 -0
data/lib/validation.rb +3 -0
data/test/feature.rb +2 -2
data/test/model-nanoparticle.rb +7 -0
data/test/nanomaterial-model-validation.rb +2 -3
data/test/setup.rb +1 -5
data/test/validation-regression.rb +2 -3
metadata +50 -5
data/lib/experiment.rb +0 -99

data/lib/model.rb CHANGED Viewed

@@ -9,6 +9,8 @@ module OpenTox
       include Mongoid::Timestamps
       store_in collection: "models"
+      attr_writer :independent_variables # store in GridFS to avoid Mongo database size limit problems
       field :name, type: String
       field :creator, type: String, default: __FILE__
       field :algorithms, type: Hash, default:{}
@@ -17,7 +19,7 @@ module OpenTox
       field :prediction_feature_id, type: BSON::ObjectId
       field :dependent_variables, type: Array, default:[]
       field :descriptor_ids, type:Array, default:[]
-      field :independent_variables, type: Array, default:[]
+      field :independent_variables_id, type: BSON::ObjectId
       field :fingerprints, type: Array, default:[]
       field :descriptor_weights, type: Array, default:[]
       field :descriptor_means, type: Array, default:[]
@@ -25,7 +27,15 @@ module OpenTox
       field :scaled_variables, type: Array, default:[]
       field :version, type: Hash, default:{}
-      def self.create prediction_feature:nil, training_dataset:nil, algorithms:{}
+      # Create a lazar model
+      # @param [OpenTox::Dataset] training_dataset
+      # @param [OpenTox::Feature, nil] prediction_feature
+      #   By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature
+      # @param [Hash, nil] algorithms
+      #   Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys.
+      #
+      # @return [OpenTox::Model::Lazar]
+      def self.create prediction_feature:nil, training_dataset:, algorithms:{}
         bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
         prediction_feature = training_dataset.features.first unless prediction_feature
         # TODO: prediction_feature without training_dataset: use all available data
@@ -119,6 +129,7 @@ module OpenTox
         end
         descriptor_method = model.algorithms[:descriptors][:method]
+        model.independent_variables = []
         case descriptor_method
         # parse fingerprints
         when "fingerprint"
@@ -177,8 +188,12 @@ module OpenTox
         model
       end
+      # Predict a substance (compound or nanoparticle)
+      # @param [OpenTox::Substance]
+      # @return [Hash]
       def predict_substance substance
+        @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
         case algorithms[:similarity][:method]
         when /tanimoto/ # binary features
           similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
@@ -234,7 +249,7 @@ module OpenTox
               neighbor_dependent_variables << dependent_variables[i]
               independent_variables.each_with_index do |c,j|
                 neighbor_independent_variables[j] ||= []
-                neighbor_independent_variables[j] << independent_variables[j][i]
+                neighbor_independent_variables[j] << @independent_variables[j][i]
               end
             end
           end
@@ -256,6 +271,9 @@ module OpenTox
         prediction
       end
+      # Predict a substance (compound or nanoparticle), an array of substances or a dataset
+      # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
+      # @return [Hash, Array<Hash>, OpenTox::Dataset]
       def predict object
         training_dataset = Dataset.find training_dataset_id
@@ -302,34 +320,62 @@ module OpenTox
       end
+      # Save the model
+      #   Stores independent_variables in GridFS to avoid Mongo database size limit problems
+      def save
+        file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables")
+        self.independent_variables_id = $gridfs.insert_one(file)
+        super
+      end
+      # Get independent variables
+      # @return [Array<Array>]
+      def independent_variables
+        @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
+        @independent_variables
+      end
+      # Get training dataset
+      # @return [OpenTox::Dataset]
       def training_dataset
         Dataset.find(training_dataset_id)
       end
+      # Get prediction feature
+      # @return [OpenTox::Feature]
       def prediction_feature
         Feature.find(prediction_feature_id)
       end
+      # Get training descriptors
+      # @return [Array<OpenTox::Feature>]
       def descriptors
         descriptor_ids.collect{|id| Feature.find(id)}
       end
+      # Get training substances
+      # @return [Array<OpenTox::Substance>]
       def substances
         substance_ids.collect{|id| Substance.find(id)}
       end
+      # Are fingerprints used as descriptors
+      # @return [TrueClass, FalseClass]
       def fingerprints?
         algorithms[:descriptors][:method] == "fingerprint" ? true : false
       end
     end
+    # Classification model
     class LazarClassification < Lazar
     end
+    # Regression model
     class LazarRegression < Lazar
     end
+    # Convenience class for generating and validating lazar models in a single step and predicting substances (compounds and nanoparticles), arrays of substances and datasets
     class Validation
       include OpenTox
@@ -343,42 +389,64 @@ module OpenTox
       field :model_id, type: BSON::ObjectId
       field :repeated_crossvalidation_id, type: BSON::ObjectId
+      # Predict a substance (compound or nanoparticle), an array of substances or a dataset
+      # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
+      # @return [Hash, Array<Hash>, OpenTox::Dataset]
       def predict object
         model.predict object
       end
+      # Get training dataset
+      # @return [OpenTox::Dataset]
       def training_dataset
         model.training_dataset
       end
+      # Get lazar model
+      # @return [OpenTox::Model::Lazar]
       def model
         Lazar.find model_id
       end
+      # Get algorithms
+      # @return [Hash]
       def algorithms
         model.algorithms
       end
+      # Get prediction feature
+      # @return [OpenTox::Feature]
       def prediction_feature
         model.prediction_feature
       end
+      # Get repeated crossvalidations
+      # @return [OpenTox::Validation::RepeatedCrossValidation]
       def repeated_crossvalidation
         OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required
       end
+      # Get crossvalidations
+      # @return [Array<OpenTox::CrossValidation]
       def crossvalidations
         repeated_crossvalidation.crossvalidations
       end
+      # Is it a regression model
+      # @return [TrueClass, FalseClass]
       def regression?
         model.is_a? LazarRegression
       end
+      # Is it a classification model
+      # @return [TrueClass, FalseClass]
       def classification?
         model.is_a? LazarClassification
       end
+      # Create and validate a lazar model from a csv file with training data and a json file with metadata
+      # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
+      # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations
       def self.from_csv_file file
         metadata_file = file.sub(/csv$/,"json")
         bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
@@ -391,6 +459,12 @@ module OpenTox
         model_validation
       end
+      # Create and validate a nano-lazar model, import data from eNanoMapper if necessary
+      #   nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf
+      # @param [OpenTox::Dataset, nil] training_dataset
+      # @param [OpenTox::Feature, nil] prediction_feature
+      # @param [Hash, nil] algorithms
+      # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
       def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil
         # find/import training_dataset

data/lib/nanoparticle.rb CHANGED Viewed

@@ -1,25 +1,36 @@
 module OpenTox
+  # Nanoparticles
   class Nanoparticle < Substance
     include OpenTox
     field :core_id, type: String, default: nil
     field :coating_ids, type: Array, default: []
+    # Get core compound
+    # @return [OpenTox::Compound]
     def core
       Compound.find core_id
     end
+    # Get coatings
+    # @return [Array<OpenTox::Compound>]
     def coating
       coating_ids.collect{|i| Compound.find i }
     end
+    # Get nanoparticle fingerprint (union of core and coating fingerprints)
+    # @param [String] fingerprint type
+    # @return [Array<String>]
     def fingerprint type=DEFAULT_FINGERPRINT
       core_fp = core.fingerprint type
       coating_fp = coating.collect{|c| c.fingerprint type}.flatten.uniq.compact
       (core_fp.empty? or coating_fp.empty?) ? [] : (core_fp+coating_fp).uniq.compact
     end
+    # Calculate physchem properties
+    # @param [Array<Hash>] list of descriptors
+    # @return [Array<Float>]
     def calculate_properties descriptors=PhysChem::OPENBABEL
       if core.smiles and !coating.collect{|c| c.smiles}.compact.empty?
         core_prop = core.calculate_properties descriptors
@@ -28,6 +39,10 @@ module OpenTox
       end
     end
+    # Add (measured) feature values
+    # @param [OpenTox::Feature]
+    # @param [TrueClass,FalseClass,Float]
+    # @param [OpenTox::Dataset]
     def add_feature feature, value, dataset
       unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand
         case feature.category
@@ -55,6 +70,10 @@ module OpenTox
       end
     end
+    # Parse values from Ambit database
+    # @param [OpenTox::Feature]
+    # @param [TrueClass,FalseClass,Float]
+    # @param [OpenTox::Dataset]
     def parse_ambit_value feature, v, dataset
       # TODO add study id to warnings
       v.delete "unit"

data/lib/overwrite.rb CHANGED Viewed

@@ -2,41 +2,51 @@ require "base64"
 class Object
   # An object is blank if it's false, empty, or a whitespace string.
   # For example, "", "   ", +nil+, [], and {} are all blank.
+  # @return [TrueClass,FalseClass]
   def blank?
     respond_to?(:empty?) ? empty? : !self
   end
+  # Is it a numeric object
+  # @return [TrueClass,FalseClass]
   def numeric?
     true if Float(self) rescue false
   end
   # Returns dimension of nested arrays
+  # @return [Fixnum]
   def dimension
     self.class == Array ? 1 + self[0].dimension : 0
   end
 end
 class Numeric
+  # Convert number to percent
+  # @return [Float]
   def percent_of(n)
     self.to_f / n.to_f * 100.0
   end
 end
 class Float
-  # round to n significant digits
-  # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby
+  # Round to n significant digits
+  #   http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby
+  # @param [Fixnum]
+  # @return [Float]
   def signif(n)
     Float("%.#{n}g" % self)
   end
-  # converts -10 logarithmized values back
+  # Convert -10 log values to original values
+  # @return [Float]
   def delog10
     10**(-1*self)
   end
 end
 module Enumerable
-  # @return [Array] only the duplicates of an enumerable
+  # Get duplicates
+  # @return [Array]
   def duplicates
     inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys
   end
@@ -51,7 +61,10 @@ module Enumerable
 end
 class String
-  # @return [String] converts camel-case to underscore-case (OpenTox::SuperModel -> open_tox/super_model)
+  # Convert camel-case to underscore-case
+  # @example
+  #   OpenTox::SuperModel -> open_tox/super_model
+  # @return [String]
   def underscore
     self.gsub(/::/, '/').
     gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
@@ -60,7 +73,7 @@ class String
     downcase
   end
-  # convert strings to boolean values
+  # Convert strings to boolean values
   # @return [TrueClass,FalseClass] true or false
   def to_boolean
     return true if self == true || self =~ (/(true|t|yes|y|1)$/i)
@@ -71,7 +84,8 @@ class String
 end
 class File
-  # @return [String] mime_type including charset using linux cmd command
+  # Get mime_type including charset using linux file command
+  # @return [String]
   def mime_type
     `file -ib '#{self.path}'`.chomp
   end
@@ -79,7 +93,7 @@ end
 class Array
-  # Sum up the size of single arrays in an array of arrays
+  # Sum the size of single arrays in an array of arrays
   # @param [Array] Array of arrays
   # @return [Integer] Sum of size of array elements
   def sum_size
@@ -92,33 +106,43 @@ class Array
     }
   end
-  # For symbolic features
+  # Check if the array has just one unique value.
   # @param [Array] Array to test.
-  # @return [Boolean] Whether the array has just one unique value.
+  # @return [TrueClass,FalseClass]
   def zero_variance?
     return self.uniq.size == 1
   end
+  # Get the median of an array
+  # @return [Numeric]
   def median
     sorted = self.sort
     len = sorted.length
     (sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0
   end
+  # Get the mean of an array
+  # @return [Numeric]
   def mean
     self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size
   end
+  # Get the variance of an array
+  # @return [Numeric]
   def sample_variance
     m = self.mean
     sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 }
     sum/(self.compact.length - 1).to_f
   end
+  # Get the standard deviation of an array
+  # @return [Numeric]
   def standard_deviation
     Math.sqrt(self.sample_variance)
   end
+  # Convert array values for R
+  # @return [Array]
   def for_R
     if self.first.is_a?(String)
       #"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets
@@ -128,6 +152,8 @@ class Array
     end
   end
+  # Collect array with index
+  #   in analogy to each_with_index
   def collect_with_index
     result = []
     self.each_with_index do |elt, idx|
@@ -139,11 +165,15 @@ end
 module URI
+  # Is it a https connection
+  # @param [String]
+  # @return [TrueClass,FalseClass]
   def self.ssl? uri
     URI.parse(uri).instance_of? URI::HTTPS
   end
-  # @return [Boolean] checks if resource exists by making a HEAD-request
+  # Check if a http resource exists by making a HEAD-request
+  # @return [TrueClass,FalseClass]
   def self.accessible?(uri)
     parsed_uri = URI.parse(uri + (OpenTox::RestClientWrapper.subjectid ? "?subjectid=#{CGI.escape OpenTox::RestClientWrapper.subjectid}" : ""))
     http_code = URI.task?(uri) ? 600 : 400
@@ -163,6 +193,9 @@ module URI
     false
   end
+  # Is the URI valid
+  # @param [String]
+  # @return [TrueClass,FalseClass]
   def self.valid? uri
     u = URI.parse(uri)
     u.scheme!=nil and u.host!=nil
@@ -170,6 +203,8 @@ module URI
     false
   end
+  # Is the URI a task URI
+  # @param [String]
   def self.task? uri
     uri =~ /task/ and URI.valid? uri
   end

data/lib/physchem.rb CHANGED Viewed

@@ -39,6 +39,9 @@ module OpenTox
     require_relative "unique_descriptors.rb"
+    # Get descriptor features
+    # @param [Hash]
+    # @return [Array<OpenTox::PhysChem>]
     def self.descriptors desc=DESCRIPTORS
       desc.collect do |name,description|
         lib,desc = name.split('.',2)
@@ -46,6 +49,8 @@ module OpenTox
       end
     end
+    # Get unique descriptor features
+    # @return [Array<OpenTox::PhysChem>]
     def self.unique_descriptors
       udesc = []
       UNIQUEDESCRIPTORS.each do |name|
@@ -64,23 +69,28 @@ module OpenTox
       udesc
     end
+    # Get OpenBabel descriptor features
+    # @return [Array<OpenTox::PhysChem>]
     def self.openbabel_descriptors
       descriptors OPENBABEL
     end
+    # Get CDK descriptor features
+    # @return [Array<OpenTox::PhysChem>]
     def self.cdk_descriptors
       descriptors CDK
     end
+    # Get JOELIB descriptor features
+    # @return [Array<OpenTox::PhysChem>]
     def self.joelib_descriptors
       descriptors JOELIB
     end
-    def calculate compound
-      result = send library.downcase,descriptor,compound
-      result[self.name]
-    end
+    # Calculate OpenBabel descriptors
+    # @param [String] descriptor type
+    # @param [OpenTox::Compound]
+    # @return [Hash]
     def openbabel descriptor, compound
       obdescriptor = OpenBabel::OBDescriptor.find_type descriptor
       obmol = OpenBabel::OBMol.new
@@ -90,10 +100,18 @@ module OpenTox
       {"#{library.capitalize}.#{descriptor}" => fix_value(obdescriptor.predict(obmol))}
     end
+    # Calculate CDK descriptors
+    # @param [String] descriptor type
+    # @param [OpenTox::Compound]
+    # @return [Hash]
     def cdk descriptor, compound
       java_descriptor "cdk", descriptor, compound
     end
+    # Calculate JOELIB descriptors
+    # @param [String] descriptor type
+    # @param [OpenTox::Compound]
+    # @return [Hash]
     def joelib descriptor, compound
       java_descriptor "joelib", descriptor, compound
     end

data/lib/regression.rb CHANGED Viewed

@@ -1,8 +1,13 @@
 module OpenTox
   module Algorithm
+    # Regression algorithms
     class Regression
+      # Weighted average
+      # @param [Array<TrueClass,FalseClass>] dependent_variables
+      # @param [Array<Float>] weights
+      # @return [Hash]
       def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:nil
         # TODO: prediction_interval
         weighted_sum = 0.0

data/lib/rest-client-wrapper.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 module OpenTox
+  # Adjustments to the rest-client gem for OpenTox
   class RestClientWrapper
     attr_accessor :request, :response

data/lib/similarity.rb CHANGED Viewed

@@ -2,6 +2,10 @@ module OpenTox
   module Algorithm
     class Vector
+      # Get dot product
+      # @param [Vector]
+      # @param [Vector]
+      # @return [Numeric]
       def self.dot_product(a, b)
         products = a.zip(b).map{|a, b| a * b}
         products.inject(0) {|s,p| s + p}
@@ -15,6 +19,9 @@ module OpenTox
     class Similarity
+      # Get Tanimoto similarity
+      # @param [Array<Array<Float>>]
+      # @return [Float]
       def self.tanimoto fingerprints
         ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
       end
@@ -23,18 +30,28 @@ module OpenTox
         #( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
       #end
+      # Get Euclidean distance
+      # @param [Array<Array<Float>>]
+      # @return [Float]
       def self.euclid scaled_properties
         sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2}
         Math.sqrt(sq.inject(0) {|s,c| s + c})
       end
-      # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
+      # Get cosine similarity
+      #   http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
+      # @param [Array<Array<Float>>]
+      # @return [Float]
       def self.cosine scaled_properties
         scaled_properties = remove_nils scaled_properties
         Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1]))
       end
-      def self.weighted_cosine scaled_properties # [a,b,weights]
+      # Get weighted cosine similarity
+      #   http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
+      # @param [Array<Array<Float>>] [a,b,weights]
+      # @return [Float]
+      def self.weighted_cosine scaled_properties
         a,b,w = remove_nils scaled_properties
         return cosine(scaled_properties) if w.uniq.size == 1
         dot_product = 0
@@ -48,6 +65,9 @@ module OpenTox
         dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b))
       end
+      # Remove nil values
+      # @param [Array<Array<Float>>] [a,b,weights]
+      # @return [Array<Array<Float>>] [a,b,weights]
       def self.remove_nils scaled_properties
         a =[]; b = []; w = []
         (0..scaled_properties.first.size-1).each do |i|

data/lib/substance.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 module OpenTox
+  # Base class for substances (e.g. compunds, nanoparticles)
   class Substance
     field :properties, type: Hash, default: {}
     field :dataset_ids, type: Array, default: []

data/lib/train-test-validation.rb CHANGED Viewed

@@ -2,11 +2,17 @@ module OpenTox
   module Validation
+    # Training test set validation
     class TrainTest < Validation
       field :training_dataset_id, type: BSON::ObjectId
       field :test_dataset_id, type: BSON::ObjectId
+      # Create a training test set validation
+      # @param [OpenTox::Model::Lazar]
+      # @param [OpenTox::Dataset] training dataset
+      # @param [OpenTox::Dataset] test dataset
+      # @return [OpenTox::Validation::TrainTest]
       def self.create model, training_set, test_set
         validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
@@ -32,16 +38,21 @@ module OpenTox
         validation
       end
+      # Get test dataset
+      # @return [OpenTox::Dataset]
       def test_dataset
         Dataset.find test_dataset_id
       end
+      # Get training dataset
+      # @return [OpenTox::Dataset]
       def training_dataset
         Dataset.find training_dataset_id
       end
     end
+    # Training test set validation for classification models
     class ClassificationTrainTest < TrainTest
       include ClassificationStatistics
       field :accept_values, type: Array
@@ -54,6 +65,7 @@ module OpenTox
       field :probability_plot_id, type: BSON::ObjectId
     end
+    # Training test set validation for regression models
     class RegressionTrainTest < TrainTest
       include RegressionStatistics
       field :rmse, type: Float, default:0