RubyGems - lazar - Versions diffs - 0.0.7 → 0.0.9 - Mend

lazar 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

checksums.yaml +4 -4
data/.gitignore +3 -0
data/README.md +2 -1
data/VERSION +1 -1
data/ext/lazar/extconf.rb +15 -76
data/ext/lazar/rinstall.R +9 -0
data/lazar.gemspec +7 -7
data/lib/classification.rb +5 -78
data/lib/compound.rb +201 -44
data/lib/crossvalidation.rb +224 -121
data/lib/dataset.rb +83 -93
data/lib/error.rb +1 -1
data/lib/experiment.rb +99 -0
data/lib/feature.rb +2 -54
data/lib/lazar.rb +47 -34
data/lib/leave-one-out-validation.rb +205 -0
data/lib/model.rb +131 -76
data/lib/opentox.rb +2 -2
data/lib/overwrite.rb +37 -0
data/lib/physchem.rb +133 -0
data/lib/regression.rb +117 -189
data/lib/rest-client-wrapper.rb +4 -5
data/lib/unique_descriptors.rb +6 -7
data/lib/validation.rb +63 -69
data/test/all.rb +2 -2
data/test/classification.rb +41 -0
data/test/compound.rb +116 -7
data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
data/test/data/batch_prediction.csv +25 -0
data/test/data/batch_prediction_inchi_small.csv +4 -0
data/test/data/batch_prediction_smiles_small.csv +4 -0
data/test/data/hamster_carcinogenicity.json +3 -0
data/test/data/loael.csv +568 -0
data/test/dataset-long.rb +5 -8
data/test/dataset.rb +31 -11
data/test/default_environment.rb +11 -0
data/test/descriptor.rb +26 -41
data/test/error.rb +1 -3
data/test/experiment.rb +301 -0
data/test/feature.rb +22 -10
data/test/lazar-long.rb +43 -23
data/test/lazar-physchem-short.rb +19 -16
data/test/prediction_models.rb +20 -0
data/test/regression.rb +43 -0
data/test/setup.rb +3 -1
data/test/test_environment.rb +10 -0
data/test/validation.rb +92 -26
metadata +64 -38
data/lib/SMARTS_InteLigand.txt +0 -983
data/lib/bbrc.rb +0 -165
data/lib/descriptor.rb +0 -247
data/lib/neighbor.rb +0 -25
data/lib/similarity.rb +0 -58
data/mongoid.yml +0 -8
data/test/descriptor-long.rb +0 -26
data/test/fminer-long.rb +0 -38
data/test/fminer.rb +0 -52
data/test/lazar-fminer.rb +0 -50
data/test/lazar-regression.rb +0 -27

data/lib/crossvalidation.rb CHANGED Viewed

@@ -6,12 +6,58 @@ module OpenTox
     field :folds, type: Integer
     field :nr_instances, type: Integer
     field :nr_unpredicted, type: Integer
-    field :predictions, type: Array
+    field :predictions, type: Array, default: []
     field :finished_at, type: Time
     def time
       finished_at - created_at
     end
+    def validations
+      validation_ids.collect{|vid| Validation.find vid}
+    end
+    def model
+      Model::Lazar.find model_id
+    end
+    def self.create model, n=10
+      model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
+      bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
+      cv = klass.new(
+        name: model.name,
+        model_id: model.id,
+        folds: n
+      )
+      cv.save # set created_at
+      nr_instances = 0
+      nr_unpredicted = 0
+      predictions = []
+      training_dataset = Dataset.find model.training_dataset_id
+      training_dataset.folds(n).each_with_index do |fold,fold_nr|
+        #fork do # parallel execution of validations
+          $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
+          t = Time.now
+          validation = Validation.create(model, fold[0], fold[1],cv)
+          $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}:  #{Time.now-t} seconds"
+        #end
+      end
+      #Process.waitall
+      cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
+      cv.validations.each do |validation|
+        nr_instances += validation.nr_instances
+        nr_unpredicted += validation.nr_unpredicted
+        predictions += validation.predictions
+      end
+      cv.update_attributes(
+        nr_instances: nr_instances,
+        nr_unpredicted: nr_unpredicted,
+        predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
+      )
+      $logger.debug "Nr unpredicted: #{nr_unpredicted}"
+      cv.statistics
+      cv
+    end
   end
   class ClassificationCrossValidation < CrossValidation
@@ -23,39 +69,41 @@ module OpenTox
     field :weighted_accuracy, type: Float
     field :true_rate, type: Hash
     field :predictivity, type: Hash
+    field :confidence_plot_id, type: BSON::ObjectId
     # TODO auc, f-measure (usability??)
-    def self.create model, n=10
-      cv = self.new
-      cv.save # set created_at
-      validation_ids = []
-      nr_instances = 0
-      nr_unpredicted = 0
-      predictions = []
-      validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
+    def statistics
       accept_values = Feature.find(model.prediction_feature_id).accept_values
       confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
       weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
       true_rate = {}
       predictivity = {}
-      fold_nr = 1
-      training_dataset = Dataset.find model.training_dataset_id
-      training_dataset.folds(n).each do |fold|
-        t = Time.now
-        $logger.debug "Fold #{fold_nr}"
-        validation = validation_class.create(model, fold[0], fold[1])
-        validation_ids << validation.id
-        nr_instances += validation.nr_instances
-        nr_unpredicted += validation.nr_unpredicted
-        predictions += validation.predictions
-        validation.confusion_matrix.each_with_index do |r,i|
-          r.each_with_index do |c,j|
-            confusion_matrix[i][j] += c
-            weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
+      predictions.each do |pred|
+        compound_id,activities,prediction,confidence = pred
+        if activities and prediction #and confidence.numeric?
+          if activities.uniq.size == 1
+            activity = activities.uniq.first
+            if prediction == activity
+              if prediction == accept_values[0]
+                confusion_matrix[0][0] += 1
+                #weighted_confusion_matrix[0][0] += confidence
+              elsif prediction == accept_values[1]
+                confusion_matrix[1][1] += 1
+                #weighted_confusion_matrix[1][1] += confidence
+              end
+            elsif prediction != activity
+              if prediction == accept_values[0]
+                confusion_matrix[0][1] += 1
+                #weighted_confusion_matrix[0][1] += confidence
+              elsif prediction == accept_values[1]
+                confusion_matrix[1][0] += 1
+                #weighted_confusion_matrix[1][0] += confidence
+              end
+            end
           end
+        else
+          nr_unpredicted += 1 if prediction.nil?
         end
-        $logger.debug "Fold #{fold_nr}:  #{Time.now-t} seconds"
-        fold_nr +=1
       end
       true_rate = {}
       predictivity = {}
@@ -64,30 +112,48 @@ module OpenTox
         predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
       end
       confidence_sum = 0
-      weighted_confusion_matrix.each do |r|
-        r.each do |c|
-          confidence_sum += c
-        end
-      end
-      cv.update_attributes(
-        name: model.name,
-        model_id: model.id,
-        folds: n,
-        validation_ids: validation_ids,
-        nr_instances: nr_instances,
-        nr_unpredicted: nr_unpredicted,
+      #weighted_confusion_matrix.each do |r|
+        #r.each do |c|
+          #confidence_sum += c
+        #end
+      #end
+      update_attributes(
         accept_values: accept_values,
         confusion_matrix: confusion_matrix,
-        weighted_confusion_matrix: weighted_confusion_matrix,
+        #weighted_confusion_matrix: weighted_confusion_matrix,
         accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
-        weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
+        #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
         true_rate: true_rate,
         predictivity: predictivity,
-        predictions: predictions.sort{|a,b| b[3] <=> a[3]}, # sort according to confidence
         finished_at: Time.now
       )
-      cv.save
-      cv
+      $logger.debug "Accuracy #{accuracy}"
+    end
+    def confidence_plot
+      unless confidence_plot_id
+        tmpfile = "/tmp/#{id.to_s}_confidence.png"
+        accuracies = []
+        confidences = []
+        correct_predictions = 0
+        incorrect_predictions = 0
+        predictions.each do |p|
+          if p[1] and p[2]
+            p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1
+            accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
+            confidences << p[3]
+          end
+        end
+        R.assign "accuracy", accuracies
+        R.assign "confidence", confidences
+        R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
+        R.eval "ggsave(file='#{tmpfile}', plot=image)"
+        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
+        plot_id = $gridfs.insert_one(file)
+        update(:confidence_plot_id => plot_id)
+      end
+      $gridfs.find_one(_id: confidence_plot_id).data
     end
     #Average area under roc  0.646
@@ -99,98 +165,135 @@ module OpenTox
     field :rmse, type: Float
     field :mae, type: Float
-    field :weighted_rmse, type: Float
-    field :weighted_mae, type: Float
+    field :r_squared, type: Float
+    field :correlation_plot_id, type: BSON::ObjectId
-    def self.create model, n=10
-      cv = self.new
-      cv.save # set created_at
-      validation_ids = []
-      nr_instances = 0
-      nr_unpredicted = 0
-      predictions = []
-      validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
-      fold_nr = 1
-      training_dataset = Dataset.find model.training_dataset_id
-      training_dataset.folds(n).each do |fold|
-        t = Time.now
-        $logger.debug "Predicting fold #{fold_nr}"
-        validation = validation_class.create(model, fold[0], fold[1])
-        validation_ids << validation.id
-        nr_instances += validation.nr_instances
-        nr_unpredicted += validation.nr_unpredicted
-        predictions += validation.predictions
-        $logger.debug "Fold #{fold_nr}:  #{Time.now-t} seconds"
-        fold_nr +=1
-      end
+    def statistics
       rmse = 0
-      weighted_rmse = 0
-      rse = 0
-      weighted_rse = 0
       mae = 0
-      weighted_mae = 0
-      rae = 0
-      weighted_rae = 0
-      n = 0
-      confidence_sum = 0
+      x = []
+      y = []
       predictions.each do |pred|
         compound_id,activity,prediction,confidence = pred
-        if activity and prediction
-          error = prediction-activity
-          rmse += error**2
-          weighted_rmse += confidence*error**2
-          mae += error.abs
-          weighted_mae += confidence*error.abs
-          n += 1
-          confidence_sum += confidence
+        if activity and prediction
+          unless activity == [nil]
+            x << -Math.log10(activity.median)
+            y << -Math.log10(prediction)
+            error = Math.log10(prediction)-Math.log10(activity.median)
+            rmse += error**2
+            #weighted_rmse += confidence*error**2
+            mae += error.abs
+            #weighted_mae += confidence*error.abs
+            #confidence_sum += confidence
+          end
         else
-          # TODO: create warnings
-          p pred
+          warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
         end
       end
-      mae = mae/n
-      weighted_mae = weighted_mae/confidence_sum
-      rmse = Math.sqrt(rmse/n)
-      weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
-      cv.update_attributes(
-        name: model.name,
-        model_id: model.id,
-        folds: n,
-        validation_ids: validation_ids,
-        nr_instances: nr_instances,
-        nr_unpredicted: nr_unpredicted,
-        predictions: predictions.sort{|a,b| b[3] <=> a[3]},
+      R.assign "measurement", x
+      R.assign "prediction", y
+      R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
+      r = R.eval("r").to_ruby
+      mae = mae/predictions.size
+      #weighted_mae = weighted_mae/confidence_sum
+      rmse = Math.sqrt(rmse/predictions.size)
+      #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
+      update_attributes(
         mae: mae,
         rmse: rmse,
-        weighted_mae: weighted_mae,
-        weighted_rmse: weighted_rmse
+        #weighted_mae: weighted_mae,
+        #weighted_rmse: weighted_rmse,
+        r_squared: r**2,
+        finished_at: Time.now
       )
-      cv.save
-      cv
+      $logger.debug "R^2 #{r**2}"
+      $logger.debug "RMSE #{rmse}"
+      $logger.debug "MAE #{mae}"
+    end
+    def misclassifications n=nil
+      #n = predictions.size unless n
+      n ||= 10
+      model = Model::Lazar.find(self.model_id)
+      training_dataset = Dataset.find(model.training_dataset_id)
+      prediction_feature = training_dataset.features.first
+      predictions.collect do |p|
+        unless p.include? nil
+          compound = Compound.find(p[0])
+          neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
+          neighbors.collect! do |n|
+            neighbor = Compound.find(n[0])
+            values = training_dataset.values(neighbor,prediction_feature)
+            { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
+          end
+          {
+            :smiles => compound.smiles,
+            #:fingerprint => compound.fp4.collect{|id|  Smarts.find(id).name},
+            :measured => p[1],
+            :predicted => p[2],
+            #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
+            :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
+            :relative_error => (p[1]-p[2]).abs/p[1],
+            :confidence => p[3],
+            :neighbors => neighbors
+          }
+        end
+      end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1]
+    end
+    def confidence_plot
+      tmpfile = "/tmp/#{id.to_s}_confidence.png"
+      sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
+      R.assign "error", sorted_predictions.collect{|p| p[0]}
+      R.assign "confidence", sorted_predictions.collect{|p| p[1]}
+      # TODO fix axis names
+      R.eval "image = qplot(confidence,error)"
+      R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
+      R.eval "ggsave(file='#{tmpfile}', plot=image)"
+      file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
+      plot_id = $gridfs.insert_one(file)
+      update(:confidence_plot_id => plot_id)
+      $gridfs.find_one(_id: confidence_plot_id).data
+    end
+    def correlation_plot
+      unless correlation_plot_id
+        tmpfile = "/tmp/#{id.to_s}_correlation.png"
+        x = predictions.collect{|p| p[1]}
+        y = predictions.collect{|p| p[2]}
+        attributes = Model::Lazar.find(self.model_id).attributes
+        attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
+        attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
+        R.assign "measurement", x
+        R.assign "prediction", y
+        R.eval "all = c(-log(measurement),-log(prediction))"
+        R.eval "range = c(min(all), max(all))"
+        R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
+        R.eval "image = image + geom_abline(intercept=0, slope=1)"
+        R.eval "ggsave(file='#{tmpfile}', plot=image)"
+        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png")
+        plot_id = $gridfs.insert_one(file)
+        update(:correlation_plot_id => plot_id)
+      end
+      $gridfs.find_one(_id: correlation_plot_id).data
     end
+  end
-    def plot
-      # RMSE
-      x = predictions.collect{|p| p[1]}
-      y = predictions.collect{|p| p[2]}
-      R.assign "Measurement", x
-      R.assign "Prediction", y
-      R.eval "par(pty='s')" # sets the plot type to be square
-      #R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))"
-      #R.eval "error <- log(Measurement)-log(Prediction)"
-      R.eval "error <- Measurement-Prediction"
-      R.eval "rmse <- sqrt(mean(error^2,na.rm=T))"
-      R.eval "mae <- mean( abs(error), na.rm = TRUE)"
-      R.eval "r <- cor(log(Prediction),log(Measurement))"
-      R.eval "svg(filename='/tmp/#{id.to_s}.svg')"
-      R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)"
-      #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)"
-      #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)"
-      R.eval "abline(0,1,col='blue')"
-      #R.eval "abline(fitline,col='red')"
-      R.eval "dev.off()"
-      "/tmp/#{id.to_s}.svg"
+  class RepeatedCrossValidation
+    field :crossvalidation_ids, type: Array, default: []
+    def self.create model, folds=10, repeats=3
+      repeated_cross_validation = self.new
+      repeats.times do |n|
+        $logger.debug "Crossvalidation #{n+1} for #{model.name}"
+        repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
+      end
+      repeated_cross_validation.save
+      repeated_cross_validation
+    end
+    def crossvalidations
+      crossvalidation_ids.collect{|id| CrossValidation.find(id)}
     end
   end