RubyGems - lazar - Versions diffs - 0.0.7 → 0.0.9 - Mend

lazar 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

checksums.yaml +4 -4
data/.gitignore +3 -0
data/README.md +2 -1
data/VERSION +1 -1
data/ext/lazar/extconf.rb +15 -76
data/ext/lazar/rinstall.R +9 -0
data/lazar.gemspec +7 -7
data/lib/classification.rb +5 -78
data/lib/compound.rb +201 -44
data/lib/crossvalidation.rb +224 -121
data/lib/dataset.rb +83 -93
data/lib/error.rb +1 -1
data/lib/experiment.rb +99 -0
data/lib/feature.rb +2 -54
data/lib/lazar.rb +47 -34
data/lib/leave-one-out-validation.rb +205 -0
data/lib/model.rb +131 -76
data/lib/opentox.rb +2 -2
data/lib/overwrite.rb +37 -0
data/lib/physchem.rb +133 -0
data/lib/regression.rb +117 -189
data/lib/rest-client-wrapper.rb +4 -5
data/lib/unique_descriptors.rb +6 -7
data/lib/validation.rb +63 -69
data/test/all.rb +2 -2
data/test/classification.rb +41 -0
data/test/compound.rb +116 -7
data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
data/test/data/batch_prediction.csv +25 -0
data/test/data/batch_prediction_inchi_small.csv +4 -0
data/test/data/batch_prediction_smiles_small.csv +4 -0
data/test/data/hamster_carcinogenicity.json +3 -0
data/test/data/loael.csv +568 -0
data/test/dataset-long.rb +5 -8
data/test/dataset.rb +31 -11
data/test/default_environment.rb +11 -0
data/test/descriptor.rb +26 -41
data/test/error.rb +1 -3
data/test/experiment.rb +301 -0
data/test/feature.rb +22 -10
data/test/lazar-long.rb +43 -23
data/test/lazar-physchem-short.rb +19 -16
data/test/prediction_models.rb +20 -0
data/test/regression.rb +43 -0
data/test/setup.rb +3 -1
data/test/test_environment.rb +10 -0
data/test/validation.rb +92 -26
metadata +64 -38
data/lib/SMARTS_InteLigand.txt +0 -983
data/lib/bbrc.rb +0 -165
data/lib/descriptor.rb +0 -247
data/lib/neighbor.rb +0 -25
data/lib/similarity.rb +0 -58
data/mongoid.yml +0 -8
data/test/descriptor-long.rb +0 -26
data/test/fminer-long.rb +0 -38
data/test/fminer.rb +0 -52
data/test/lazar-fminer.rb +0 -50
data/test/lazar-regression.rb +0 -27

data/lib/validation.rb CHANGED Viewed

@@ -2,7 +2,9 @@ module OpenTox
   class Validation
+    field :model_id, type: BSON::ObjectId
     field :prediction_dataset_id, type: BSON::ObjectId
+    field :crossvalidation_id, type: BSON::ObjectId
     field :test_dataset_id, type: BSON::ObjectId
     field :nr_instances, type: Integer
     field :nr_unpredicted, type: Integer
@@ -16,98 +18,90 @@ module OpenTox
       Dataset.find test_dataset_id
     end
-  end
+    def model
+      Model::Lazar.find model_id
+    end
-  class ClassificationValidation < Validation
-    field :accept_values, type: String
-    field :confusion_matrix, type: Array
-    field :weighted_confusion_matrix, type: Array
+    def self.create model, training_set, test_set, crossvalidation=nil
+      atts = model.attributes.dup # do not modify attributes from original model
+      atts["_id"] = BSON::ObjectId.new
+      atts[:training_dataset_id] = training_set.id
+      validation_model = model.class.create training_set, atts
+      validation_model.save
+      cids = test_set.compound_ids
-    def self.create model, training_set, test_set
-      validation = self.class.new
-      #feature_dataset = Dataset.find model.feature_dataset_id
-      # TODO check and delegate to Algorithm
-      #features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
-      validation_model = model.class.create training_set#, features
-      test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
+      test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used
       prediction_dataset = validation_model.predict test_set_without_activities
-      accept_values = prediction_dataset.prediction_feature.accept_values
-      confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
-      weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
       predictions = []
       nr_unpredicted = 0
-      prediction_dataset.data_entries.each_with_index do |pe,i|
-        if pe[0] and pe[1] and pe[1].numeric?
-          prediction = pe[0]
-          # TODO prediction_feature, convention??
-          # TODO generalize for multiple classes
-          activity = test_set.data_entries[i].first
-          confidence = prediction_dataset.data_entries[i][1]
-          predictions << [prediction_dataset.compound_ids[i], activity, prediction, confidence]
-          if prediction == activity
-            if prediction == accept_values[0]
-              confusion_matrix[0][0] += 1
-              weighted_confusion_matrix[0][0] += confidence
-            elsif prediction == accept_values[1]
-              confusion_matrix[1][1] += 1
-              weighted_confusion_matrix[1][1] += confidence
-            end
-          elsif prediction != activity
-            if prediction == accept_values[0]
-              confusion_matrix[0][1] += 1
-              weighted_confusion_matrix[0][1] += confidence
-            elsif prediction == accept_values[1]
-              confusion_matrix[1][0] += 1
-              weighted_confusion_matrix[1][0] += confidence
-            end
-          end
+      activities = test_set.data_entries.collect{|de| de.first}
+      prediction_dataset.data_entries.each_with_index do |de,i|
+        if de[0] #and de[1]
+          cid = prediction_dataset.compound_ids[i]
+          rows = cids.each_index.select{|r| cids[r] == cid }
+          activities = rows.collect{|r| test_set.data_entries[r][0]}
+          prediction = de.first
+          confidence = de[1]
+          predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]]
         else
-          nr_unpredicted += 1 if pe[0].nil?
+          nr_unpredicted += 1
         end
       end
       validation = self.new(
+        :model_id => validation_model.id,
         :prediction_dataset_id => prediction_dataset.id,
         :test_dataset_id => test_set.id,
         :nr_instances => test_set.compound_ids.size,
         :nr_unpredicted => nr_unpredicted,
-        :accept_values => accept_values,
-        :confusion_matrix => confusion_matrix,
-        :weighted_confusion_matrix => weighted_confusion_matrix,
-        :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
+        :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
       )
+      validation.crossvalidation_id = crossvalidation.id if crossvalidation
       validation.save
       validation
     end
+  end
+  class ClassificationValidation < Validation
   end
   class RegressionValidation < Validation
-    def self.create model, training_set, test_set
-      validation_model = Model::LazarRegression.create training_set
-      test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
-      prediction_dataset = validation_model.predict test_set_without_activities
-      predictions = []
-      nr_unpredicted = 0
-      activities = test_set.data_entries.collect{|de| de.first}
-      prediction_dataset.data_entries.each_with_index do |de,i|
-        if de[0] and de[1] and de[1].numeric?
-          activity = activities[i]
-          prediction = de.first
-          confidence = de[1]
-          predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence]
+    def statistics
+      rmse = 0
+      weighted_rmse = 0
+      rse = 0
+      weighted_rse = 0
+      mae = 0
+      weighted_mae = 0
+      confidence_sum = 0
+      predictions.each do |pred|
+        compound_id,activity,prediction,confidence = pred
+        if activity and prediction
+          error = Math.log10(prediction)-Math.log10(activity.median)
+          rmse += error**2
+          weighted_rmse += confidence*error**2
+          mae += error.abs
+          weighted_mae += confidence*error.abs
+          confidence_sum += confidence
         else
-          nr_unpredicted += 1
+          warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
         end
       end
-      validation = self.new(
-        :prediction_dataset_id => prediction_dataset.id,
-        :test_dataset_id => test_set.id,
-        :nr_instances => test_set.compound_ids.size,
-        :nr_unpredicted => nr_unpredicted,
-        :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
-      )
-      validation.save
-      validation
+      x = predictions.collect{|p| p[1].median}
+      y = predictions.collect{|p| p[2]}
+      R.assign "measurement", x
+      R.assign "prediction", y
+      R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
+      r = R.eval("r").to_ruby
+      mae = mae/predictions.size
+      weighted_mae = weighted_mae/confidence_sum
+      rmse = Math.sqrt(rmse/predictions.size)
+      weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
+      { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
     end
   end

data/test/all.rb CHANGED Viewed

@@ -1,5 +1,5 @@
-exclude = ["./setup.rb","./all.rb"]
+# "./default_environment.rb" has to be executed separately
+exclude = ["./setup.rb","./all.rb", "./default_environment.rb"]
 (Dir[File.join(File.dirname(__FILE__),"*.rb")]-exclude).each do |test|
-  p test
   require_relative test
 end

data/test/classification.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require_relative "setup.rb"
+class LazarClassificationTest < MiniTest::Test
+  def test_lazar_classification
+    training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
+    model = Model::LazarClassification.create training_dataset
+    [ {
+      :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
+      :prediction => "false",
+      :confidence => 0.25281385281385277,
+      :nr_neighbors => 11
+    },{
+      :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
+      :prediction => "false",
+      :confidence => 0.3639589577089577,
+      :nr_neighbors => 14
+    } ].each do |example|
+      prediction = model.predict example[:compound]
+      assert_equal example[:prediction], prediction[:value]
+      #assert_equal example[:confidence], prediction[:confidence]
+      #assert_equal example[:nr_neighbors], prediction[:neighbors].size
+    end
+    compound = Compound.from_smiles "CCO"
+    prediction = model.predict compound
+    assert_equal ["false"], prediction[:database_activities]
+    assert_equal "true", prediction[:value]
+    # make a dataset prediction
+    compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
+    prediction = model.predict compound_dataset
+    assert_equal compound_dataset.compounds, prediction.compounds
+    assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.data_entries[7][3]
+    assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.data_entries[14][3]
+    # cleanup
+    [training_dataset,model,compound_dataset].each{|o| o.delete}
+  end
+end

data/test/compound.rb CHANGED Viewed

@@ -54,7 +54,6 @@ print c.sdf
   def test_inchikey
     c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
-    p c
     assert_equal "UHOVQNZJYSORNB-UHFFFAOYSA-N", c.inchikey
   end
@@ -65,8 +64,7 @@ print c.sdf
   def test_chemblid
     c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
-    #assert_equal "CHEMBL277500", c.chemblid
-    assert_equal "CHEMBL581676", c.chemblid
+    assert_equal "CHEMBL277500", c.chemblid
   end
   def test_sdf_storage
@@ -78,17 +76,17 @@ print c.sdf
   def test_fingerprint
     c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
-    assert c.fp4.collect{|fid| Feature.find(fid).name}.include? ("1,3-Tautomerizable")
-    assert_equal c.fp4.size, c.fp4_size
+    assert_equal 9, c.fingerprint("FP4").size
   end
   def test_neighbors
     d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
     d.compounds.each do |c|
-      refute_nil c.fp4
+      refute_nil c.fingerprint("MP2D")
     end
     c = d.compounds[371]
-    assert c.neighbors.size >= 19
+    n = c.fingerprint_neighbors({:type => "FP4", :min_sim => 0.7, :training_dataset_id => d.id })
+    assert n.size >= 18, "Neighbors size (#{n.size}) should be larger than 17"
   end
   def test_openbabel_segfault
@@ -97,4 +95,115 @@ print c.sdf
     c = Compound.from_inchi(inchi)
     assert_equal inchi, c.inchi
   end
+  def test_openbabel_fingerprint
+    [
+      "CC(=O)CC(C)C#N",
+      "CC(=O)CC(C)C",
+      "C(=O)CC(C)C#N",
+    ].each do |smi|
+      c = OpenTox::Compound.from_smiles smi
+      refute_nil c.fingerprint("FP4")
+    end
+  end
+  def test_fingerprint_neighbors
+    types = ["FP2", "FP3", "FP4", "MACCS"]
+    min_sim = 0.7
+    training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
+    [
+      "CC(=O)CC(C)C#N",
+      "CC(=O)CC(C)C",
+      "C(=O)CC(C)C#N",
+    ].each do |smi|
+      c = OpenTox::Compound.from_smiles smi
+      types.each do |type|
+        neighbors = c.fingerprint_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim})
+        unless type == "FP2" and smi == "CC(=O)CC(C)C#N" or smi == "C(=O)CC(C)C#N" and (type == "FP2" or type == "MACCS")
+          refute_empty neighbors
+        end
+      end
+    end
+  end
+  def test_mna
+    c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
+    assert_equal 18, c.fingerprint("MNA").size
+    assert_equal 9, c.fingerprint("MNA").uniq.size
+  end
+  def test_mpd
+    c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
+    assert 13, c.fingerprint("MP2D").size
+    assert 7, c.fingerprint("MP2D").uniq.size
+  end
+  def test_fingerprint_count_neighbors
+    types = ["MP2D", "MNA"]
+    min_sim = 0.0
+    training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
+    [
+      "CC(=O)CC(C)C#N",
+      "CC(=O)CC(C)C",
+      "C(=O)CC(C)C#N",
+    ].each do |smi|
+      c = OpenTox::Compound.from_smiles smi
+      types.each do |type|
+        neighbors = c.fingerprint_count_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim})
+        if type == "FP4"
+          fp4_neighbors = c.neighbors
+          neighbors.each do |n|
+            assert_includes fp4_neighbors, n
+          end
+        end
+      end
+    end
+  end
+  def test_fingerprint_db_neighbors
+    #skip
+    training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
+    [
+      "CC(=O)CC(C)C#N",
+      "CC(=O)CC(C)C",
+      "C(=O)CC(C)C#N",
+    ].each do |smi|
+      c = OpenTox::Compound.from_smiles smi
+      t = Time.now
+      neighbors = c.db_neighbors(:training_dataset_id => training_dataset.id, :min_sim => 0.2)
+      p Time.now - t
+      t = Time.now
+      neighbors2 = c.fingerprint_neighbors({:type => "MP2D", :training_dataset_id => training_dataset.id, :min_sim => 0.2})
+      p Time.now - t
+      p neighbors.size
+      p neighbors2.size
+      #p neighbors
+      #p neighbors2
+      #p neighbors2 - neighbors
+      #assert_equal neighbors, neighbors2
+    end
+  end
+  def test_molecular_weight
+    c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C"
+    assert_equal 100.15888, c.molecular_weight
+  end
+  def test_mg_conversions
+    # TODO fix!
+    skip
+    c = OpenTox::Compound.from_smiles "O"
+    mw = c.molecular_weight
+    assert_equal 18.01528, mw
+    assert_equal 0.8105107141417474, c.logmmol_to_mg(4.34688225631145, mw)
+    assert_equal 9007.64, c.mmol_to_mg(500, mw)
+    assert_equal 2437.9999984148976, c.logmg_to_mg(3.387033701)
+  end
+  def test_physchem
+    c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C"
+    assert_equal PhysChem::OBDESCRIPTORS.size, c.physchem.size
+    assert_equal PhysChem::OBDESCRIPTORS.size, c.physchem(PhysChem.openbabel_descriptors).size
+    assert_equal PhysChem::unique_descriptors.size, c.physchem(PhysChem.unique_descriptors).size
+  end
 end