RubyGems - lazar - Versions diffs - 0.0.1 - Mend

lazar 0.0.1

Files changed (98) hide show

checksums.yaml +7 -0
data/.gitignore +10 -0
data/.yardopts +4 -0
data/Gemfile +2 -0
data/LICENSE +674 -0
data/README.md +44 -0
data/Rakefile +1 -0
data/VERSION +1 -0
data/ext/lazar/extconf.rb +87 -0
data/java/CdkDescriptorInfo.class +0 -0
data/java/CdkDescriptorInfo.java +22 -0
data/java/CdkDescriptors.class +0 -0
data/java/CdkDescriptors.java +141 -0
data/java/Jmol.jar +0 -0
data/java/JoelibDescriptorInfo.class +0 -0
data/java/JoelibDescriptorInfo.java +15 -0
data/java/JoelibDescriptors.class +0 -0
data/java/JoelibDescriptors.java +60 -0
data/java/Rakefile +15 -0
data/java/cdk-1.4.19.jar +0 -0
data/java/joelib2.jar +0 -0
data/java/log4j.jar +0 -0
data/lazar.gemspec +29 -0
data/lib/SMARTS_InteLigand.txt +983 -0
data/lib/algorithm.rb +21 -0
data/lib/bbrc.rb +165 -0
data/lib/classification.rb +107 -0
data/lib/compound.rb +254 -0
data/lib/crossvalidation.rb +187 -0
data/lib/dataset.rb +334 -0
data/lib/descriptor.rb +247 -0
data/lib/error.rb +66 -0
data/lib/feature.rb +97 -0
data/lib/lazar-model.rb +170 -0
data/lib/lazar.rb +69 -0
data/lib/neighbor.rb +25 -0
data/lib/opentox.rb +22 -0
data/lib/overwrite.rb +119 -0
data/lib/regression.rb +199 -0
data/lib/rest-client-wrapper.rb +98 -0
data/lib/similarity.rb +58 -0
data/lib/unique_descriptors.rb +120 -0
data/lib/validation.rb +114 -0
data/mongoid.yml +8 -0
data/test/all.rb +5 -0
data/test/compound.rb +100 -0
data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +13553 -0
data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +436 -0
data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +568 -0
data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +87 -0
data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +978 -0
data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +1120 -0
data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +1113 -0
data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +850 -0
data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +829 -0
data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +1198 -0
data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +1505 -0
data/test/data/EPAFHM.csv +618 -0
data/test/data/EPAFHM.medi.csv +100 -0
data/test/data/EPAFHM.mini.csv +22 -0
data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +581 -0
data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +1217 -0
data/test/data/ISSCAN-multi.csv +59 -0
data/test/data/LOAEL_log_mg_corrected_smiles.csv +568 -0
data/test/data/LOAEL_log_mmol_corrected_smiles.csv +568 -0
data/test/data/acetaldehyde.sdf +14 -0
data/test/data/boiling_points.ext.sdf +11460 -0
data/test/data/cpdb_100.csv +101 -0
data/test/data/hamster_carcinogenicity.csv +86 -0
data/test/data/hamster_carcinogenicity.mini.bool_float.csv +11 -0
data/test/data/hamster_carcinogenicity.mini.bool_int.csv +11 -0
data/test/data/hamster_carcinogenicity.mini.bool_string.csv +11 -0
data/test/data/hamster_carcinogenicity.mini.csv +11 -0
data/test/data/hamster_carcinogenicity.ntriples +618 -0
data/test/data/hamster_carcinogenicity.sdf +2805 -0
data/test/data/hamster_carcinogenicity.xls +0 -0
data/test/data/hamster_carcinogenicity.yaml +352 -0
data/test/data/hamster_carcinogenicity_with_errors.csv +88 -0
data/test/data/kazius.csv +4070 -0
data/test/data/multi_cell_call.csv +1067 -0
data/test/data/multi_cell_call_no_dup.csv +1057 -0
data/test/data/multicolumn.csv +8 -0
data/test/data/rat_feature_dataset.csv +1179 -0
data/test/data/wrong_dataset.csv +8 -0
data/test/dataset-long.rb +117 -0
data/test/dataset.rb +199 -0
data/test/descriptor-long.rb +26 -0
data/test/descriptor.rb +83 -0
data/test/error.rb +24 -0
data/test/feature.rb +65 -0
data/test/fminer-long.rb +38 -0
data/test/fminer.rb +52 -0
data/test/lazar-fminer.rb +50 -0
data/test/lazar-long.rb +72 -0
data/test/lazar-physchem-short.rb +27 -0
data/test/setup.rb +6 -0
data/test/validation.rb +41 -0
metadata +212 -0

data/lib/crossvalidation.rb ADDED Viewed

@@ -0,0 +1,187 @@
+module OpenTox
+  class CrossValidation
+    field :validation_ids, type: Array, default: []
+    field :folds, type: Integer
+    field :nr_instances, type: Integer
+    field :nr_unpredicted, type: Integer
+    field :predictions, type: Array
+    field :finished_at, type: Time
+  end
+  class ClassificationCrossValidation < CrossValidation
+    field :accept_values, type: Array
+    field :confusion_matrix, type: Array
+    field :weighted_confusion_matrix, type: Array
+    field :accuracy, type: Float
+    field :weighted_accuracy, type: Float
+    field :true_rate, type: Hash
+    field :predictivity, type: Hash
+    # TODO auc, f-measure (usability??)
+    def self.create model, n=10
+      cv = self.new
+      validation_ids = []
+      nr_instances = 0
+      nr_unpredicted = 0
+      predictions = []
+      validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
+      accept_values = Feature.find(model.prediction_feature_id).accept_values
+      confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+      weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+      true_rate = {}
+      predictivity = {}
+      fold_nr = 1
+      training_dataset = Dataset.find model.training_dataset_id
+      training_dataset.folds(n).each do |fold|
+        t = Time.now
+        $logger.debug "Fold #{fold_nr}"
+        validation = validation_class.create(model, fold[0], fold[1])
+        validation_ids << validation.id
+        nr_instances += validation.nr_instances
+        nr_unpredicted += validation.nr_unpredicted
+        predictions += validation.predictions
+        validation.confusion_matrix.each_with_index do |r,i|
+          r.each_with_index do |c,j|
+            confusion_matrix[i][j] += c
+            weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
+          end
+        end
+        $logger.debug "Fold #{fold_nr}:  #{Time.now-t} seconds"
+        fold_nr +=1
+      end
+      true_rate = {}
+      predictivity = {}
+      accept_values.each_with_index do |v,i|
+        true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
+        predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+      end
+      confidence_sum = 0
+      weighted_confusion_matrix.each do |r|
+        r.each do |c|
+          confidence_sum += c
+        end
+      end
+      cv.update_attributes(
+        nr_instances: nr_instances,
+        nr_unpredicted: nr_unpredicted,
+        accept_values: accept_values,
+        confusion_matrix: confusion_matrix,
+        weighted_confusion_matrix: weighted_confusion_matrix,
+        accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
+        weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
+        true_rate: true_rate,
+        predictivity: predictivity,
+        predictions: predictions.sort{|a,b| b[3] <=> a[3]}, # sort according to confidence
+        finished_at: Time.now
+      )
+      cv.save
+      cv
+    end
+    #Average area under roc  0.646
+    #Area under roc  0.646
+    #F measure carcinogen: 0.769, noncarcinogen: 0.348
+  end
+  class RegressionCrossValidation < Validation
+    field :validation_ids, type: Array, default: []
+    field :folds, type: Integer
+    field :rmse, type: Float
+    field :mae, type: Float
+    field :weighted_rmse, type: Float
+    field :weighted_mae, type: Float
+    def self.create model, n=10
+      cv = self.new
+      validation_ids = []
+      nr_instances = 0
+      nr_unpredicted = 0
+      predictions = []
+      validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
+      fold_nr = 1
+      training_dataset = Dataset.find model.training_dataset_id
+      training_dataset.folds(n).each do |fold|
+        t = Time.now
+        $logger.debug "Predicting fold #{fold_nr}"
+        validation = validation_class.create(model, fold[0], fold[1])
+        validation_ids << validation.id
+        nr_instances += validation.nr_instances
+        nr_unpredicted += validation.nr_unpredicted
+        predictions += validation.predictions
+        $logger.debug "Fold #{fold_nr}:  #{Time.now-t} seconds"
+        fold_nr +=1
+      end
+      rmse = 0
+      weighted_rmse = 0
+      rse = 0
+      weighted_rse = 0
+      mae = 0
+      weighted_mae = 0
+      rae = 0
+      weighted_rae = 0
+      n = 0
+      confidence_sum = 0
+      predictions.each do |pred|
+        compound_id,activity,prediction,confidence = pred
+        if activity and prediction
+          error = prediction-activity
+          rmse += error**2
+          weighted_rmse += confidence*error**2
+          mae += error.abs
+          weighted_mae += confidence*error.abs
+          n += 1
+          confidence_sum += confidence
+        else
+          # TODO: create warnings
+          p pred
+        end
+      end
+      mae = mae/n
+      weighted_mae = weighted_mae/confidence_sum
+      rmse = Math.sqrt(rmse/n)
+      weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
+      cv.update_attributes(
+        folds: n,
+        validation_ids: validation_ids,
+        nr_instances: nr_instances,
+        nr_unpredicted: nr_unpredicted,
+        predictions: predictions.sort{|a,b| b[3] <=> a[3]},
+        mae: mae,
+        rmse: rmse,
+        weighted_mae: weighted_mae,
+        weighted_rmse: weighted_rmse
+      )
+      cv.save
+      cv
+    end
+    def plot
+      # RMSE
+      x = predictions.collect{|p| p[1]}
+      y = predictions.collect{|p| p[2]}
+      R.assign "Measurement", x
+      R.assign "Prediction", y
+      R.eval "par(pty='s')" # sets the plot type to be square
+      #R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))"
+      #R.eval "error <- log(Measurement)-log(Prediction)"
+      R.eval "error <- Measurement-Prediction"
+      R.eval "rmse <- sqrt(mean(error^2,na.rm=T))"
+      R.eval "mae <- mean( abs(error), na.rm = TRUE)"
+      R.eval "r <- cor(log(Prediction),log(Measurement))"
+      R.eval "svg(filename='/tmp/#{id.to_s}.svg')"
+      R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)"
+      #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)"
+      #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)"
+      R.eval "abline(0,1,col='blue')"
+      #R.eval "abline(fitline,col='red')"
+      R.eval "dev.off()"
+      "/tmp/#{id.to_s}.svg"
+    end
+  end
+end

data/lib/dataset.rb ADDED Viewed

@@ -0,0 +1,334 @@
+require 'csv'
+require 'tempfile'
+module OpenTox
+  class Dataset
+    attr_writer :data_entries
+    # associations like has_many, belongs_to deteriorate performance
+    field :feature_ids, type: Array, default: []
+    field :compound_ids, type: Array, default: []
+    field :data_entries_id, type: BSON::ObjectId, default: []
+    field :source, type: String
+    field :warnings, type: Array, default: []
+    # Save all data including data_entries
+    # Should be used instead of save
+    def save_all
+      dump = Marshal.dump(@data_entries)
+      file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries")
+      data_entries_id = $gridfs.insert_one(file)
+      update(:data_entries_id => data_entries_id)
+      save
+    end
+    # Readers
+    # Get all compounds
+    def compounds
+      @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id}
+      @compounds
+    end
+    # Get all features
+    def features
+      @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)}
+      @features
+    end
+    # Get all data_entries
+    def data_entries
+      unless @data_entries
+        t = Time.now
+        data_entry_file = $gridfs.find_one(_id: data_entries_id)
+        if data_entry_file.nil?
+          @data_entries = []
+        else
+          @data_entries = Marshal.load(data_entry_file.data)
+          bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
+          bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
+          bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries..first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
+          $logger.debug "Retrieving data: #{Time.now-t}"
+        end
+      end
+      @data_entries
+    end
+    # Find data entry values for a given compound and feature
+    # @param compound [OpenTox::Compound] OpenTox Compound object
+    # @param feature [OpenTox::Feature] OpenTox Feature object
+    # @return [Array] Data entry values
+    def values(compound, feature)
+      rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id }
+      col = feature_ids.index feature.id
+      rows.collect{|row| data_entries[row][col]}
+    end
+    # Writers
+    # Set compounds
+    def compounds=(compounds)
+      self.compound_ids = compounds.collect{|c| c.id}
+    end
+    # Set features
+    def features=(features)
+      self.feature_ids = features.collect{|f| f.id}
+    end
+    # Dataset operations
+    # Split a dataset into n folds
+    # @param [Integer] number of folds
+    # @return [Array] Array with folds [training_dataset,test_dataset]
+    def folds n
+      len = self.compound_ids.size
+      indices = (0..len-1).to_a.shuffle
+      mid = (len/n)
+      chunks = []
+      start = 0
+      1.upto(n) do |i|
+        last = start+mid
+        last = last-1 unless len%n >= i
+        test_idxs = indices[start..last] || []
+        test_cids = test_idxs.collect{|i| self.compound_ids[i]}
+        test_data_entries = test_idxs.collect{|i| self.data_entries[i]}
+        test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries)
+        training_idxs = indices-test_idxs
+        training_cids = training_idxs.collect{|i| self.compound_ids[i]}
+        training_data_entries = training_idxs.collect{|i| self.data_entries[i]}
+        training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries)
+        test_dataset.save_all
+        training_dataset.save_all
+        chunks << [training_dataset,test_dataset]
+        start = last+1
+      end
+      chunks
+    end
+    # Diagnostics
+    def correlation_plot training_dataset
+      # TODO: create/store svg
+      R.assign "features", data_entries
+      R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
+      R.eval "featurePlot(features,activities)"
+    end
+    def density_plot
+      # TODO: create/store svg
+      R.assign "acts", data_entries.collect{|r| r.first }#.compact
+      R.eval "plot(density(log(acts),na.rm= TRUE), main='log(#{features.first.name})')"
+    end
+    # Serialisation
+    # converts dataset to csv format including compound smiles as first column, other column headers are feature titles
+    # @return [String]
+    def to_csv(inchi=false)
+      CSV.generate() do |csv| #{:force_quotes=>true}
+        csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.title}
+        compounds.each_with_index do |c,i|
+          csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
+        end
+      end
+    end
+    # Parsers
+    # Create a dataset from file (csv,sdf,...)
+    # @param filename [String]
+    # @return [String] dataset uri
+    # TODO
+    #def self.from_sdf_file
+    #end
+    # Create a dataset from CSV file
+    # TODO: document structure
+    def self.from_csv_file file, source=nil, bioassay=true
+      source ||= file
+      table = CSV.read file, :skip_blanks => true
+      dataset = self.new(:source => source, :name => File.basename(file))
+      dataset.parse_table table, bioassay
+      dataset
+    end
+    # parse data in tabular format (e.g. from csv)
+    # does a lot of guesswork in order to determine feature types
+    def parse_table table, bioassay=true
+      time = Time.now
+      # features
+      feature_names = table.shift.collect{|f| f.strip}
+      warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size
+      compound_format = feature_names.shift.strip
+      bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
+      numeric = []
+      # guess feature types
+      feature_names.each_with_index do |f,i|
+        metadata = {:name => f}
+        values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
+        types = values.collect{|v| v.numeric? ? true : false}.uniq
+        if values.size == 0 # empty feature
+        elsif  values.size > 5 and types.size == 1 and types.first == true # 5 max classes
+          metadata["numeric"] = true
+          numeric[i] = true
+        else
+          metadata["nominal"] = true
+          metadata["accept_values"] = values
+          numeric[i] = false
+        end
+        if bioassay
+          if metadata["numeric"]
+            feature = NumericBioAssay.find_or_create_by(metadata)
+          elsif metadata["nominal"]
+            feature = NominalBioAssay.find_or_create_by(metadata)
+          end
+        else
+          metadata.merge({:measured => false, :calculated => true})
+          if metadata["numeric"]
+            feature = NumericFeature.find_or_create_by(metadata)
+          elsif metadata["nominal"]
+            feature = NominalFeature.find_or_create_by(metadata)
+          end
+        end
+        feature_ids << feature.id
+      end
+      $logger.debug "Feature values: #{Time.now-time}"
+      time = Time.now
+      r = -1
+      compound_time = 0
+      value_time = 0
+      # compounds and values
+      @data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)}
+      table.each_with_index do |vals,i|
+        ct = Time.now
+        identifier = vals.shift
+        warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
+        begin
+          case compound_format
+          when /SMILES/i
+            compound = OpenTox::Compound.from_smiles(identifier)
+          when /InChI/i
+            compound = OpenTox::Compound.from_inchi(identifier)
+          end
+        rescue
+          compound = nil
+        end
+        if compound.nil?
+          # compound parsers may return nil
+          warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
+          next
+        end
+        # TODO insert empty compounds to keep positions?
+        compound_time += Time.now-ct
+        r += 1
+        unless vals.size == feature_ids.size # way cheaper than accessing features
+          warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
+          next
+        end
+        compound_ids << compound.id
+        @data_entries << Array.new(table.first.size-1)
+        vals.each_with_index do |v,j|
+          if v.blank?
+            warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
+            next
+          elsif numeric[j]
+            @data_entries.last[j] = v.to_f
+          else
+            @data_entries.last[j] = v.strip
+          end
+        end
+      end
+      compounds.duplicates.each do |compound|
+        positions = []
+        compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi == compound.inchi}
+        warnings << "Duplicate compound #{compound.inchi} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
+      end
+      $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
+      time = Time.now
+      save_all
+      $logger.debug "Saving: #{Time.now-time}"
+    end
+=begin
+    # TODO remove
+    # Create a dataset with compounds and features
+    def self.create compounds, features, warnings=[], source=nil
+      dataset = Dataset.new(:warnings => warnings)
+      dataset.compounds = compounds
+      dataset.features = features
+      dataset
+    end
+    # merge dataset (i.e. append features)
+    def +(dataset)
+      bad_request_error "Dataset merge failed because the argument is not a OpenTox::Dataset but a #{dataset.class}" unless dataset.is_a? Dataset
+      bad_request_error "Dataset merge failed because compounds are unequal in datasets #{self.id} and #{dataset.id}" unless compound_ids == dataset.compound_ids
+      self.feature_ids ||= []
+      self.feature_ids = self.feature_ids + dataset.feature_ids
+      @data_entries ||= Array.new(compound_ids.size){[]}
+      @data_entries.each_with_index do |row,i|
+        @data_entries[i] = row + dataset.fingerprint(compounds[i])
+      end
+      self
+    end
+    def fingerprint(compound)
+      i = compound_ids.index(compound.id)
+      i.nil? ? nil : data_entries[i]
+    end
+=end
+    # Fill unset data entries
+    # @param any value
+    def fill_nil_with n
+      (0 .. compound_ids.size-1).each do |i|
+        @data_entries[i] ||= []
+        (0 .. feature_ids.size-1).each do |j|
+          @data_entries[i][j] ||= n
+        end
+      end
+    end
+  end
+  # Dataset for lazar predictions
+  class LazarPrediction < Dataset
+    field :creator, type: String
+    field :prediction_feature_id, type: String
+    def prediction_feature
+      Feature.find prediction_feature_id
+    end
+  end
+  # Dataset for descriptors (physchem)
+  class DescriptorDataset < Dataset
+    field :feature_calculation_algorithm, type: String
+  end
+  # Dataset for fminer descriptors
+  class FminerDataset < DescriptorDataset
+    field :training_algorithm, type: String
+    field :training_dataset_id, type: BSON::ObjectId
+    field :training_feature_id, type: BSON::ObjectId
+    field :training_parameters, type: Hash
+  end
+end