RubyGems - opentox-ruby - Versions diffs - 3.1.0 → 4.0.0 - Mend

opentox-ruby 3.1.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data/ChangeLog CHANGED

@@ -1,13 +1,23 @@
+v4.0.0 2012-07-12
+	* fminer addition of compounds fixed
+	* improved performance for CSV download
+	* switch to opentox-ruby version 4.0.0
+2012-04-20
+	* Support for joelib and openbabel descriptors in a completely unified interface with CDK (Ambit)
+	* Features can have multiple types (nominal and numeric), PC descriptors have detailed meta data
+	* Myriads of bugfixes to CSV download code (e.g. missing descriptors, handling of duplicates)
 v3.1.0 2012-02-24
-		* utils.rb: added for special routines (e.g. descriptor calculation)
-		* task.rb: Polling with increasing interval
-		* parser.rb: CSV up and download fixed
-		* transform.rb: routines to create machine learning data matrices
-		* algorithm.rb: SVM parameter grid search, cos similarity as algorithm,
-		gauss() removed
+	* utils.rb: added for special routines (e.g. descriptor calculation)
+	* task.rb: Polling with increasing interval
+	* parser.rb: CSV up and download fixed
+	* transform.rb: routines to create machine learning data matrices
+	* algorithm.rb: SVM parameter grid search, cos similarity as algorithm,	gauss() removed
 v3.0.1 2011-10-19
-    * feature: model registration to ontology service
-    * ontology lib gets endpoints from ontology service
+	* feature: model registration to ontology service
+	* ontology lib gets endpoints from ontology service
 v3.0.0 2011-09-23
-    * datasets stored as json (with Yajl) to improve performance
+	* datasets stored as json (with Yajl) to improve performance

data/README.markdown CHANGED

@@ -38,4 +38,4 @@ This example shows how to create a lazar model and predict a compound, it assume
 Copyright
 ---------
-Copyright (c) 2009-2011 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
+Copyright (c) 2009-2012 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.

data/Rakefile CHANGED

@@ -42,9 +42,10 @@ begin
     gem.add_dependency "dm-migrations",  "=1.1.0"
     gem.add_dependency "dm-validations",  "=1.1.0"
     gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
-    gem.add_dependency "ruby-plot", "=0.6.0"
+    gem.add_dependency "ruby-plot", "=0.6.1"
     gem.add_dependency "gsl", "=1.14.7"
     gem.add_dependency "statsample", "=1.1.0"
+    gem.add_dependency "redis", "=2.2.2"
     gem.add_development_dependency 'jeweler'
     gem.files =  FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 3.1.0
1	+ 4.0.0

data/lib/algorithm.rb CHANGED

@@ -56,25 +56,73 @@ module OpenTox
       def check_params(params,per_mil,subjectid=nil)
         raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and  !params[:dataset_uri].nil?
-        raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and  !params[:prediction_feature].nil?
-        @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
         @training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", subjectid
+        unless params[:prediction_feature] # try to read prediction_feature from dataset
+          raise OpenTox::NotFoundError.new "Please provide a prediction_feature parameter" unless @training_dataset.features.size == 1
+          prediction_feature = OpenTox::Feature.find(@training_dataset.features.keys.first,@subjectid)
+          params[:prediction_feature] = prediction_feature.uri
+        end
+        @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
         raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature])
         unless params[:min_frequency].nil?
-          @minfreq=params[:min_frequency].to_i
-          raise "Minimum frequency must be a number >0!" unless @minfreq>0
-        else
-          @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+          # check for percentage
+          if params[:min_frequency].include? "pc"
+            per_mil=params[:min_frequency].gsub(/pc/,"")
+            if OpenTox::Algorithm.numeric? per_mil
+              per_mil = per_mil.to_i * 10
+            else
+              bad_request=true
+            end
+          # check for per-mil
+          elsif params[:min_frequency].include? "pm"
+            per_mil=params[:min_frequency].gsub(/pm/,"")
+            if OpenTox::Algorithm.numeric? per_mil
+              per_mil = per_mil.to_i
+            else
+              bad_request=true
+            end
+          # set minfreq directly
+          else
+            if OpenTox::Algorithm.numeric? params[:min_frequency]
+              @minfreq=params[:min_frequency].to_i
+              LOGGER.debug "min_frequency #{@minfreq}"
+            else
+              bad_request=true
+            end
+          end
+          raise OpenTox::BadRequestError.new "Minimum frequency must be integer [n], or a percentage [n]pc, or a per-mil [n]pm , with n greater 0" if bad_request
+        end
+        if @minfreq.nil?
+          @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil)
+          LOGGER.debug "min_frequency #{@minfreq} (input was #{per_mil} per-mil)"
         end
       end
-      def add_fminer_data(fminer_instance, params, value_map)
+      def add_fminer_data(fminer_instance, value_map)
+        # detect nr duplicates per compound
+        compound_sizes = {}
+        @training_dataset.compounds.each do |compound|
+          entries=@training_dataset.data_entries[compound]
+          entries.each do |feature, values|
+            compound_sizes[compound] || compound_sizes[compound] = []
+            compound_sizes[compound] << values.size unless values.size == 0
+          end
+          compound_sizes[compound].uniq!
+          raise "Inappropriate data for fminer" if compound_sizes[compound].size > 1
+          compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
+        end
         id = 1 # fminer start id is not 0
-        @training_dataset.data_entries.each do |compound,entry|
+        @training_dataset.compounds.each do |compound|
+          entry=@training_dataset.data_entries[compound]
           begin
-            smiles = OpenTox::Compound.smiles(compound.to_s)
+            smiles = OpenTox::Compound.new(compound).to_smiles
           rescue
             LOGGER.warn "No resource for #{compound.to_s}"
             next
@@ -84,32 +132,31 @@ module OpenTox
             next
           end
-          value_map=params[:value_map] unless params[:value_map].nil?
           entry.each do |feature,values|
             if feature == @prediction_feature.uri
-              values.each do |value|
-                if value.nil?
+              (0...compound_sizes[compound]).each { |i|
+                if values[i].nil?
                   LOGGER.warn "No #{feature} activity for #{compound.to_s}."
                 else
                   if @prediction_feature.feature_type == "classification"
-                    activity= value_map.invert[value.to_s].to_i # activities are mapped to 1..n
+                    activity= value_map.invert[values[i]].to_i # activities are mapped to 1..n
                     @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
                   elsif @prediction_feature.feature_type == "regression"
-                    activity= value.to_f
+                    activity= values[i].to_f
                   end
                   begin
-                    fminer_instance.AddCompound(smiles,id)
-                    fminer_instance.AddActivity(activity, id)
+                    fminer_instance.AddCompound(smiles,id) if fminer_instance
+                    fminer_instance.AddActivity(activity, id) if fminer_instance
                     @all_activities[id]=activity # DV: insert global information
                     @compounds[id] = compound
                     @smi[id] = smiles
                     id += 1
                   rescue Exception => e
-                    LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
+                    LOGGER.warn "Could not add " + smiles + "\t" + values[i].to_s + " to fminer"
                     LOGGER.warn e.backtrace
                   end
                 end
-              end
+              }
             end
           end
         end
@@ -380,11 +427,11 @@ module OpenTox
           prediction = acts[0]
         else
           #LOGGER.debug gram_matrix.to_yaml
-          @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
-          @r.eval "set.seed(1)"
+          @r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
           @r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab"
           @r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore"
           @r.eval "registerDoMC()" # switch on parallel processing
+          @r.eval "set.seed(1)"
           begin
             # set data
@@ -400,7 +447,14 @@ module OpenTox
             # prepare data
             LOGGER.debug "Preparing R data ..."
-            @r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification
+            @r.eval <<-EOR
+              weights=NULL
+              if (class(y) == 'character') {
+                y = factor(y)
+                suppressPackageStartupMessages(library('class'))
+                #weights=unlist(as.list(prop.table(table(y))))
+              }
+            EOR
             @r.eval <<-EOR
               rem = nearZeroVar(prop_matrix)
@@ -417,8 +471,18 @@ module OpenTox
             # model + support vectors
             LOGGER.debug "Creating R SVM model ..."
-            @r.eval <<-EOR
-              model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale"))
+            train_success = @r.eval <<-EOR
+              # AM: TODO: evaluate class weight effect by altering:
+              # AM: comment in 'weights' above run and class.weights=weights vs. class.weights=1-weights
+              # AM: vs
+              # AM: comment out 'weights' above (status quo), thereby disabling weights
+              model = train(prop_matrix,y,
+                             method="svmradial",
+                             preProcess=c("center", "scale"),
+                             class.weights=weights,
+                             trControl=trainControl(method="LGOCV",number=10),
+                             tuneLength=8
+                           )
               perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
             EOR
@@ -431,6 +495,7 @@ module OpenTox
             # censoring
             prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance )
+            prediction = nil unless train_success
             LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}"
           rescue Exception => e
             LOGGER.debug "#{e.class}: #{e.message}"
@@ -456,30 +521,42 @@ module OpenTox
         @r.del_missing = params[:del_missing] == true ? 1 : 0
         r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
         @r.f_fds_r = r_result_file.to_s
         # need packs 'randomForest', 'RANN'
         @r.eval <<-EOR
-          set.seed(1)
           suppressPackageStartupMessages(library('caret'))
           suppressPackageStartupMessages(library('randomForest'))
           suppressPackageStartupMessages(library('RANN'))
           suppressPackageStartupMessages(library('doMC'))
           registerDoMC()
+          set.seed(1)
           acts = read.csv(ds_csv_file, check.names=F)
           feats = read.csv(fds_csv_file, check.names=F)
           ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-)
           features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]
           y = ds[,which(names(ds) == prediction_feature)]
           # assumes a data matrix 'features' and a vector 'y' of target values
           row.names(features)=NULL
+          # features with all values missing removed
+          na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) )
+          features = features[,!names(features) %in% na_col]
+          # features with infinite values removed
+          inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) )
+          features = features[,!names(features) %in% inf_col]
+          # features with zero variance removed
+          zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) )
+          features = features[,!names(features) %in% zero_var]
           pp = NULL
           if (del_missing) {
             # needed if rows should be removed
-            na_ids = apply(features,1,function(x)any(is.na(x)))
+            na_ids = apply ( features,1,function(x) any ( is.na ( x ) ) )
             features = features[!na_ids,]
             y = y[!na_ids]
             pp = preProcess(features, method=c("scale", "center"))
@@ -488,17 +565,23 @@ module OpenTox
             pp = preProcess(features, method=c("scale", "center", "knnImpute"))
           }
           features = predict(pp, features)
+          # features with nan values removed (sometimes preProcess return NaN values)
+          nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) )
+          features = features[,!names(features) %in% nan_col]
           # determine subsets
-          subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
-          subsets = c(2,3,4,5,7,10,subsets)
+          subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7)
+          #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
+          #subsets = c(2,3,4,5,7,10,subsets)
+          #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30)
           subsets = unique(sort(round(subsets)))
           subsets = subsets[subsets<=dim(features)[2]]
           subsets = subsets[subsets>1]
           # Recursive feature elimination
-          rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)
+          rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets)
           # read existing dataset and select most useful features
           csv=feats[,c("SMILES", rfProfile$optVariables)]
           write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')
@@ -527,7 +610,7 @@ module OpenTox
       # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
       # @return [Hash] Hash with matching Smarts and number of hits
       def self.lookup(params)
-        params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid])
+        params[:compound].lookup(params[:features], params[:feature_dataset_uri], params[:pc_type], params[:lib], params[:subjectid])
       end
     end
@@ -539,3 +622,26 @@ module OpenTox
     end
   end
 end
+class Array
+  # collect method extended for parallel processing.
+  # Note: assign return value as: ans = arr.pcollect(n) { |obj| ... }
+  # @param n the number of processes to spawn (default: unlimited)
+  def pcollect(n = nil)
+    nproc = 0
+    result = collect do |*a|
+      r, w = IO.pipe
+      fork do
+        r.close
+        w.write( Marshal.dump( yield(*a) ) )
+      end
+      if n and (nproc+=1) >= n
+        Process.wait ; nproc -= 1
+      end
+      [ w.close, r ].last
+    end
+    Process.waitall
+    result.collect{|r| Marshal.load [ r.read, r.close ].first}
+  end
+end

data/lib/compound.rb CHANGED

@@ -3,6 +3,7 @@
 module OpenTox
+  require "rexml/document"
   # Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure).
 	class Compound
@@ -17,16 +18,20 @@ module OpenTox
     # @return [Compound] Compound
 		def initialize(uri=nil)
       @uri = uri
-      case @uri
-      when /InChI/ # shortcut for IST services
-        @inchi = @uri.sub(/^.*InChI/, 'InChI')
+      if (@uri =~ URI::regexp) || @uri.nil?
+        case @uri
+        when /InChI/ # shortcut for IST services
+          @inchi = @uri.sub(/^.*InChI/, 'InChI')
+        else
+          @inchi = RestClientWrapper.get(@uri, :accept => 'chemical/x-inchi').to_s.chomp if @uri
+        end
+        if @uri and @inchi.to_s.size==0
+          LOGGER.warn "REMOVE ABMIT HACK: no inchi for compound "+@uri.to_s+", load via smiles"
+          @inchi = Compound.smiles2inchi(Compound.smiles(@uri))
+        end
       else
-        @inchi = RestClientWrapper.get(@uri, :accept => 'chemical/x-inchi').to_s.chomp if @uri
-      end
-      if @uri and @inchi.to_s.size==0
-        LOGGER.warn "REMOVE ABMIT HACK: no inchi for compound "+@uri.to_s+", load via smiles"
-        @inchi = Compound.smiles2inchi(Compound.smiles(@uri))
+        raise "Not able to create compound with uri: #{@uri}"
       end
     end
@@ -130,6 +135,47 @@ module OpenTox
         "not available"
       end
 		end
+    # Get all known compound names sorted by classification. Relies on an external service for name lookups.
+    # @example
+    #   names = compound.to_names_hash
+    # @return [Hash] Classification => Name Array
+		def to_names_hash
+      begin
+        xml = RestClientWrapper.get("#{@@cactus_uri}#{@inchi}/names/xml")
+        xmldoc = REXML::Document.new(xml)
+        data = {}
+        xmldoc.root.elements[1].elements.each{|e|
+          if data.has_key?(e.attribute("classification").value) == false
+             data[e.attribute("classification").value] = [e.text]
+          else
+             data[e.attribute("classification").value].push(e.text)
+          end
+        }
+        data
+      rescue
+        "not available"
+      end
+		end
+    # Get all known compound names sorted by classification. Relies on an external service for name lookups.
+    # @example
+    #   names = compound.to_names_hash
+    # @return [Hash] Classification => Name Array
+    def to_ambit_names_hash
+      begin
+        ds = OpenTox::Dataset.new
+        ds.save
+        ds.load_rdfxml(RestClientWrapper.get("http://apps.ideaconsult.net:8080/ambit2/query/compound/search/names?type=smiles&property=&search=#{@inchi}"))
+        ds.save
+        ds.uri
+      rescue
+        "not available"
+      end
+    end
 		# Match a smarts string
     # @example
@@ -197,25 +243,28 @@ module OpenTox
     # Lookup numerical values, returns hash with feature name as key and value as value
     # @param [Array] Array of feature names
     # @param [String] Feature dataset uri
+    # @param [String] Comma separated pc types
+    # @param [String] Comma separated lib
     # @return [Hash] Hash with feature name as key and value as value
-    def lookup(feature_array,feature_dataset_uri,pc_type,subjectid=nil)
+		def lookup(feature_array,feature_dataset_uri,pc_type,lib,subjectid=nil)
       ds = OpenTox::Dataset.find(feature_dataset_uri,subjectid)
       #entry = ds.data_entries[self.uri]
       entry = nil
-      ds.data_entries.each { |c_uri, values|
-        if c_uri.split('/compound/').last == self.to_inchi
-          entry = ds.data_entries[self.uri]
+      ds.data_entries.each { |c_uri, values|
+        compound = OpenTox::Compound.new(c_uri)
+        if compound.to_inchi == self.to_inchi # Compare compounds by InChI
+          entry = ds.data_entries[c_uri]
           break
         end
       }
       LOGGER.debug "#{entry.size} entries in feature ds for query." unless entry.nil?
       if entry.nil?
-        uri, smiles_to_inchi = OpenTox::Algorithm.get_pc_descriptors({:compounds => [self.uri], :pc_type => pc_type})
-        uri = OpenTox::Algorithm.load_ds_csv(uri, smiles_to_inchi, subjectid)
-        ds = OpenTox::Dataset.find(uri,subjectid)
+        temp_ds = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid); temp_ds.add_compound(self.uri); temp_uri = temp_ds.save(subjectid)
+        uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-algorithm"], "/pc/AllDescriptors"), {:dataset_uri => temp_uri, :pc_type => pc_type, :lib => lib, :subjectid => subjectid})
+        ds = OpenTox::Dataset.find(uri, subjectid)
         entry = ds.data_entries[self.uri]
         ds.delete(subjectid)
+        temp_ds.delete(subjectid)
       end
       features = entry.keys
       features.each { |feature|
@@ -224,7 +273,6 @@ module OpenTox
         entry.delete(feature) unless feature == new_feature # e.g. when loading from ambit
       }
       #res = feature_array.collect {|v| entry[v]}
-      #LOGGER.debug "----- am #{entry.to_yaml}"
       entry
 		end