RubyGems - opentox-ruby - Versions diffs - 3.0.1 → 3.1.0 - Mend

opentox-ruby 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/lib/rest_client_wrapper.rb CHANGED Viewed

@@ -70,7 +70,7 @@ module OpenTox
       begin
         #LOGGER.debug "RestCall: "+rest_call.to_s+" "+uri.to_s+" "+headers.inspect+" "+payload.inspect
-        resource = RestClient::Resource.new(uri,{:timeout => 60})
+        resource = RestClient::Resource.new(uri,{:timeout => 600})
         if rest_call=="post" || rest_call=="put"
           result = resource.send(rest_call, payload, headers)
         else

data/lib/serializer.rb CHANGED Viewed

@@ -55,7 +55,7 @@ module OpenTox
           OT.predictedVariables => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
           OT.paramValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
-          #object props for validation#
+          #object props for validation#
           OT.model => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
           OT.trainingDataset => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
           OT.predictionFeature => { RDF["type"] => [{ "type" => "uri", "value" => OWL.ObjectProperty }] } ,
@@ -87,7 +87,7 @@ module OpenTox
           OT.percentageCompleted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
           OT.acceptValue => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
-          # annotation props for validation
+          # annotation props for validation
           OT.numUnpredicted => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
           OT.crossvalidationFold => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
           OT.numInstances => { RDF["type"] => [{ "type" => "uri", "value" => OWL.AnnotationProperty }] } ,
@@ -143,8 +143,8 @@ module OpenTox
         @data_entries = {}
         @values_id = 0
         @parameter_id = 0
-        @classes = Set.new
+        @classes = Set.new
         @object_properties = Set.new
         @annotation_properties = Set.new
         @datatype_properties = Set.new
@@ -208,7 +208,7 @@ module OpenTox
         @object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => OT.Task }] }
         add_metadata uri, metadata
       end
       # Add a resource defined by resource_class and content
       # (see documentation of add_content for example)
       # @param [String] uri of resource
@@ -223,10 +223,10 @@ module OpenTox
       def add_uri(uri,type)
         @object[uri] = { RDF["type"] => [{ "type" => "uri", "value" => type }] }
       end
       private
       @@content_id = 1
       #Recursiv function to add content
       #@example
       #  { DC.description => "bla",
@@ -244,7 +244,7 @@ module OpenTox
         hash.each do |u,v|
           if v.is_a? Hash
             # value is again a hash, i.e. a new owl class is added
-            # first make sure type (==class) is set
+            # first make sure type (==class) is set
             type = v[RDF.type]
             raise "type missing for "+u.to_s+" content:\n"+v.inspect unless type
             raise "class unknown "+type.to_s+" (for "+u.to_s+")" unless @object.has_key?(type)
@@ -256,7 +256,7 @@ module OpenTox
             # add content to new class
             add_content(genid,v)
           elsif v.is_a? Array
-            # value is an array, i.e. a list of values with property is added
+            # value is an array, i.e. a list of values with property is added
             v.each{ |vv| add_content( uri, { u => vv } ) }
           else # v.is_a? String
             # simple string value
@@ -268,7 +268,7 @@ module OpenTox
           end
         end
       end
       public
       # Add metadata
@@ -329,7 +329,7 @@ module OpenTox
           v = [{ "type" => "uri", "value" => value}]
         when "literal"
           v = [{ "type" => "literal", "value" => value, "datatype" => datatype(value) }]
-        else
+        else
           raise "Illegal type #{type(value)} for #{value}."
         end
         @object[values] = {
@@ -342,7 +342,7 @@ module OpenTox
       end
       # Serializers
       # Convert to N-Triples
       # @return [text/plain] Object OWL-DL in N-Triples format
       def to_ntriples
@@ -353,7 +353,7 @@ module OpenTox
           entry.each do |p,objects|
             p = url(p)
             objects.each do |o|
-              case o["type"]
+              case o["type"]
               when "uri"
                 o = url(o["value"])
               when "literal"
@@ -371,9 +371,15 @@ module OpenTox
       # Convert to RDF/XML
       # @return [text/plain] Object OWL-DL in RDF/XML format
       def to_rdfxml
-        Tempfile.open("owl-serializer"){|f| f.write(self.to_ntriples); @path = f.path}
+        tmpf = Tempfile.open("owl-serializer")
+        tmpf.write(self.to_ntriples)
+        tmpf.flush
+        @path = tmpf.path
         # TODO: add base uri for ist services
-        `rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null`
+        res=`rapper -i ntriples -f 'xmlns:ot="#{OT.uri}"' -f 'xmlns:ota="#{OTA.uri}"' -f 'xmlns:dc="#{DC.uri}"' -f 'xmlns:rdf="#{RDF.uri}"' -f 'xmlns:owl="#{OWL.uri}"' -o rdfxml #{@path} 2>/dev/null`
+        tmpf.close
+        tmpf.delete
+        res
       end
       # Convert to JSON as specified in http://n2.talis.com/wiki/RDF_JSON_Specification
@@ -427,20 +433,20 @@ module OpenTox
       end
       def literal(value,type)
-        # concat and << are faster string concatination operators than +
+        # concat and << are faster string concatination operators than +
         '"'.concat(value.to_s).concat('"^^<').concat(type).concat('>')
       end
       def url(uri)
-        # concat and << are faster string concatination operators than +
+        # concat and << are faster string concatination operators than +
         '<'.concat(uri).concat('>')
       end
       def rdf_types
-        @classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } }
-        @object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } }
-        @annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } }
-        @datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } }
+        @classes.each { |c| @object[c] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['Class'] }] } }
+        @object_properties.each { |p| @object[p] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['ObjectProperty'] }] } }
+        @annotation_properties.each { |a| @object[a] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['AnnotationProperty'] }] } }
+        @datatype_properties.each { |d| @object[d] = { RDF["type"] => [{ "type" => "uri", "value" => OWL['DatatypeProperty'] }] } }
       end
     end
@@ -457,35 +463,46 @@ module OpenTox
         @rows.first << features
         @rows.first.flatten!
         dataset.data_entries.each do |compound,entries|
-          smiles = Compound.new(compound).to_smiles
+          cmpd = Compound.new(compound)
+          smiles = cmpd.to_smiles
+          inchi = URI.encode_www_form_component(cmpd.to_inchi)
+          row_container = Array.new
           row = Array.new(@rows.first.size)
-          row[0] = smiles
+          row_container << row
+          #row[0] = smiles
+          row[0] = inchi
           entries.each do |feature, values|
             i = features.index(feature)+1
             values.each do |value|
-              if row[i]
-                row[i] = "#{row[i]} #{value}" # multiple values
+              if row_container[0][i]
+                #LOGGER.debug "Feature '#{feature}' (nr '#{i}'): '#{value}'"
+                row_container << row_container.last.collect
+                row_container.last[i] = value
+                #LOGGER.debug "RC: #{row_container.to_yaml}"
               else
-                row[i] = value
+                row_container.each { |r| r[i] = value }
               end
             end
           end
-          @rows << row
+          row_container.each { |r| @rows << r }
         end
       end
       # Convert to CSV string
       # @return [String] CSV string
       def to_csv
-        @rows.collect{|r| r.join(", ")}.join("\n")
+        rows = @rows.collect
+        result = ""
+        result << rows.shift.collect { |f| f.split('/').last }.join(",") << "\n" # only feature name
+        result << rows.collect{ |r| r.join(",") }.join("\n")
       end
       # Convert to spreadsheet workbook
       # @return [Spreadsheet::Workbook] Workbook object (use the spreadsheet gemc to write a file)
-      def to_spreadsheet
+      def to_spreadsheet(sheetname="sheet1")
         Spreadsheet.client_encoding = 'UTF-8'
         book = Spreadsheet::Workbook.new
-        sheet = book.create_worksheet(:name => '')
+        sheet = book.create_worksheet(:name => "#{sheetname}")
         sheet.column(0).width = 100
         i = 0
         @rows.each do |row|

data/lib/stratification.R ADDED Viewed

@@ -0,0 +1,201 @@
+nominal_to_binary <- function( data )
+{
+  result = NULL
+  for (i in 1:ncol(data))
+  {
+     #print(i)
+     if (is.numeric( data[,i] ) )
+     {
+        if (is.null(result))
+          result = data.frame(data[,i])
+        else
+          result = data.frame(result, data[,i])
+        colnames(result)[ncol(result)] <- colnames(data)[i]
+     }
+     else
+     {
+        vals = unique(data[,i])
+        for (j in 1:length(vals))
+        {
+           #print(j)
+           bins = c()
+           for (k in 1:nrow(data))
+           {
+              if(data[,i][k] == vals[j])
+                bins = c(bins,1)
+              else
+                bins = c(bins,0)
+           }
+           #print(bins)
+           if (is.null(result))
+             result = data.frame(bins)
+           else
+             result = data.frame(result, bins)
+           colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j])
+           if (length(vals)==2) break
+        }
+     }
+  }
+  #print(head(result))
+  result
+}
+process_data <- function( data )
+{
+  data.num <- as.data.frame(data)
+  if (!is.numeric(data.num))
+  {
+    data.num = nominal_to_binary(data.num)
+  }
+  if(any(is.na(data.num)))
+  {
+    require("gam")
+   	data.repl = na.gam.replace(data.num)
+  }
+  else
+  	data.repl = data.num
+  data.repl
+}
+cluster <- function( data, min=10, max=15 )
+{
+  require("vegan")
+  max <- min(max,nrow(unique(data)))
+  max <- min(max,nrow(data)-1)
+  if (min>max)
+    min=max
+  print(paste("cascade k-means ",min," - ",max))
+  s = cascadeKM(data,min,max,iter=30)
+  m = max.col(s$results)[2]
+  print(paste("best k-means clustering result: ",((m-1)+min)," num clusters"))
+  cbind(s$partition[,m])
+}
+stratified_split <- function( data, ratio=0.3, method="cluster" )
+{
+    data.processed = as.matrix(process_data( data ))
+    if (method == "samplecube")
+    {
+      require("sampling")
+      # adjust ratio to make samplecube return exact number of samples
+      ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
+      pik = rep(ratio,times=nrow(data.processed))
+      data.strat = cbind(pik,data.processed)
+      samplecube(data.strat,pik,order=2,comment=F)
+    }
+    else if (method == "cluster")
+    {
+      cl = cluster(data.processed)
+#      require("caret")
+#      res = createDataPartition(cl,p=ratio)
+#      split = rep(1, times=nrow(data))
+#      for (j in 1:nrow(data))
+#        if ( is.na(match(j,res$Resample1)) )
+#          split[j]=0
+#      split
+      require("sampling")
+      stratified_split(cl,ratio,"samplecube")
+    }
+    else
+      stop("unknown method")
+}
+stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
+{
+  print(paste(num_folds,"-fold-split, data-size",nrow(data)))
+  data.processed = as.matrix(process_data( data ))
+  if (method == "samplecube")
+  {
+    folds = rep(0, times=nrow(data))
+    for (i in 1:(num_folds-1))
+    {
+      require("sampling")
+      prop = 1/(num_folds-(i-1))
+      print(paste("fold",i,"/",num_folds," prop",prop))
+      pik = rep(prop,times=nrow(data))
+      for (j in 1:nrow(data))
+        if(folds[j]!=0)
+          pik[j]=0
+      data.strat = cbind(pik,data.processed)
+      s<-samplecube(data.strat,pik,order=2,comment=F)
+      print(paste("fold size: ",sum(s)))
+      for (j in 1:nrow(data))
+        if (s[j] == 1)
+          folds[j]=i
+    }
+    for (j in 1:nrow(data))
+      if (folds[j] == 0)
+        folds[j]=num_folds
+    folds
+  }
+  else if (method == "cluster")
+  {
+    require("TunePareto")
+    cl = cluster(data.processed)
+    res = generateCVRuns(cl,ntimes=1,nfold=3)
+    folds = rep(0, times=nrow(data))
+    for (i in 1:num_folds)
+      for(j in 1:length(res[[1]][[i]]))
+        folds[res[[1]][[i]][j]]=i
+    folds
+  }
+  else
+    stop("unknown method")
+}
+plot_pre_process <- function( data, method="pca" )
+{
+  data.processed = process_data( data )
+  if (method == "pca")
+  {
+    data.pca <- prcomp(data.processed, scale=TRUE)
+    as.data.frame(data.pca$x)[1:2]
+  }
+  else if (method == "smacof")
+  {
+    require("smacof")
+    data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
+    data.emb$conf
+  }
+  else
+    stop("unknown method")
+}
+plot_split <- function( data, split, names=NULL, ... )
+{
+  if (ncol(data)!=2 || !is.numeric(data[,1]) || !is.numeric(data[,2]))
+    stop("data not suitable for plotting, plot_pre_process() first")
+  plot( NULL, xlim = extendrange(data[,1]), ylim = extendrange(data[,2]), ... )
+  if (is.null(names))
+    names <- c("split 1","split 2")
+  colos = as.double(rep(2:(max(split)+2)))
+  legend("topleft",names,pch=2,col=colos)
+  for (j in max(split):0)
+  {
+    set = c()
+    for (i in 1:nrow(data))
+      if (split[i] == j)
+        set = c(set,i)
+    points(data[set,], pch = 2, col=(j+2))
+  }
+}
+#a<-matrix(rnorm(100, mean=50,  sd=4), ncol=5)
+#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
+#data<-rbind(a,b)
+#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
+#data<-rbind(data,c)
+#data=iris
+#split = stratified_k_fold_split(data, num_folds=3)
+#split = stratified_split(data, ratio=0.33, method="cluster")
+#print(sum(split))
+#plot_split(plot_pre_process(data),split,c("training","test"))
+#cl = cluster(data)

data/lib/task.rb CHANGED Viewed

@@ -242,16 +242,20 @@ module OpenTox
     # waits for a task, unless time exceeds or state is no longer running
     # @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
     # @param [optional,Numeric] dur seconds pausing before cheking again for completion
-    def wait_for_completion( waiting_task=nil, dur=0.3)
+    def wait_for_completion( waiting_task=nil)
       waiting_task.waiting_for(self.uri) if waiting_task
       due_to_time = Time.new + DEFAULT_TASK_MAX_DURATION
+      start_time = Time.new
+      dur = 0
       LOGGER.debug "start waiting for task "+@uri.to_s+" at: "+Time.new.to_s+", waiting at least until "+due_to_time.to_s
       load_metadata # for extremely fast tasks
       check_state
       while self.running? or self.queued?
         sleep dur
+        dur = [[(Time.new - start_time)/20.0,0.3].max,300.0].min
+        #LOGGER.debug "task-object-id: #{self.object_id} - wait: #{"%.2f"%(Time.new - start_time)} - dur: #{"%.2f"%dur}"
         load_metadata
         # if another (sub)task is waiting for self, set progress accordingly
         waiting_task.progress(@metadata[OT.percentageCompleted].to_f) if waiting_task

data/lib/transform.rb ADDED Viewed

@@ -0,0 +1,520 @@
+module OpenTox
+    module Transform
+    # Uses Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
+      # LogAutoScaler for GSL vectors.
+      # Take log and scale.
+      class LogAutoScale
+        attr_accessor :vs, :offset, :autoscaler
+        # @param [GSL::Vector] Values to transform using LogAutoScaling.
+        def initialize values
+          @distance_to_zero = 1.0
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            vs = values.clone
+            @offset = vs.min - @distance_to_zero
+            @autoscaler = OpenTox::Transform::AutoScale.new mvlog(vs)
+            @vs = @autoscaler.vs
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+        # @param [GSL::Vector] values to restore.
+        # @return [GSL::Vector] transformed values.
+        def restore values
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            vs = values.clone
+            rv = @autoscaler.restore(vs)
+            rv.to_a.collect { |v| (10**v) + @offset }.to_gv
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+        # @param [GSL::Vector] values to transform.
+        # @return [GSL::Vector] transformed values.
+        def mvlog values
+          values.to_a.collect { |v| Math::log10(v - @offset) }.to_gv
+        end
+      end
+      # Auto-Scaler for GSL vectors.
+      # Center on mean and divide by standard deviation.
+      class AutoScale
+        attr_accessor :vs, :mean, :stdev
+        # @param [GSL::Vector] values to transform using AutoScaling.
+        def initialize values
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            vs = values.clone
+            @mean = vs.to_scale.mean
+            @stdev = vs.to_scale.standard_deviation_population
+            @stdev = 0.0 if @stdev.nan?
+            @vs = transform vs
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+        # @param [GSL::Vector] values to transform.
+        # @return [GSL::Vector] transformed values.
+        def transform values
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            autoscale values.clone
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+        # @param [GSL::Vector] Values to restore.
+        # @return [GSL::Vector] transformed values.
+        def restore values
+          begin
+            raise "Cannot transform, values empty." if values.size==0
+            rv_ss = values.clone.to_scale * @stdev unless @stdev == 0.0
+            (rv_ss + @mean).to_gsl
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+        # @param [GSL::Vector] values to transform.
+        # @return [GSL::Vector] transformed values.
+        def autoscale values
+          vs_ss = values.clone.to_scale - @mean
+          @stdev == 0.0 ? vs_ss.to_gsl : ( vs_ss * ( 1 / @stdev) ).to_gsl
+        end
+      end
+      # Principal Components Analysis.
+      class PCA
+        attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
+        # Creates a transformed dataset as GSL::Matrix.
+        #
+        # @param [GSL::Matrix] Data matrix.
+        # @param [Float] Compression ratio from [0,1], default 0.05.
+        # @return [GSL::Matrix] Data transformed matrix.
+        def initialize data_matrix, compression=0.05, maxcols=(1.0/0.0)
+          begin
+            @data_matrix = data_matrix.clone
+            @compression = compression.to_f
+            @mean = Array.new
+            @autoscaler = Array.new
+            @cols = Array.new
+            @maxcols = maxcols
+            # Objective Feature Selection
+            raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
+            @data_matrix_selected = nil
+            (0..@data_matrix.size2-1).each { |i|
+              if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
+                if @data_matrix_selected.nil?
+                  @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
+                  @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
+                else
+                  @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
+                end
+                @cols << i
+              end
+            }
+            raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
+            # PCA uses internal centering on 0
+            @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @cols.size)
+            (0..@cols.size-1).each { |i|
+              as = OpenTox::Transform::AutoScale.new(@data_matrix_selected.col(i))
+              @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = as.vs * as.stdev # re-adjust by stdev
+              @mean << as.mean
+              @autoscaler << as
+            }
+            # PCA
+            data_matrix_hash = Hash.new
+            (0..@cols.size-1).each { |i|
+              column_view = @data_matrix_scaled.col(i)
+              data_matrix_hash[i] = column_view.to_scale
+            }
+            dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
+            cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
+            pca=Statsample::Factor::PCA.new(cor_matrix)
+            # Select best eigenvectors
+            pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
+            @eigenvalue_sums = Array.new
+            (0..@cols.size-1).each { |i|
+              @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
+            }
+            eigenvectors_selected = Array.new
+            pca.eigenvectors.each_with_index { |ev, i|
+              if (@eigenvalue_sums[i] <= ((1.0-@compression)*@cols.size)) || (eigenvectors_selected.size == 0)
+                eigenvectors_selected << ev.to_a unless @maxcols <= eigenvectors_selected.size
+              end
+            }
+            @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, @cols.size).transpose
+            @data_transformed_matrix = (@eigenvector_matrix.transpose * @data_matrix_scaled.transpose).transpose
+          rescue Exception => e
+              LOGGER.debug "#{e.class}: #{e.message}"
+              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+        # Transforms data to feature space found by PCA.
+        #
+        # @param [GSL::Matrix] Data matrix.
+        # @return [GSL::Matrix] Transformed data matrix.
+        def transform values
+          begin
+            vs = values.clone
+            raise "Error! Too few columns for transformation." if vs.size2 < @cols.max
+            data_matrix_scaled = GSL::Matrix.alloc(vs.size1, @cols.size)
+            @cols.each_with_index { |i,j|
+              data_matrix_scaled.col(j)[0..data_matrix_scaled.size1-1] = @autoscaler[j].transform(vs.col(i).to_a) * @autoscaler[j].stdev
+            }
+            (@eigenvector_matrix.transpose * data_matrix_scaled.transpose).transpose
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+        # Restores data in the original feature space (possibly with compression loss).
+        #
+        # @param [GSL::Matrix] Transformed data matrix.
+        # @return [GSL::Matrix] Data matrix.
+        def restore
+          begin
+            data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
+            # reverse scaling
+            (0..@cols.size-1).each { |i|
+              data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
+            }
+            data_matrix_restored
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+      end
+      # Singular Value Decomposition
+      class SVD
+        attr_accessor :data_matrix, :compression, :data_transformed_matrix, :uk, :vk, :eigk, :eigk_inv
+        # Creates a transformed dataset as GSL::Matrix.
+        #
+        # @param [GSL::Matrix] Data matrix
+        # @param [Float] Compression ratio from [0,1], default 0.05
+        # @return [GSL::Matrix] Data transformed matrix
+        def initialize data_matrix, compression=0.05
+          begin
+            @data_matrix = data_matrix.clone
+            @compression = compression
+            # Compute the SV Decomposition X=USV
+            # vt is *not* the transpose of V here, but V itself (see http://goo.gl/mm2xz)!
+            u, vt, s = data_matrix.SV_decomp
+            # Determine cutoff index
+            s2 = s.mul(s) ; s2_sum = s2.sum
+            s2_run = 0
+            k = s2.size - 1
+            s2.to_a.reverse.each { |v|
+              s2_run += v
+              frac = s2_run / s2_sum
+              break if frac > compression
+              k -= 1
+            }
+            k += 1 if k == 0 # avoid uni-dimensional (always cos sim of 1)
+            # Take the k-rank approximation of the Matrix
+            #   - Take first k columns of u
+            #   - Take first k columns of vt
+            #   - Take the first k eigenvalues
+            @uk = u.submatrix(nil, (0..k)) # used to transform column format data
+            @vk = vt.submatrix(nil, (0..k)) # used to transform row format data
+            s = GSL::Matrix.diagonal(s)
+            @eigk = s.submatrix((0..k), (0..k))
+            @eigk_inv = @eigk.inv
+            # Transform data
+            @data_transformed_matrix = @uk # = u for all SVs
+            # NOTE: @data_transformed_matrix is also equal to
+            # @data_matrix * @vk * @eigk_inv
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+        # Transforms data instance (1 row) to feature space found by SVD.
+        #
+        # @param [GSL::Matrix] Data matrix (1 x m).
+        # @return [GSL::Matrix] Transformed data matrix.
+        def transform_instance values
+          begin
+            values * @vk * @eigk_inv
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+        alias :transform :transform_instance # make this the default (see PCA interface)
+        # Transforms data feature (1 column) to feature space found by SVD.
+        #
+        # @param [GSL::Matrix] Data matrix (1 x n).
+        # @return [GSL::Matrix] Transformed data matrix.
+        def transform_feature values
+          begin
+            values * @uk * @eigk_inv
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+        # Restores data in the original feature space (possibly with compression loss).
+        #
+        # @param [GSL::Matrix] Transformed data matrix.
+        # @return [GSL::Matrix] Data matrix.
+        def restore
+          begin
+            @data_transformed_matrix * @eigk * @vk.transpose  # reverse svd
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+      end
+      # Attaches transformations to an OpenTox::Model
+      # Stores props, sims, performs similarity calculations
+      class ModelTransformer
+        attr_accessor :model, :similarity_algorithm, :acts, :sims
+        # @params[OpenTox::Model] model to transform
+        def initialize model
+          @model = model
+          @similarity_algorithm = @model.similarity_algorithm
+        end
+        def transform
+          get_matrices # creates @n_prop, @q_prop, @acts from ordered fps
+          @ids = (0..((@n_prop.length)-1)).to_a # surviving compounds; become neighbors
+          # Preprocessing
+          if (@model.similarity_algorithm == "Similarity.cosine")
+            # truncate nil-columns and -rows
+            LOGGER.debug "O: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+            while @q_prop.size>0
+              idx = @q_prop.index(nil)
+              break if idx.nil?
+              @q_prop.slice!(idx)
+              @n_prop.each { |r| r.slice!(idx) }
+            end
+            LOGGER.debug "Q: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+            remove_nils  # removes nil cells (for cosine); alters @n_props, @q_props, cuts down @ids to survivors
+            LOGGER.debug "M: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+            # adjust rest
+            fps_tmp = []; @ids.each { |idx| fps_tmp << @fps[idx] }; @fps = fps_tmp
+            cmpds_tmp = []; @ids.each { |idx| cmpds_tmp << @cmpds[idx] }; @cmpds = cmpds_tmp
+            acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
+            # scale and svd
+            nr_cases, nr_features = @n_prop.size, @n_prop[0].size
+            gsl_n_prop = GSL::Matrix.alloc(@n_prop.flatten, nr_cases, nr_features); gsl_n_prop_orig = gsl_n_prop.clone # make backup
+            gsl_q_prop = GSL::Matrix.alloc(@q_prop.flatten, 1, nr_features); gsl_q_prop_orig = gsl_q_prop.clone # make backup
+            (0...nr_features).each { |i|
+               autoscaler = OpenTox::Transform::AutoScale.new(gsl_n_prop.col(i))
+               gsl_n_prop.col(i)[0..nr_cases-1] = autoscaler.vs
+               gsl_q_prop.col(i)[0..0] = autoscaler.transform gsl_q_prop.col(i)
+            }
+            svd = OpenTox::Algorithm::Transform::SVD.new(gsl_n_prop, 0.0)
+            @n_prop = svd.data_transformed_matrix.to_a
+            @q_prop = svd.transform(gsl_q_prop).row(0).to_a
+            LOGGER.debug "S: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+          else
+            convert_nils # convert nil cells (for tanimoto); leave @n_props, @q_props, @ids untouched
+          end
+          # neighbor calculation
+          @ids = [] # surviving compounds become neighbors
+          @sims = [] # calculated by neighbor routine
+          neighbors
+          n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp # select neighbors from matrix
+          acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp
+          # Sims between neighbors, if necessary
+          gram_matrix = []
+          if !@model.parameter("propositionalized") # need gram matrix for standard setting (n. prop.)
+            @n_prop.each_index do |i|
+              gram_matrix[i] = [] unless gram_matrix[i]
+              @n_prop.each_index do |j|
+                if (j>i)
+                  sim = eval("OpenTox::Algorithm::#{@similarity_algorithm}(@n_prop[i], @n_prop[j])")
+                  gram_matrix[i][j] = sim
+                  gram_matrix[j] = [] unless gram_matrix[j]
+                  gram_matrix[j][i] = gram_matrix[i][j]
+                end
+              end
+              gram_matrix[i][i] = 1.0
+            end
+          end
+          # reclaim original data (if svd was performed)
+          if svd
+            @n_prop = gsl_n_prop_orig.to_a
+            n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp
+            @q_prop = gsl_q_prop_orig.row(0).to_a
+          end
+          LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
+          LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
+          @sims = [ gram_matrix, @sims ]
+        end
+        # Find neighbors and store them as object variable, access all compounds for that.
+        def neighbors
+          @model.neighbors = []
+          @n_prop.each_with_index do |fp, idx| # AM: access all compounds
+            add_neighbor fp, idx
+          end
+        end
+        # Adds a neighbor to @neighbors if it passes the similarity threshold
+        # adjusts @ids to signal the
+        def add_neighbor(training_props, idx)
+          sim = similarity(training_props)
+          if sim > @model.parameter("min_sim")
+            if @model.activities[@cmpds[idx]]
+              @model.activities[@cmpds[idx]].each do |act|
+                @model.neighbors << {
+                  :compound => @cmpds[idx],
+                  :similarity => sim,
+                  :features => @fps[idx].keys,
+                  :activity => act
+                }
+                @sims << sim
+                @ids << idx
+              end
+            end
+          end
+        end
+        # Removes nil entries from n_prop and q_prop.
+        # Matrix is a nested two-dimensional array.
+        # Removes iteratively rows or columns with the highest fraction of nil entries, until all nil entries are removed.
+        # Tie break: columns take precedence.
+        # Deficient input such as [[nil],[nil]] will not be completely reduced, as the algorithm terminates if any matrix dimension (x or y) is zero.
+        # Enables the use of cosine similarity / SVD
+        def remove_nils
+         return @n_prop if (@n_prop.length == 0 || @n_prop[0].length == 0)
+          col_nr_nils = (Matrix.rows(@n_prop)).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
+          row_nr_nils = (Matrix.rows(@n_prop)).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
+          m_cols = col_nr_nils.max
+          m_rows = row_nr_nils.max
+          idx_cols = col_nr_nils.index(m_cols)
+          idx_rows = row_nr_nils.index(m_rows)
+          while ((m_cols > 0) || (m_rows > 0)) do
+            if m_cols >= m_rows
+              @n_prop.each { |row| row.slice!(idx_cols) }
+              @q_prop.slice!(idx_cols)
+            else
+              @n_prop.slice!(idx_rows)
+              @ids.slice!(idx_rows)
+            end
+            break if (@n_prop.length == 0) || (@n_prop[0].length == 0)
+            col_nr_nils = Matrix.rows(@n_prop).column_vectors.collect{ |cv| (cv.to_a.count(nil) / cv.size.to_f) }
+            row_nr_nils = Matrix.rows(@n_prop).row_vectors.collect{ |rv| (rv.to_a.count(nil) / rv.size.to_f) }
+            m_cols = col_nr_nils.max
+            m_rows = row_nr_nils.max
+            idx_cols= col_nr_nils.index(m_cols)
+            idx_rows = row_nr_nils.index(m_rows)
+          end
+        end
+        # Replaces nils by zeroes in n_prop and q_prop
+        # Enables the use of Tanimoto similarities with arrays (rows of n_prop and q_prop)
+        def convert_nils
+          @n_prop.each { |row| row.collect! { |v| v.nil? ? 0 : v } }
+          @q_prop.collect! { |v| v.nil? ? 0 : v }
+        end
+        # Executes model similarity_algorithm
+        def similarity(training_props)
+          eval("OpenTox::Algorithm::#{@model.similarity_algorithm}(training_props, @q_prop)")
+        end
+        # Converts fingerprints to matrix, order of rows by fingerprints. nil values allowed.
+        # Same for compound fingerprints.
+        def get_matrices
+          @cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = []
+          @model.fingerprints.each { |fp|
+            cmpd = fp[0]; fp = fp[1]
+            if @model.activities[cmpd] # row good
+              acts = @model.activities[cmpd]; @acts += acts
+              LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
+              row = []; @model.features.each { |f| row << fp[f] } # nils for non-existent f's
+              acts.size.times { # multiple additions for multiple activities
+                @n_prop << row.collect
+                @cmpds << cmpd
+                @fps << Marshal.load(Marshal.dump(fp))
+              }
+            else
+              LOGGER.warn "No activity found for compound '#{cmpd}' in model '#{@model.uri}'"
+            end
+          }
+          @model.features.each { |f| @q_prop << @model.compound_fingerprints[f] } # query structure
+        end
+        def props
+          @model.parameter("propositionalized") ? [ @n_prop, @q_prop ] : nil
+        end
+      end
+    end
+end