RubyGems - opentox-ruby - Versions diffs - 2.0.1 → 2.1.0 - Mend

opentox-ruby 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/Rakefile CHANGED

@@ -8,53 +8,46 @@ begin
     gem.summary = %Q{Ruby wrapper for the OpenTox REST API}
     gem.description = %Q{Ruby wrapper for the OpenTox REST API (http://www.opentox.org)}
     gem.email = "helma@in-silico.ch"
-    gem.homepage = "http://github.com/helma/opentox-ruby"
+    gem.homepage = "http://github.com/opentox/opentox-ruby"
     gem.authors = ["Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler"]
-    # dependencies
-    [ "sinatra",
-      "emk-sinatra-url-for",
-      "sinatra-respond_to",
-      "sinatra-static-assets",
-      "rest-client",
-      "rack",
-      "rack-contrib",
-      "rack-flash",
-      "nokogiri",
-      "rubyzip",
-      "roo",
-      "spreadsheet",
-      "google-spreadsheet-ruby",
-      "yajl-ruby",
-      "tmail",
-      "rinruby",
-      "ohm",
-      "ohm-contrib",
-      "SystemTimer",
-      "rjb",
-      #valiation-gems
-      "dm-core",
-      "dm-serializer",
-      "dm-timestamps",
-      "dm-types",
-      "dm-migrations",
-      "dm-validations",
-      "dm-sqlite-adapter"
-    ].each { |dep| gem.add_dependency dep }
-=begin
-    [ "dm-core",
-      'dm-serializer',
-      'dm-timestamps',
-      'dm-types',
-      'dm-migrations',
-      "dm-mysql-adapter",
-      "dm-validations",
-    ].each {|dep| gem.add_dependency dep, ">= 1" }
-=end
-    #valiation-gem
-    gem.add_dependency "haml", ">=3"
-    # validation-gems
-    gem.add_dependency "ruby-plot", "~>0.4.0"
-    ['jeweler'].each { |dep| gem.add_development_dependency dep }
+    # dependencies with versions
+    gem.add_dependency "sinatra", "=1.2.6"
+    gem.add_dependency "emk-sinatra-url-for", "=0.2.1"
+    gem.add_dependency "sinatra-respond_to", "=0.7.0"
+    gem.add_dependency "sinatra-static-assets", "=0.5.0"
+    gem.add_dependency "rest-client", "=1.6.1"
+    gem.add_dependency "rack", "=1.3.1"
+    gem.add_dependency "rack-contrib", "=1.1.0"
+    gem.add_dependency "rack-flash", "=0.1.1"
+    gem.add_dependency "nokogiri", "=1.4.4"
+    gem.add_dependency "rubyzip", "=0.9.4"
+    gem.add_dependency "roo", "=1.9.3"
+    gem.add_dependency "spreadsheet", "=0.6.5.4"
+    gem.add_dependency "google-spreadsheet-ruby", "=0.1.5"
+    gem.add_dependency "yajl-ruby", "=0.8.2"
+    #gem.add_dependency "mail", "=2.3.0"
+    gem.add_dependency "rinruby", "=2.0.2"
+    gem.add_dependency "ohm", "=0.1.3"
+    gem.add_dependency "ohm-contrib", "=0.1.1"
+    gem.add_dependency "SystemTimer", "=1.2.3"
+    gem.add_dependency "rjb", "=1.3.4"
+    gem.add_dependency "haml", "=3.1.1"
+    # for headless browser tests
+    gem.add_dependency "akephalos", "=0.2.5"
+    #valiation-gems
+    gem.add_dependency "dm-core",  "=1.1.0"
+    gem.add_dependency "dm-serializer",  "=1.1.0"
+    gem.add_dependency "dm-timestamps", "=1.1.0"
+    gem.add_dependency "dm-types",  "=1.1.0"
+    gem.add_dependency "dm-migrations",  "=1.1.0"
+    gem.add_dependency "dm-validations",  "=1.1.0"
+    gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
+    gem.add_dependency "ruby-plot", "=0.5.0"
+    gem.add_dependency "gsl", "=1.14.7"
+    gem.add_dependency "statsample", "=1.1.0"
+    #gem.add_dependency "statsample-optimization", "=2.1.0"
+    gem.add_development_dependency 'jeweler'
     gem.files =  FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
   end
   Jeweler::GemcutterTasks.new

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 2.0.1
1	+ 2.1.0

data/lib/algorithm.rb CHANGED

@@ -3,6 +3,8 @@
 # avoids compiling R with X
 R = nil
 require "rinruby"
+require "statsample"
+require 'uri'
 module OpenTox
@@ -16,6 +18,7 @@ module OpenTox
     # @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
     # @return [String] URI of new resource (dataset, model, ...)
     def run(params=nil, waiting_task=nil)
+      LOGGER.info "Running algorithm '"+@uri.to_s+"' with params: "+params.inspect
       RestClientWrapper.post(@uri, params, {:accept => 'text/uri-list'}, waiting_task).to_s
     end
@@ -45,12 +48,75 @@ module OpenTox
     end
     # Fminer algorithms (https://github.com/amaunz/fminer2)
-    module Fminer
+    class Fminer
       include Algorithm
+      attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi
+      def check_params(params,per_mil,subjectid=nil)
+        raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and  !params[:dataset_uri].nil?
+        raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and  !params[:prediction_feature].nil?
+        @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
+        @training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", subjectid
+        raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature])
+        unless params[:min_frequency].nil?
+          @minfreq=params[:min_frequency].to_i
+          raise "Minimum frequency must be a number >0!" unless @minfreq>0
+        else
+          @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+        end
+      end
+      def add_fminer_data(fminer_instance, params, value_map)
+        id = 1 # fminer start id is not 0
+        @training_dataset.data_entries.each do |compound,entry|
+          begin
+            smiles = OpenTox::Compound.smiles(compound.to_s)
+          rescue
+            LOGGER.warn "No resource for #{compound.to_s}"
+            next
+          end
+          if smiles == '' or smiles.nil?
+            LOGGER.warn "Cannot find smiles for #{compound.to_s}."
+            next
+          end
+          value_map=params[:value_map] unless params[:value_map].nil?
+          entry.each do |feature,values|
+            if feature == @prediction_feature.uri
+              values.each do |value|
+                if value.nil?
+                  LOGGER.warn "No #{feature} activity for #{compound.to_s}."
+                else
+                  if @prediction_feature.feature_type == "classification"
+                    activity= value_map.invert[value].to_i # activities are mapped to 1..n
+                    @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
+                  elsif @prediction_feature.feature_type == "regression"
+                    activity= value.to_f
+                  end
+                  begin
+                    fminer_instance.AddCompound(smiles,id)
+                    fminer_instance.AddActivity(activity, id)
+                    @all_activities[id]=activity # DV: insert global information
+                    @compounds[id] = compound
+                    @smi[id] = smiles
+                    id += 1
+                  rescue Exception => e
+                    LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
+                    LOGGER.warn e.backtrace
+                  end
+                end
+              end
+            end
+          end
+        end
+      end
+    end
       # Backbone Refinement Class mining (http://bbrc.maunz.de/)
-      class BBRC
-        include Fminer
+      class BBRC < Fminer
         # Initialize bbrc algorithm
         def initialize(subjectid=nil)
           super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
@@ -59,8 +125,7 @@ module OpenTox
       end
       # LAtent STructure Pattern Mining (http://last-pm.maunz.de)
-      class LAST
-        include Fminer
+      class LAST < Fminer
         # Initialize last algorithm
         def initialize(subjectid=nil)
           super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
@@ -68,7 +133,6 @@ module OpenTox
         end
       end
-    end
     # Create lazar prediction model
     class Lazar
@@ -90,19 +154,34 @@ module OpenTox
       # @param [Array] features_a Features of first compound
       # @param [Array] features_b Features of second compound
       # @param [optional, Hash] weights Weights for all features
+      # @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required
       # @return [Float] (Weighted) tanimoto similarity
-      def self.tanimoto(features_a,features_b,weights=nil)
+      def self.tanimoto(features_a,features_b,weights=nil,params=nil)
         common_features = features_a & features_b
         all_features = (features_a + features_b).uniq
-        common_p_sum = 0.0
+        #LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}"
         if common_features.size > 0
           if weights
-            common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
-            all_p_sum = 0.0
-            all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
+            #LOGGER.debug "nr_hits: #{params[:nr_hits]}"
+            if !params.nil? && params[:nr_hits]
+              params[:weights] = weights
+              params[:mode] = "min"
+              params[:features] = common_features
+              common_p_sum = Algorithm.p_sum_support(params)
+              params[:mode] = "max"
+              params[:features] = all_features
+              all_p_sum = Algorithm.p_sum_support(params)
+            else
+              common_p_sum = 0.0
+              common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
+              all_p_sum = 0.0
+              all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
+            end
+            #LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}"
             common_p_sum/all_p_sum
           else
-            common_features.to_f/all_features
+            #LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}"
+            common_features.size.to_f/all_features.size.to_f
           end
         else
           0.0
@@ -132,65 +211,300 @@ module OpenTox
       end
     end
+    # Structural Graph Clustering by TU Munich
+    # Finds clusters similar to a query structure in a given training dataset
+    # May be queried for cluster membership of an unknown compound
+    class StructuralClustering
+      attr_accessor :training_dataset_uri, :training_threshold, :query_dataset_uri, :query_threshold, :target_clusters_array
+      # @params[String] Training dataset_uri
+      # @params[Float]  Similarity threshold for training (optional)
+      # @params[String] Cluster service uri (no AA)
+      def initialize training_dataset_uri, training_threshold=0.8, cluster_service_uri = "http://opentox-dev.informatik.tu-muenchen.de:8080/OpenTox/algorithm/StructuralClustering"
+        if (training_dataset_uri =~ URI::regexp).nil? || (cluster_service_uri =~ URI::regexp).nil?
+          raise "Invalid URI."
+        end
+        @training_dataset_uri = training_dataset_uri
+        if !OpenTox::Algorithm.numeric? training_threshold || training_threshold <0 || training_threshold >1
+          raise "Training threshold out of bounds."
+        end
+        @training_threshold = training_threshold.to_f
+        # Train a cluster model
+        params = {:dataset_uri => @training_dataset_uri, :threshold => @training_threshold }
+        @cluster_model_uri = OpenTox::RestClientWrapper.post cluster_service_uri, params
+        cluster_model_rdf = OpenTox::RestClientWrapper.get @cluster_model_uri
+        @datasets = OpenTox::Parser::Owl.from_rdf cluster_model_rdf, OT.Dataset, true # must extract OT.Datasets from model
+        # Process parsed OWL objects
+        @clusterid_dataset_map = Hash.new
+        @datasets.each { |d|
+          begin
+            d.metadata[OT.hasSource]["Structural Clustering cluster "] = "" # must parse in metadata for string (not elegant)
+            @clusterid_dataset_map[d.metadata[OT.hasSource].to_i] = d.uri
+          rescue Exception => e
+            # ignore other entries!
+          end
+        }
+      end
+      # Whether a model has been trained
+      def trained?
+        !@cluster_model_uri.nil?
+      end
+      # Instance query: clusters for a compound
+      # @params[String] Query compound
+      # @params[Float]  Similarity threshold for query to clusters (optional)
+      def get_clusters query_compound_uri, query_threshold = 0.5
+        if !OpenTox::Algorithm.numeric? query_threshold || query_threshold <0 || query_threshold >1
+          raise "Query threshold out of bounds."
+        end
+        @query_threshold = query_threshold.to_f
+        # Preparing a query dataset
+        query_dataset = OpenTox::Dataset.new
+        @query_dataset_uri = query_dataset.save
+        query_dataset = OpenTox::Dataset.find @query_dataset_uri
+        query_dataset.add_compound query_compound_uri
+        @query_dataset_uri = query_dataset.save
+        # Obtaining a clustering for query compound
+        params = { :dataset_uri => @query_dataset_uri, :threshold => @query_threshold }
+        cluster_query_dataset_uri = OpenTox::RestClientWrapper.post @cluster_model_uri, params
+        cluster_query_dataset = OpenTox::Dataset.new cluster_query_dataset_uri
+        cluster_query_dataset.load_all
+        # Reading cluster ids for features from metadata
+        feature_clusterid_map = Hash.new
+        pattern="Prediction feature for cluster assignment " # must parse for string in metadata (not elegant)
+        cluster_query_dataset.features.each { |feature_uri,metadata|
+          metadata[DC.title][pattern]=""
+          feature_clusterid_map[feature_uri] = metadata[DC.title].to_i
+        }
+        # Integrity check
+        unless cluster_query_dataset.compounds.size == 1
+          raise "Number of predicted compounds is != 1."
+        end
+        # Process data entry
+        query_compound_uri = cluster_query_dataset.compounds[0]
+        @target_clusters_array = Array.new
+        cluster_query_dataset.features.keys.each { |cluster_membership_feature|
+          # Getting dataset URI for cluster
+          target_cluster = feature_clusterid_map[cluster_membership_feature]
+          dataset = @clusterid_dataset_map[target_cluster]
+          # Finally look up presence
+          data_entry = cluster_query_dataset.data_entries[query_compound_uri]
+          present = data_entry[cluster_membership_feature][0]
+          # Store result
+          @target_clusters_array << dataset if present > 0.5 # 0.0 for absence, 1.0 for presence
+        }
+      end
+    end
     module Neighbors
+      # Local multi-linear regression (MLR) prediction from neighbors.
+      # Uses propositionalized setting.
+      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+      # @return [Numeric] A prediction value.
+      def self.local_mlr_prop(params)
+        confidence=0.0
+        prediction=nil
+        if params[:neighbors].size>0
+          props = params[:prop_kernel] ? get_props(params) : nil
+          acts = params[:neighbors].collect { |n| act = n[:activity].to_f }
+          sims = params[:neighbors].collect { |n| Algorithm.gauss(n[:similarity]) }
+          LOGGER.debug "Local MLR (Propositionalization / GSL)."
+          prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} )
+          transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
+          prediction = transformer.values[0]
+          prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
+          LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+          params[:conf_stdev] = false if params[:conf_stdev].nil?
+          confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
+          confidence = nil if prediction.nil?
+        end
+        {:prediction => prediction, :confidence => confidence}
+      end
+      # Multi-linear regression weighted by similarity.
+      # Objective Feature Selection, Principal Components Analysis, Scaling of Axes.
+      # @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required
+      # @return [Numeric] A prediction value.
+      def self.mlr(params)
+        # GSL matrix operations:
+        # to_a : row-wise conversion to nested array
+        #
+        # Statsample operations (build on GSL):
+        # to_scale: convert into Statsample format
+        begin
+          n_prop = params[:n_prop].collect { |v| v }
+          q_prop = params[:q_prop].collect { |v| v }
+          n_prop << q_prop # attach q_prop
+          nr_cases, nr_features = get_sizes n_prop
+          data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
+          # Principal Components Analysis
+          LOGGER.debug "PCA..."
+          pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix)
+          data_matrix = pca.data_transformed_matrix
+          # Attach intercept column to data
+          intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1)
+          data_matrix = data_matrix.horzcat(intercept)
+          (0..data_matrix.size2-2).each { |i|
+            autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i))
+            data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values
+          }
+          # Detach query instance
+          n_prop = data_matrix.to_a
+          q_prop = n_prop.pop
+          nr_cases, nr_features = get_sizes n_prop
+          data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
+          # model + support vectors
+          LOGGER.debug "Creating MLR model ..."
+          c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl)
+          GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0]
+        rescue Exception => e
+          LOGGER.debug "#{e.class}: #{e.message}"
+        end
+      end
       # Classification with majority vote from neighbors weighted by similarity
-      # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity`
-      # @param [optional] params Ignored (only for compatibility with local_svm_regression)
-      # @return [Hash] Hash with keys `:prediction, :confidence`
-      def self.weighted_majority_vote(neighbors,params={})
-        conf = 0.0
+      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+      # @return [Numeric] A prediction value.
+      def self.weighted_majority_vote(params)
+        neighbor_contribution = 0.0
+        confidence_sum = 0.0
         confidence = 0.0
-        neighbors.each do |neighbor|
-          case neighbor[:activity].to_s
-          when 'true'
-            conf += Algorithm.gauss(neighbor[:similarity])
-          when 'false'
-            conf -= Algorithm.gauss(neighbor[:similarity])
+        prediction = nil
+        params[:neighbors].each do |neighbor|
+          neighbor_weight = Algorithm.gauss(neighbor[:similarity]).to_f
+          neighbor_contribution += neighbor[:activity].to_f * neighbor_weight
+          if params[:value_map].size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
+            case neighbor[:activity]
+            when 1
+              confidence_sum -= neighbor_weight
+            when 2
+              confidence_sum += neighbor_weight
+            end
+          else
+            confidence_sum += neighbor_weight
           end
         end
-        if conf > 0.0
-          prediction = true
-        elsif conf < 0.0
-          prediction = false
-        else
-          prediction = nil
-        end
-        confidence = conf/neighbors.size if neighbors.size > 0
-        {:prediction => prediction, :confidence => confidence.abs}
+        if params[:value_map].size == 2
+          if confidence_sum >= 0.0
+            prediction = 2 unless params[:neighbors].size==0
+          elsif confidence_sum < 0.0
+            prediction = 1 unless params[:neighbors].size==0
+          end
+        else
+          prediction = (neighbor_contribution/confidence_sum).round  unless params[:neighbors].size==0  # AM: new multinomial prediction
+        end
+        LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil?
+        confidence = confidence_sum/params[:neighbors].size if params[:neighbors].size > 0
+        LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil?
+        return {:prediction => prediction, :confidence => confidence.abs}
       end
       # Local support vector regression from neighbors
-      # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features`
-      # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required
-      # @return [Hash] Hash with keys `:prediction, :confidence`
-      def self.local_svm_regression(neighbors,params )
-        sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values between query and neighbors
-        conf = sims.inject{|sum,x| sum + x }
-        # AM: Control log taking
-        take_logs=true
-        neighbors.each do |n|
-          if (! n[:activity].nil?) && (n[:activity].to_f < 0.0)
-            take_logs = false
-          end
+      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+      # @return [Numeric] A prediction value.
+      def self.local_svm_regression(params)
+        confidence = 0.0
+        prediction = nil
+        if params[:neighbors].size>0
+          props = params[:prop_kernel] ? get_props(params) : nil
+          acts = params[:neighbors].collect{ |n| n[:activity].to_f }
+          sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) }
+          prediction = props.nil? ? local_svm(acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr")
+          transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
+          prediction = transformer.values[0]
+          prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
+          LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+          params[:conf_stdev] = false if params[:conf_stdev].nil?
+          confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
+          confidence = nil if prediction.nil?
         end
-        acts = neighbors.collect do |n|
-          act = n[:activity]
-          take_logs ? Math.log10(act.to_f) : act.to_f
-        end # activities of neighbors for supervised learning
+        {:prediction => prediction, :confidence => confidence}
+      end
+      # Local support vector classification from neighbors
+      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+      # @return [Numeric] A prediction value.
+      def self.local_svm_classification(params)
-        neighbor_matches = neighbors.collect{ |n| n[:features] } # as in classification: URIs of matches
+        confidence = 0.0
+        prediction = nil
+        if params[:neighbors].size>0
+          props = params[:prop_kernel] ? get_props(params) : nil
+          acts = params[:neighbors].collect { |n| act = n[:activity] }
+          sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
+          prediction = props.nil? ? local_svm(acts, sims, "C-bsvc", params) : local_svm_prop(props, acts, "C-bsvc")
+          LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
+          params[:conf_stdev] = false if params[:conf_stdev].nil?
+          confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
+        end
+        {:prediction => prediction, :confidence => confidence}
+      end
+      # Local support vector prediction from neighbors.
+      # Uses pre-defined Kernel Matrix.
+      # Not to be called directly (use local_svm_regression or local_svm_classification).
+      # @param [Array] acts, activities for neighbors.
+      # @param [Array] sims, similarities for neighbors.
+      # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
+      # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
+      # @return [Numeric] A prediction value.
+      def self.local_svm(acts, sims, type, params)
+        LOGGER.debug "Local SVM (Weighted Tanimoto Kernel)."
+        neighbor_matches = params[:neighbors].collect{ |n| n[:features] } # URIs of matches
         gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
-        if neighbor_matches.size == 0
-          raise "No neighbors found"
+        prediction = nil
+        if Algorithm::zero_variance? acts
+          prediction = acts[0]
         else
           # gram matrix
           (0..(neighbor_matches.length-1)).each do |i|
+            neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]]
             gram_matrix[i] = [] unless gram_matrix[i]
             # upper triangle
             ((i+1)..(neighbor_matches.length-1)).each do |j|
-              sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values])")
+              neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]]
+              sim_params = {}
+              if params[:nr_hits]
+                sim_params[:nr_hits] = true
+                sim_params[:compound_features_hits] = neighbor_i_hits
+                sim_params[:training_compound_features_hits] = neighbor_j_hits
+              end
+              sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)")
               gram_matrix[i][j] = Algorithm.gauss(sim)
               gram_matrix[j] = [] unless gram_matrix[j]
               gram_matrix[j][i] = gram_matrix[i][j] # lower triangle
@@ -198,6 +512,7 @@ module OpenTox
             gram_matrix[i][i] = 1.0
           end
           #LOGGER.debug gram_matrix.to_yaml
           @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
           @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
@@ -208,27 +523,171 @@ module OpenTox
           @r.y = acts
           @r.sims = sims
-          LOGGER.debug "Preparing R data ..."
-          # prepare data
-          @r.eval "y<-as.vector(y)"
-          @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
-          @r.eval "sims<-as.vector(sims)"
-          # model + support vectors
-          LOGGER.debug "Creating SVM model ..."
-          @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"nu-svr\", nu=0.8)"
-          @r.eval "sv<-as.vector(SVindex(model))"
-          @r.eval "sims<-sims[sv]"
-          @r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
-          LOGGER.debug "Predicting ..."
-          @r.eval "p<-predict(model,sims)[1,1]"
-          prediction = 10**(@r.p.to_f) if take_logs
-          LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
-          @r.quit # free R
+          begin
+            LOGGER.debug "Preparing R data ..."
+            # prepare data
+            @r.eval "y<-as.vector(y)"
+            @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
+            @r.eval "sims<-as.vector(sims)"
+            # model + support vectors
+            LOGGER.debug "Creating SVM model ..."
+            @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"#{type}\", nu=0.5)"
+            @r.eval "sv<-as.vector(SVindex(model))"
+            @r.eval "sims<-sims[sv]"
+            @r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
+            LOGGER.debug "Predicting ..."
+            if type == "nu-svr"
+              @r.eval "p<-predict(model,sims)[1,1]"
+            elsif type == "C-bsvc"
+              @r.eval "p<-predict(model,sims)"
+            end
+            if type == "nu-svr"
+              prediction = @r.p
+            elsif type == "C-bsvc"
+              #prediction = (@r.p.to_f == 1.0 ? true : false)
+              prediction = @r.p
+            end
+            @r.quit # free R
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
         end
-        confidence = conf/neighbors.size if neighbors.size > 0
-        {:prediction => prediction, :confidence => confidence}
+        prediction
+      end
+      # Local support vector prediction from neighbors.
+      # Uses propositionalized setting.
+      # Not to be called directly (use local_svm_regression or local_svm_classification).
+      # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
+      # @param [Array] acts, activities for neighbors.
+      # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
+      # @return [Numeric] A prediction value.
+      def self.local_svm_prop(props, acts, type)
+          LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
+          n_prop = props[0] # is a matrix, i.e. two nested Arrays.
+          q_prop = props[1] # is an Array.
+          prediction = nil
+          if Algorithm::zero_variance? acts
+            prediction = acts[0]
+          else
+            #LOGGER.debug gram_matrix.to_yaml
+            @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
+            @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
+            LOGGER.debug "Setting R data ..."
+            # set data
+            @r.n_prop = n_prop.flatten
+            @r.n_prop_x_size = n_prop.size
+            @r.n_prop_y_size = n_prop[0].size
+            @r.y = acts
+            @r.q_prop = q_prop
+            begin
+              LOGGER.debug "Preparing R data ..."
+              # prepare data
+              @r.eval "y<-matrix(y)"
+              @r.eval "prop_matrix<-matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=TRUE)"
+              @r.eval "q_prop<-matrix(q_prop, 1, n_prop_y_size, byrow=TRUE)"
+              # model + support vectors
+              LOGGER.debug "Creating SVM model ..."
+              @r.eval "model<-ksvm(prop_matrix, y, type=\"#{type}\", nu=0.5)"
+              LOGGER.debug "Predicting ..."
+              if type == "nu-svr"
+                @r.eval "p<-predict(model,q_prop)[1,1]"
+              elsif type == "C-bsvc"
+                @r.eval "p<-predict(model,q_prop)"
+              end
+              if type == "nu-svr"
+                prediction = @r.p
+              elsif type == "C-bsvc"
+                #prediction = (@r.p.to_f == 1.0 ? true : false)
+                prediction = @r.p
+              end
+              @r.quit # free R
+            rescue Exception => e
+              LOGGER.debug "#{e.class}: #{e.message}"
+              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+            end
+          end
+          prediction
+      end
+      # Get confidence for regression, with standard deviation of neighbor activity if conf_stdev is set.
+      # @param[Hash] Required keys: :sims, :acts, :neighbors, :conf_stdev
+      # @return[Float] Confidence
+      def self.get_confidence(params)
+        if params[:conf_stdev]
+          sim_median = params[:sims].to_scale.median
+          if sim_median.nil?
+            confidence = nil
+          else
+            standard_deviation = params[:acts].to_scale.standard_deviation_sample
+            confidence = (sim_median*Math.exp(-1*standard_deviation)).abs
+            if confidence.nan?
+              confidence = nil
+            end
+          end
+        else
+          conf = params[:sims].inject{|sum,x| sum + x }
+          confidence = conf/params[:neighbors].size
+        end
+        LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
+        return confidence
+      end
+      # Get X and Y size of a nested Array (Matrix)
+      def self.get_sizes(matrix)
+        begin
+          nr_cases = matrix.size
+          nr_features = matrix[0].size
+        rescue Exception => e
+          LOGGER.debug "#{e.class}: #{e.message}"
+          LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+        end
+        #puts "NRC: #{nr_cases}, NRF: #{nr_features}"
+        [ nr_cases, nr_features ]
+      end
+      # Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features)
+      # Same for the vector describing the query compound
+      # @param[Array] neighbors.
+      # @param[OpenTox::Compound] query compound.
+      # @param[Array] Dataset Features.
+      # @param[Array] Fingerprints of neighbors.
+      # @param[Float] p-values of Features.
+      def self.get_props (params)
+        matrix = Array.new
+        begin
+          params[:neighbors].each do |n|
+            n = n[:compound]
+            row = []
+            params[:features].each do |f|
+              if ! params[:fingerprints][n].nil?
+                row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0)
+              else
+                row << 0.0
+              end
+            end
+            matrix << row
+          end
+          row = []
+          params[:features].each do |f|
+            if params[:nr_hits]
+              compound_feature_hits = params[:compound].match_hits([f])
+              row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f]))
+            else
+              row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f])
+            end
+          end
+        rescue Exception => e
+          LOGGER.debug "get_props failed with '" + $! + "'"
+        end
+        [ matrix, row ]
       end
     end
@@ -250,6 +709,195 @@ module OpenTox
       def features(dataset_uri,compound_uri)
       end
     end
+    module Transform
+      include Algorithm
+      # The transformer that inverts values.
+      # 1/x is used, after values have been moved >= 1.
+      class Inverter
+        attr_accessor :offset, :values
+        # @params[Array] Values to transform.
+        # @params[Float] Offset for restore.
+        def initialize *args
+          case args.size
+          when 1
+            begin
+              values=args[0]
+              raise "Cannot transform, values empty." if @values.size==0
+              @values = values.collect { |v| -1.0 * v }
+              @offset = 1.0 - @values.minmax[0]
+              @offset = -1.0 * @offset if @offset>0.0
+              @values.collect! { |v| v - @offset }   # slide >1
+              @values.collect! { |v| 1 / v }         # invert to [0,1]
+            rescue Exception => e
+              LOGGER.debug "#{e.class}: #{e.message}"
+              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+            end
+          when 2
+            @offset = args[1].to_f
+            @values = args[0].collect { |v| 1 / v }
+            @values.collect! { |v| v + @offset }
+            @values.collect! { |v| -1.0 * v }
+          end
+        end
+      end
+      # The transformer that takes logs.
+      # Log10 is used, after values have been moved > 0.
+      class Log10
+        attr_accessor :offset, :values
+        # @params[Array] Values to transform / restore.
+        # @params[Float] Offset for restore.
+        def initialize *args
+          @distance_to_zero = 0.000000001 # 1 / 1 billion
+          case args.size
+          when 1
+            begin
+              values=args[0]
+              raise "Cannot transform, values empty." if values.size==0
+              @offset = values.minmax[0]
+              @offset = -1.0 * @offset if @offset>0.0
+              @values = values.collect { |v| v - @offset }   # slide > anchor
+              @values.collect! { |v| v + @distance_to_zero }  #
+              @values.collect! { |v| Math::log10 v } # log10 (can fail)
+            rescue Exception => e
+              LOGGER.debug "#{e.class}: #{e.message}"
+              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+            end
+          when 2
+            @offset = args[1].to_f
+            @values = args[0].collect { |v| 10**v }
+            @values.collect! { |v| v - @distance_to_zero }
+            @values.collect! { |v| v + @offset }
+          end
+        end
+      end
+      # The transformer that does nothing (No OPeration).
+      class NOP
+        attr_accessor :offset, :values
+        # @params[Array] Values to transform / restore.
+        # @params[Float] Offset for restore.
+        def initialize *args
+          @offset = 0.0
+          @distance_to_zero = 0.0
+          case args.size
+          when 1
+            @values = args[0]
+          when 2
+            @values = args[0]
+          end
+        end
+      end
+      # Auto-Scaler for Arrays
+      # Center on mean and divide by standard deviation
+      class AutoScale
+        attr_accessor :scaled_values, :mean, :stdev
+        # @params[Array] Values to transform.
+        def initialize values
+          @scaled_values = values
+          @mean = @scaled_values.to_scale.mean
+          @stdev = @scaled_values.to_scale.standard_deviation_sample
+          @scaled_values = @scaled_values.collect {|vi| vi - @mean }
+          @scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0
+        end
+      end
+      # Principal Components Analysis
+      # Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
+      class PCA
+        attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
+        # Creates a transformed dataset as GSL::Matrix.
+        # @param [GSL::Matrix] Data matrix.
+        # @param [Float] Compression ratio from [0,1].
+        # @return [GSL::Matrix] Data transformed matrix.
+        def initialize data_matrix, compression=0.05
+          begin
+            @data_matrix = data_matrix
+            @compression = compression.to_f
+            @stdev = Array.new
+            @mean = Array.new
+            # Objective Feature Selection
+            raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
+            @data_matrix_selected = nil
+            (0..@data_matrix.size2-1).each { |i|
+              if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
+                if @data_matrix_selected.nil?
+                  @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
+                  @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
+                else
+                  @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
+                end
+              end
+            }
+            raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
+            # Scaling of Axes
+            @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2)
+            (0..@data_matrix_selected.size2-1).each { |i|
+              @autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
+              @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values
+              @stdev << @autoscaler.stdev
+              @mean << @autoscaler.mean
+            }
+            data_matrix_hash = Hash.new
+            (0..@data_matrix_scaled.size2-1).each { |i|
+              column_view = @data_matrix_scaled.col(i)
+              data_matrix_hash[i] = column_view.to_scale
+            }
+            dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
+            cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
+            pca=Statsample::Factor::PCA.new(cor_matrix)
+            pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
+            @eigenvalue_sums = Array.new
+            (0..dataset_hash.fields.size-1).each { |i|
+              @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
+            }
+            eigenvectors_selected = Array.new
+            pca.eigenvectors.each_with_index { |ev, i|
+              if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0)
+                eigenvectors_selected << ev.to_a
+              end
+            }
+            @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose
+            dataset_matrix = dataset_hash.to_gsl.transpose
+            @data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose
+          rescue Exception => e
+              LOGGER.debug "#{e.class}: #{e.message}"
+              LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+        # Restores data in the original feature space (possibly with compression loss).
+        # @return [GSL::Matrix] Data matrix.
+        def restore
+          begin
+            data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
+            # reverse scaling
+            (0..data_matrix_restored.size2-1).each { |i|
+              data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0
+              data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
+            }
+            data_matrix_restored
+          rescue Exception => e
+            LOGGER.debug "#{e.class}: #{e.message}"
+            LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          end
+        end
+      end
+    end
     # Gauss kernel
     # @return [Float]
@@ -257,16 +905,85 @@ module OpenTox
       d = 1.0 - x.to_f
       Math.exp(-(d*d)/(2*sigma*sigma))
     end
+    # For symbolic features
+    # @param [Array] Array to test, must indicate non-occurrence with 0.
+    # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
+    def self.isnull_or_singular?(array)
+      nr_zeroes = array.count(0)
+      return (nr_zeroes == array.size) ||    # remove non-occurring feature
+             (nr_zeroes == array.size-1) ||  # remove singular feature
+             (nr_zeroes == 0)                # also remove feature present everywhere
+    end
+    # Numeric value test
+    # @param[Object] value
+    # @return [Boolean] Whether value is a number
+    def self.numeric?(value)
+      true if Float(value) rescue false
+    end
+    # For symbolic features
+    # @param [Array] Array to test, must indicate non-occurrence with 0.
+    # @return [Boolean] Whether the feature has variance zero.
+    def self.zero_variance?(array)
+      return (array.to_scale.variance_sample == 0.0)
+    end
-    # Median of an array
+    # Sum of an array for Arrays.
     # @param [Array] Array with values
-    # @return [Float] Median
-    def self.median(array)
-      return nil if array.empty?
-      array.sort!
-      m_pos = array.size / 2
-      return array.size % 2 == 1 ? array[m_pos] : (array[m_pos-1] + array[m_pos])/2
+    # @return [Integer] Sum of size of values
+    def self.sum_size(array)
+      sum=0
+      array.each { |e| sum += e.size }
+      return sum
+    end
+    # Minimum Frequency
+    # @param [Integer] per-mil value
+    # return [Integer] min-frequency
+    def self.min_frequency(training_dataset,per_mil)
+      minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+      minfreq = 2 unless minfreq > 2
+      Integer (minfreq)
     end
+    # Effect calculation for classification
+    # @param [Array] Array of occurrences per class in the form of Enumerables.
+    # @param [Array] Array of database instance counts per class.
+    def self.effect(occurrences, db_instances)
+      max=0
+      max_value=0
+      nr_o = self.sum_size(occurrences)
+      nr_db = db_instances.to_scale.sum
+      occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
+        actual = o.size.to_f/nr_o
+        expected = db_instances[i].to_f/nr_db
+        if actual > expected
+          if ((actual - expected) / actual) > max_value
+           max_value = (actual - expected) / actual # 'Schleppzeiger'
+            max = i
+          end
+        end
+      }
+      max
+    end
+    # Returns Support value of an fingerprint
+    # @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required
+    # return [Numeric] Support value
+    def self.p_sum_support(params)
+      p_sum = 0.0
+        params[:features].each{|f|
+        compound_hits = params[:compound_features_hits][f]
+        neighbor_hits = params[:training_compound_features_hits][f]
+        p_sum += eval("(Algorithm.gauss(params[:weights][f]) * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))")
+      }
+      p_sum
+    end
   end
 end