RubyGems - opentox-ruby - Versions diffs - 3.1.0 → 4.0.0 - Mend

opentox-ruby 3.1.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data/lib/dataset.rb CHANGED

@@ -197,7 +197,12 @@ module OpenTox
       accept_values
     end
-    # Detect feature type(s) in the dataset
+    # Detect feature type (reduced to one across all features)
+    # Classification takes precedence over regression
+    # DEPRECATED --
+    #   HAS NO SENSE FOR DATASETS WITH MORE THAN 1 FEATURE
+    #   FEATURES CAN HAVE MULTIPLE TYPES
+    # Replacement: see feature_types()
     # @return [String] `classification", "regression", "mixed" or unknown`
     def feature_type(subjectid=nil)
       load_features(subjectid)
@@ -210,6 +215,24 @@ module OpenTox
         "unknown"
       end
     end
+    # Detect feature types. A feature can have multiple types.
+    # Returns types hashed by feature URI, with missing features omitted.
+    # Example (YAML):
+    #   http://toxcreate3.in-silico.ch:8082/dataset/152/feature/nHal:
+    #   - http://www.opentox.org/api/1.1#NumericFeature
+    #   - http://www.opentox.org/api/1.1#NominalFeature
+    #   ...
+    #
+    # @return [Hash] Keys: feature URIs, Values: Array of types
+    def feature_types(subjectid=nil)
+      load_features(subjectid)
+        @features.inject({}){ |h,(f,metadata)|
+          h[f]=metadata[RDF.type] unless metadata[RDF.type][0].include? "MissingFeature"
+          h
+        }
+    end
 =begin
 =end
@@ -316,11 +339,14 @@ module OpenTox
     end
     # Complete feature values by adding zeroes
-    def complete_data_entries
+    # @param [Hash] key: compound, value: duplicate sizes
+    def complete_data_entries(compound_sizes)
       all_features = @features.keys
       @data_entries.each { |c, e|
         (Set.new(all_features.collect)).subtract(Set.new e.keys).to_a.each { |f|
-          self.add(c,f,0)
+          compound_sizes[c].times {
+            self.add(c,f,0)
+          }
         }
       }
     end
@@ -454,6 +480,14 @@ module OpenTox
       end
     end
+    def value_map(prediction_feature_uri)
+      training_classes = accept_values(prediction_feature_uri).sort
+      value_map=Hash.new
+      training_classes.each_with_index { |c,i| value_map[i+1] = c }
+      value_map
+    end
     private
     # Copy a dataset (rewrites URI)
     def copy(dataset)
@@ -504,6 +538,7 @@ module OpenTox
       @data_entries[compound.uri].collect{|f,v| @features[f] if f.match(/neighbor/)}.compact if @data_entries[compound.uri]
     end
 #    def errors(compound)
 #      features = @data_entries[compound.uri].keys
 #      features.collect{|f| @features[f][OT.error]}.join(" ") if features

data/lib/model.rb CHANGED

@@ -103,7 +103,7 @@ module OpenTox
       include Model
-      attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors
+      attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors, :compounds
       def initialize(uri=nil)
         if uri
@@ -169,12 +169,13 @@ module OpenTox
         lazar.prediction_algorithm = hash["prediction_algorithm"] if hash["prediction_algorithm"]
         lazar.subjectid = hash["subjectid"] if hash["subjectid"]
         lazar.value_map = hash["value_map"] if hash["value_map"]
+        lazar.compounds = hash["compounds"] if hash["compounds"]
         lazar
       end
       def to_json
-        Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map})
+        Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map, :compounds => @compounds})
       end
       def run( params, accept_header=nil, waiting_task=nil )
@@ -237,6 +238,7 @@ module OpenTox
         @compound = Compound.new compound_uri
         features = {}
         #LOGGER.debug self.to_yaml
         unless @prediction_dataset
           @prediction_dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid)
@@ -247,19 +249,19 @@ module OpenTox
             OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}]
           } )
         end
-        if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "regression"
-          all_activities = []
-          all_activities = @activities.values.flatten.collect! { |i| i.to_f }
-        end
         unless database_activity(subjectid) # adds database activity to @prediction_dataset
           # Calculation of needed values for query compound
           @compound_features = eval("#{@feature_calculation_algorithm}({
                                     :compound => @compound,
                                     :features => @features,
                                     :feature_dataset_uri => @metadata[OT.featureDataset],
                                     :pc_type => self.parameter(\"pc_type\"),
+                                    :lib => self.parameter(\"lib\"),
                                     :subjectid => subjectid
                                     })")
           # Adding fingerprint of query compound with features and values(p_value*nr_hits)
           @compound_fingerprints = {}
           @compound_features.each do |feature, value| # value is nil if "Substructure.match"
@@ -314,6 +316,16 @@ module OpenTox
                 @prediction_dataset.add @compound.uri, feature_uri, true
                 f+=1
               end
+            elsif @feature_calculation_algorithm == "Substructure.lookup"
+              f = 0
+              @compound_features.each do |feature, value|
+                features[feature] = feature
+                @prediction_dataset.add_feature(feature, {
+                  RDF.type => [OT.NumericFeature]
+                })
+                @prediction_dataset.add @compound.uri, feature, value
+                f+=1
+              end
             else
               @compound_features.each do |feature|
                 features[feature] = feature
@@ -337,15 +349,26 @@ module OpenTox
                 else
                   feature_uri = feature
                 end
-                @prediction_dataset.add neighbor[:compound], feature_uri, true
+                if @feature_calculation_algorithm == "Substructure.lookup"
+                  @prediction_dataset.add neighbor[:compound], feature_uri, @fingerprints[neighbor[:compound]][feature_uri]
+                else
+                  @prediction_dataset.add neighbor[:compound], feature_uri, true
+                end
                 unless features.has_key? feature
                   features[feature] = feature_uri
-                  @prediction_dataset.add_feature(feature_uri, {
-                    RDF.type => [OT.Substructure],
-                    OT.smarts => feature,
-                    OT.pValue => @p_values[feature],
-                    OT.effect => @effects[feature]
-                  })
+                  if @feature_calculation_algorithm == "Substructure.lookup"
+                    @prediction_dataset.add_feature(feature_uri, {
+                      RDF.type => [OT.NumericFeature]
+                    })
+                  else
+                    @prediction_dataset.add_feature(feature_uri, {
+                      RDF.type => [OT.Substructure],
+                      OT.smarts => feature,
+                      OT.pValue => @p_values[feature],
+                      OT.effect => @effects[feature]
+                    })
+                  end
                   f+=1
                 end
               end

data/lib/parser.rb CHANGED

@@ -349,11 +349,15 @@ module OpenTox
       # Load CSV string (format specification: http://toxcreate.org/help)
       # @param [String] csv CSV representation of the dataset
+      # @param [Boolean] drop_missing Whether completely missing rows should be droppped
+      # @param [Boolean] all_numeric Whether all features should be treated as numeric
+      # @param [Boolean] del_nominal All nominal features will be removed
       # @return [OpenTox::Dataset] Dataset object with CSV data
-      def load_csv(csv, drop_missing=false)
+      def load_csv(csv, drop_missing=false, all_numeric=false)
         row = 0
         input = csv.split("\n")
         headers = split_row(input.shift)
+        headers.collect! {|header| header.to_s.gsub(/[\/.\\\(\)\{\}\[\]]/,"_")}
         add_features(headers)
         value_maps = Array.new
         regression_features=Array.new
@@ -362,7 +366,7 @@ module OpenTox
           row = split_row(row)
           value_maps = detect_new_values(row, value_maps)
           value_maps.each_with_index { |vm,j|
-            if vm.size > @max_class_values # max @max_class_values classes.
+            if (vm.size > @max_class_values) || all_numeric # max @max_class_values classes.
               regression_features[j]=true
             else
               regression_features[j]=false
@@ -392,22 +396,30 @@ module OpenTox
       def warnings
-        info = ''
+        info = '<br>'
         @feature_types.each do |feature,types|
+          @dataset.add_feature_metadata(feature,{RDF.type => []})
           if types.uniq.size == 0
-            type = "helper#MissingFeature"
-          elsif types.uniq.size > 1
-            type = OT.NumericFeature
+            @dataset.add_feature_metadata(
+              feature, {RDF.type => ( @dataset.features[feature][RDF.type] << "helper#MissingFeature" ) } # TODO: Fit to OT ontology!
+            )
+            info += "'#{@dataset.feature_name(feature)}' detected as 'MissingFeature'<br>"
           else
-            type = types.first
+            info += "'#{@dataset.feature_name(feature)}' detected as "
+            types_arr = []
+            types.uniq.each { |t|
+              types_arr << t
+              info += "'#{t.split('#').last}', "
+            }
+            @dataset.add_feature_metadata(
+              feature, {RDF.type => types_arr.sort} # nominal should be first for downward compatibility
+            )
+            info.chop!.chop!
+            info += "<br>"
           end
-          @dataset.add_feature_metadata(feature,{RDF.type => [type]})
-          info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
-          # TODO: rewrite feature values
-          # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
         end
         @dataset.metadata[OT.Info] = info
         warnings = ''
@@ -469,28 +481,31 @@ module OpenTox
           unless @duplicate_feature_indices.include? i
             value = row[i]
-            #LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty
             feature = @features[feature_idx]
             type = feature_type(value) # May be NIL
-            type = OT.NominalFeature unless (type.nil? || regression_features[i])
             @feature_types[feature] << type if type
+            # Add nominal type if #distinct values le @max_class_values
+            if type == OT.NumericFeature
+              @feature_types[feature] << OT.NominalFeature unless regression_features[i]
+            end
             val = nil
             case type
             when OT.NumericFeature
               val = value.to_f
+              val = nil if val.infinite?
             when OT.NominalFeature
               val = value.to_s
             end
             feature_idx += 1
-            if val != nil
+            if val != nil
               @dataset.add(compound.uri, feature, val)
-              if type != OT.NumericFeature
+              if @feature_types[feature].include? OT.NominalFeature
                 @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
-                @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
+                @dataset.features[feature][OT.acceptValue] << val unless @dataset.features[feature][OT.acceptValue].include?(val)
               end
             end
@@ -654,7 +669,7 @@ module OpenTox
           obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
           table.data[compound.uri] = row
         end
         # find and remove ignored_features
         @activity_errors = table.clean_features
         table.add_to_dataset @dataset

data/lib/r-util.rb CHANGED

@@ -8,6 +8,18 @@ PACKAGE_DIR = package_dir
 require "tempfile"
+class Array
+  def check_uniq
+    hash = {}
+    self.each do |x|
+      raise "duplicate #{x}" if hash[x]
+      hash[x] = true
+    end
+  end
+end
 module OpenTox
   class RUtil
@@ -75,12 +87,10 @@ module OpenTox
     end
     # embedds feature values of two datasets into 2D and plots it
-    # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)
     #
     def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
-        features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
+        features=nil, subjectid=nil, waiting_task=nil)
-      raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
       LOGGER.debug("r-util> create feature value plot")
       d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
       d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
@@ -102,17 +112,13 @@ module OpenTox
       @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
       @r.names = [dataset_name1, dataset_name2]
       LOGGER.debug("r-util> - convert data to 2d")
-      @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
+      #@r.eval "save.image(\"/tmp/image.R\")"
+      @r.eval "df.2d <- plot_pre_process(df, method='sammon')"
       waiting_task.progress(75) if waiting_task
-      if fast_plot
-        info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
-      else
-        info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
-      end
       LOGGER.debug("r-util> - plot data")
       plot_to_files(files) do |file|
-        @r.eval "plot_split( df.2d, split, names, #{info})"
+        @r.eval "plot_split( df.2d, split, names, main='Sammon embedding of #{features.size} features',xlab='x',ylab='y')"
       end
     end
@@ -170,19 +176,68 @@ module OpenTox
       end
     end
-    # stratified splits a dataset into two dataset the feature values
+    # stratified splits a dataset into two dataset according to the feature values
+    # all features are taken into account unless <split_features> is given
+    # returns two datases
+    def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
+      stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features )
+    end
+    # stratified splits a dataset into k datasets according the feature values
     # all features are taken into account unless <split_features> is given
-    def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
+    # returns two arrays of datasets
+    def stratified_k_fold_split( dataset, metadata={}, missing_values="NA", num_folds=10, subjectid=nil, seed=42, split_features=nil )
+      stratified_split_internal( dataset, metadata, missing_values, num_folds, nil, subjectid, seed, split_features )
+    end
+    private
+    def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil )
+      raise "internal error" if num_folds!=nil and pct!=nil
+      k_fold_split = num_folds!=nil
+      if k_fold_split
+        raise "num_folds not a fixnum: #{num_folds}" unless num_folds.is_a?(Fixnum)
+      else
+        raise "pct is not a numeric: #{pct}" unless pct.is_a?(Numeric)
+      end
       raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
+      raise "missing_values=#{missing_values}" unless missing_values.is_a?(String) or missing_values==0
+      raise "subjectid=#{subjectid}" unless subjectid==nil or subjectid.is_a?(String)
       LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
-      df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
+      df = dataset_to_dataframe( dataset, missing_values, subjectid)
       @r.eval "set.seed(#{seed})"
-      @r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
-      split = @r.pull 'split'
-      split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
-      split_to_datasets( df, split, subjectid )
+      str_split_features = ""
+      if split_features
+        @r.split_features = split_features if split_features
+        str_split_features = "colnames=split_features"
+      end
+      #@r.eval "save.image(\"/tmp/image.R\")"
+      if k_fold_split
+        @r.eval "split <- stratified_k_fold_split(#{df}, num_folds=#{num_folds}, #{str_split_features})"
+        split = @r.pull 'split'
+        train = []
+        test = []
+        num_folds.times do |f|
+          datasetname = 'dataset fold '+(f+1).to_s+' of '+num_folds.to_s
+          metadata[DC.title] = "training "+datasetname
+          train << split_to_dataset( df, split, metadata, subjectid ){ |i| i!=(f+1) }
+          metadata[DC.title] = "test "+datasetname
+          test << split_to_dataset( df, split, metadata, subjectid ){ |i| i==(f+1) }
+        end
+        return train, test
+      else
+        puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
+        @r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
+        split = @r.pull 'split'
+        metadata[DC.title] = "Training dataset split of "+dataset.uri
+        train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 }
+        metadata[DC.title] = "Test dataset split of "+dataset.uri
+        test = split_to_dataset( df, split, metadata, subjectid ){ |i| i==0 }
+        return train, test
+      end
     end
+    public
     # dataset should be loaded completely (use Dataset.find)
     # takes duplicates into account
@@ -212,9 +267,13 @@ module OpenTox
         features = dataset.features.keys.sort
       end
       compounds = []
+      compound_names = []
       dataset.compounds.each do |c|
+        count = 0
         num_compounds[c].times do |i|
           compounds << c
+          compound_names << "#{c}$#{count}"
+          count+=1
         end
       end
@@ -238,7 +297,7 @@ module OpenTox
         end
       end
       df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
-      assign_dataframe(df_name,d_values,compounds,features)
+      assign_dataframe(df_name,d_values,compound_names,features)
       # set dataframe column types accordingly
       f_count = 1 #R starts at 1
@@ -264,25 +323,27 @@ module OpenTox
     # converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
     # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
-    def dataframe_to_dataset( df, subjectid=nil )
-      dataframe_to_dataset_indices( df, subjectid, nil)
+    def dataframe_to_dataset( df, metadata={}, subjectid=nil )
+      dataframe_to_dataset_indices( df, metadata, subjectid, nil)
     end
     private
-    def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
+    def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil )
       raise unless @@feats[df].size>0
-      values, compounds, features = pull_dataframe(df)
+      values, compound_names, features = pull_dataframe(df)
+      compounds = compound_names.collect{|c| c.split("$")[0]}
       features.each{|f| raise unless @@feats[df][f]}
       dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
+      dataset.add_metadata(metadata)
       LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
       compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
       features.each{|f| dataset.add_feature(f,@@feats[df][f])}
       features.size.times do |c|
         feat = OpenTox::Feature.find(features[c],subjectid)
-        nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
+        numeric = feat.metadata[RDF.type].to_a.flatten.include?(OT.NumericFeature)
         compounds.size.times do |r|
           if compound_indices==nil or compound_indices.include?(r)
-            dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f) if values[r][c]!="NA"
+            dataset.add(compounds[r],features[c],numeric ? values[r][c].to_f : values[r][c]) if values[r][c]!="NA"
           end
         end
       end
@@ -290,16 +351,12 @@ module OpenTox
       dataset
     end
-    def split_to_datasets( df, split, subjectid=nil )
-      sets = []
-      (split.min.to_i .. split.max.to_i).each do |i|
-        indices = []
-        split.size.times{|j| indices<<j if split[j]==i}
-        dataset = dataframe_to_dataset_indices( df, subjectid, indices )
-        LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
-        sets << dataset
-      end
-      sets
+    def split_to_dataset( df, split, metadata={}, subjectid=nil )
+      indices = []
+      split.size.times{|i| indices<<i if yield(split[i]) }
+      dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices )
+      LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
+      dataset
     end
     def pull_dataframe(df)
@@ -323,6 +380,8 @@ module OpenTox
     end
     def assign_dataframe(df,input,rownames,colnames)
+      rownames.check_uniq if rownames
+      colnames.check_uniq if colnames
       tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
       file = File.new(tmp, 'w')
       input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}