RubyGems - opentox-ruby - Versions diffs - 3.0.1 → 3.1.0 - Mend

opentox-ruby 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/lib/parser.rb CHANGED Viewed

@@ -57,7 +57,7 @@ module OpenTox
         `rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line|
           triple = line.to_triple
           if triple[0] == @uri
-            if triple[1] == RDF.type || triple[1]==OT.predictedVariables # allow multiple types
+            if triple[1] == RDF.type || triple[1]==OT.predictedVariables || triple[1]==OT.independentVariables # allow multiple types
               @metadata[triple[1]] = [] unless @metadata[triple[1]]
               @metadata[triple[1]] << triple[2].split('^^').first
             else
@@ -290,10 +290,11 @@ module OpenTox
         @features = []
         @feature_types = {}
-        @format_errors = ""
-        @smiles_errors = []
+        @format_errors = []
+        @id_errors = []
         @activity_errors = []
         @duplicates = {}
+        @max_class_values = 3
       end
       def detect_new_values(row, value_maps)
@@ -309,9 +310,10 @@ module OpenTox
       # Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help)
       # @param [Excel] book Excel workbook object (created with roo gem)
       # @return [OpenTox::Dataset] Dataset object with Excel data
-      def load_spreadsheet(book)
+      def load_spreadsheet(book, drop_missing=false)
         book.default_sheet = 0
-        add_features book.row(1)
+        headers = book.row(1)
+        add_features headers
         value_maps = Array.new
         regression_features=Array.new
@@ -319,15 +321,27 @@ module OpenTox
           row = book.row(i)
           value_maps = detect_new_values(row, value_maps)
           value_maps.each_with_index { |vm,j|
-            if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer.
+            if vm.size > @max_class_values # 5 is the maximum nr of classes supported by Fminer.
               regression_features[j]=true
             else
               regression_features[j]=false
             end
           }
         }
         2.upto(book.last_row) { |i|
-          add_values book.row(i), regression_features
+          drop=false
+          row = book.row(i)
+          raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
+          if row.include?("")
+            @format_errors << "Row #{i} has #{row.count("")} missing values"
+            drop=true
+            drop_missing=true if (row.count("") == row.size-1)
+          end
+          add_values(row, regression_features) unless (drop_missing && drop)
+          if (drop_missing && drop)
+            @format_errors << "Row #{i} not added"
+          end
         }
         warnings
         @dataset
@@ -336,10 +350,11 @@ module OpenTox
       # Load CSV string (format specification: http://toxcreate.org/help)
       # @param [String] csv CSV representation of the dataset
       # @return [OpenTox::Dataset] Dataset object with CSV data
-      def load_csv(csv)
+      def load_csv(csv, drop_missing=false)
         row = 0
         input = csv.split("\n")
-        add_features split_row(input.shift)
+        headers = split_row(input.shift)
+        add_features(headers)
         value_maps = Array.new
         regression_features=Array.new
@@ -347,15 +362,27 @@ module OpenTox
           row = split_row(row)
           value_maps = detect_new_values(row, value_maps)
           value_maps.each_with_index { |vm,j|
-            if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer.
+            if vm.size > @max_class_values # max @max_class_values classes.
               regression_features[j]=true
             else
               regression_features[j]=false
             end
           }
         }
-        input.each { |row|
-          add_values split_row(row), regression_features
+        input.each_with_index { |row, i|
+          drop=false
+          row = split_row(row)
+          raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
+          if row.include?("")
+            @format_errors << "Row #{i} has #{row.count("")} missing values"
+            drop=true
+            drop_missing=true if (row.count("") == row.size-1)
+          end
+          add_values(row, regression_features) unless (drop_missing && drop)
+          if (drop_missing && drop)
+            @format_errors << "Row #{i} not added"
+          end
         }
         warnings
         @dataset
@@ -367,88 +394,115 @@ module OpenTox
         info = ''
         @feature_types.each do |feature,types|
-          if types.uniq.size > 1
+          if types.uniq.size == 0
+            type = "helper#MissingFeature"
+          elsif types.uniq.size > 1
             type = OT.NumericFeature
           else
             type = types.first
           end
           @dataset.add_feature_metadata(feature,{RDF.type => [type]})
-          info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}."
+          info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
           # TODO: rewrite feature values
-          # TODO if value.to_f == 0 @activity_errors << "#{smiles} Zero values not allowed for regression datasets - entry ignored."
+          # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
         end
         @dataset.metadata[OT.Info] = info
         warnings = ''
-        warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @smiles_errors.join("<br/>") unless @smiles_errors.empty?
+        warnings += "<p>Incorrect structures (ignored):</p>" + @id_errors.join("<br/>") unless @id_errors.empty?
         warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
+        warnings += "<p>Format errors:</p>" + @format_errors.join("<br/>") unless @format_errors.empty?
         duplicate_warnings = ''
         @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
-        warnings += "<p>Duplicated structures (all structures/activities used for model building, please  make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
+        warnings += "<p>Duplicate structures (all structures/activities used for model building, please make sure that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
         @dataset.metadata[OT.Warnings] = warnings
       end
+      # Adds a row of features to a dataset
+      # @param Array A row split up as an array
+      # @return Array Indices for duplicate features
       def add_features(row)
-        row.shift  # get rid of smiles entry
-        row.each do |feature_name|
+        row=row.collect
+        row.shift  # get rid of id entry
+        @duplicate_feature_indices = [] # starts with 0 at first f after id
+        row.each_with_index do |feature_name, idx|
           feature_uri = File.join(@dataset.uri,"feature",URI.encode(feature_name))
-          @feature_types[feature_uri] = []
-          @features << feature_uri
-          @dataset.add_feature(feature_uri,{DC.title => feature_name})
+          unless @features.include? feature_uri
+            @feature_types[feature_uri] = []
+            @features << feature_uri
+            @dataset.add_feature(feature_uri,{DC.title => feature_name})
+          else
+            @duplicate_feature_indices << idx
+            @format_errors << "Duplicate Feature '#{feature_name}' at pos #{idx}"
+          end
         end
       end
       # Adds a row to a dataset
       # @param Array A row split up as an array
       # @param Array Indicator for regression for each field
+      # @param Array Indices for duplicate features
       def add_values(row, regression_features)
-        smiles = row.shift
-        compound = Compound.from_smiles(smiles)
+        id = row.shift
+        case id
+        when /InChI/
+          compound = Compound.from_inchi(URI.decode_www_form_component(id))
+        else
+          compound = Compound.from_smiles(id)
+        end
         if compound.nil? or compound.inchi.nil? or compound.inchi == ""
-          @smiles_errors << smiles+", "+row.join(", ")
+          @id_errors << id+", "+row.join(", ")
           return false
         end
         @duplicates[compound.inchi] = [] unless @duplicates[compound.inchi]
-        @duplicates[compound.inchi] << smiles+", "+row.join(", ")
+        @duplicates[compound.inchi] << id+", "+row.join(", ")
+        feature_idx = 0
         row.each_index do |i|
-          value = row[i]
-          feature = @features[i]
-          type = nil
-          if (regression_features[i])
-            type = feature_type(value)
-            if type != OT.NumericFeature
-              raise "Error! Expected numeric values."
+          unless @duplicate_feature_indices.include? i
+            value = row[i]
+            #LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty
+            feature = @features[feature_idx]
+            type = feature_type(value) # May be NIL
+            type = OT.NominalFeature unless (type.nil? || regression_features[i])
+            @feature_types[feature] << type if type
+            val = nil
+            case type
+            when OT.NumericFeature
+              val = value.to_f
+            when OT.NominalFeature
+              val = value.to_s
             end
-          else
-            type = OT.NominalFeature
-          end
-          @feature_types[feature] << type
-          case type
-          when OT.NumericFeature
-            val = value.to_f
-          when OT.NominalFeature
-            val = value.to_s
-          end
-          if val!=nil
-            @dataset.add(compound.uri, feature, val)
-            if type!=OT.NumericFeature
-              @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
-              @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
+            feature_idx += 1
+            if val != nil
+              @dataset.add(compound.uri, feature, val)
+              if type != OT.NumericFeature
+                @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
+                @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
+              end
             end
           end
         end
       end
       def feature_type(value)
-        if OpenTox::Algorithm::numeric? value
+        if value == ""
+          return nil
+        elsif OpenTox::Algorithm::numeric? value
           return OT.NumericFeature
         else
           return OT.NominalFeature
@@ -456,7 +510,7 @@ module OpenTox
       end
       def split_row(row)
-        row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/) # remove quotes
+        row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/,-1) # -1: do not skip empty cells
       end
     end
@@ -468,6 +522,7 @@ module OpenTox
       def initialize
         @data = {}
         @activity_errors = []
+        @max_class_values = 3
       end
       def feature_values(feature)
@@ -485,14 +540,14 @@ module OpenTox
       def clean_features
         ignored_features = []
         features.each do |feature|
-          if feature_values(feature).size > 5
+          if feature_values(feature).size > @max_class_values
             if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature
               # REGRESSION
             elsif feature_types(feature).include? OT.NumericFeature
               @data.each{|c,row| row[feature] = nil unless OpenTox::Algorithm::numeric?(row[feature]) } # delete nominal features
               @activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)."
             else
-              @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)."
+              @activity_errors << "Feature #{feature} ignored (more than #{@max_class_values} nominal feature values and no numeric values)."
               ignored_features << feature
               next
             end
@@ -543,12 +598,15 @@ module OpenTox
       private
       def feature_type(value)
-        if OpenTox::Algorithm::numeric? value
+        if value.nil?
+          return nil
+        elsif OpenTox::Algorithm::numeric? value
           return OT.NumericFeature
         else
           return OT.NominalFeature
         end
       end
     end
     # quick hack to enable sdf import via csv
@@ -589,20 +647,20 @@ module OpenTox
             @duplicates[inchi] << rec #inchi#+", "+row.join(", ")
             compound = Compound.from_inchi inchi
           rescue
-            @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}"
+            @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec}) have been ignored! \n#{s}"
             next
           end
           row = {}
           obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
           table.data[compound.uri] = row
         end
-        # finda and remove ignored_features
+        # find and remove ignored_features
         @activity_errors = table.clean_features
         table.add_to_dataset @dataset
         warnings = ''
-        warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
+        warnings += "<p>Incorrect structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
         warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
         duplicate_warnings = ''
         @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }

data/lib/r-util.rb ADDED Viewed

@@ -0,0 +1,354 @@
+# pending: package dir hack ---------
+# CONFIG[:base_dir] = "/home/<user>/opentox-ruby/www"
+# PACKAGE_DIR = "/home/<user>/opentox-ruby/r-packages"
+package_dir = CONFIG[:base_dir].split("/")
+package_dir[-1] = "r-packages"
+package_dir = package_dir.join("/")
+PACKAGE_DIR = package_dir
+require "tempfile"
+module OpenTox
+  class RUtil
+    @@feats = {}
+    def initialize
+      @r = RinRuby.new(true,false) unless defined?(@r) and @r
+      @r.eval ".libPaths('#{PACKAGE_DIR}')"
+      @r_packages = @r.pull "installed.packages()[,1]"
+      ["sampling","gam","vegan"].each{|l| install_package(l)} #"caret", "smacof", "TunePareto"
+      @r.eval "source('#{File.join(Gem.loaded_specs['opentox-ruby'].full_gem_path,'lib/stratification.R')}')"
+    end
+    def quit_r
+      begin
+        @r.quit
+        @r = nil
+      rescue
+      end
+    end
+    def r
+      @r
+    end
+    def package_installed?( package )
+      @r_packages.include?(package)
+    end
+    def install_package( package )
+      unless package_installed?(package)
+        LOGGER.debug "r-util> installing r-package #{package} to #{PACKAGE_DIR}"
+        @r.eval "install.packages('#{package}', repos='http://cran.r-project.org', lib='#{PACKAGE_DIR}')"
+      end
+    end
+    # <0 -> array1 << array2
+    # 0  -> no significant difference
+    # >0 -> array2 >> array1
+    def paired_ttest(array1, array2, significance_level=0.95)
+      @r.assign "v1",array1
+      @r.assign "v2",array2
+      @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)"
+      t = @r.pull "ttest$statistic"
+      p = @r.pull "ttest$p.value"
+      if (1-significance_level > p)
+        t
+      else
+        0
+      end
+    end
+    # example:
+    # files = ["/tmp/box.svg","/tmp/box.png"]
+    # data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ]
+    # boxplot(files, data, "comparison1" )
+    #
+    def boxplot(files, data, title="")
+      LOGGER.debug("r-util> create boxplot")
+      assign_dataframe("boxdata",data.collect{|e| e[1]}.transpose,nil,data.collect{|e| e[0].to_s})
+      plot_to_files(files) do |file|
+        @r.eval "boxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1}))"
+      end
+    end
+    # embedds feature values of two datasets into 2D and plots it
+    # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)
+    #
+    def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
+        features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
+      raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
+      LOGGER.debug("r-util> create feature value plot")
+      d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
+      d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
+      if features
+        [d1, d2].each{|d| features.each{|f| raise "feature not included" unless d.features.keys.include?(f)}}
+      else
+        raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if
+          (d1.features.keys.sort != d2.features.keys.sort)
+        features = d1.features.keys
+      end
+      raise "at least two features needed" if d1.features.keys.size<2
+      waiting_task.progress(25) if waiting_task
+      df1 = dataset_to_dataframe(d1,0,subjectid,features)
+      df2 = dataset_to_dataframe(d2,0,subjectid,features)
+      waiting_task.progress(50) if waiting_task
+      @r.eval "df <- rbind(#{df1},#{df2})"
+      @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
+      @r.names = [dataset_name1, dataset_name2]
+      LOGGER.debug("r-util> - convert data to 2d")
+      @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
+      waiting_task.progress(75) if waiting_task
+      if fast_plot
+        info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
+      else
+        info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
+      end
+      LOGGER.debug("r-util> - plot data")
+      plot_to_files(files) do |file|
+        @r.eval "plot_split( df.2d, split, names, #{info})"
+      end
+    end
+    # plots a double histogram
+    # data1 and data2 are arrays with values, either numerical or categorial (string values)
+    # is_numerical, boolean flag indicating value types
+    # log (only for numerical), plot logarithm of values
+    def double_hist_plot(files, data1, data2, is_numerical, log=false, name1="first", name2="second", title="title", xaxis="x-values")
+      LOGGER.debug("r-util> create double hist plot")
+      all = data1 + data2
+      if (is_numerical)
+        @r.eval "double_plot <- function(data1, data2, log=FALSE, names=c('data1','data2'), title='title', xlab='x-values')
+        {
+          if (log)
+          {
+            data1 <- log(data1)
+            data2 <- log(data2)
+            xlab = paste('logarithm of',xlab,sep=' ')
+          }
+          xlims <- round(c(min(c(min(data1),min(data2))),max(c(max(data1),max(data2)))))
+          h <- hist(rbind(data1,data2),plot=F)
+          h1 <- hist(data1,plot=F,breaks=h$breaks)
+          h2 <- hist(data2,plot=F,breaks=h$breaks)
+          xlims = c(min(h$breaks),max(h$breaks))
+          ylims = c(0,max(h1$counts,h2$counts))
+          xaxps = c(min(h$breaks),max(h$breaks),(length(h$breaks)-1))
+          plot(h1, col=rgb(1,0,0,2/4), xlim=xlims, xaxp=xaxps, ylim=ylims,
+            main=title, xlab=xlab, ylab='counts' )
+          plot(h2, col=rgb(0,1,0,2/4), add=T )
+          legend('topleft',names,lty=c(1,1),col=c('red','green'))
+        }"
+        @r.assign("data1",data1)
+        @r.assign("data2",data2)
+        @r.legend = [name1, name2]
+      else
+        raise "log not valid for categorial" if log
+        vals = all.uniq.sort!
+        counts1 = vals.collect{|e| data1.count(e)}
+        counts2 = vals.collect{|e| data2.count(e)}
+        @r.data1 = counts1
+        @r.data2 = counts2
+        @r.value_names = [name1, name2]
+        @r.legend = vals
+        @r.eval("data <- cbind(data1,data2)")
+      end
+      plot_to_files(files) do |file|
+        if (is_numerical)
+          @r.eval "double_plot(data1,data2,log=#{log ? "T":"F"},names=legend,title='#{title}',xlab='#{xaxis}')"
+        else
+          @r.eval("bp <- barplot(data, beside=T, names.arg=value_names,
+            main='#{title}', col=sort(rep(2:3,length(legend))))") #legend.text=c(legend),
+          @r.eval "text(bp, 0, round(data, 1),cex=1,pos=3)"
+        end
+      end
+    end
+    # stratified splits a dataset into two dataset the feature values
+    # all features are taken into account unless <split_features> is given
+    def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
+      raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
+      LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
+      df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
+      @r.eval "set.seed(#{seed})"
+      @r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
+      split = @r.pull 'split'
+      split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
+      split_to_datasets( df, split, subjectid )
+    end
+    # dataset should be loaded completely (use Dataset.find)
+    # takes duplicates into account
+    # replaces missing values with param <missing_value>
+    # returns dataframe-variable-name in R
+    def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil )
+      LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}"
+      # count duplicates
+      num_compounds = {}
+      dataset.features.keys.each do |f|
+        dataset.compounds.each do |c|
+          if dataset.data_entries[c]
+            val = dataset.data_entries[c][f]
+            size = val==nil ? 1 : val.size
+            num_compounds[c] = num_compounds[c]==nil ? size : [num_compounds[c],size].max
+          else
+            num_compounds[c] = 1
+          end
+        end
+      end
+      # use either all, or the provided features, sorting is important as col-index := features
+      if features
+        features.sort!
+      else
+        features = dataset.features.keys.sort
+      end
+      compounds = []
+      dataset.compounds.each do |c|
+        num_compounds[c].times do |i|
+          compounds << c
+        end
+      end
+      # values into 2D array, then to dataframe
+      d_values = []
+      dataset.compounds.each do |c|
+        num_compounds[c].times do |i|
+          c_values = []
+          features.each do |f|
+            if dataset.data_entries[c]
+              val = dataset.data_entries[c][f]
+              v = val==nil ? "" : val[i].to_s
+            else
+              raise "wtf" if i>0
+              v = ""
+            end
+            v = missing_value if v.size()==0
+            c_values << v
+          end
+          d_values << c_values
+        end
+      end
+      df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
+      assign_dataframe(df_name,d_values,compounds,features)
+      # set dataframe column types accordingly
+      f_count = 1 #R starts at 1
+      features.each do |f|
+        feat = OpenTox::Feature.find(f,subjectid)
+        nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
+        if nominal
+          @r.eval "#{df_name}[,#{f_count}] <- as.character(#{df_name}[,#{f_count}])"
+        else
+          @r.eval "#{df_name}[,#{f_count}] <- as.numeric(#{df_name}[,#{f_count}])"
+        end
+        f_count += 1
+      end
+      #@r.eval "head(#{df_name})"
+      # store compounds, and features (including metainformation)
+      @@feats[df_name] = {}
+      features.each do |f|
+        @@feats[df_name][f] = dataset.features[f]
+      end
+      df_name
+    end
+    # converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
+    # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
+    def dataframe_to_dataset( df, subjectid=nil )
+      dataframe_to_dataset_indices( df, subjectid, nil)
+    end
+    private
+    def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
+      raise unless @@feats[df].size>0
+      values, compounds, features = pull_dataframe(df)
+      features.each{|f| raise unless @@feats[df][f]}
+      dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
+      LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
+      compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
+      features.each{|f| dataset.add_feature(f,@@feats[df][f])}
+      features.size.times do |c|
+        feat = OpenTox::Feature.find(features[c],subjectid)
+        nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
+        compounds.size.times do |r|
+          if compound_indices==nil or compound_indices.include?(r)
+            dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f) if values[r][c]!="NA"
+          end
+        end
+      end
+      dataset.save(subjectid)
+      dataset
+    end
+    def split_to_datasets( df, split, subjectid=nil )
+      sets = []
+      (split.min.to_i .. split.max.to_i).each do |i|
+        indices = []
+        split.size.times{|j| indices<<j if split[j]==i}
+        dataset = dataframe_to_dataset_indices( df, subjectid, indices )
+        LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
+        sets << dataset
+      end
+      sets
+    end
+    def pull_dataframe(df)
+      tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
+      @r.eval "write.table(#{df},file='#{tmp}',sep='#')"
+      res = []; compounds = []; features = []
+      first = true
+      file = File.new(tmp, 'r')
+      file.each_line("\n") do |row|
+        if first
+           features = row.chomp.split("#").collect{|e| e.gsub("\"","")}
+           first = false
+        else
+           vals = row.chomp.split("#").collect{|e| e.gsub("\"","")}
+           compounds << vals[0]
+           res << vals[1..-1]
+        end
+      end
+      begin File.delete(tmp); rescue; end
+      return res, compounds, features
+    end
+    def assign_dataframe(df,input,rownames,colnames)
+      tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
+      file = File.new(tmp, 'w')
+      input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}
+      file.flush
+      @r.rownames = rownames if rownames
+      @r.colnames = colnames
+      @r.eval "#{df} <- read.table(file='#{tmp}',sep='#',"+
+        "#{rownames ? "row.names=rownames" : ""},col.names=colnames,check.names=F)"
+      begin File.delete(tmp); rescue; end
+    end
+    def plot_to_files(files)
+      files.each do |file|
+        if file=~/(?i)\.svg/
+          @r.eval("svg('#{file}',10,8)")
+        elsif file=~/(?i)\.png/
+          @r.eval("png('#{file}')")
+        else
+          raise "invalid format: "+file.to_s
+        end
+        yield file
+        LOGGER.debug "r-util> plotted to #{file}"
+        @r.eval("dev.off()")
+      end
+    end
+  end
+end