opentox-ruby 3.1.0 → 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -197,7 +197,12 @@ module OpenTox
197
197
  accept_values
198
198
  end
199
199
 
200
- # Detect feature type(s) in the dataset
200
+ # Detect feature type (reduced to one across all features)
201
+ # Classification takes precedence over regression
202
+ # DEPRECATED --
203
+ # HAS NO SENSE FOR DATASETS WITH MORE THAN 1 FEATURE
204
+ # FEATURES CAN HAVE MULTIPLE TYPES
205
+ # Replacement: see feature_types()
201
206
  # @return [String] `classification", "regression", "mixed" or unknown`
202
207
  def feature_type(subjectid=nil)
203
208
  load_features(subjectid)
@@ -210,6 +215,24 @@ module OpenTox
210
215
  "unknown"
211
216
  end
212
217
  end
218
+
219
+
220
+ # Detect feature types. A feature can have multiple types.
221
+ # Returns types hashed by feature URI, with missing features omitted.
222
+ # Example (YAML):
223
+ # http://toxcreate3.in-silico.ch:8082/dataset/152/feature/nHal:
224
+ # - http://www.opentox.org/api/1.1#NumericFeature
225
+ # - http://www.opentox.org/api/1.1#NominalFeature
226
+ # ...
227
+ #
228
+ # @return [Hash] Keys: feature URIs, Values: Array of types
229
+ def feature_types(subjectid=nil)
230
+ load_features(subjectid)
231
+ @features.inject({}){ |h,(f,metadata)|
232
+ h[f]=metadata[RDF.type] unless metadata[RDF.type][0].include? "MissingFeature"
233
+ h
234
+ }
235
+ end
213
236
  =begin
214
237
  =end
215
238
 
@@ -316,11 +339,14 @@ module OpenTox
316
339
  end
317
340
 
318
341
  # Complete feature values by adding zeroes
319
- def complete_data_entries
342
+ # @param [Hash] key: compound, value: duplicate sizes
343
+ def complete_data_entries(compound_sizes)
320
344
  all_features = @features.keys
321
345
  @data_entries.each { |c, e|
322
346
  (Set.new(all_features.collect)).subtract(Set.new e.keys).to_a.each { |f|
323
- self.add(c,f,0)
347
+ compound_sizes[c].times {
348
+ self.add(c,f,0)
349
+ }
324
350
  }
325
351
  }
326
352
  end
@@ -454,6 +480,14 @@ module OpenTox
454
480
  end
455
481
  end
456
482
 
483
+ def value_map(prediction_feature_uri)
484
+ training_classes = accept_values(prediction_feature_uri).sort
485
+ value_map=Hash.new
486
+ training_classes.each_with_index { |c,i| value_map[i+1] = c }
487
+ value_map
488
+ end
489
+
490
+
457
491
  private
458
492
  # Copy a dataset (rewrites URI)
459
493
  def copy(dataset)
@@ -504,6 +538,7 @@ module OpenTox
504
538
  @data_entries[compound.uri].collect{|f,v| @features[f] if f.match(/neighbor/)}.compact if @data_entries[compound.uri]
505
539
  end
506
540
 
541
+
507
542
  # def errors(compound)
508
543
  # features = @data_entries[compound.uri].keys
509
544
  # features.collect{|f| @features[f][OT.error]}.join(" ") if features
@@ -103,7 +103,7 @@ module OpenTox
103
103
  include Model
104
104
 
105
105
 
106
- attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors
106
+ attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors, :compounds
107
107
  def initialize(uri=nil)
108
108
 
109
109
  if uri
@@ -169,12 +169,13 @@ module OpenTox
169
169
  lazar.prediction_algorithm = hash["prediction_algorithm"] if hash["prediction_algorithm"]
170
170
  lazar.subjectid = hash["subjectid"] if hash["subjectid"]
171
171
  lazar.value_map = hash["value_map"] if hash["value_map"]
172
+ lazar.compounds = hash["compounds"] if hash["compounds"]
172
173
 
173
174
  lazar
174
175
  end
175
176
 
176
177
  def to_json
177
- Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map})
178
+ Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map, :compounds => @compounds})
178
179
  end
179
180
 
180
181
  def run( params, accept_header=nil, waiting_task=nil )
@@ -237,6 +238,7 @@ module OpenTox
237
238
 
238
239
  @compound = Compound.new compound_uri
239
240
  features = {}
241
+
240
242
  #LOGGER.debug self.to_yaml
241
243
  unless @prediction_dataset
242
244
  @prediction_dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid)
@@ -247,19 +249,19 @@ module OpenTox
247
249
  OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}]
248
250
  } )
249
251
  end
250
- if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "regression"
251
- all_activities = []
252
- all_activities = @activities.values.flatten.collect! { |i| i.to_f }
253
- end
252
+
254
253
  unless database_activity(subjectid) # adds database activity to @prediction_dataset
254
+
255
255
  # Calculation of needed values for query compound
256
256
  @compound_features = eval("#{@feature_calculation_algorithm}({
257
257
  :compound => @compound,
258
258
  :features => @features,
259
259
  :feature_dataset_uri => @metadata[OT.featureDataset],
260
260
  :pc_type => self.parameter(\"pc_type\"),
261
+ :lib => self.parameter(\"lib\"),
261
262
  :subjectid => subjectid
262
263
  })")
264
+
263
265
  # Adding fingerprint of query compound with features and values(p_value*nr_hits)
264
266
  @compound_fingerprints = {}
265
267
  @compound_features.each do |feature, value| # value is nil if "Substructure.match"
@@ -314,6 +316,16 @@ module OpenTox
314
316
  @prediction_dataset.add @compound.uri, feature_uri, true
315
317
  f+=1
316
318
  end
319
+ elsif @feature_calculation_algorithm == "Substructure.lookup"
320
+ f = 0
321
+ @compound_features.each do |feature, value|
322
+ features[feature] = feature
323
+ @prediction_dataset.add_feature(feature, {
324
+ RDF.type => [OT.NumericFeature]
325
+ })
326
+ @prediction_dataset.add @compound.uri, feature, value
327
+ f+=1
328
+ end
317
329
  else
318
330
  @compound_features.each do |feature|
319
331
  features[feature] = feature
@@ -337,15 +349,26 @@ module OpenTox
337
349
  else
338
350
  feature_uri = feature
339
351
  end
340
- @prediction_dataset.add neighbor[:compound], feature_uri, true
352
+ if @feature_calculation_algorithm == "Substructure.lookup"
353
+ @prediction_dataset.add neighbor[:compound], feature_uri, @fingerprints[neighbor[:compound]][feature_uri]
354
+ else
355
+ @prediction_dataset.add neighbor[:compound], feature_uri, true
356
+ end
357
+
341
358
  unless features.has_key? feature
342
359
  features[feature] = feature_uri
343
- @prediction_dataset.add_feature(feature_uri, {
344
- RDF.type => [OT.Substructure],
345
- OT.smarts => feature,
346
- OT.pValue => @p_values[feature],
347
- OT.effect => @effects[feature]
348
- })
360
+ if @feature_calculation_algorithm == "Substructure.lookup"
361
+ @prediction_dataset.add_feature(feature_uri, {
362
+ RDF.type => [OT.NumericFeature]
363
+ })
364
+ else
365
+ @prediction_dataset.add_feature(feature_uri, {
366
+ RDF.type => [OT.Substructure],
367
+ OT.smarts => feature,
368
+ OT.pValue => @p_values[feature],
369
+ OT.effect => @effects[feature]
370
+ })
371
+ end
349
372
  f+=1
350
373
  end
351
374
  end
@@ -349,11 +349,15 @@ module OpenTox
349
349
 
350
350
  # Load CSV string (format specification: http://toxcreate.org/help)
351
351
  # @param [String] csv CSV representation of the dataset
352
+ # @param [Boolean] drop_missing Whether completely missing rows should be droppped
353
+ # @param [Boolean] all_numeric Whether all features should be treated as numeric
354
+ # @param [Boolean] del_nominal All nominal features will be removed
352
355
  # @return [OpenTox::Dataset] Dataset object with CSV data
353
- def load_csv(csv, drop_missing=false)
356
+ def load_csv(csv, drop_missing=false, all_numeric=false)
354
357
  row = 0
355
358
  input = csv.split("\n")
356
359
  headers = split_row(input.shift)
360
+ headers.collect! {|header| header.to_s.gsub(/[\/.\\\(\)\{\}\[\]]/,"_")}
357
361
  add_features(headers)
358
362
  value_maps = Array.new
359
363
  regression_features=Array.new
@@ -362,7 +366,7 @@ module OpenTox
362
366
  row = split_row(row)
363
367
  value_maps = detect_new_values(row, value_maps)
364
368
  value_maps.each_with_index { |vm,j|
365
- if vm.size > @max_class_values # max @max_class_values classes.
369
+ if (vm.size > @max_class_values) || all_numeric # max @max_class_values classes.
366
370
  regression_features[j]=true
367
371
  else
368
372
  regression_features[j]=false
@@ -392,22 +396,30 @@ module OpenTox
392
396
 
393
397
  def warnings
394
398
 
395
- info = ''
399
+ info = '<br>'
396
400
  @feature_types.each do |feature,types|
401
+ @dataset.add_feature_metadata(feature,{RDF.type => []})
397
402
  if types.uniq.size == 0
398
- type = "helper#MissingFeature"
399
- elsif types.uniq.size > 1
400
- type = OT.NumericFeature
403
+ @dataset.add_feature_metadata(
404
+ feature, {RDF.type => ( @dataset.features[feature][RDF.type] << "helper#MissingFeature" ) } # TODO: Fit to OT ontology!
405
+ )
406
+ info += "'#{@dataset.feature_name(feature)}' detected as 'MissingFeature'<br>"
401
407
  else
402
- type = types.first
408
+ info += "'#{@dataset.feature_name(feature)}' detected as "
409
+ types_arr = []
410
+ types.uniq.each { |t|
411
+ types_arr << t
412
+ info += "'#{t.split('#').last}', "
413
+ }
414
+
415
+ @dataset.add_feature_metadata(
416
+ feature, {RDF.type => types_arr.sort} # nominal should be first for downward compatibility
417
+ )
418
+
419
+ info.chop!.chop!
420
+ info += "<br>"
403
421
  end
404
- @dataset.add_feature_metadata(feature,{RDF.type => [type]})
405
- info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
406
-
407
- # TODO: rewrite feature values
408
- # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
409
422
  end
410
-
411
423
  @dataset.metadata[OT.Info] = info
412
424
 
413
425
  warnings = ''
@@ -469,28 +481,31 @@ module OpenTox
469
481
  unless @duplicate_feature_indices.include? i
470
482
 
471
483
  value = row[i]
472
- #LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty
473
484
  feature = @features[feature_idx]
474
485
 
475
486
  type = feature_type(value) # May be NIL
476
- type = OT.NominalFeature unless (type.nil? || regression_features[i])
477
487
  @feature_types[feature] << type if type
488
+ # Add nominal type if #distinct values le @max_class_values
489
+ if type == OT.NumericFeature
490
+ @feature_types[feature] << OT.NominalFeature unless regression_features[i]
491
+ end
478
492
 
479
493
  val = nil
480
494
  case type
481
495
  when OT.NumericFeature
482
496
  val = value.to_f
497
+ val = nil if val.infinite?
483
498
  when OT.NominalFeature
484
499
  val = value.to_s
485
500
  end
486
501
 
487
502
  feature_idx += 1
488
503
 
489
- if val != nil
504
+ if val != nil
490
505
  @dataset.add(compound.uri, feature, val)
491
- if type != OT.NumericFeature
506
+ if @feature_types[feature].include? OT.NominalFeature
492
507
  @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
493
- @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
508
+ @dataset.features[feature][OT.acceptValue] << val unless @dataset.features[feature][OT.acceptValue].include?(val)
494
509
  end
495
510
  end
496
511
 
@@ -654,7 +669,7 @@ module OpenTox
654
669
  obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
655
670
  table.data[compound.uri] = row
656
671
  end
657
-
672
+
658
673
  # find and remove ignored_features
659
674
  @activity_errors = table.clean_features
660
675
  table.add_to_dataset @dataset
@@ -8,6 +8,18 @@ PACKAGE_DIR = package_dir
8
8
 
9
9
  require "tempfile"
10
10
 
11
+ class Array
12
+
13
+ def check_uniq
14
+ hash = {}
15
+ self.each do |x|
16
+ raise "duplicate #{x}" if hash[x]
17
+ hash[x] = true
18
+ end
19
+ end
20
+
21
+ end
22
+
11
23
  module OpenTox
12
24
 
13
25
  class RUtil
@@ -75,12 +87,10 @@ module OpenTox
75
87
  end
76
88
 
77
89
  # embedds feature values of two datasets into 2D and plots it
78
- # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)
79
90
  #
80
91
  def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
81
- features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
92
+ features=nil, subjectid=nil, waiting_task=nil)
82
93
 
83
- raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
84
94
  LOGGER.debug("r-util> create feature value plot")
85
95
  d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
86
96
  d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
@@ -102,17 +112,13 @@ module OpenTox
102
112
  @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
103
113
  @r.names = [dataset_name1, dataset_name2]
104
114
  LOGGER.debug("r-util> - convert data to 2d")
105
- @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
115
+ #@r.eval "save.image(\"/tmp/image.R\")"
116
+ @r.eval "df.2d <- plot_pre_process(df, method='sammon')"
106
117
  waiting_task.progress(75) if waiting_task
107
118
 
108
- if fast_plot
109
- info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
110
- else
111
- info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
112
- end
113
119
  LOGGER.debug("r-util> - plot data")
114
120
  plot_to_files(files) do |file|
115
- @r.eval "plot_split( df.2d, split, names, #{info})"
121
+ @r.eval "plot_split( df.2d, split, names, main='Sammon embedding of #{features.size} features',xlab='x',ylab='y')"
116
122
  end
117
123
  end
118
124
 
@@ -170,19 +176,68 @@ module OpenTox
170
176
  end
171
177
  end
172
178
 
173
- # stratified splits a dataset into two dataset the feature values
179
+ # stratified splits a dataset into two dataset according to the feature values
180
+ # all features are taken into account unless <split_features> is given
181
+ # returns two datases
182
+ def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
183
+ stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features )
184
+ end
185
+
186
+ # stratified splits a dataset into k datasets according the feature values
174
187
  # all features are taken into account unless <split_features> is given
175
- def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
188
+ # returns two arrays of datasets
189
+ def stratified_k_fold_split( dataset, metadata={}, missing_values="NA", num_folds=10, subjectid=nil, seed=42, split_features=nil )
190
+ stratified_split_internal( dataset, metadata, missing_values, num_folds, nil, subjectid, seed, split_features )
191
+ end
192
+
193
+ private
194
+ def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil )
195
+ raise "internal error" if num_folds!=nil and pct!=nil
196
+ k_fold_split = num_folds!=nil
197
+ if k_fold_split
198
+ raise "num_folds not a fixnum: #{num_folds}" unless num_folds.is_a?(Fixnum)
199
+ else
200
+ raise "pct is not a numeric: #{pct}" unless pct.is_a?(Numeric)
201
+ end
176
202
  raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
203
+ raise "missing_values=#{missing_values}" unless missing_values.is_a?(String) or missing_values==0
204
+ raise "subjectid=#{subjectid}" unless subjectid==nil or subjectid.is_a?(String)
177
205
  LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
178
206
 
179
- df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
207
+ df = dataset_to_dataframe( dataset, missing_values, subjectid)
180
208
  @r.eval "set.seed(#{seed})"
181
- @r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
182
- split = @r.pull 'split'
183
- split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
184
- split_to_datasets( df, split, subjectid )
209
+ str_split_features = ""
210
+ if split_features
211
+ @r.split_features = split_features if split_features
212
+ str_split_features = "colnames=split_features"
213
+ end
214
+ #@r.eval "save.image(\"/tmp/image.R\")"
215
+
216
+ if k_fold_split
217
+ @r.eval "split <- stratified_k_fold_split(#{df}, num_folds=#{num_folds}, #{str_split_features})"
218
+ split = @r.pull 'split'
219
+ train = []
220
+ test = []
221
+ num_folds.times do |f|
222
+ datasetname = 'dataset fold '+(f+1).to_s+' of '+num_folds.to_s
223
+ metadata[DC.title] = "training "+datasetname
224
+ train << split_to_dataset( df, split, metadata, subjectid ){ |i| i!=(f+1) }
225
+ metadata[DC.title] = "test "+datasetname
226
+ test << split_to_dataset( df, split, metadata, subjectid ){ |i| i==(f+1) }
227
+ end
228
+ return train, test
229
+ else
230
+ puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
231
+ @r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
232
+ split = @r.pull 'split'
233
+ metadata[DC.title] = "Training dataset split of "+dataset.uri
234
+ train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 }
235
+ metadata[DC.title] = "Test dataset split of "+dataset.uri
236
+ test = split_to_dataset( df, split, metadata, subjectid ){ |i| i==0 }
237
+ return train, test
238
+ end
185
239
  end
240
+ public
186
241
 
187
242
  # dataset should be loaded completely (use Dataset.find)
188
243
  # takes duplicates into account
@@ -212,9 +267,13 @@ module OpenTox
212
267
  features = dataset.features.keys.sort
213
268
  end
214
269
  compounds = []
270
+ compound_names = []
215
271
  dataset.compounds.each do |c|
272
+ count = 0
216
273
  num_compounds[c].times do |i|
217
274
  compounds << c
275
+ compound_names << "#{c}$#{count}"
276
+ count+=1
218
277
  end
219
278
  end
220
279
 
@@ -238,7 +297,7 @@ module OpenTox
238
297
  end
239
298
  end
240
299
  df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
241
- assign_dataframe(df_name,d_values,compounds,features)
300
+ assign_dataframe(df_name,d_values,compound_names,features)
242
301
 
243
302
  # set dataframe column types accordingly
244
303
  f_count = 1 #R starts at 1
@@ -264,25 +323,27 @@ module OpenTox
264
323
 
265
324
  # converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
266
325
  # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
267
- def dataframe_to_dataset( df, subjectid=nil )
268
- dataframe_to_dataset_indices( df, subjectid, nil)
326
+ def dataframe_to_dataset( df, metadata={}, subjectid=nil )
327
+ dataframe_to_dataset_indices( df, metadata, subjectid, nil)
269
328
  end
270
329
 
271
330
  private
272
- def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
331
+ def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil )
273
332
  raise unless @@feats[df].size>0
274
- values, compounds, features = pull_dataframe(df)
333
+ values, compound_names, features = pull_dataframe(df)
334
+ compounds = compound_names.collect{|c| c.split("$")[0]}
275
335
  features.each{|f| raise unless @@feats[df][f]}
276
336
  dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
337
+ dataset.add_metadata(metadata)
277
338
  LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
278
339
  compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
279
340
  features.each{|f| dataset.add_feature(f,@@feats[df][f])}
280
341
  features.size.times do |c|
281
342
  feat = OpenTox::Feature.find(features[c],subjectid)
282
- nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
343
+ numeric = feat.metadata[RDF.type].to_a.flatten.include?(OT.NumericFeature)
283
344
  compounds.size.times do |r|
284
345
  if compound_indices==nil or compound_indices.include?(r)
285
- dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f) if values[r][c]!="NA"
346
+ dataset.add(compounds[r],features[c],numeric ? values[r][c].to_f : values[r][c]) if values[r][c]!="NA"
286
347
  end
287
348
  end
288
349
  end
@@ -290,16 +351,12 @@ module OpenTox
290
351
  dataset
291
352
  end
292
353
 
293
- def split_to_datasets( df, split, subjectid=nil )
294
- sets = []
295
- (split.min.to_i .. split.max.to_i).each do |i|
296
- indices = []
297
- split.size.times{|j| indices<<j if split[j]==i}
298
- dataset = dataframe_to_dataset_indices( df, subjectid, indices )
299
- LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
300
- sets << dataset
301
- end
302
- sets
354
+ def split_to_dataset( df, split, metadata={}, subjectid=nil )
355
+ indices = []
356
+ split.size.times{|i| indices<<i if yield(split[i]) }
357
+ dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices )
358
+ LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
359
+ dataset
303
360
  end
304
361
 
305
362
  def pull_dataframe(df)
@@ -323,6 +380,8 @@ module OpenTox
323
380
  end
324
381
 
325
382
  def assign_dataframe(df,input,rownames,colnames)
383
+ rownames.check_uniq if rownames
384
+ colnames.check_uniq if colnames
326
385
  tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
327
386
  file = File.new(tmp, 'w')
328
387
  input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}