opentox-ruby 3.1.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -197,7 +197,12 @@ module OpenTox
197
197
  accept_values
198
198
  end
199
199
 
200
- # Detect feature type(s) in the dataset
200
+ # Detect feature type (reduced to one across all features)
201
+ # Classification takes precedence over regression
202
+ # DEPRECATED --
203
+ # HAS NO SENSE FOR DATASETS WITH MORE THAN 1 FEATURE
204
+ # FEATURES CAN HAVE MULTIPLE TYPES
205
+ # Replacement: see feature_types()
201
206
  # @return [String] `classification", "regression", "mixed" or unknown`
202
207
  def feature_type(subjectid=nil)
203
208
  load_features(subjectid)
@@ -210,6 +215,24 @@ module OpenTox
210
215
  "unknown"
211
216
  end
212
217
  end
218
+
219
+
220
+ # Detect feature types. A feature can have multiple types.
221
+ # Returns types hashed by feature URI, with missing features omitted.
222
+ # Example (YAML):
223
+ # http://toxcreate3.in-silico.ch:8082/dataset/152/feature/nHal:
224
+ # - http://www.opentox.org/api/1.1#NumericFeature
225
+ # - http://www.opentox.org/api/1.1#NominalFeature
226
+ # ...
227
+ #
228
+ # @return [Hash] Keys: feature URIs, Values: Array of types
229
+ def feature_types(subjectid=nil)
230
+ load_features(subjectid)
231
+ @features.inject({}){ |h,(f,metadata)|
232
+ h[f]=metadata[RDF.type] unless metadata[RDF.type][0].include? "MissingFeature"
233
+ h
234
+ }
235
+ end
213
236
  =begin
214
237
  =end
215
238
 
@@ -316,11 +339,14 @@ module OpenTox
316
339
  end
317
340
 
318
341
  # Complete feature values by adding zeroes
319
- def complete_data_entries
342
+ # @param [Hash] key: compound, value: duplicate sizes
343
+ def complete_data_entries(compound_sizes)
320
344
  all_features = @features.keys
321
345
  @data_entries.each { |c, e|
322
346
  (Set.new(all_features.collect)).subtract(Set.new e.keys).to_a.each { |f|
323
- self.add(c,f,0)
347
+ compound_sizes[c].times {
348
+ self.add(c,f,0)
349
+ }
324
350
  }
325
351
  }
326
352
  end
@@ -454,6 +480,14 @@ module OpenTox
454
480
  end
455
481
  end
456
482
 
483
+ def value_map(prediction_feature_uri)
484
+ training_classes = accept_values(prediction_feature_uri).sort
485
+ value_map=Hash.new
486
+ training_classes.each_with_index { |c,i| value_map[i+1] = c }
487
+ value_map
488
+ end
489
+
490
+
457
491
  private
458
492
  # Copy a dataset (rewrites URI)
459
493
  def copy(dataset)
@@ -504,6 +538,7 @@ module OpenTox
504
538
  @data_entries[compound.uri].collect{|f,v| @features[f] if f.match(/neighbor/)}.compact if @data_entries[compound.uri]
505
539
  end
506
540
 
541
+
507
542
  # def errors(compound)
508
543
  # features = @data_entries[compound.uri].keys
509
544
  # features.collect{|f| @features[f][OT.error]}.join(" ") if features
@@ -103,7 +103,7 @@ module OpenTox
103
103
  include Model
104
104
 
105
105
 
106
- attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors
106
+ attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors, :compounds
107
107
  def initialize(uri=nil)
108
108
 
109
109
  if uri
@@ -169,12 +169,13 @@ module OpenTox
169
169
  lazar.prediction_algorithm = hash["prediction_algorithm"] if hash["prediction_algorithm"]
170
170
  lazar.subjectid = hash["subjectid"] if hash["subjectid"]
171
171
  lazar.value_map = hash["value_map"] if hash["value_map"]
172
+ lazar.compounds = hash["compounds"] if hash["compounds"]
172
173
 
173
174
  lazar
174
175
  end
175
176
 
176
177
  def to_json
177
- Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map})
178
+ Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map, :compounds => @compounds})
178
179
  end
179
180
 
180
181
  def run( params, accept_header=nil, waiting_task=nil )
@@ -237,6 +238,7 @@ module OpenTox
237
238
 
238
239
  @compound = Compound.new compound_uri
239
240
  features = {}
241
+
240
242
  #LOGGER.debug self.to_yaml
241
243
  unless @prediction_dataset
242
244
  @prediction_dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid)
@@ -247,19 +249,19 @@ module OpenTox
247
249
  OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}]
248
250
  } )
249
251
  end
250
- if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "regression"
251
- all_activities = []
252
- all_activities = @activities.values.flatten.collect! { |i| i.to_f }
253
- end
252
+
254
253
  unless database_activity(subjectid) # adds database activity to @prediction_dataset
254
+
255
255
  # Calculation of needed values for query compound
256
256
  @compound_features = eval("#{@feature_calculation_algorithm}({
257
257
  :compound => @compound,
258
258
  :features => @features,
259
259
  :feature_dataset_uri => @metadata[OT.featureDataset],
260
260
  :pc_type => self.parameter(\"pc_type\"),
261
+ :lib => self.parameter(\"lib\"),
261
262
  :subjectid => subjectid
262
263
  })")
264
+
263
265
  # Adding fingerprint of query compound with features and values(p_value*nr_hits)
264
266
  @compound_fingerprints = {}
265
267
  @compound_features.each do |feature, value| # value is nil if "Substructure.match"
@@ -314,6 +316,16 @@ module OpenTox
314
316
  @prediction_dataset.add @compound.uri, feature_uri, true
315
317
  f+=1
316
318
  end
319
+ elsif @feature_calculation_algorithm == "Substructure.lookup"
320
+ f = 0
321
+ @compound_features.each do |feature, value|
322
+ features[feature] = feature
323
+ @prediction_dataset.add_feature(feature, {
324
+ RDF.type => [OT.NumericFeature]
325
+ })
326
+ @prediction_dataset.add @compound.uri, feature, value
327
+ f+=1
328
+ end
317
329
  else
318
330
  @compound_features.each do |feature|
319
331
  features[feature] = feature
@@ -337,15 +349,26 @@ module OpenTox
337
349
  else
338
350
  feature_uri = feature
339
351
  end
340
- @prediction_dataset.add neighbor[:compound], feature_uri, true
352
+ if @feature_calculation_algorithm == "Substructure.lookup"
353
+ @prediction_dataset.add neighbor[:compound], feature_uri, @fingerprints[neighbor[:compound]][feature_uri]
354
+ else
355
+ @prediction_dataset.add neighbor[:compound], feature_uri, true
356
+ end
357
+
341
358
  unless features.has_key? feature
342
359
  features[feature] = feature_uri
343
- @prediction_dataset.add_feature(feature_uri, {
344
- RDF.type => [OT.Substructure],
345
- OT.smarts => feature,
346
- OT.pValue => @p_values[feature],
347
- OT.effect => @effects[feature]
348
- })
360
+ if @feature_calculation_algorithm == "Substructure.lookup"
361
+ @prediction_dataset.add_feature(feature_uri, {
362
+ RDF.type => [OT.NumericFeature]
363
+ })
364
+ else
365
+ @prediction_dataset.add_feature(feature_uri, {
366
+ RDF.type => [OT.Substructure],
367
+ OT.smarts => feature,
368
+ OT.pValue => @p_values[feature],
369
+ OT.effect => @effects[feature]
370
+ })
371
+ end
349
372
  f+=1
350
373
  end
351
374
  end
@@ -349,11 +349,15 @@ module OpenTox
349
349
 
350
350
  # Load CSV string (format specification: http://toxcreate.org/help)
351
351
  # @param [String] csv CSV representation of the dataset
352
+ # @param [Boolean] drop_missing Whether completely missing rows should be droppped
353
+ # @param [Boolean] all_numeric Whether all features should be treated as numeric
354
+ # @param [Boolean] del_nominal All nominal features will be removed
352
355
  # @return [OpenTox::Dataset] Dataset object with CSV data
353
- def load_csv(csv, drop_missing=false)
356
+ def load_csv(csv, drop_missing=false, all_numeric=false)
354
357
  row = 0
355
358
  input = csv.split("\n")
356
359
  headers = split_row(input.shift)
360
+ headers.collect! {|header| header.to_s.gsub(/[\/.\\\(\)\{\}\[\]]/,"_")}
357
361
  add_features(headers)
358
362
  value_maps = Array.new
359
363
  regression_features=Array.new
@@ -362,7 +366,7 @@ module OpenTox
362
366
  row = split_row(row)
363
367
  value_maps = detect_new_values(row, value_maps)
364
368
  value_maps.each_with_index { |vm,j|
365
- if vm.size > @max_class_values # max @max_class_values classes.
369
+ if (vm.size > @max_class_values) || all_numeric # max @max_class_values classes.
366
370
  regression_features[j]=true
367
371
  else
368
372
  regression_features[j]=false
@@ -392,22 +396,30 @@ module OpenTox
392
396
 
393
397
  def warnings
394
398
 
395
- info = ''
399
+ info = '<br>'
396
400
  @feature_types.each do |feature,types|
401
+ @dataset.add_feature_metadata(feature,{RDF.type => []})
397
402
  if types.uniq.size == 0
398
- type = "helper#MissingFeature"
399
- elsif types.uniq.size > 1
400
- type = OT.NumericFeature
403
+ @dataset.add_feature_metadata(
404
+ feature, {RDF.type => ( @dataset.features[feature][RDF.type] << "helper#MissingFeature" ) } # TODO: Fit to OT ontology!
405
+ )
406
+ info += "'#{@dataset.feature_name(feature)}' detected as 'MissingFeature'<br>"
401
407
  else
402
- type = types.first
408
+ info += "'#{@dataset.feature_name(feature)}' detected as "
409
+ types_arr = []
410
+ types.uniq.each { |t|
411
+ types_arr << t
412
+ info += "'#{t.split('#').last}', "
413
+ }
414
+
415
+ @dataset.add_feature_metadata(
416
+ feature, {RDF.type => types_arr.sort} # nominal should be first for downward compatibility
417
+ )
418
+
419
+ info.chop!.chop!
420
+ info += "<br>"
403
421
  end
404
- @dataset.add_feature_metadata(feature,{RDF.type => [type]})
405
- info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
406
-
407
- # TODO: rewrite feature values
408
- # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
409
422
  end
410
-
411
423
  @dataset.metadata[OT.Info] = info
412
424
 
413
425
  warnings = ''
@@ -469,28 +481,31 @@ module OpenTox
469
481
  unless @duplicate_feature_indices.include? i
470
482
 
471
483
  value = row[i]
472
- #LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty
473
484
  feature = @features[feature_idx]
474
485
 
475
486
  type = feature_type(value) # May be NIL
476
- type = OT.NominalFeature unless (type.nil? || regression_features[i])
477
487
  @feature_types[feature] << type if type
488
+ # Add nominal type if #distinct values le @max_class_values
489
+ if type == OT.NumericFeature
490
+ @feature_types[feature] << OT.NominalFeature unless regression_features[i]
491
+ end
478
492
 
479
493
  val = nil
480
494
  case type
481
495
  when OT.NumericFeature
482
496
  val = value.to_f
497
+ val = nil if val.infinite?
483
498
  when OT.NominalFeature
484
499
  val = value.to_s
485
500
  end
486
501
 
487
502
  feature_idx += 1
488
503
 
489
- if val != nil
504
+ if val != nil
490
505
  @dataset.add(compound.uri, feature, val)
491
- if type != OT.NumericFeature
506
+ if @feature_types[feature].include? OT.NominalFeature
492
507
  @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
493
- @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
508
+ @dataset.features[feature][OT.acceptValue] << val unless @dataset.features[feature][OT.acceptValue].include?(val)
494
509
  end
495
510
  end
496
511
 
@@ -654,7 +669,7 @@ module OpenTox
654
669
  obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
655
670
  table.data[compound.uri] = row
656
671
  end
657
-
672
+
658
673
  # find and remove ignored_features
659
674
  @activity_errors = table.clean_features
660
675
  table.add_to_dataset @dataset
@@ -8,6 +8,18 @@ PACKAGE_DIR = package_dir
8
8
 
9
9
  require "tempfile"
10
10
 
11
+ class Array
12
+
13
+ def check_uniq
14
+ hash = {}
15
+ self.each do |x|
16
+ raise "duplicate #{x}" if hash[x]
17
+ hash[x] = true
18
+ end
19
+ end
20
+
21
+ end
22
+
11
23
  module OpenTox
12
24
 
13
25
  class RUtil
@@ -75,12 +87,10 @@ module OpenTox
75
87
  end
76
88
 
77
89
  # embedds feature values of two datasets into 2D and plots it
78
- # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)
79
90
  #
80
91
  def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
81
- features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
92
+ features=nil, subjectid=nil, waiting_task=nil)
82
93
 
83
- raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
84
94
  LOGGER.debug("r-util> create feature value plot")
85
95
  d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
86
96
  d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
@@ -102,17 +112,13 @@ module OpenTox
102
112
  @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
103
113
  @r.names = [dataset_name1, dataset_name2]
104
114
  LOGGER.debug("r-util> - convert data to 2d")
105
- @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
115
+ #@r.eval "save.image(\"/tmp/image.R\")"
116
+ @r.eval "df.2d <- plot_pre_process(df, method='sammon')"
106
117
  waiting_task.progress(75) if waiting_task
107
118
 
108
- if fast_plot
109
- info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
110
- else
111
- info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
112
- end
113
119
  LOGGER.debug("r-util> - plot data")
114
120
  plot_to_files(files) do |file|
115
- @r.eval "plot_split( df.2d, split, names, #{info})"
121
+ @r.eval "plot_split( df.2d, split, names, main='Sammon embedding of #{features.size} features',xlab='x',ylab='y')"
116
122
  end
117
123
  end
118
124
 
@@ -170,19 +176,68 @@ module OpenTox
170
176
  end
171
177
  end
172
178
 
173
- # stratified splits a dataset into two dataset the feature values
179
+ # stratified splits a dataset into two dataset according to the feature values
180
+ # all features are taken into account unless <split_features> is given
181
+ # returns two datases
182
+ def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
183
+ stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features )
184
+ end
185
+
186
+ # stratified splits a dataset into k datasets according the feature values
174
187
  # all features are taken into account unless <split_features> is given
175
- def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
188
+ # returns two arrays of datasets
189
+ def stratified_k_fold_split( dataset, metadata={}, missing_values="NA", num_folds=10, subjectid=nil, seed=42, split_features=nil )
190
+ stratified_split_internal( dataset, metadata, missing_values, num_folds, nil, subjectid, seed, split_features )
191
+ end
192
+
193
+ private
194
+ def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil )
195
+ raise "internal error" if num_folds!=nil and pct!=nil
196
+ k_fold_split = num_folds!=nil
197
+ if k_fold_split
198
+ raise "num_folds not a fixnum: #{num_folds}" unless num_folds.is_a?(Fixnum)
199
+ else
200
+ raise "pct is not a numeric: #{pct}" unless pct.is_a?(Numeric)
201
+ end
176
202
  raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
203
+ raise "missing_values=#{missing_values}" unless missing_values.is_a?(String) or missing_values==0
204
+ raise "subjectid=#{subjectid}" unless subjectid==nil or subjectid.is_a?(String)
177
205
  LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
178
206
 
179
- df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
207
+ df = dataset_to_dataframe( dataset, missing_values, subjectid)
180
208
  @r.eval "set.seed(#{seed})"
181
- @r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
182
- split = @r.pull 'split'
183
- split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
184
- split_to_datasets( df, split, subjectid )
209
+ str_split_features = ""
210
+ if split_features
211
+ @r.split_features = split_features if split_features
212
+ str_split_features = "colnames=split_features"
213
+ end
214
+ #@r.eval "save.image(\"/tmp/image.R\")"
215
+
216
+ if k_fold_split
217
+ @r.eval "split <- stratified_k_fold_split(#{df}, num_folds=#{num_folds}, #{str_split_features})"
218
+ split = @r.pull 'split'
219
+ train = []
220
+ test = []
221
+ num_folds.times do |f|
222
+ datasetname = 'dataset fold '+(f+1).to_s+' of '+num_folds.to_s
223
+ metadata[DC.title] = "training "+datasetname
224
+ train << split_to_dataset( df, split, metadata, subjectid ){ |i| i!=(f+1) }
225
+ metadata[DC.title] = "test "+datasetname
226
+ test << split_to_dataset( df, split, metadata, subjectid ){ |i| i==(f+1) }
227
+ end
228
+ return train, test
229
+ else
230
+ puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
231
+ @r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})"
232
+ split = @r.pull 'split'
233
+ metadata[DC.title] = "Training dataset split of "+dataset.uri
234
+ train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 }
235
+ metadata[DC.title] = "Test dataset split of "+dataset.uri
236
+ test = split_to_dataset( df, split, metadata, subjectid ){ |i| i==0 }
237
+ return train, test
238
+ end
185
239
  end
240
+ public
186
241
 
187
242
  # dataset should be loaded completely (use Dataset.find)
188
243
  # takes duplicates into account
@@ -212,9 +267,13 @@ module OpenTox
212
267
  features = dataset.features.keys.sort
213
268
  end
214
269
  compounds = []
270
+ compound_names = []
215
271
  dataset.compounds.each do |c|
272
+ count = 0
216
273
  num_compounds[c].times do |i|
217
274
  compounds << c
275
+ compound_names << "#{c}$#{count}"
276
+ count+=1
218
277
  end
219
278
  end
220
279
 
@@ -238,7 +297,7 @@ module OpenTox
238
297
  end
239
298
  end
240
299
  df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
241
- assign_dataframe(df_name,d_values,compounds,features)
300
+ assign_dataframe(df_name,d_values,compound_names,features)
242
301
 
243
302
  # set dataframe column types accordingly
244
303
  f_count = 1 #R starts at 1
@@ -264,25 +323,27 @@ module OpenTox
264
323
 
265
324
  # converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
266
325
  # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
267
- def dataframe_to_dataset( df, subjectid=nil )
268
- dataframe_to_dataset_indices( df, subjectid, nil)
326
+ def dataframe_to_dataset( df, metadata={}, subjectid=nil )
327
+ dataframe_to_dataset_indices( df, metadata, subjectid, nil)
269
328
  end
270
329
 
271
330
  private
272
- def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
331
+ def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil )
273
332
  raise unless @@feats[df].size>0
274
- values, compounds, features = pull_dataframe(df)
333
+ values, compound_names, features = pull_dataframe(df)
334
+ compounds = compound_names.collect{|c| c.split("$")[0]}
275
335
  features.each{|f| raise unless @@feats[df][f]}
276
336
  dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
337
+ dataset.add_metadata(metadata)
277
338
  LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
278
339
  compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
279
340
  features.each{|f| dataset.add_feature(f,@@feats[df][f])}
280
341
  features.size.times do |c|
281
342
  feat = OpenTox::Feature.find(features[c],subjectid)
282
- nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
343
+ numeric = feat.metadata[RDF.type].to_a.flatten.include?(OT.NumericFeature)
283
344
  compounds.size.times do |r|
284
345
  if compound_indices==nil or compound_indices.include?(r)
285
- dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f) if values[r][c]!="NA"
346
+ dataset.add(compounds[r],features[c],numeric ? values[r][c].to_f : values[r][c]) if values[r][c]!="NA"
286
347
  end
287
348
  end
288
349
  end
@@ -290,16 +351,12 @@ module OpenTox
290
351
  dataset
291
352
  end
292
353
 
293
- def split_to_datasets( df, split, subjectid=nil )
294
- sets = []
295
- (split.min.to_i .. split.max.to_i).each do |i|
296
- indices = []
297
- split.size.times{|j| indices<<j if split[j]==i}
298
- dataset = dataframe_to_dataset_indices( df, subjectid, indices )
299
- LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
300
- sets << dataset
301
- end
302
- sets
354
+ def split_to_dataset( df, split, metadata={}, subjectid=nil )
355
+ indices = []
356
+ split.size.times{|i| indices<<i if yield(split[i]) }
357
+ dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices )
358
+ LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
359
+ dataset
303
360
  end
304
361
 
305
362
  def pull_dataframe(df)
@@ -323,6 +380,8 @@ module OpenTox
323
380
  end
324
381
 
325
382
  def assign_dataframe(df,input,rownames,colnames)
383
+ rownames.check_uniq if rownames
384
+ colnames.check_uniq if colnames
326
385
  tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
327
386
  file = File.new(tmp, 'w')
328
387
  input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}