opentox-ruby 3.0.1 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/parser.rb CHANGED
@@ -57,7 +57,7 @@ module OpenTox
57
57
  `rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line|
58
58
  triple = line.to_triple
59
59
  if triple[0] == @uri
60
- if triple[1] == RDF.type || triple[1]==OT.predictedVariables # allow multiple types
60
+ if triple[1] == RDF.type || triple[1]==OT.predictedVariables || triple[1]==OT.independentVariables # allow multiple types
61
61
  @metadata[triple[1]] = [] unless @metadata[triple[1]]
62
62
  @metadata[triple[1]] << triple[2].split('^^').first
63
63
  else
@@ -290,10 +290,11 @@ module OpenTox
290
290
  @features = []
291
291
  @feature_types = {}
292
292
 
293
- @format_errors = ""
294
- @smiles_errors = []
293
+ @format_errors = []
294
+ @id_errors = []
295
295
  @activity_errors = []
296
296
  @duplicates = {}
297
+ @max_class_values = 3
297
298
  end
298
299
 
299
300
  def detect_new_values(row, value_maps)
@@ -309,9 +310,10 @@ module OpenTox
309
310
  # Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help)
310
311
  # @param [Excel] book Excel workbook object (created with roo gem)
311
312
  # @return [OpenTox::Dataset] Dataset object with Excel data
312
- def load_spreadsheet(book)
313
+ def load_spreadsheet(book, drop_missing=false)
313
314
  book.default_sheet = 0
314
- add_features book.row(1)
315
+ headers = book.row(1)
316
+ add_features headers
315
317
  value_maps = Array.new
316
318
  regression_features=Array.new
317
319
 
@@ -319,15 +321,27 @@ module OpenTox
319
321
  row = book.row(i)
320
322
  value_maps = detect_new_values(row, value_maps)
321
323
  value_maps.each_with_index { |vm,j|
322
- if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer.
324
+ if vm.size > @max_class_values # 5 is the maximum nr of classes supported by Fminer.
323
325
  regression_features[j]=true
324
326
  else
325
327
  regression_features[j]=false
326
328
  end
327
329
  }
328
330
  }
331
+
329
332
  2.upto(book.last_row) { |i|
330
- add_values book.row(i), regression_features
333
+ drop=false
334
+ row = book.row(i)
335
+ raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
336
+ if row.include?("")
337
+ @format_errors << "Row #{i} has #{row.count("")} missing values"
338
+ drop=true
339
+ drop_missing=true if (row.count("") == row.size-1)
340
+ end
341
+ add_values(row, regression_features) unless (drop_missing && drop)
342
+ if (drop_missing && drop)
343
+ @format_errors << "Row #{i} not added"
344
+ end
331
345
  }
332
346
  warnings
333
347
  @dataset
@@ -336,10 +350,11 @@ module OpenTox
336
350
  # Load CSV string (format specification: http://toxcreate.org/help)
337
351
  # @param [String] csv CSV representation of the dataset
338
352
  # @return [OpenTox::Dataset] Dataset object with CSV data
339
- def load_csv(csv)
353
+ def load_csv(csv, drop_missing=false)
340
354
  row = 0
341
355
  input = csv.split("\n")
342
- add_features split_row(input.shift)
356
+ headers = split_row(input.shift)
357
+ add_features(headers)
343
358
  value_maps = Array.new
344
359
  regression_features=Array.new
345
360
 
@@ -347,15 +362,27 @@ module OpenTox
347
362
  row = split_row(row)
348
363
  value_maps = detect_new_values(row, value_maps)
349
364
  value_maps.each_with_index { |vm,j|
350
- if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer.
365
+ if vm.size > @max_class_values # max @max_class_values classes.
351
366
  regression_features[j]=true
352
367
  else
353
368
  regression_features[j]=false
354
369
  end
355
370
  }
356
371
  }
357
- input.each { |row|
358
- add_values split_row(row), regression_features
372
+
373
+ input.each_with_index { |row, i|
374
+ drop=false
375
+ row = split_row(row)
376
+ raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
377
+ if row.include?("")
378
+ @format_errors << "Row #{i} has #{row.count("")} missing values"
379
+ drop=true
380
+ drop_missing=true if (row.count("") == row.size-1)
381
+ end
382
+ add_values(row, regression_features) unless (drop_missing && drop)
383
+ if (drop_missing && drop)
384
+ @format_errors << "Row #{i} not added"
385
+ end
359
386
  }
360
387
  warnings
361
388
  @dataset
@@ -367,88 +394,115 @@ module OpenTox
367
394
 
368
395
  info = ''
369
396
  @feature_types.each do |feature,types|
370
- if types.uniq.size > 1
397
+ if types.uniq.size == 0
398
+ type = "helper#MissingFeature"
399
+ elsif types.uniq.size > 1
371
400
  type = OT.NumericFeature
372
401
  else
373
402
  type = types.first
374
403
  end
375
404
  @dataset.add_feature_metadata(feature,{RDF.type => [type]})
376
- info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}."
405
+ info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
377
406
 
378
407
  # TODO: rewrite feature values
379
- # TODO if value.to_f == 0 @activity_errors << "#{smiles} Zero values not allowed for regression datasets - entry ignored."
408
+ # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
380
409
  end
381
410
 
382
411
  @dataset.metadata[OT.Info] = info
383
412
 
384
413
  warnings = ''
385
- warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @smiles_errors.join("<br/>") unless @smiles_errors.empty?
414
+ warnings += "<p>Incorrect structures (ignored):</p>" + @id_errors.join("<br/>") unless @id_errors.empty?
386
415
  warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
416
+ warnings += "<p>Format errors:</p>" + @format_errors.join("<br/>") unless @format_errors.empty?
387
417
  duplicate_warnings = ''
388
418
  @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
389
- warnings += "<p>Duplicated structures (all structures/activities used for model building, please make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
419
+ warnings += "<p>Duplicate structures (all structures/activities used for model building, please make sure that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
390
420
 
391
421
  @dataset.metadata[OT.Warnings] = warnings
392
422
 
393
423
  end
394
424
 
425
+ # Adds a row of features to a dataset
426
+ # @param Array A row split up as an array
427
+ # @return Array Indices for duplicate features
395
428
  def add_features(row)
396
- row.shift # get rid of smiles entry
397
- row.each do |feature_name|
429
+ row=row.collect
430
+ row.shift # get rid of id entry
431
+ @duplicate_feature_indices = [] # starts with 0 at first f after id
432
+ row.each_with_index do |feature_name, idx|
398
433
  feature_uri = File.join(@dataset.uri,"feature",URI.encode(feature_name))
399
- @feature_types[feature_uri] = []
400
- @features << feature_uri
401
- @dataset.add_feature(feature_uri,{DC.title => feature_name})
434
+ unless @features.include? feature_uri
435
+ @feature_types[feature_uri] = []
436
+ @features << feature_uri
437
+ @dataset.add_feature(feature_uri,{DC.title => feature_name})
438
+ else
439
+ @duplicate_feature_indices << idx
440
+ @format_errors << "Duplicate Feature '#{feature_name}' at pos #{idx}"
441
+ end
402
442
  end
403
443
  end
404
444
 
405
445
  # Adds a row to a dataset
406
446
  # @param Array A row split up as an array
407
447
  # @param Array Indicator for regression for each field
448
+ # @param Array Indices for duplicate features
408
449
  def add_values(row, regression_features)
409
450
 
410
- smiles = row.shift
411
- compound = Compound.from_smiles(smiles)
451
+ id = row.shift
452
+ case id
453
+ when /InChI/
454
+ compound = Compound.from_inchi(URI.decode_www_form_component(id))
455
+ else
456
+ compound = Compound.from_smiles(id)
457
+ end
458
+
412
459
  if compound.nil? or compound.inchi.nil? or compound.inchi == ""
413
- @smiles_errors << smiles+", "+row.join(", ")
460
+ @id_errors << id+", "+row.join(", ")
414
461
  return false
415
462
  end
416
463
  @duplicates[compound.inchi] = [] unless @duplicates[compound.inchi]
417
- @duplicates[compound.inchi] << smiles+", "+row.join(", ")
464
+ @duplicates[compound.inchi] << id+", "+row.join(", ")
418
465
 
466
+ feature_idx = 0
419
467
  row.each_index do |i|
420
- value = row[i]
421
- feature = @features[i]
422
468
 
423
- type = nil
424
- if (regression_features[i])
425
- type = feature_type(value)
426
- if type != OT.NumericFeature
427
- raise "Error! Expected numeric values."
469
+ unless @duplicate_feature_indices.include? i
470
+
471
+ value = row[i]
472
+ #LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty
473
+ feature = @features[feature_idx]
474
+
475
+ type = feature_type(value) # May be NIL
476
+ type = OT.NominalFeature unless (type.nil? || regression_features[i])
477
+ @feature_types[feature] << type if type
478
+
479
+ val = nil
480
+ case type
481
+ when OT.NumericFeature
482
+ val = value.to_f
483
+ when OT.NominalFeature
484
+ val = value.to_s
428
485
  end
429
- else
430
- type = OT.NominalFeature
431
- end
432
- @feature_types[feature] << type
433
486
 
434
- case type
435
- when OT.NumericFeature
436
- val = value.to_f
437
- when OT.NominalFeature
438
- val = value.to_s
439
- end
440
- if val!=nil
441
- @dataset.add(compound.uri, feature, val)
442
- if type!=OT.NumericFeature
443
- @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
444
- @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
487
+ feature_idx += 1
488
+
489
+ if val != nil
490
+ @dataset.add(compound.uri, feature, val)
491
+ if type != OT.NumericFeature
492
+ @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
493
+ @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
494
+ end
445
495
  end
496
+
446
497
  end
498
+
447
499
  end
448
500
  end
449
501
 
450
502
  def feature_type(value)
451
- if OpenTox::Algorithm::numeric? value
503
+ if value == ""
504
+ return nil
505
+ elsif OpenTox::Algorithm::numeric? value
452
506
  return OT.NumericFeature
453
507
  else
454
508
  return OT.NominalFeature
@@ -456,7 +510,7 @@ module OpenTox
456
510
  end
457
511
 
458
512
  def split_row(row)
459
- row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/) # remove quotes
513
+ row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/,-1) # -1: do not skip empty cells
460
514
  end
461
515
 
462
516
  end
@@ -468,6 +522,7 @@ module OpenTox
468
522
  def initialize
469
523
  @data = {}
470
524
  @activity_errors = []
525
+ @max_class_values = 3
471
526
  end
472
527
 
473
528
  def feature_values(feature)
@@ -485,14 +540,14 @@ module OpenTox
485
540
  def clean_features
486
541
  ignored_features = []
487
542
  features.each do |feature|
488
- if feature_values(feature).size > 5
543
+ if feature_values(feature).size > @max_class_values
489
544
  if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature
490
545
  # REGRESSION
491
546
  elsif feature_types(feature).include? OT.NumericFeature
492
547
  @data.each{|c,row| row[feature] = nil unless OpenTox::Algorithm::numeric?(row[feature]) } # delete nominal features
493
548
  @activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)."
494
549
  else
495
- @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)."
550
+ @activity_errors << "Feature #{feature} ignored (more than #{@max_class_values} nominal feature values and no numeric values)."
496
551
  ignored_features << feature
497
552
  next
498
553
  end
@@ -543,12 +598,15 @@ module OpenTox
543
598
  private
544
599
 
545
600
  def feature_type(value)
546
- if OpenTox::Algorithm::numeric? value
601
+ if value.nil?
602
+ return nil
603
+ elsif OpenTox::Algorithm::numeric? value
547
604
  return OT.NumericFeature
548
605
  else
549
606
  return OT.NominalFeature
550
607
  end
551
608
  end
609
+
552
610
  end
553
611
 
554
612
  # quick hack to enable sdf import via csv
@@ -589,20 +647,20 @@ module OpenTox
589
647
  @duplicates[inchi] << rec #inchi#+", "+row.join(", ")
590
648
  compound = Compound.from_inchi inchi
591
649
  rescue
592
- @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}"
650
+ @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec}) have been ignored! \n#{s}"
593
651
  next
594
652
  end
595
653
  row = {}
596
654
  obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
597
655
  table.data[compound.uri] = row
598
656
  end
599
-
600
- # finda and remove ignored_features
657
+
658
+ # find and remove ignored_features
601
659
  @activity_errors = table.clean_features
602
660
  table.add_to_dataset @dataset
603
661
 
604
662
  warnings = ''
605
- warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
663
+ warnings += "<p>Incorrect structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
606
664
  warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
607
665
  duplicate_warnings = ''
608
666
  @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
data/lib/r-util.rb ADDED
@@ -0,0 +1,354 @@
1
+ # pending: package dir hack ---------
2
+ # CONFIG[:base_dir] = "/home/<user>/opentox-ruby/www"
3
+ # PACKAGE_DIR = "/home/<user>/opentox-ruby/r-packages"
4
+ package_dir = CONFIG[:base_dir].split("/")
5
+ package_dir[-1] = "r-packages"
6
+ package_dir = package_dir.join("/")
7
+ PACKAGE_DIR = package_dir
8
+
9
+ require "tempfile"
10
+
11
+ module OpenTox
12
+
13
+ class RUtil
14
+
15
+ @@feats = {}
16
+
17
+ def initialize
18
+ @r = RinRuby.new(true,false) unless defined?(@r) and @r
19
+ @r.eval ".libPaths('#{PACKAGE_DIR}')"
20
+ @r_packages = @r.pull "installed.packages()[,1]"
21
+ ["sampling","gam","vegan"].each{|l| install_package(l)} #"caret", "smacof", "TunePareto"
22
+ @r.eval "source('#{File.join(Gem.loaded_specs['opentox-ruby'].full_gem_path,'lib/stratification.R')}')"
23
+ end
24
+
25
+ def quit_r
26
+ begin
27
+ @r.quit
28
+ @r = nil
29
+ rescue
30
+ end
31
+ end
32
+
33
+ def r
34
+ @r
35
+ end
36
+
37
+ def package_installed?( package )
38
+ @r_packages.include?(package)
39
+ end
40
+
41
+ def install_package( package )
42
+ unless package_installed?(package)
43
+ LOGGER.debug "r-util> installing r-package #{package} to #{PACKAGE_DIR}"
44
+ @r.eval "install.packages('#{package}', repos='http://cran.r-project.org', lib='#{PACKAGE_DIR}')"
45
+ end
46
+ end
47
+
48
+ # <0 -> array1 << array2
49
+ # 0 -> no significant difference
50
+ # >0 -> array2 >> array1
51
+ def paired_ttest(array1, array2, significance_level=0.95)
52
+ @r.assign "v1",array1
53
+ @r.assign "v2",array2
54
+ @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)"
55
+ t = @r.pull "ttest$statistic"
56
+ p = @r.pull "ttest$p.value"
57
+ if (1-significance_level > p)
58
+ t
59
+ else
60
+ 0
61
+ end
62
+ end
63
+
64
+ # example:
65
+ # files = ["/tmp/box.svg","/tmp/box.png"]
66
+ # data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ]
67
+ # boxplot(files, data, "comparison1" )
68
+ #
69
+ def boxplot(files, data, title="")
70
+ LOGGER.debug("r-util> create boxplot")
71
+ assign_dataframe("boxdata",data.collect{|e| e[1]}.transpose,nil,data.collect{|e| e[0].to_s})
72
+ plot_to_files(files) do |file|
73
+ @r.eval "boxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1}))"
74
+ end
75
+ end
76
+
77
+ # embedds feature values of two datasets into 2D and plots it
78
+ # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)
79
+ #
80
+ def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
81
+ features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
82
+
83
+ raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
84
+ LOGGER.debug("r-util> create feature value plot")
85
+ d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
86
+ d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
87
+ if features
88
+ [d1, d2].each{|d| features.each{|f| raise "feature not included" unless d.features.keys.include?(f)}}
89
+ else
90
+ raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if
91
+ (d1.features.keys.sort != d2.features.keys.sort)
92
+ features = d1.features.keys
93
+ end
94
+ raise "at least two features needed" if d1.features.keys.size<2
95
+ waiting_task.progress(25) if waiting_task
96
+
97
+ df1 = dataset_to_dataframe(d1,0,subjectid,features)
98
+ df2 = dataset_to_dataframe(d2,0,subjectid,features)
99
+ waiting_task.progress(50) if waiting_task
100
+
101
+ @r.eval "df <- rbind(#{df1},#{df2})"
102
+ @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
103
+ @r.names = [dataset_name1, dataset_name2]
104
+ LOGGER.debug("r-util> - convert data to 2d")
105
+ @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
106
+ waiting_task.progress(75) if waiting_task
107
+
108
+ if fast_plot
109
+ info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
110
+ else
111
+ info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
112
+ end
113
+ LOGGER.debug("r-util> - plot data")
114
+ plot_to_files(files) do |file|
115
+ @r.eval "plot_split( df.2d, split, names, #{info})"
116
+ end
117
+ end
118
+
119
+ # plots a double histogram
120
+ # data1 and data2 are arrays with values, either numerical or categorial (string values)
121
+ # is_numerical, boolean flag indicating value types
122
+ # log (only for numerical), plot logarithm of values
123
+ def double_hist_plot(files, data1, data2, is_numerical, log=false, name1="first", name2="second", title="title", xaxis="x-values")
124
+ LOGGER.debug("r-util> create double hist plot")
125
+ all = data1 + data2
126
+ if (is_numerical)
127
+ @r.eval "double_plot <- function(data1, data2, log=FALSE, names=c('data1','data2'), title='title', xlab='x-values')
128
+ {
129
+ if (log)
130
+ {
131
+ data1 <- log(data1)
132
+ data2 <- log(data2)
133
+ xlab = paste('logarithm of',xlab,sep=' ')
134
+ }
135
+ xlims <- round(c(min(c(min(data1),min(data2))),max(c(max(data1),max(data2)))))
136
+ h <- hist(rbind(data1,data2),plot=F)
137
+ h1 <- hist(data1,plot=F,breaks=h$breaks)
138
+ h2 <- hist(data2,plot=F,breaks=h$breaks)
139
+ xlims = c(min(h$breaks),max(h$breaks))
140
+ ylims = c(0,max(h1$counts,h2$counts))
141
+ xaxps = c(min(h$breaks),max(h$breaks),(length(h$breaks)-1))
142
+ plot(h1, col=rgb(1,0,0,2/4), xlim=xlims, xaxp=xaxps, ylim=ylims,
143
+ main=title, xlab=xlab, ylab='counts' )
144
+ plot(h2, col=rgb(0,1,0,2/4), add=T )
145
+ legend('topleft',names,lty=c(1,1),col=c('red','green'))
146
+ }"
147
+ @r.assign("data1",data1)
148
+ @r.assign("data2",data2)
149
+ @r.legend = [name1, name2]
150
+ else
151
+ raise "log not valid for categorial" if log
152
+ vals = all.uniq.sort!
153
+ counts1 = vals.collect{|e| data1.count(e)}
154
+ counts2 = vals.collect{|e| data2.count(e)}
155
+ @r.data1 = counts1
156
+ @r.data2 = counts2
157
+ @r.value_names = [name1, name2]
158
+ @r.legend = vals
159
+ @r.eval("data <- cbind(data1,data2)")
160
+ end
161
+
162
+ plot_to_files(files) do |file|
163
+ if (is_numerical)
164
+ @r.eval "double_plot(data1,data2,log=#{log ? "T":"F"},names=legend,title='#{title}',xlab='#{xaxis}')"
165
+ else
166
+ @r.eval("bp <- barplot(data, beside=T, names.arg=value_names,
167
+ main='#{title}', col=sort(rep(2:3,length(legend))))") #legend.text=c(legend),
168
+ @r.eval "text(bp, 0, round(data, 1),cex=1,pos=3)"
169
+ end
170
+ end
171
+ end
172
+
173
+ # stratified splits a dataset into two dataset the feature values
174
+ # all features are taken into account unless <split_features> is given
175
+ def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
176
+ raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
177
+ LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
178
+
179
+ df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
180
+ @r.eval "set.seed(#{seed})"
181
+ @r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
182
+ split = @r.pull 'split'
183
+ split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
184
+ split_to_datasets( df, split, subjectid )
185
+ end
186
+
187
+ # dataset should be loaded completely (use Dataset.find)
188
+ # takes duplicates into account
189
+ # replaces missing values with param <missing_value>
190
+ # returns dataframe-variable-name in R
191
+ def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil )
192
+ LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}"
193
+
194
+ # count duplicates
195
+ num_compounds = {}
196
+ dataset.features.keys.each do |f|
197
+ dataset.compounds.each do |c|
198
+ if dataset.data_entries[c]
199
+ val = dataset.data_entries[c][f]
200
+ size = val==nil ? 1 : val.size
201
+ num_compounds[c] = num_compounds[c]==nil ? size : [num_compounds[c],size].max
202
+ else
203
+ num_compounds[c] = 1
204
+ end
205
+ end
206
+ end
207
+
208
+ # use either all, or the provided features, sorting is important as col-index := features
209
+ if features
210
+ features.sort!
211
+ else
212
+ features = dataset.features.keys.sort
213
+ end
214
+ compounds = []
215
+ dataset.compounds.each do |c|
216
+ num_compounds[c].times do |i|
217
+ compounds << c
218
+ end
219
+ end
220
+
221
+ # values into 2D array, then to dataframe
222
+ d_values = []
223
+ dataset.compounds.each do |c|
224
+ num_compounds[c].times do |i|
225
+ c_values = []
226
+ features.each do |f|
227
+ if dataset.data_entries[c]
228
+ val = dataset.data_entries[c][f]
229
+ v = val==nil ? "" : val[i].to_s
230
+ else
231
+ raise "wtf" if i>0
232
+ v = ""
233
+ end
234
+ v = missing_value if v.size()==0
235
+ c_values << v
236
+ end
237
+ d_values << c_values
238
+ end
239
+ end
240
+ df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
241
+ assign_dataframe(df_name,d_values,compounds,features)
242
+
243
+ # set dataframe column types accordingly
244
+ f_count = 1 #R starts at 1
245
+ features.each do |f|
246
+ feat = OpenTox::Feature.find(f,subjectid)
247
+ nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
248
+ if nominal
249
+ @r.eval "#{df_name}[,#{f_count}] <- as.character(#{df_name}[,#{f_count}])"
250
+ else
251
+ @r.eval "#{df_name}[,#{f_count}] <- as.numeric(#{df_name}[,#{f_count}])"
252
+ end
253
+ f_count += 1
254
+ end
255
+ #@r.eval "head(#{df_name})"
256
+
257
+ # store compounds, and features (including metainformation)
258
+ @@feats[df_name] = {}
259
+ features.each do |f|
260
+ @@feats[df_name][f] = dataset.features[f]
261
+ end
262
+ df_name
263
+ end
264
+
265
+ # converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
266
+ # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
267
+ def dataframe_to_dataset( df, subjectid=nil )
268
+ dataframe_to_dataset_indices( df, subjectid, nil)
269
+ end
270
+
271
+ private
272
+ def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
273
+ raise unless @@feats[df].size>0
274
+ values, compounds, features = pull_dataframe(df)
275
+ features.each{|f| raise unless @@feats[df][f]}
276
+ dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
277
+ LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
278
+ compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
279
+ features.each{|f| dataset.add_feature(f,@@feats[df][f])}
280
+ features.size.times do |c|
281
+ feat = OpenTox::Feature.find(features[c],subjectid)
282
+ nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
283
+ compounds.size.times do |r|
284
+ if compound_indices==nil or compound_indices.include?(r)
285
+ dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f) if values[r][c]!="NA"
286
+ end
287
+ end
288
+ end
289
+ dataset.save(subjectid)
290
+ dataset
291
+ end
292
+
293
+ def split_to_datasets( df, split, subjectid=nil )
294
+ sets = []
295
+ (split.min.to_i .. split.max.to_i).each do |i|
296
+ indices = []
297
+ split.size.times{|j| indices<<j if split[j]==i}
298
+ dataset = dataframe_to_dataset_indices( df, subjectid, indices )
299
+ LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
300
+ sets << dataset
301
+ end
302
+ sets
303
+ end
304
+
305
+ def pull_dataframe(df)
306
+ tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
307
+ @r.eval "write.table(#{df},file='#{tmp}',sep='#')"
308
+ res = []; compounds = []; features = []
309
+ first = true
310
+ file = File.new(tmp, 'r')
311
+ file.each_line("\n") do |row|
312
+ if first
313
+ features = row.chomp.split("#").collect{|e| e.gsub("\"","")}
314
+ first = false
315
+ else
316
+ vals = row.chomp.split("#").collect{|e| e.gsub("\"","")}
317
+ compounds << vals[0]
318
+ res << vals[1..-1]
319
+ end
320
+ end
321
+ begin File.delete(tmp); rescue; end
322
+ return res, compounds, features
323
+ end
324
+
325
+ def assign_dataframe(df,input,rownames,colnames)
326
+ tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
327
+ file = File.new(tmp, 'w')
328
+ input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}
329
+ file.flush
330
+ @r.rownames = rownames if rownames
331
+ @r.colnames = colnames
332
+ @r.eval "#{df} <- read.table(file='#{tmp}',sep='#',"+
333
+ "#{rownames ? "row.names=rownames" : ""},col.names=colnames,check.names=F)"
334
+ begin File.delete(tmp); rescue; end
335
+ end
336
+
337
+ def plot_to_files(files)
338
+ files.each do |file|
339
+ if file=~/(?i)\.svg/
340
+ @r.eval("svg('#{file}',10,8)")
341
+ elsif file=~/(?i)\.png/
342
+ @r.eval("png('#{file}')")
343
+ else
344
+ raise "invalid format: "+file.to_s
345
+ end
346
+ yield file
347
+ LOGGER.debug "r-util> plotted to #{file}"
348
+ @r.eval("dev.off()")
349
+ end
350
+ end
351
+ end
352
+ end
353
+
354
+