opentox-ruby 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/parser.rb CHANGED
@@ -57,7 +57,7 @@ module OpenTox
57
57
  `rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line|
58
58
  triple = line.to_triple
59
59
  if triple[0] == @uri
60
- if triple[1] == RDF.type || triple[1]==OT.predictedVariables # allow multiple types
60
+ if triple[1] == RDF.type || triple[1]==OT.predictedVariables || triple[1]==OT.independentVariables # allow multiple types
61
61
  @metadata[triple[1]] = [] unless @metadata[triple[1]]
62
62
  @metadata[triple[1]] << triple[2].split('^^').first
63
63
  else
@@ -290,10 +290,11 @@ module OpenTox
290
290
  @features = []
291
291
  @feature_types = {}
292
292
 
293
- @format_errors = ""
294
- @smiles_errors = []
293
+ @format_errors = []
294
+ @id_errors = []
295
295
  @activity_errors = []
296
296
  @duplicates = {}
297
+ @max_class_values = 3
297
298
  end
298
299
 
299
300
  def detect_new_values(row, value_maps)
@@ -309,9 +310,10 @@ module OpenTox
309
310
  # Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help)
310
311
  # @param [Excel] book Excel workbook object (created with roo gem)
311
312
  # @return [OpenTox::Dataset] Dataset object with Excel data
312
- def load_spreadsheet(book)
313
+ def load_spreadsheet(book, drop_missing=false)
313
314
  book.default_sheet = 0
314
- add_features book.row(1)
315
+ headers = book.row(1)
316
+ add_features headers
315
317
  value_maps = Array.new
316
318
  regression_features=Array.new
317
319
 
@@ -319,15 +321,27 @@ module OpenTox
319
321
  row = book.row(i)
320
322
  value_maps = detect_new_values(row, value_maps)
321
323
  value_maps.each_with_index { |vm,j|
322
- if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer.
324
+ if vm.size > @max_class_values # 5 is the maximum nr of classes supported by Fminer.
323
325
  regression_features[j]=true
324
326
  else
325
327
  regression_features[j]=false
326
328
  end
327
329
  }
328
330
  }
331
+
329
332
  2.upto(book.last_row) { |i|
330
- add_values book.row(i), regression_features
333
+ drop=false
334
+ row = book.row(i)
335
+ raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
336
+ if row.include?("")
337
+ @format_errors << "Row #{i} has #{row.count("")} missing values"
338
+ drop=true
339
+ drop_missing=true if (row.count("") == row.size-1)
340
+ end
341
+ add_values(row, regression_features) unless (drop_missing && drop)
342
+ if (drop_missing && drop)
343
+ @format_errors << "Row #{i} not added"
344
+ end
331
345
  }
332
346
  warnings
333
347
  @dataset
@@ -336,10 +350,11 @@ module OpenTox
336
350
  # Load CSV string (format specification: http://toxcreate.org/help)
337
351
  # @param [String] csv CSV representation of the dataset
338
352
  # @return [OpenTox::Dataset] Dataset object with CSV data
339
- def load_csv(csv)
353
+ def load_csv(csv, drop_missing=false)
340
354
  row = 0
341
355
  input = csv.split("\n")
342
- add_features split_row(input.shift)
356
+ headers = split_row(input.shift)
357
+ add_features(headers)
343
358
  value_maps = Array.new
344
359
  regression_features=Array.new
345
360
 
@@ -347,15 +362,27 @@ module OpenTox
347
362
  row = split_row(row)
348
363
  value_maps = detect_new_values(row, value_maps)
349
364
  value_maps.each_with_index { |vm,j|
350
- if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer.
365
+ if vm.size > @max_class_values # max @max_class_values classes.
351
366
  regression_features[j]=true
352
367
  else
353
368
  regression_features[j]=false
354
369
  end
355
370
  }
356
371
  }
357
- input.each { |row|
358
- add_values split_row(row), regression_features
372
+
373
+ input.each_with_index { |row, i|
374
+ drop=false
375
+ row = split_row(row)
376
+ raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
377
+ if row.include?("")
378
+ @format_errors << "Row #{i} has #{row.count("")} missing values"
379
+ drop=true
380
+ drop_missing=true if (row.count("") == row.size-1)
381
+ end
382
+ add_values(row, regression_features) unless (drop_missing && drop)
383
+ if (drop_missing && drop)
384
+ @format_errors << "Row #{i} not added"
385
+ end
359
386
  }
360
387
  warnings
361
388
  @dataset
@@ -367,88 +394,115 @@ module OpenTox
367
394
 
368
395
  info = ''
369
396
  @feature_types.each do |feature,types|
370
- if types.uniq.size > 1
397
+ if types.uniq.size == 0
398
+ type = "helper#MissingFeature"
399
+ elsif types.uniq.size > 1
371
400
  type = OT.NumericFeature
372
401
  else
373
402
  type = types.first
374
403
  end
375
404
  @dataset.add_feature_metadata(feature,{RDF.type => [type]})
376
- info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}."
405
+ info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
377
406
 
378
407
  # TODO: rewrite feature values
379
- # TODO if value.to_f == 0 @activity_errors << "#{smiles} Zero values not allowed for regression datasets - entry ignored."
408
+ # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
380
409
  end
381
410
 
382
411
  @dataset.metadata[OT.Info] = info
383
412
 
384
413
  warnings = ''
385
- warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @smiles_errors.join("<br/>") unless @smiles_errors.empty?
414
+ warnings += "<p>Incorrect structures (ignored):</p>" + @id_errors.join("<br/>") unless @id_errors.empty?
386
415
  warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
416
+ warnings += "<p>Format errors:</p>" + @format_errors.join("<br/>") unless @format_errors.empty?
387
417
  duplicate_warnings = ''
388
418
  @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
389
- warnings += "<p>Duplicated structures (all structures/activities used for model building, please make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
419
+ warnings += "<p>Duplicate structures (all structures/activities used for model building, please make sure that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
390
420
 
391
421
  @dataset.metadata[OT.Warnings] = warnings
392
422
 
393
423
  end
394
424
 
425
+ # Adds a row of features to a dataset
426
+ # @param Array A row split up as an array
427
+ # @return Array Indices for duplicate features
395
428
  def add_features(row)
396
- row.shift # get rid of smiles entry
397
- row.each do |feature_name|
429
+ row=row.collect
430
+ row.shift # get rid of id entry
431
+ @duplicate_feature_indices = [] # starts with 0 at first f after id
432
+ row.each_with_index do |feature_name, idx|
398
433
  feature_uri = File.join(@dataset.uri,"feature",URI.encode(feature_name))
399
- @feature_types[feature_uri] = []
400
- @features << feature_uri
401
- @dataset.add_feature(feature_uri,{DC.title => feature_name})
434
+ unless @features.include? feature_uri
435
+ @feature_types[feature_uri] = []
436
+ @features << feature_uri
437
+ @dataset.add_feature(feature_uri,{DC.title => feature_name})
438
+ else
439
+ @duplicate_feature_indices << idx
440
+ @format_errors << "Duplicate Feature '#{feature_name}' at pos #{idx}"
441
+ end
402
442
  end
403
443
  end
404
444
 
405
445
  # Adds a row to a dataset
406
446
  # @param Array A row split up as an array
407
447
  # @param Array Indicator for regression for each field
448
+ # @param Array Indices for duplicate features
408
449
  def add_values(row, regression_features)
409
450
 
410
- smiles = row.shift
411
- compound = Compound.from_smiles(smiles)
451
+ id = row.shift
452
+ case id
453
+ when /InChI/
454
+ compound = Compound.from_inchi(URI.decode_www_form_component(id))
455
+ else
456
+ compound = Compound.from_smiles(id)
457
+ end
458
+
412
459
  if compound.nil? or compound.inchi.nil? or compound.inchi == ""
413
- @smiles_errors << smiles+", "+row.join(", ")
460
+ @id_errors << id+", "+row.join(", ")
414
461
  return false
415
462
  end
416
463
  @duplicates[compound.inchi] = [] unless @duplicates[compound.inchi]
417
- @duplicates[compound.inchi] << smiles+", "+row.join(", ")
464
+ @duplicates[compound.inchi] << id+", "+row.join(", ")
418
465
 
466
+ feature_idx = 0
419
467
  row.each_index do |i|
420
- value = row[i]
421
- feature = @features[i]
422
468
 
423
- type = nil
424
- if (regression_features[i])
425
- type = feature_type(value)
426
- if type != OT.NumericFeature
427
- raise "Error! Expected numeric values."
469
+ unless @duplicate_feature_indices.include? i
470
+
471
+ value = row[i]
472
+ #LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty
473
+ feature = @features[feature_idx]
474
+
475
+ type = feature_type(value) # May be NIL
476
+ type = OT.NominalFeature unless (type.nil? || regression_features[i])
477
+ @feature_types[feature] << type if type
478
+
479
+ val = nil
480
+ case type
481
+ when OT.NumericFeature
482
+ val = value.to_f
483
+ when OT.NominalFeature
484
+ val = value.to_s
428
485
  end
429
- else
430
- type = OT.NominalFeature
431
- end
432
- @feature_types[feature] << type
433
486
 
434
- case type
435
- when OT.NumericFeature
436
- val = value.to_f
437
- when OT.NominalFeature
438
- val = value.to_s
439
- end
440
- if val!=nil
441
- @dataset.add(compound.uri, feature, val)
442
- if type!=OT.NumericFeature
443
- @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
444
- @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
487
+ feature_idx += 1
488
+
489
+ if val != nil
490
+ @dataset.add(compound.uri, feature, val)
491
+ if type != OT.NumericFeature
492
+ @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
493
+ @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
494
+ end
445
495
  end
496
+
446
497
  end
498
+
447
499
  end
448
500
  end
449
501
 
450
502
  def feature_type(value)
451
- if OpenTox::Algorithm::numeric? value
503
+ if value == ""
504
+ return nil
505
+ elsif OpenTox::Algorithm::numeric? value
452
506
  return OT.NumericFeature
453
507
  else
454
508
  return OT.NominalFeature
@@ -456,7 +510,7 @@ module OpenTox
456
510
  end
457
511
 
458
512
  def split_row(row)
459
- row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/) # remove quotes
513
+ row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/,-1) # -1: do not skip empty cells
460
514
  end
461
515
 
462
516
  end
@@ -468,6 +522,7 @@ module OpenTox
468
522
  def initialize
469
523
  @data = {}
470
524
  @activity_errors = []
525
+ @max_class_values = 3
471
526
  end
472
527
 
473
528
  def feature_values(feature)
@@ -485,14 +540,14 @@ module OpenTox
485
540
  def clean_features
486
541
  ignored_features = []
487
542
  features.each do |feature|
488
- if feature_values(feature).size > 5
543
+ if feature_values(feature).size > @max_class_values
489
544
  if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature
490
545
  # REGRESSION
491
546
  elsif feature_types(feature).include? OT.NumericFeature
492
547
  @data.each{|c,row| row[feature] = nil unless OpenTox::Algorithm::numeric?(row[feature]) } # delete nominal features
493
548
  @activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)."
494
549
  else
495
- @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)."
550
+ @activity_errors << "Feature #{feature} ignored (more than #{@max_class_values} nominal feature values and no numeric values)."
496
551
  ignored_features << feature
497
552
  next
498
553
  end
@@ -543,12 +598,15 @@ module OpenTox
543
598
  private
544
599
 
545
600
  def feature_type(value)
546
- if OpenTox::Algorithm::numeric? value
601
+ if value.nil?
602
+ return nil
603
+ elsif OpenTox::Algorithm::numeric? value
547
604
  return OT.NumericFeature
548
605
  else
549
606
  return OT.NominalFeature
550
607
  end
551
608
  end
609
+
552
610
  end
553
611
 
554
612
  # quick hack to enable sdf import via csv
@@ -589,20 +647,20 @@ module OpenTox
589
647
  @duplicates[inchi] << rec #inchi#+", "+row.join(", ")
590
648
  compound = Compound.from_inchi inchi
591
649
  rescue
592
- @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}"
650
+ @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec}) have been ignored! \n#{s}"
593
651
  next
594
652
  end
595
653
  row = {}
596
654
  obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
597
655
  table.data[compound.uri] = row
598
656
  end
599
-
600
- # finda and remove ignored_features
657
+
658
+ # find and remove ignored_features
601
659
  @activity_errors = table.clean_features
602
660
  table.add_to_dataset @dataset
603
661
 
604
662
  warnings = ''
605
- warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
663
+ warnings += "<p>Incorrect structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
606
664
  warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
607
665
  duplicate_warnings = ''
608
666
  @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
data/lib/r-util.rb ADDED
@@ -0,0 +1,354 @@
1
+ # pending: package dir hack ---------
2
+ # CONFIG[:base_dir] = "/home/<user>/opentox-ruby/www"
3
+ # PACKAGE_DIR = "/home/<user>/opentox-ruby/r-packages"
4
+ package_dir = CONFIG[:base_dir].split("/")
5
+ package_dir[-1] = "r-packages"
6
+ package_dir = package_dir.join("/")
7
+ PACKAGE_DIR = package_dir
8
+
9
+ require "tempfile"
10
+
11
+ module OpenTox
12
+
13
+ class RUtil
14
+
15
+ @@feats = {}
16
+
17
+ def initialize
18
+ @r = RinRuby.new(true,false) unless defined?(@r) and @r
19
+ @r.eval ".libPaths('#{PACKAGE_DIR}')"
20
+ @r_packages = @r.pull "installed.packages()[,1]"
21
+ ["sampling","gam","vegan"].each{|l| install_package(l)} #"caret", "smacof", "TunePareto"
22
+ @r.eval "source('#{File.join(Gem.loaded_specs['opentox-ruby'].full_gem_path,'lib/stratification.R')}')"
23
+ end
24
+
25
+ def quit_r
26
+ begin
27
+ @r.quit
28
+ @r = nil
29
+ rescue
30
+ end
31
+ end
32
+
33
+ def r
34
+ @r
35
+ end
36
+
37
+ def package_installed?( package )
38
+ @r_packages.include?(package)
39
+ end
40
+
41
+ def install_package( package )
42
+ unless package_installed?(package)
43
+ LOGGER.debug "r-util> installing r-package #{package} to #{PACKAGE_DIR}"
44
+ @r.eval "install.packages('#{package}', repos='http://cran.r-project.org', lib='#{PACKAGE_DIR}')"
45
+ end
46
+ end
47
+
48
+ # <0 -> array1 << array2
49
+ # 0 -> no significant difference
50
+ # >0 -> array2 >> array1
51
+ def paired_ttest(array1, array2, significance_level=0.95)
52
+ @r.assign "v1",array1
53
+ @r.assign "v2",array2
54
+ @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)"
55
+ t = @r.pull "ttest$statistic"
56
+ p = @r.pull "ttest$p.value"
57
+ if (1-significance_level > p)
58
+ t
59
+ else
60
+ 0
61
+ end
62
+ end
63
+
64
+ # example:
65
+ # files = ["/tmp/box.svg","/tmp/box.png"]
66
+ # data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ]
67
+ # boxplot(files, data, "comparison1" )
68
+ #
69
+ def boxplot(files, data, title="")
70
+ LOGGER.debug("r-util> create boxplot")
71
+ assign_dataframe("boxdata",data.collect{|e| e[1]}.transpose,nil,data.collect{|e| e[0].to_s})
72
+ plot_to_files(files) do |file|
73
+ @r.eval "boxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1}))"
74
+ end
75
+ end
76
+
77
+ # embedds feature values of two datasets into 2D and plots it
78
+ # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)
79
+ #
80
+ def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
81
+ features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
82
+
83
+ raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
84
+ LOGGER.debug("r-util> create feature value plot")
85
+ d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
86
+ d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
87
+ if features
88
+ [d1, d2].each{|d| features.each{|f| raise "feature not included" unless d.features.keys.include?(f)}}
89
+ else
90
+ raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if
91
+ (d1.features.keys.sort != d2.features.keys.sort)
92
+ features = d1.features.keys
93
+ end
94
+ raise "at least two features needed" if d1.features.keys.size<2
95
+ waiting_task.progress(25) if waiting_task
96
+
97
+ df1 = dataset_to_dataframe(d1,0,subjectid,features)
98
+ df2 = dataset_to_dataframe(d2,0,subjectid,features)
99
+ waiting_task.progress(50) if waiting_task
100
+
101
+ @r.eval "df <- rbind(#{df1},#{df2})"
102
+ @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
103
+ @r.names = [dataset_name1, dataset_name2]
104
+ LOGGER.debug("r-util> - convert data to 2d")
105
+ @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
106
+ waiting_task.progress(75) if waiting_task
107
+
108
+ if fast_plot
109
+ info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
110
+ else
111
+ info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
112
+ end
113
+ LOGGER.debug("r-util> - plot data")
114
+ plot_to_files(files) do |file|
115
+ @r.eval "plot_split( df.2d, split, names, #{info})"
116
+ end
117
+ end
118
+
119
+ # plots a double histogram
120
+ # data1 and data2 are arrays with values, either numerical or categorial (string values)
121
+ # is_numerical, boolean flag indicating value types
122
+ # log (only for numerical), plot logarithm of values
123
+ def double_hist_plot(files, data1, data2, is_numerical, log=false, name1="first", name2="second", title="title", xaxis="x-values")
124
+ LOGGER.debug("r-util> create double hist plot")
125
+ all = data1 + data2
126
+ if (is_numerical)
127
+ @r.eval "double_plot <- function(data1, data2, log=FALSE, names=c('data1','data2'), title='title', xlab='x-values')
128
+ {
129
+ if (log)
130
+ {
131
+ data1 <- log(data1)
132
+ data2 <- log(data2)
133
+ xlab = paste('logarithm of',xlab,sep=' ')
134
+ }
135
+ xlims <- round(c(min(c(min(data1),min(data2))),max(c(max(data1),max(data2)))))
136
+ h <- hist(rbind(data1,data2),plot=F)
137
+ h1 <- hist(data1,plot=F,breaks=h$breaks)
138
+ h2 <- hist(data2,plot=F,breaks=h$breaks)
139
+ xlims = c(min(h$breaks),max(h$breaks))
140
+ ylims = c(0,max(h1$counts,h2$counts))
141
+ xaxps = c(min(h$breaks),max(h$breaks),(length(h$breaks)-1))
142
+ plot(h1, col=rgb(1,0,0,2/4), xlim=xlims, xaxp=xaxps, ylim=ylims,
143
+ main=title, xlab=xlab, ylab='counts' )
144
+ plot(h2, col=rgb(0,1,0,2/4), add=T )
145
+ legend('topleft',names,lty=c(1,1),col=c('red','green'))
146
+ }"
147
+ @r.assign("data1",data1)
148
+ @r.assign("data2",data2)
149
+ @r.legend = [name1, name2]
150
+ else
151
+ raise "log not valid for categorial" if log
152
+ vals = all.uniq.sort!
153
+ counts1 = vals.collect{|e| data1.count(e)}
154
+ counts2 = vals.collect{|e| data2.count(e)}
155
+ @r.data1 = counts1
156
+ @r.data2 = counts2
157
+ @r.value_names = [name1, name2]
158
+ @r.legend = vals
159
+ @r.eval("data <- cbind(data1,data2)")
160
+ end
161
+
162
+ plot_to_files(files) do |file|
163
+ if (is_numerical)
164
+ @r.eval "double_plot(data1,data2,log=#{log ? "T":"F"},names=legend,title='#{title}',xlab='#{xaxis}')"
165
+ else
166
+ @r.eval("bp <- barplot(data, beside=T, names.arg=value_names,
167
+ main='#{title}', col=sort(rep(2:3,length(legend))))") #legend.text=c(legend),
168
+ @r.eval "text(bp, 0, round(data, 1),cex=1,pos=3)"
169
+ end
170
+ end
171
+ end
172
+
173
+ # stratified splits a dataset into two dataset the feature values
174
+ # all features are taken into account unless <split_features> is given
175
+ def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
176
+ raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
177
+ LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
178
+
179
+ df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
180
+ @r.eval "set.seed(#{seed})"
181
+ @r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
182
+ split = @r.pull 'split'
183
+ split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
184
+ split_to_datasets( df, split, subjectid )
185
+ end
186
+
187
+ # dataset should be loaded completely (use Dataset.find)
188
+ # takes duplicates into account
189
+ # replaces missing values with param <missing_value>
190
+ # returns dataframe-variable-name in R
191
+ def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil )
192
+ LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}"
193
+
194
+ # count duplicates
195
+ num_compounds = {}
196
+ dataset.features.keys.each do |f|
197
+ dataset.compounds.each do |c|
198
+ if dataset.data_entries[c]
199
+ val = dataset.data_entries[c][f]
200
+ size = val==nil ? 1 : val.size
201
+ num_compounds[c] = num_compounds[c]==nil ? size : [num_compounds[c],size].max
202
+ else
203
+ num_compounds[c] = 1
204
+ end
205
+ end
206
+ end
207
+
208
+ # use either all, or the provided features, sorting is important as col-index := features
209
+ if features
210
+ features.sort!
211
+ else
212
+ features = dataset.features.keys.sort
213
+ end
214
+ compounds = []
215
+ dataset.compounds.each do |c|
216
+ num_compounds[c].times do |i|
217
+ compounds << c
218
+ end
219
+ end
220
+
221
+ # values into 2D array, then to dataframe
222
+ d_values = []
223
+ dataset.compounds.each do |c|
224
+ num_compounds[c].times do |i|
225
+ c_values = []
226
+ features.each do |f|
227
+ if dataset.data_entries[c]
228
+ val = dataset.data_entries[c][f]
229
+ v = val==nil ? "" : val[i].to_s
230
+ else
231
+ raise "wtf" if i>0
232
+ v = ""
233
+ end
234
+ v = missing_value if v.size()==0
235
+ c_values << v
236
+ end
237
+ d_values << c_values
238
+ end
239
+ end
240
+ df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
241
+ assign_dataframe(df_name,d_values,compounds,features)
242
+
243
+ # set dataframe column types accordingly
244
+ f_count = 1 #R starts at 1
245
+ features.each do |f|
246
+ feat = OpenTox::Feature.find(f,subjectid)
247
+ nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
248
+ if nominal
249
+ @r.eval "#{df_name}[,#{f_count}] <- as.character(#{df_name}[,#{f_count}])"
250
+ else
251
+ @r.eval "#{df_name}[,#{f_count}] <- as.numeric(#{df_name}[,#{f_count}])"
252
+ end
253
+ f_count += 1
254
+ end
255
+ #@r.eval "head(#{df_name})"
256
+
257
+ # store compounds, and features (including metainformation)
258
+ @@feats[df_name] = {}
259
+ features.each do |f|
260
+ @@feats[df_name][f] = dataset.features[f]
261
+ end
262
+ df_name
263
+ end
264
+
265
+ # converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
266
+ # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
267
+ def dataframe_to_dataset( df, subjectid=nil )
268
+ dataframe_to_dataset_indices( df, subjectid, nil)
269
+ end
270
+
271
+ private
272
+ def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
273
+ raise unless @@feats[df].size>0
274
+ values, compounds, features = pull_dataframe(df)
275
+ features.each{|f| raise unless @@feats[df][f]}
276
+ dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
277
+ LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
278
+ compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
279
+ features.each{|f| dataset.add_feature(f,@@feats[df][f])}
280
+ features.size.times do |c|
281
+ feat = OpenTox::Feature.find(features[c],subjectid)
282
+ nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
283
+ compounds.size.times do |r|
284
+ if compound_indices==nil or compound_indices.include?(r)
285
+ dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f) if values[r][c]!="NA"
286
+ end
287
+ end
288
+ end
289
+ dataset.save(subjectid)
290
+ dataset
291
+ end
292
+
293
+ def split_to_datasets( df, split, subjectid=nil )
294
+ sets = []
295
+ (split.min.to_i .. split.max.to_i).each do |i|
296
+ indices = []
297
+ split.size.times{|j| indices<<j if split[j]==i}
298
+ dataset = dataframe_to_dataset_indices( df, subjectid, indices )
299
+ LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
300
+ sets << dataset
301
+ end
302
+ sets
303
+ end
304
+
305
+ def pull_dataframe(df)
306
+ tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
307
+ @r.eval "write.table(#{df},file='#{tmp}',sep='#')"
308
+ res = []; compounds = []; features = []
309
+ first = true
310
+ file = File.new(tmp, 'r')
311
+ file.each_line("\n") do |row|
312
+ if first
313
+ features = row.chomp.split("#").collect{|e| e.gsub("\"","")}
314
+ first = false
315
+ else
316
+ vals = row.chomp.split("#").collect{|e| e.gsub("\"","")}
317
+ compounds << vals[0]
318
+ res << vals[1..-1]
319
+ end
320
+ end
321
+ begin File.delete(tmp); rescue; end
322
+ return res, compounds, features
323
+ end
324
+
325
+ def assign_dataframe(df,input,rownames,colnames)
326
+ tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
327
+ file = File.new(tmp, 'w')
328
+ input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}
329
+ file.flush
330
+ @r.rownames = rownames if rownames
331
+ @r.colnames = colnames
332
+ @r.eval "#{df} <- read.table(file='#{tmp}',sep='#',"+
333
+ "#{rownames ? "row.names=rownames" : ""},col.names=colnames,check.names=F)"
334
+ begin File.delete(tmp); rescue; end
335
+ end
336
+
337
+ def plot_to_files(files)
338
+ files.each do |file|
339
+ if file=~/(?i)\.svg/
340
+ @r.eval("svg('#{file}',10,8)")
341
+ elsif file=~/(?i)\.png/
342
+ @r.eval("png('#{file}')")
343
+ else
344
+ raise "invalid format: "+file.to_s
345
+ end
346
+ yield file
347
+ LOGGER.debug "r-util> plotted to #{file}"
348
+ @r.eval("dev.off()")
349
+ end
350
+ end
351
+ end
352
+ end
353
+
354
+