opentox-ruby 3.0.1 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +8 -0
- data/Rakefile +2 -3
- data/VERSION +1 -1
- data/lib/algorithm.rb +227 -675
- data/lib/authorization.rb +10 -8
- data/lib/compound.rb +47 -11
- data/lib/dataset.rb +50 -2
- data/lib/environment.rb +6 -1
- data/lib/model.rb +37 -72
- data/lib/opentox-ruby.rb +1 -1
- data/lib/parser.rb +115 -57
- data/lib/r-util.rb +354 -0
- data/lib/rest_client_wrapper.rb +1 -1
- data/lib/serializer.rb +47 -30
- data/lib/stratification.R +201 -0
- data/lib/task.rb +5 -1
- data/lib/transform.rb +520 -0
- data/lib/utils.rb +372 -0
- data/lib/validation.rb +52 -6
- metadata +413 -428
data/lib/parser.rb
CHANGED
@@ -57,7 +57,7 @@ module OpenTox
|
|
57
57
|
`rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line|
|
58
58
|
triple = line.to_triple
|
59
59
|
if triple[0] == @uri
|
60
|
-
if triple[1] == RDF.type || triple[1]==OT.predictedVariables # allow multiple types
|
60
|
+
if triple[1] == RDF.type || triple[1]==OT.predictedVariables || triple[1]==OT.independentVariables # allow multiple types
|
61
61
|
@metadata[triple[1]] = [] unless @metadata[triple[1]]
|
62
62
|
@metadata[triple[1]] << triple[2].split('^^').first
|
63
63
|
else
|
@@ -290,10 +290,11 @@ module OpenTox
|
|
290
290
|
@features = []
|
291
291
|
@feature_types = {}
|
292
292
|
|
293
|
-
@format_errors =
|
294
|
-
@
|
293
|
+
@format_errors = []
|
294
|
+
@id_errors = []
|
295
295
|
@activity_errors = []
|
296
296
|
@duplicates = {}
|
297
|
+
@max_class_values = 3
|
297
298
|
end
|
298
299
|
|
299
300
|
def detect_new_values(row, value_maps)
|
@@ -309,9 +310,10 @@ module OpenTox
|
|
309
310
|
# Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help)
|
310
311
|
# @param [Excel] book Excel workbook object (created with roo gem)
|
311
312
|
# @return [OpenTox::Dataset] Dataset object with Excel data
|
312
|
-
def load_spreadsheet(book)
|
313
|
+
def load_spreadsheet(book, drop_missing=false)
|
313
314
|
book.default_sheet = 0
|
314
|
-
|
315
|
+
headers = book.row(1)
|
316
|
+
add_features headers
|
315
317
|
value_maps = Array.new
|
316
318
|
regression_features=Array.new
|
317
319
|
|
@@ -319,15 +321,27 @@ module OpenTox
|
|
319
321
|
row = book.row(i)
|
320
322
|
value_maps = detect_new_values(row, value_maps)
|
321
323
|
value_maps.each_with_index { |vm,j|
|
322
|
-
if vm.size >
|
324
|
+
if vm.size > @max_class_values # 5 is the maximum nr of classes supported by Fminer.
|
323
325
|
regression_features[j]=true
|
324
326
|
else
|
325
327
|
regression_features[j]=false
|
326
328
|
end
|
327
329
|
}
|
328
330
|
}
|
331
|
+
|
329
332
|
2.upto(book.last_row) { |i|
|
330
|
-
|
333
|
+
drop=false
|
334
|
+
row = book.row(i)
|
335
|
+
raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
|
336
|
+
if row.include?("")
|
337
|
+
@format_errors << "Row #{i} has #{row.count("")} missing values"
|
338
|
+
drop=true
|
339
|
+
drop_missing=true if (row.count("") == row.size-1)
|
340
|
+
end
|
341
|
+
add_values(row, regression_features) unless (drop_missing && drop)
|
342
|
+
if (drop_missing && drop)
|
343
|
+
@format_errors << "Row #{i} not added"
|
344
|
+
end
|
331
345
|
}
|
332
346
|
warnings
|
333
347
|
@dataset
|
@@ -336,10 +350,11 @@ module OpenTox
|
|
336
350
|
# Load CSV string (format specification: http://toxcreate.org/help)
|
337
351
|
# @param [String] csv CSV representation of the dataset
|
338
352
|
# @return [OpenTox::Dataset] Dataset object with CSV data
|
339
|
-
def load_csv(csv)
|
353
|
+
def load_csv(csv, drop_missing=false)
|
340
354
|
row = 0
|
341
355
|
input = csv.split("\n")
|
342
|
-
|
356
|
+
headers = split_row(input.shift)
|
357
|
+
add_features(headers)
|
343
358
|
value_maps = Array.new
|
344
359
|
regression_features=Array.new
|
345
360
|
|
@@ -347,15 +362,27 @@ module OpenTox
|
|
347
362
|
row = split_row(row)
|
348
363
|
value_maps = detect_new_values(row, value_maps)
|
349
364
|
value_maps.each_with_index { |vm,j|
|
350
|
-
if vm.size >
|
365
|
+
if vm.size > @max_class_values # max @max_class_values classes.
|
351
366
|
regression_features[j]=true
|
352
367
|
else
|
353
368
|
regression_features[j]=false
|
354
369
|
end
|
355
370
|
}
|
356
371
|
}
|
357
|
-
|
358
|
-
|
372
|
+
|
373
|
+
input.each_with_index { |row, i|
|
374
|
+
drop=false
|
375
|
+
row = split_row(row)
|
376
|
+
raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
|
377
|
+
if row.include?("")
|
378
|
+
@format_errors << "Row #{i} has #{row.count("")} missing values"
|
379
|
+
drop=true
|
380
|
+
drop_missing=true if (row.count("") == row.size-1)
|
381
|
+
end
|
382
|
+
add_values(row, regression_features) unless (drop_missing && drop)
|
383
|
+
if (drop_missing && drop)
|
384
|
+
@format_errors << "Row #{i} not added"
|
385
|
+
end
|
359
386
|
}
|
360
387
|
warnings
|
361
388
|
@dataset
|
@@ -367,88 +394,115 @@ module OpenTox
|
|
367
394
|
|
368
395
|
info = ''
|
369
396
|
@feature_types.each do |feature,types|
|
370
|
-
if types.uniq.size
|
397
|
+
if types.uniq.size == 0
|
398
|
+
type = "helper#MissingFeature"
|
399
|
+
elsif types.uniq.size > 1
|
371
400
|
type = OT.NumericFeature
|
372
401
|
else
|
373
402
|
type = types.first
|
374
403
|
end
|
375
404
|
@dataset.add_feature_metadata(feature,{RDF.type => [type]})
|
376
|
-
info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}."
|
405
|
+
info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
|
377
406
|
|
378
407
|
# TODO: rewrite feature values
|
379
|
-
# TODO if value.to_f == 0 @activity_errors << "#{
|
408
|
+
# TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
|
380
409
|
end
|
381
410
|
|
382
411
|
@dataset.metadata[OT.Info] = info
|
383
412
|
|
384
413
|
warnings = ''
|
385
|
-
warnings += "<p>Incorrect
|
414
|
+
warnings += "<p>Incorrect structures (ignored):</p>" + @id_errors.join("<br/>") unless @id_errors.empty?
|
386
415
|
warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
|
416
|
+
warnings += "<p>Format errors:</p>" + @format_errors.join("<br/>") unless @format_errors.empty?
|
387
417
|
duplicate_warnings = ''
|
388
418
|
@duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
|
389
|
-
warnings += "<p>
|
419
|
+
warnings += "<p>Duplicate structures (all structures/activities used for model building, please make sure that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
|
390
420
|
|
391
421
|
@dataset.metadata[OT.Warnings] = warnings
|
392
422
|
|
393
423
|
end
|
394
424
|
|
425
|
+
# Adds a row of features to a dataset
|
426
|
+
# @param Array A row split up as an array
|
427
|
+
# @return Array Indices for duplicate features
|
395
428
|
def add_features(row)
|
396
|
-
row.
|
397
|
-
row.
|
429
|
+
row=row.collect
|
430
|
+
row.shift # get rid of id entry
|
431
|
+
@duplicate_feature_indices = [] # starts with 0 at first f after id
|
432
|
+
row.each_with_index do |feature_name, idx|
|
398
433
|
feature_uri = File.join(@dataset.uri,"feature",URI.encode(feature_name))
|
399
|
-
@
|
400
|
-
|
401
|
-
|
434
|
+
unless @features.include? feature_uri
|
435
|
+
@feature_types[feature_uri] = []
|
436
|
+
@features << feature_uri
|
437
|
+
@dataset.add_feature(feature_uri,{DC.title => feature_name})
|
438
|
+
else
|
439
|
+
@duplicate_feature_indices << idx
|
440
|
+
@format_errors << "Duplicate Feature '#{feature_name}' at pos #{idx}"
|
441
|
+
end
|
402
442
|
end
|
403
443
|
end
|
404
444
|
|
405
445
|
# Adds a row to a dataset
|
406
446
|
# @param Array A row split up as an array
|
407
447
|
# @param Array Indicator for regression for each field
|
448
|
+
# @param Array Indices for duplicate features
|
408
449
|
def add_values(row, regression_features)
|
409
450
|
|
410
|
-
|
411
|
-
|
451
|
+
id = row.shift
|
452
|
+
case id
|
453
|
+
when /InChI/
|
454
|
+
compound = Compound.from_inchi(URI.decode_www_form_component(id))
|
455
|
+
else
|
456
|
+
compound = Compound.from_smiles(id)
|
457
|
+
end
|
458
|
+
|
412
459
|
if compound.nil? or compound.inchi.nil? or compound.inchi == ""
|
413
|
-
@
|
460
|
+
@id_errors << id+", "+row.join(", ")
|
414
461
|
return false
|
415
462
|
end
|
416
463
|
@duplicates[compound.inchi] = [] unless @duplicates[compound.inchi]
|
417
|
-
@duplicates[compound.inchi] <<
|
464
|
+
@duplicates[compound.inchi] << id+", "+row.join(", ")
|
418
465
|
|
466
|
+
feature_idx = 0
|
419
467
|
row.each_index do |i|
|
420
|
-
value = row[i]
|
421
|
-
feature = @features[i]
|
422
468
|
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
if
|
427
|
-
|
469
|
+
unless @duplicate_feature_indices.include? i
|
470
|
+
|
471
|
+
value = row[i]
|
472
|
+
#LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty
|
473
|
+
feature = @features[feature_idx]
|
474
|
+
|
475
|
+
type = feature_type(value) # May be NIL
|
476
|
+
type = OT.NominalFeature unless (type.nil? || regression_features[i])
|
477
|
+
@feature_types[feature] << type if type
|
478
|
+
|
479
|
+
val = nil
|
480
|
+
case type
|
481
|
+
when OT.NumericFeature
|
482
|
+
val = value.to_f
|
483
|
+
when OT.NominalFeature
|
484
|
+
val = value.to_s
|
428
485
|
end
|
429
|
-
else
|
430
|
-
type = OT.NominalFeature
|
431
|
-
end
|
432
|
-
@feature_types[feature] << type
|
433
486
|
|
434
|
-
|
435
|
-
|
436
|
-
val
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
if type!=OT.NumericFeature
|
443
|
-
@dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
|
444
|
-
@dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
|
487
|
+
feature_idx += 1
|
488
|
+
|
489
|
+
if val != nil
|
490
|
+
@dataset.add(compound.uri, feature, val)
|
491
|
+
if type != OT.NumericFeature
|
492
|
+
@dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
|
493
|
+
@dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
|
494
|
+
end
|
445
495
|
end
|
496
|
+
|
446
497
|
end
|
498
|
+
|
447
499
|
end
|
448
500
|
end
|
449
501
|
|
450
502
|
def feature_type(value)
|
451
|
-
if
|
503
|
+
if value == ""
|
504
|
+
return nil
|
505
|
+
elsif OpenTox::Algorithm::numeric? value
|
452
506
|
return OT.NumericFeature
|
453
507
|
else
|
454
508
|
return OT.NominalFeature
|
@@ -456,7 +510,7 @@ module OpenTox
|
|
456
510
|
end
|
457
511
|
|
458
512
|
def split_row(row)
|
459
|
-
row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s
|
513
|
+
row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/,-1) # -1: do not skip empty cells
|
460
514
|
end
|
461
515
|
|
462
516
|
end
|
@@ -468,6 +522,7 @@ module OpenTox
|
|
468
522
|
def initialize
|
469
523
|
@data = {}
|
470
524
|
@activity_errors = []
|
525
|
+
@max_class_values = 3
|
471
526
|
end
|
472
527
|
|
473
528
|
def feature_values(feature)
|
@@ -485,14 +540,14 @@ module OpenTox
|
|
485
540
|
def clean_features
|
486
541
|
ignored_features = []
|
487
542
|
features.each do |feature|
|
488
|
-
if feature_values(feature).size >
|
543
|
+
if feature_values(feature).size > @max_class_values
|
489
544
|
if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature
|
490
545
|
# REGRESSION
|
491
546
|
elsif feature_types(feature).include? OT.NumericFeature
|
492
547
|
@data.each{|c,row| row[feature] = nil unless OpenTox::Algorithm::numeric?(row[feature]) } # delete nominal features
|
493
548
|
@activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)."
|
494
549
|
else
|
495
|
-
@activity_errors << "Feature #{feature} ignored (more than
|
550
|
+
@activity_errors << "Feature #{feature} ignored (more than #{@max_class_values} nominal feature values and no numeric values)."
|
496
551
|
ignored_features << feature
|
497
552
|
next
|
498
553
|
end
|
@@ -543,12 +598,15 @@ module OpenTox
|
|
543
598
|
private
|
544
599
|
|
545
600
|
def feature_type(value)
|
546
|
-
if
|
601
|
+
if value.nil?
|
602
|
+
return nil
|
603
|
+
elsif OpenTox::Algorithm::numeric? value
|
547
604
|
return OT.NumericFeature
|
548
605
|
else
|
549
606
|
return OT.NominalFeature
|
550
607
|
end
|
551
608
|
end
|
609
|
+
|
552
610
|
end
|
553
611
|
|
554
612
|
# quick hack to enable sdf import via csv
|
@@ -589,20 +647,20 @@ module OpenTox
|
|
589
647
|
@duplicates[inchi] << rec #inchi#+", "+row.join(", ")
|
590
648
|
compound = Compound.from_inchi inchi
|
591
649
|
rescue
|
592
|
-
@compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}"
|
650
|
+
@compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec}) have been ignored! \n#{s}"
|
593
651
|
next
|
594
652
|
end
|
595
653
|
row = {}
|
596
654
|
obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
|
597
655
|
table.data[compound.uri] = row
|
598
656
|
end
|
599
|
-
|
600
|
-
#
|
657
|
+
|
658
|
+
# find and remove ignored_features
|
601
659
|
@activity_errors = table.clean_features
|
602
660
|
table.add_to_dataset @dataset
|
603
661
|
|
604
662
|
warnings = ''
|
605
|
-
warnings += "<p>Incorrect
|
663
|
+
warnings += "<p>Incorrect structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
|
606
664
|
warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
|
607
665
|
duplicate_warnings = ''
|
608
666
|
@duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
|
data/lib/r-util.rb
ADDED
@@ -0,0 +1,354 @@
|
|
1
|
+
# pending: package dir hack ---------
|
2
|
+
# CONFIG[:base_dir] = "/home/<user>/opentox-ruby/www"
|
3
|
+
# PACKAGE_DIR = "/home/<user>/opentox-ruby/r-packages"
|
4
|
+
package_dir = CONFIG[:base_dir].split("/")
|
5
|
+
package_dir[-1] = "r-packages"
|
6
|
+
package_dir = package_dir.join("/")
|
7
|
+
PACKAGE_DIR = package_dir
|
8
|
+
|
9
|
+
require "tempfile"
|
10
|
+
|
11
|
+
module OpenTox
|
12
|
+
|
13
|
+
class RUtil
|
14
|
+
|
15
|
+
@@feats = {}
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
@r = RinRuby.new(true,false) unless defined?(@r) and @r
|
19
|
+
@r.eval ".libPaths('#{PACKAGE_DIR}')"
|
20
|
+
@r_packages = @r.pull "installed.packages()[,1]"
|
21
|
+
["sampling","gam","vegan"].each{|l| install_package(l)} #"caret", "smacof", "TunePareto"
|
22
|
+
@r.eval "source('#{File.join(Gem.loaded_specs['opentox-ruby'].full_gem_path,'lib/stratification.R')}')"
|
23
|
+
end
|
24
|
+
|
25
|
+
def quit_r
|
26
|
+
begin
|
27
|
+
@r.quit
|
28
|
+
@r = nil
|
29
|
+
rescue
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def r
|
34
|
+
@r
|
35
|
+
end
|
36
|
+
|
37
|
+
def package_installed?( package )
|
38
|
+
@r_packages.include?(package)
|
39
|
+
end
|
40
|
+
|
41
|
+
def install_package( package )
|
42
|
+
unless package_installed?(package)
|
43
|
+
LOGGER.debug "r-util> installing r-package #{package} to #{PACKAGE_DIR}"
|
44
|
+
@r.eval "install.packages('#{package}', repos='http://cran.r-project.org', lib='#{PACKAGE_DIR}')"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# <0 -> array1 << array2
|
49
|
+
# 0 -> no significant difference
|
50
|
+
# >0 -> array2 >> array1
|
51
|
+
def paired_ttest(array1, array2, significance_level=0.95)
|
52
|
+
@r.assign "v1",array1
|
53
|
+
@r.assign "v2",array2
|
54
|
+
@r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)"
|
55
|
+
t = @r.pull "ttest$statistic"
|
56
|
+
p = @r.pull "ttest$p.value"
|
57
|
+
if (1-significance_level > p)
|
58
|
+
t
|
59
|
+
else
|
60
|
+
0
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# example:
|
65
|
+
# files = ["/tmp/box.svg","/tmp/box.png"]
|
66
|
+
# data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ]
|
67
|
+
# boxplot(files, data, "comparison1" )
|
68
|
+
#
|
69
|
+
def boxplot(files, data, title="")
|
70
|
+
LOGGER.debug("r-util> create boxplot")
|
71
|
+
assign_dataframe("boxdata",data.collect{|e| e[1]}.transpose,nil,data.collect{|e| e[0].to_s})
|
72
|
+
plot_to_files(files) do |file|
|
73
|
+
@r.eval "boxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1}))"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# embedds feature values of two datasets into 2D and plots it
|
78
|
+
# fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)
|
79
|
+
#
|
80
|
+
def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
|
81
|
+
features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
|
82
|
+
|
83
|
+
raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
|
84
|
+
LOGGER.debug("r-util> create feature value plot")
|
85
|
+
d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
|
86
|
+
d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
|
87
|
+
if features
|
88
|
+
[d1, d2].each{|d| features.each{|f| raise "feature not included" unless d.features.keys.include?(f)}}
|
89
|
+
else
|
90
|
+
raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if
|
91
|
+
(d1.features.keys.sort != d2.features.keys.sort)
|
92
|
+
features = d1.features.keys
|
93
|
+
end
|
94
|
+
raise "at least two features needed" if d1.features.keys.size<2
|
95
|
+
waiting_task.progress(25) if waiting_task
|
96
|
+
|
97
|
+
df1 = dataset_to_dataframe(d1,0,subjectid,features)
|
98
|
+
df2 = dataset_to_dataframe(d2,0,subjectid,features)
|
99
|
+
waiting_task.progress(50) if waiting_task
|
100
|
+
|
101
|
+
@r.eval "df <- rbind(#{df1},#{df2})"
|
102
|
+
@r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
|
103
|
+
@r.names = [dataset_name1, dataset_name2]
|
104
|
+
LOGGER.debug("r-util> - convert data to 2d")
|
105
|
+
@r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
|
106
|
+
waiting_task.progress(75) if waiting_task
|
107
|
+
|
108
|
+
if fast_plot
|
109
|
+
info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
|
110
|
+
else
|
111
|
+
info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
|
112
|
+
end
|
113
|
+
LOGGER.debug("r-util> - plot data")
|
114
|
+
plot_to_files(files) do |file|
|
115
|
+
@r.eval "plot_split( df.2d, split, names, #{info})"
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# plots a double histogram
|
120
|
+
# data1 and data2 are arrays with values, either numerical or categorial (string values)
|
121
|
+
# is_numerical, boolean flag indicating value types
|
122
|
+
# log (only for numerical), plot logarithm of values
|
123
|
+
def double_hist_plot(files, data1, data2, is_numerical, log=false, name1="first", name2="second", title="title", xaxis="x-values")
|
124
|
+
LOGGER.debug("r-util> create double hist plot")
|
125
|
+
all = data1 + data2
|
126
|
+
if (is_numerical)
|
127
|
+
@r.eval "double_plot <- function(data1, data2, log=FALSE, names=c('data1','data2'), title='title', xlab='x-values')
|
128
|
+
{
|
129
|
+
if (log)
|
130
|
+
{
|
131
|
+
data1 <- log(data1)
|
132
|
+
data2 <- log(data2)
|
133
|
+
xlab = paste('logarithm of',xlab,sep=' ')
|
134
|
+
}
|
135
|
+
xlims <- round(c(min(c(min(data1),min(data2))),max(c(max(data1),max(data2)))))
|
136
|
+
h <- hist(rbind(data1,data2),plot=F)
|
137
|
+
h1 <- hist(data1,plot=F,breaks=h$breaks)
|
138
|
+
h2 <- hist(data2,plot=F,breaks=h$breaks)
|
139
|
+
xlims = c(min(h$breaks),max(h$breaks))
|
140
|
+
ylims = c(0,max(h1$counts,h2$counts))
|
141
|
+
xaxps = c(min(h$breaks),max(h$breaks),(length(h$breaks)-1))
|
142
|
+
plot(h1, col=rgb(1,0,0,2/4), xlim=xlims, xaxp=xaxps, ylim=ylims,
|
143
|
+
main=title, xlab=xlab, ylab='counts' )
|
144
|
+
plot(h2, col=rgb(0,1,0,2/4), add=T )
|
145
|
+
legend('topleft',names,lty=c(1,1),col=c('red','green'))
|
146
|
+
}"
|
147
|
+
@r.assign("data1",data1)
|
148
|
+
@r.assign("data2",data2)
|
149
|
+
@r.legend = [name1, name2]
|
150
|
+
else
|
151
|
+
raise "log not valid for categorial" if log
|
152
|
+
vals = all.uniq.sort!
|
153
|
+
counts1 = vals.collect{|e| data1.count(e)}
|
154
|
+
counts2 = vals.collect{|e| data2.count(e)}
|
155
|
+
@r.data1 = counts1
|
156
|
+
@r.data2 = counts2
|
157
|
+
@r.value_names = [name1, name2]
|
158
|
+
@r.legend = vals
|
159
|
+
@r.eval("data <- cbind(data1,data2)")
|
160
|
+
end
|
161
|
+
|
162
|
+
plot_to_files(files) do |file|
|
163
|
+
if (is_numerical)
|
164
|
+
@r.eval "double_plot(data1,data2,log=#{log ? "T":"F"},names=legend,title='#{title}',xlab='#{xaxis}')"
|
165
|
+
else
|
166
|
+
@r.eval("bp <- barplot(data, beside=T, names.arg=value_names,
|
167
|
+
main='#{title}', col=sort(rep(2:3,length(legend))))") #legend.text=c(legend),
|
168
|
+
@r.eval "text(bp, 0, round(data, 1),cex=1,pos=3)"
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
# stratified splits a dataset into two dataset the feature values
|
174
|
+
# all features are taken into account unless <split_features> is given
|
175
|
+
def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
|
176
|
+
raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
|
177
|
+
LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
|
178
|
+
|
179
|
+
df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
|
180
|
+
@r.eval "set.seed(#{seed})"
|
181
|
+
@r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
|
182
|
+
split = @r.pull 'split'
|
183
|
+
split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
|
184
|
+
split_to_datasets( df, split, subjectid )
|
185
|
+
end
|
186
|
+
|
187
|
+
# dataset should be loaded completely (use Dataset.find)
|
188
|
+
# takes duplicates into account
|
189
|
+
# replaces missing values with param <missing_value>
|
190
|
+
# returns dataframe-variable-name in R
|
191
|
+
def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil )
|
192
|
+
LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}"
|
193
|
+
|
194
|
+
# count duplicates
|
195
|
+
num_compounds = {}
|
196
|
+
dataset.features.keys.each do |f|
|
197
|
+
dataset.compounds.each do |c|
|
198
|
+
if dataset.data_entries[c]
|
199
|
+
val = dataset.data_entries[c][f]
|
200
|
+
size = val==nil ? 1 : val.size
|
201
|
+
num_compounds[c] = num_compounds[c]==nil ? size : [num_compounds[c],size].max
|
202
|
+
else
|
203
|
+
num_compounds[c] = 1
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
# use either all, or the provided features, sorting is important as col-index := features
|
209
|
+
if features
|
210
|
+
features.sort!
|
211
|
+
else
|
212
|
+
features = dataset.features.keys.sort
|
213
|
+
end
|
214
|
+
compounds = []
|
215
|
+
dataset.compounds.each do |c|
|
216
|
+
num_compounds[c].times do |i|
|
217
|
+
compounds << c
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
# values into 2D array, then to dataframe
|
222
|
+
d_values = []
|
223
|
+
dataset.compounds.each do |c|
|
224
|
+
num_compounds[c].times do |i|
|
225
|
+
c_values = []
|
226
|
+
features.each do |f|
|
227
|
+
if dataset.data_entries[c]
|
228
|
+
val = dataset.data_entries[c][f]
|
229
|
+
v = val==nil ? "" : val[i].to_s
|
230
|
+
else
|
231
|
+
raise "wtf" if i>0
|
232
|
+
v = ""
|
233
|
+
end
|
234
|
+
v = missing_value if v.size()==0
|
235
|
+
c_values << v
|
236
|
+
end
|
237
|
+
d_values << c_values
|
238
|
+
end
|
239
|
+
end
|
240
|
+
df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
|
241
|
+
assign_dataframe(df_name,d_values,compounds,features)
|
242
|
+
|
243
|
+
# set dataframe column types accordingly
|
244
|
+
f_count = 1 #R starts at 1
|
245
|
+
features.each do |f|
|
246
|
+
feat = OpenTox::Feature.find(f,subjectid)
|
247
|
+
nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
|
248
|
+
if nominal
|
249
|
+
@r.eval "#{df_name}[,#{f_count}] <- as.character(#{df_name}[,#{f_count}])"
|
250
|
+
else
|
251
|
+
@r.eval "#{df_name}[,#{f_count}] <- as.numeric(#{df_name}[,#{f_count}])"
|
252
|
+
end
|
253
|
+
f_count += 1
|
254
|
+
end
|
255
|
+
#@r.eval "head(#{df_name})"
|
256
|
+
|
257
|
+
# store compounds, and features (including metainformation)
|
258
|
+
@@feats[df_name] = {}
|
259
|
+
features.each do |f|
|
260
|
+
@@feats[df_name][f] = dataset.features[f]
|
261
|
+
end
|
262
|
+
df_name
|
263
|
+
end
|
264
|
+
|
265
|
+
# converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
|
266
|
+
# this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
|
267
|
+
def dataframe_to_dataset( df, subjectid=nil )
|
268
|
+
dataframe_to_dataset_indices( df, subjectid, nil)
|
269
|
+
end
|
270
|
+
|
271
|
+
private
|
272
|
+
def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
|
273
|
+
raise unless @@feats[df].size>0
|
274
|
+
values, compounds, features = pull_dataframe(df)
|
275
|
+
features.each{|f| raise unless @@feats[df][f]}
|
276
|
+
dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
|
277
|
+
LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
|
278
|
+
compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
|
279
|
+
features.each{|f| dataset.add_feature(f,@@feats[df][f])}
|
280
|
+
features.size.times do |c|
|
281
|
+
feat = OpenTox::Feature.find(features[c],subjectid)
|
282
|
+
nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
|
283
|
+
compounds.size.times do |r|
|
284
|
+
if compound_indices==nil or compound_indices.include?(r)
|
285
|
+
dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f) if values[r][c]!="NA"
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
289
|
+
dataset.save(subjectid)
|
290
|
+
dataset
|
291
|
+
end
|
292
|
+
|
293
|
+
def split_to_datasets( df, split, subjectid=nil )
|
294
|
+
sets = []
|
295
|
+
(split.min.to_i .. split.max.to_i).each do |i|
|
296
|
+
indices = []
|
297
|
+
split.size.times{|j| indices<<j if split[j]==i}
|
298
|
+
dataset = dataframe_to_dataset_indices( df, subjectid, indices )
|
299
|
+
LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
|
300
|
+
sets << dataset
|
301
|
+
end
|
302
|
+
sets
|
303
|
+
end
|
304
|
+
|
305
|
+
def pull_dataframe(df)
|
306
|
+
tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
|
307
|
+
@r.eval "write.table(#{df},file='#{tmp}',sep='#')"
|
308
|
+
res = []; compounds = []; features = []
|
309
|
+
first = true
|
310
|
+
file = File.new(tmp, 'r')
|
311
|
+
file.each_line("\n") do |row|
|
312
|
+
if first
|
313
|
+
features = row.chomp.split("#").collect{|e| e.gsub("\"","")}
|
314
|
+
first = false
|
315
|
+
else
|
316
|
+
vals = row.chomp.split("#").collect{|e| e.gsub("\"","")}
|
317
|
+
compounds << vals[0]
|
318
|
+
res << vals[1..-1]
|
319
|
+
end
|
320
|
+
end
|
321
|
+
begin File.delete(tmp); rescue; end
|
322
|
+
return res, compounds, features
|
323
|
+
end
|
324
|
+
|
325
|
+
def assign_dataframe(df,input,rownames,colnames)
|
326
|
+
tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
|
327
|
+
file = File.new(tmp, 'w')
|
328
|
+
input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}
|
329
|
+
file.flush
|
330
|
+
@r.rownames = rownames if rownames
|
331
|
+
@r.colnames = colnames
|
332
|
+
@r.eval "#{df} <- read.table(file='#{tmp}',sep='#',"+
|
333
|
+
"#{rownames ? "row.names=rownames" : ""},col.names=colnames,check.names=F)"
|
334
|
+
begin File.delete(tmp); rescue; end
|
335
|
+
end
|
336
|
+
|
337
|
+
def plot_to_files(files)
|
338
|
+
files.each do |file|
|
339
|
+
if file=~/(?i)\.svg/
|
340
|
+
@r.eval("svg('#{file}',10,8)")
|
341
|
+
elsif file=~/(?i)\.png/
|
342
|
+
@r.eval("png('#{file}')")
|
343
|
+
else
|
344
|
+
raise "invalid format: "+file.to_s
|
345
|
+
end
|
346
|
+
yield file
|
347
|
+
LOGGER.debug "r-util> plotted to #{file}"
|
348
|
+
@r.eval("dev.off()")
|
349
|
+
end
|
350
|
+
end
|
351
|
+
end
|
352
|
+
end
|
353
|
+
|
354
|
+
|