opentox-ruby 3.0.1 → 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +8 -0
- data/Rakefile +2 -3
- data/VERSION +1 -1
- data/lib/algorithm.rb +227 -675
- data/lib/authorization.rb +10 -8
- data/lib/compound.rb +47 -11
- data/lib/dataset.rb +50 -2
- data/lib/environment.rb +6 -1
- data/lib/model.rb +37 -72
- data/lib/opentox-ruby.rb +1 -1
- data/lib/parser.rb +115 -57
- data/lib/r-util.rb +354 -0
- data/lib/rest_client_wrapper.rb +1 -1
- data/lib/serializer.rb +47 -30
- data/lib/stratification.R +201 -0
- data/lib/task.rb +5 -1
- data/lib/transform.rb +520 -0
- data/lib/utils.rb +372 -0
- data/lib/validation.rb +52 -6
- metadata +413 -428
data/lib/parser.rb
CHANGED
@@ -57,7 +57,7 @@ module OpenTox
|
|
57
57
|
`rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line|
|
58
58
|
triple = line.to_triple
|
59
59
|
if triple[0] == @uri
|
60
|
-
if triple[1] == RDF.type || triple[1]==OT.predictedVariables # allow multiple types
|
60
|
+
if triple[1] == RDF.type || triple[1]==OT.predictedVariables || triple[1]==OT.independentVariables # allow multiple types
|
61
61
|
@metadata[triple[1]] = [] unless @metadata[triple[1]]
|
62
62
|
@metadata[triple[1]] << triple[2].split('^^').first
|
63
63
|
else
|
@@ -290,10 +290,11 @@ module OpenTox
|
|
290
290
|
@features = []
|
291
291
|
@feature_types = {}
|
292
292
|
|
293
|
-
@format_errors =
|
294
|
-
@
|
293
|
+
@format_errors = []
|
294
|
+
@id_errors = []
|
295
295
|
@activity_errors = []
|
296
296
|
@duplicates = {}
|
297
|
+
@max_class_values = 3
|
297
298
|
end
|
298
299
|
|
299
300
|
def detect_new_values(row, value_maps)
|
@@ -309,9 +310,10 @@ module OpenTox
|
|
309
310
|
# Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help)
|
310
311
|
# @param [Excel] book Excel workbook object (created with roo gem)
|
311
312
|
# @return [OpenTox::Dataset] Dataset object with Excel data
|
312
|
-
def load_spreadsheet(book)
|
313
|
+
def load_spreadsheet(book, drop_missing=false)
|
313
314
|
book.default_sheet = 0
|
314
|
-
|
315
|
+
headers = book.row(1)
|
316
|
+
add_features headers
|
315
317
|
value_maps = Array.new
|
316
318
|
regression_features=Array.new
|
317
319
|
|
@@ -319,15 +321,27 @@ module OpenTox
|
|
319
321
|
row = book.row(i)
|
320
322
|
value_maps = detect_new_values(row, value_maps)
|
321
323
|
value_maps.each_with_index { |vm,j|
|
322
|
-
if vm.size >
|
324
|
+
if vm.size > @max_class_values # 5 is the maximum nr of classes supported by Fminer.
|
323
325
|
regression_features[j]=true
|
324
326
|
else
|
325
327
|
regression_features[j]=false
|
326
328
|
end
|
327
329
|
}
|
328
330
|
}
|
331
|
+
|
329
332
|
2.upto(book.last_row) { |i|
|
330
|
-
|
333
|
+
drop=false
|
334
|
+
row = book.row(i)
|
335
|
+
raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
|
336
|
+
if row.include?("")
|
337
|
+
@format_errors << "Row #{i} has #{row.count("")} missing values"
|
338
|
+
drop=true
|
339
|
+
drop_missing=true if (row.count("") == row.size-1)
|
340
|
+
end
|
341
|
+
add_values(row, regression_features) unless (drop_missing && drop)
|
342
|
+
if (drop_missing && drop)
|
343
|
+
@format_errors << "Row #{i} not added"
|
344
|
+
end
|
331
345
|
}
|
332
346
|
warnings
|
333
347
|
@dataset
|
@@ -336,10 +350,11 @@ module OpenTox
|
|
336
350
|
# Load CSV string (format specification: http://toxcreate.org/help)
|
337
351
|
# @param [String] csv CSV representation of the dataset
|
338
352
|
# @return [OpenTox::Dataset] Dataset object with CSV data
|
339
|
-
def load_csv(csv)
|
353
|
+
def load_csv(csv, drop_missing=false)
|
340
354
|
row = 0
|
341
355
|
input = csv.split("\n")
|
342
|
-
|
356
|
+
headers = split_row(input.shift)
|
357
|
+
add_features(headers)
|
343
358
|
value_maps = Array.new
|
344
359
|
regression_features=Array.new
|
345
360
|
|
@@ -347,15 +362,27 @@ module OpenTox
|
|
347
362
|
row = split_row(row)
|
348
363
|
value_maps = detect_new_values(row, value_maps)
|
349
364
|
value_maps.each_with_index { |vm,j|
|
350
|
-
if vm.size >
|
365
|
+
if vm.size > @max_class_values # max @max_class_values classes.
|
351
366
|
regression_features[j]=true
|
352
367
|
else
|
353
368
|
regression_features[j]=false
|
354
369
|
end
|
355
370
|
}
|
356
371
|
}
|
357
|
-
|
358
|
-
|
372
|
+
|
373
|
+
input.each_with_index { |row, i|
|
374
|
+
drop=false
|
375
|
+
row = split_row(row)
|
376
|
+
raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
|
377
|
+
if row.include?("")
|
378
|
+
@format_errors << "Row #{i} has #{row.count("")} missing values"
|
379
|
+
drop=true
|
380
|
+
drop_missing=true if (row.count("") == row.size-1)
|
381
|
+
end
|
382
|
+
add_values(row, regression_features) unless (drop_missing && drop)
|
383
|
+
if (drop_missing && drop)
|
384
|
+
@format_errors << "Row #{i} not added"
|
385
|
+
end
|
359
386
|
}
|
360
387
|
warnings
|
361
388
|
@dataset
|
@@ -367,88 +394,115 @@ module OpenTox
|
|
367
394
|
|
368
395
|
info = ''
|
369
396
|
@feature_types.each do |feature,types|
|
370
|
-
if types.uniq.size
|
397
|
+
if types.uniq.size == 0
|
398
|
+
type = "helper#MissingFeature"
|
399
|
+
elsif types.uniq.size > 1
|
371
400
|
type = OT.NumericFeature
|
372
401
|
else
|
373
402
|
type = types.first
|
374
403
|
end
|
375
404
|
@dataset.add_feature_metadata(feature,{RDF.type => [type]})
|
376
|
-
info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}."
|
405
|
+
info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type
|
377
406
|
|
378
407
|
# TODO: rewrite feature values
|
379
|
-
# TODO if value.to_f == 0 @activity_errors << "#{
|
408
|
+
# TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored."
|
380
409
|
end
|
381
410
|
|
382
411
|
@dataset.metadata[OT.Info] = info
|
383
412
|
|
384
413
|
warnings = ''
|
385
|
-
warnings += "<p>Incorrect
|
414
|
+
warnings += "<p>Incorrect structures (ignored):</p>" + @id_errors.join("<br/>") unless @id_errors.empty?
|
386
415
|
warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
|
416
|
+
warnings += "<p>Format errors:</p>" + @format_errors.join("<br/>") unless @format_errors.empty?
|
387
417
|
duplicate_warnings = ''
|
388
418
|
@duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
|
389
|
-
warnings += "<p>
|
419
|
+
warnings += "<p>Duplicate structures (all structures/activities used for model building, please make sure that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
|
390
420
|
|
391
421
|
@dataset.metadata[OT.Warnings] = warnings
|
392
422
|
|
393
423
|
end
|
394
424
|
|
425
|
+
# Adds a row of features to a dataset
|
426
|
+
# @param Array A row split up as an array
|
427
|
+
# @return Array Indices for duplicate features
|
395
428
|
def add_features(row)
|
396
|
-
row.
|
397
|
-
row.
|
429
|
+
row=row.collect
|
430
|
+
row.shift # get rid of id entry
|
431
|
+
@duplicate_feature_indices = [] # starts with 0 at first f after id
|
432
|
+
row.each_with_index do |feature_name, idx|
|
398
433
|
feature_uri = File.join(@dataset.uri,"feature",URI.encode(feature_name))
|
399
|
-
@
|
400
|
-
|
401
|
-
|
434
|
+
unless @features.include? feature_uri
|
435
|
+
@feature_types[feature_uri] = []
|
436
|
+
@features << feature_uri
|
437
|
+
@dataset.add_feature(feature_uri,{DC.title => feature_name})
|
438
|
+
else
|
439
|
+
@duplicate_feature_indices << idx
|
440
|
+
@format_errors << "Duplicate Feature '#{feature_name}' at pos #{idx}"
|
441
|
+
end
|
402
442
|
end
|
403
443
|
end
|
404
444
|
|
405
445
|
# Adds a row to a dataset
|
406
446
|
# @param Array A row split up as an array
|
407
447
|
# @param Array Indicator for regression for each field
|
448
|
+
# @param Array Indices for duplicate features
|
408
449
|
def add_values(row, regression_features)
|
409
450
|
|
410
|
-
|
411
|
-
|
451
|
+
id = row.shift
|
452
|
+
case id
|
453
|
+
when /InChI/
|
454
|
+
compound = Compound.from_inchi(URI.decode_www_form_component(id))
|
455
|
+
else
|
456
|
+
compound = Compound.from_smiles(id)
|
457
|
+
end
|
458
|
+
|
412
459
|
if compound.nil? or compound.inchi.nil? or compound.inchi == ""
|
413
|
-
@
|
460
|
+
@id_errors << id+", "+row.join(", ")
|
414
461
|
return false
|
415
462
|
end
|
416
463
|
@duplicates[compound.inchi] = [] unless @duplicates[compound.inchi]
|
417
|
-
@duplicates[compound.inchi] <<
|
464
|
+
@duplicates[compound.inchi] << id+", "+row.join(", ")
|
418
465
|
|
466
|
+
feature_idx = 0
|
419
467
|
row.each_index do |i|
|
420
|
-
value = row[i]
|
421
|
-
feature = @features[i]
|
422
468
|
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
if
|
427
|
-
|
469
|
+
unless @duplicate_feature_indices.include? i
|
470
|
+
|
471
|
+
value = row[i]
|
472
|
+
#LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty
|
473
|
+
feature = @features[feature_idx]
|
474
|
+
|
475
|
+
type = feature_type(value) # May be NIL
|
476
|
+
type = OT.NominalFeature unless (type.nil? || regression_features[i])
|
477
|
+
@feature_types[feature] << type if type
|
478
|
+
|
479
|
+
val = nil
|
480
|
+
case type
|
481
|
+
when OT.NumericFeature
|
482
|
+
val = value.to_f
|
483
|
+
when OT.NominalFeature
|
484
|
+
val = value.to_s
|
428
485
|
end
|
429
|
-
else
|
430
|
-
type = OT.NominalFeature
|
431
|
-
end
|
432
|
-
@feature_types[feature] << type
|
433
486
|
|
434
|
-
|
435
|
-
|
436
|
-
val
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
if type!=OT.NumericFeature
|
443
|
-
@dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
|
444
|
-
@dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
|
487
|
+
feature_idx += 1
|
488
|
+
|
489
|
+
if val != nil
|
490
|
+
@dataset.add(compound.uri, feature, val)
|
491
|
+
if type != OT.NumericFeature
|
492
|
+
@dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
|
493
|
+
@dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s)
|
494
|
+
end
|
445
495
|
end
|
496
|
+
|
446
497
|
end
|
498
|
+
|
447
499
|
end
|
448
500
|
end
|
449
501
|
|
450
502
|
def feature_type(value)
|
451
|
-
if
|
503
|
+
if value == ""
|
504
|
+
return nil
|
505
|
+
elsif OpenTox::Algorithm::numeric? value
|
452
506
|
return OT.NumericFeature
|
453
507
|
else
|
454
508
|
return OT.NominalFeature
|
@@ -456,7 +510,7 @@ module OpenTox
|
|
456
510
|
end
|
457
511
|
|
458
512
|
def split_row(row)
|
459
|
-
row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s
|
513
|
+
row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/,-1) # -1: do not skip empty cells
|
460
514
|
end
|
461
515
|
|
462
516
|
end
|
@@ -468,6 +522,7 @@ module OpenTox
|
|
468
522
|
def initialize
|
469
523
|
@data = {}
|
470
524
|
@activity_errors = []
|
525
|
+
@max_class_values = 3
|
471
526
|
end
|
472
527
|
|
473
528
|
def feature_values(feature)
|
@@ -485,14 +540,14 @@ module OpenTox
|
|
485
540
|
def clean_features
|
486
541
|
ignored_features = []
|
487
542
|
features.each do |feature|
|
488
|
-
if feature_values(feature).size >
|
543
|
+
if feature_values(feature).size > @max_class_values
|
489
544
|
if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature
|
490
545
|
# REGRESSION
|
491
546
|
elsif feature_types(feature).include? OT.NumericFeature
|
492
547
|
@data.each{|c,row| row[feature] = nil unless OpenTox::Algorithm::numeric?(row[feature]) } # delete nominal features
|
493
548
|
@activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)."
|
494
549
|
else
|
495
|
-
@activity_errors << "Feature #{feature} ignored (more than
|
550
|
+
@activity_errors << "Feature #{feature} ignored (more than #{@max_class_values} nominal feature values and no numeric values)."
|
496
551
|
ignored_features << feature
|
497
552
|
next
|
498
553
|
end
|
@@ -543,12 +598,15 @@ module OpenTox
|
|
543
598
|
private
|
544
599
|
|
545
600
|
def feature_type(value)
|
546
|
-
if
|
601
|
+
if value.nil?
|
602
|
+
return nil
|
603
|
+
elsif OpenTox::Algorithm::numeric? value
|
547
604
|
return OT.NumericFeature
|
548
605
|
else
|
549
606
|
return OT.NominalFeature
|
550
607
|
end
|
551
608
|
end
|
609
|
+
|
552
610
|
end
|
553
611
|
|
554
612
|
# quick hack to enable sdf import via csv
|
@@ -589,20 +647,20 @@ module OpenTox
|
|
589
647
|
@duplicates[inchi] << rec #inchi#+", "+row.join(", ")
|
590
648
|
compound = Compound.from_inchi inchi
|
591
649
|
rescue
|
592
|
-
@compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}"
|
650
|
+
@compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec}) have been ignored! \n#{s}"
|
593
651
|
next
|
594
652
|
end
|
595
653
|
row = {}
|
596
654
|
obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
|
597
655
|
table.data[compound.uri] = row
|
598
656
|
end
|
599
|
-
|
600
|
-
#
|
657
|
+
|
658
|
+
# find and remove ignored_features
|
601
659
|
@activity_errors = table.clean_features
|
602
660
|
table.add_to_dataset @dataset
|
603
661
|
|
604
662
|
warnings = ''
|
605
|
-
warnings += "<p>Incorrect
|
663
|
+
warnings += "<p>Incorrect structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
|
606
664
|
warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
|
607
665
|
duplicate_warnings = ''
|
608
666
|
@duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
|
data/lib/r-util.rb
ADDED
@@ -0,0 +1,354 @@
|
|
1
|
+
# pending: package dir hack ---------
|
2
|
+
# CONFIG[:base_dir] = "/home/<user>/opentox-ruby/www"
|
3
|
+
# PACKAGE_DIR = "/home/<user>/opentox-ruby/r-packages"
|
4
|
+
package_dir = CONFIG[:base_dir].split("/")
|
5
|
+
package_dir[-1] = "r-packages"
|
6
|
+
package_dir = package_dir.join("/")
|
7
|
+
PACKAGE_DIR = package_dir
|
8
|
+
|
9
|
+
require "tempfile"
|
10
|
+
|
11
|
+
module OpenTox
|
12
|
+
|
13
|
+
class RUtil
|
14
|
+
|
15
|
+
@@feats = {}
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
@r = RinRuby.new(true,false) unless defined?(@r) and @r
|
19
|
+
@r.eval ".libPaths('#{PACKAGE_DIR}')"
|
20
|
+
@r_packages = @r.pull "installed.packages()[,1]"
|
21
|
+
["sampling","gam","vegan"].each{|l| install_package(l)} #"caret", "smacof", "TunePareto"
|
22
|
+
@r.eval "source('#{File.join(Gem.loaded_specs['opentox-ruby'].full_gem_path,'lib/stratification.R')}')"
|
23
|
+
end
|
24
|
+
|
25
|
+
def quit_r
|
26
|
+
begin
|
27
|
+
@r.quit
|
28
|
+
@r = nil
|
29
|
+
rescue
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def r
|
34
|
+
@r
|
35
|
+
end
|
36
|
+
|
37
|
+
def package_installed?( package )
|
38
|
+
@r_packages.include?(package)
|
39
|
+
end
|
40
|
+
|
41
|
+
def install_package( package )
|
42
|
+
unless package_installed?(package)
|
43
|
+
LOGGER.debug "r-util> installing r-package #{package} to #{PACKAGE_DIR}"
|
44
|
+
@r.eval "install.packages('#{package}', repos='http://cran.r-project.org', lib='#{PACKAGE_DIR}')"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# <0 -> array1 << array2
|
49
|
+
# 0 -> no significant difference
|
50
|
+
# >0 -> array2 >> array1
|
51
|
+
def paired_ttest(array1, array2, significance_level=0.95)
|
52
|
+
@r.assign "v1",array1
|
53
|
+
@r.assign "v2",array2
|
54
|
+
@r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)"
|
55
|
+
t = @r.pull "ttest$statistic"
|
56
|
+
p = @r.pull "ttest$p.value"
|
57
|
+
if (1-significance_level > p)
|
58
|
+
t
|
59
|
+
else
|
60
|
+
0
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# example:
|
65
|
+
# files = ["/tmp/box.svg","/tmp/box.png"]
|
66
|
+
# data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ]
|
67
|
+
# boxplot(files, data, "comparison1" )
|
68
|
+
#
|
69
|
+
def boxplot(files, data, title="")
|
70
|
+
LOGGER.debug("r-util> create boxplot")
|
71
|
+
assign_dataframe("boxdata",data.collect{|e| e[1]}.transpose,nil,data.collect{|e| e[0].to_s})
|
72
|
+
plot_to_files(files) do |file|
|
73
|
+
@r.eval "boxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1}))"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# embedds feature values of two datasets into 2D and plots it
|
78
|
+
# fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)
|
79
|
+
#
|
80
|
+
def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
|
81
|
+
features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
|
82
|
+
|
83
|
+
raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
|
84
|
+
LOGGER.debug("r-util> create feature value plot")
|
85
|
+
d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
|
86
|
+
d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
|
87
|
+
if features
|
88
|
+
[d1, d2].each{|d| features.each{|f| raise "feature not included" unless d.features.keys.include?(f)}}
|
89
|
+
else
|
90
|
+
raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if
|
91
|
+
(d1.features.keys.sort != d2.features.keys.sort)
|
92
|
+
features = d1.features.keys
|
93
|
+
end
|
94
|
+
raise "at least two features needed" if d1.features.keys.size<2
|
95
|
+
waiting_task.progress(25) if waiting_task
|
96
|
+
|
97
|
+
df1 = dataset_to_dataframe(d1,0,subjectid,features)
|
98
|
+
df2 = dataset_to_dataframe(d2,0,subjectid,features)
|
99
|
+
waiting_task.progress(50) if waiting_task
|
100
|
+
|
101
|
+
@r.eval "df <- rbind(#{df1},#{df2})"
|
102
|
+
@r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
|
103
|
+
@r.names = [dataset_name1, dataset_name2]
|
104
|
+
LOGGER.debug("r-util> - convert data to 2d")
|
105
|
+
@r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
|
106
|
+
waiting_task.progress(75) if waiting_task
|
107
|
+
|
108
|
+
if fast_plot
|
109
|
+
info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
|
110
|
+
else
|
111
|
+
info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
|
112
|
+
end
|
113
|
+
LOGGER.debug("r-util> - plot data")
|
114
|
+
plot_to_files(files) do |file|
|
115
|
+
@r.eval "plot_split( df.2d, split, names, #{info})"
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# plots a double histogram
|
120
|
+
# data1 and data2 are arrays with values, either numerical or categorial (string values)
|
121
|
+
# is_numerical, boolean flag indicating value types
|
122
|
+
# log (only for numerical), plot logarithm of values
|
123
|
+
def double_hist_plot(files, data1, data2, is_numerical, log=false, name1="first", name2="second", title="title", xaxis="x-values")
|
124
|
+
LOGGER.debug("r-util> create double hist plot")
|
125
|
+
all = data1 + data2
|
126
|
+
if (is_numerical)
|
127
|
+
@r.eval "double_plot <- function(data1, data2, log=FALSE, names=c('data1','data2'), title='title', xlab='x-values')
|
128
|
+
{
|
129
|
+
if (log)
|
130
|
+
{
|
131
|
+
data1 <- log(data1)
|
132
|
+
data2 <- log(data2)
|
133
|
+
xlab = paste('logarithm of',xlab,sep=' ')
|
134
|
+
}
|
135
|
+
xlims <- round(c(min(c(min(data1),min(data2))),max(c(max(data1),max(data2)))))
|
136
|
+
h <- hist(rbind(data1,data2),plot=F)
|
137
|
+
h1 <- hist(data1,plot=F,breaks=h$breaks)
|
138
|
+
h2 <- hist(data2,plot=F,breaks=h$breaks)
|
139
|
+
xlims = c(min(h$breaks),max(h$breaks))
|
140
|
+
ylims = c(0,max(h1$counts,h2$counts))
|
141
|
+
xaxps = c(min(h$breaks),max(h$breaks),(length(h$breaks)-1))
|
142
|
+
plot(h1, col=rgb(1,0,0,2/4), xlim=xlims, xaxp=xaxps, ylim=ylims,
|
143
|
+
main=title, xlab=xlab, ylab='counts' )
|
144
|
+
plot(h2, col=rgb(0,1,0,2/4), add=T )
|
145
|
+
legend('topleft',names,lty=c(1,1),col=c('red','green'))
|
146
|
+
}"
|
147
|
+
@r.assign("data1",data1)
|
148
|
+
@r.assign("data2",data2)
|
149
|
+
@r.legend = [name1, name2]
|
150
|
+
else
|
151
|
+
raise "log not valid for categorial" if log
|
152
|
+
vals = all.uniq.sort!
|
153
|
+
counts1 = vals.collect{|e| data1.count(e)}
|
154
|
+
counts2 = vals.collect{|e| data2.count(e)}
|
155
|
+
@r.data1 = counts1
|
156
|
+
@r.data2 = counts2
|
157
|
+
@r.value_names = [name1, name2]
|
158
|
+
@r.legend = vals
|
159
|
+
@r.eval("data <- cbind(data1,data2)")
|
160
|
+
end
|
161
|
+
|
162
|
+
plot_to_files(files) do |file|
|
163
|
+
if (is_numerical)
|
164
|
+
@r.eval "double_plot(data1,data2,log=#{log ? "T":"F"},names=legend,title='#{title}',xlab='#{xaxis}')"
|
165
|
+
else
|
166
|
+
@r.eval("bp <- barplot(data, beside=T, names.arg=value_names,
|
167
|
+
main='#{title}', col=sort(rep(2:3,length(legend))))") #legend.text=c(legend),
|
168
|
+
@r.eval "text(bp, 0, round(data, 1),cex=1,pos=3)"
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
# stratified splits a dataset into two dataset the feature values
|
174
|
+
# all features are taken into account unless <split_features> is given
|
175
|
+
def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
|
176
|
+
raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
|
177
|
+
LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
|
178
|
+
|
179
|
+
df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
|
180
|
+
@r.eval "set.seed(#{seed})"
|
181
|
+
@r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
|
182
|
+
split = @r.pull 'split'
|
183
|
+
split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
|
184
|
+
split_to_datasets( df, split, subjectid )
|
185
|
+
end
|
186
|
+
|
187
|
+
# dataset should be loaded completely (use Dataset.find)
|
188
|
+
# takes duplicates into account
|
189
|
+
# replaces missing values with param <missing_value>
|
190
|
+
# returns dataframe-variable-name in R
|
191
|
+
def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil )
|
192
|
+
LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}"
|
193
|
+
|
194
|
+
# count duplicates
|
195
|
+
num_compounds = {}
|
196
|
+
dataset.features.keys.each do |f|
|
197
|
+
dataset.compounds.each do |c|
|
198
|
+
if dataset.data_entries[c]
|
199
|
+
val = dataset.data_entries[c][f]
|
200
|
+
size = val==nil ? 1 : val.size
|
201
|
+
num_compounds[c] = num_compounds[c]==nil ? size : [num_compounds[c],size].max
|
202
|
+
else
|
203
|
+
num_compounds[c] = 1
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
# use either all, or the provided features, sorting is important as col-index := features
|
209
|
+
if features
|
210
|
+
features.sort!
|
211
|
+
else
|
212
|
+
features = dataset.features.keys.sort
|
213
|
+
end
|
214
|
+
compounds = []
|
215
|
+
dataset.compounds.each do |c|
|
216
|
+
num_compounds[c].times do |i|
|
217
|
+
compounds << c
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
# values into 2D array, then to dataframe
|
222
|
+
d_values = []
|
223
|
+
dataset.compounds.each do |c|
|
224
|
+
num_compounds[c].times do |i|
|
225
|
+
c_values = []
|
226
|
+
features.each do |f|
|
227
|
+
if dataset.data_entries[c]
|
228
|
+
val = dataset.data_entries[c][f]
|
229
|
+
v = val==nil ? "" : val[i].to_s
|
230
|
+
else
|
231
|
+
raise "wtf" if i>0
|
232
|
+
v = ""
|
233
|
+
end
|
234
|
+
v = missing_value if v.size()==0
|
235
|
+
c_values << v
|
236
|
+
end
|
237
|
+
d_values << c_values
|
238
|
+
end
|
239
|
+
end
|
240
|
+
df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
|
241
|
+
assign_dataframe(df_name,d_values,compounds,features)
|
242
|
+
|
243
|
+
# set dataframe column types accordingly
|
244
|
+
f_count = 1 #R starts at 1
|
245
|
+
features.each do |f|
|
246
|
+
feat = OpenTox::Feature.find(f,subjectid)
|
247
|
+
nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
|
248
|
+
if nominal
|
249
|
+
@r.eval "#{df_name}[,#{f_count}] <- as.character(#{df_name}[,#{f_count}])"
|
250
|
+
else
|
251
|
+
@r.eval "#{df_name}[,#{f_count}] <- as.numeric(#{df_name}[,#{f_count}])"
|
252
|
+
end
|
253
|
+
f_count += 1
|
254
|
+
end
|
255
|
+
#@r.eval "head(#{df_name})"
|
256
|
+
|
257
|
+
# store compounds, and features (including metainformation)
|
258
|
+
@@feats[df_name] = {}
|
259
|
+
features.each do |f|
|
260
|
+
@@feats[df_name][f] = dataset.features[f]
|
261
|
+
end
|
262
|
+
df_name
|
263
|
+
end
|
264
|
+
|
265
|
+
# converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
|
266
|
+
# this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
|
267
|
+
def dataframe_to_dataset( df, subjectid=nil )
|
268
|
+
dataframe_to_dataset_indices( df, subjectid, nil)
|
269
|
+
end
|
270
|
+
|
271
|
+
private
|
272
|
+
def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil )
|
273
|
+
raise unless @@feats[df].size>0
|
274
|
+
values, compounds, features = pull_dataframe(df)
|
275
|
+
features.each{|f| raise unless @@feats[df][f]}
|
276
|
+
dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
|
277
|
+
LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}"
|
278
|
+
compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
|
279
|
+
features.each{|f| dataset.add_feature(f,@@feats[df][f])}
|
280
|
+
features.size.times do |c|
|
281
|
+
feat = OpenTox::Feature.find(features[c],subjectid)
|
282
|
+
nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
|
283
|
+
compounds.size.times do |r|
|
284
|
+
if compound_indices==nil or compound_indices.include?(r)
|
285
|
+
dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f) if values[r][c]!="NA"
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
289
|
+
dataset.save(subjectid)
|
290
|
+
dataset
|
291
|
+
end
|
292
|
+
|
293
|
+
def split_to_datasets( df, split, subjectid=nil )
|
294
|
+
sets = []
|
295
|
+
(split.min.to_i .. split.max.to_i).each do |i|
|
296
|
+
indices = []
|
297
|
+
split.size.times{|j| indices<<j if split[j]==i}
|
298
|
+
dataset = dataframe_to_dataset_indices( df, subjectid, indices )
|
299
|
+
LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
|
300
|
+
sets << dataset
|
301
|
+
end
|
302
|
+
sets
|
303
|
+
end
|
304
|
+
|
305
|
+
def pull_dataframe(df)
|
306
|
+
tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
|
307
|
+
@r.eval "write.table(#{df},file='#{tmp}',sep='#')"
|
308
|
+
res = []; compounds = []; features = []
|
309
|
+
first = true
|
310
|
+
file = File.new(tmp, 'r')
|
311
|
+
file.each_line("\n") do |row|
|
312
|
+
if first
|
313
|
+
features = row.chomp.split("#").collect{|e| e.gsub("\"","")}
|
314
|
+
first = false
|
315
|
+
else
|
316
|
+
vals = row.chomp.split("#").collect{|e| e.gsub("\"","")}
|
317
|
+
compounds << vals[0]
|
318
|
+
res << vals[1..-1]
|
319
|
+
end
|
320
|
+
end
|
321
|
+
begin File.delete(tmp); rescue; end
|
322
|
+
return res, compounds, features
|
323
|
+
end
|
324
|
+
|
325
|
+
def assign_dataframe(df,input,rownames,colnames)
|
326
|
+
tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv")
|
327
|
+
file = File.new(tmp, 'w')
|
328
|
+
input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")}
|
329
|
+
file.flush
|
330
|
+
@r.rownames = rownames if rownames
|
331
|
+
@r.colnames = colnames
|
332
|
+
@r.eval "#{df} <- read.table(file='#{tmp}',sep='#',"+
|
333
|
+
"#{rownames ? "row.names=rownames" : ""},col.names=colnames,check.names=F)"
|
334
|
+
begin File.delete(tmp); rescue; end
|
335
|
+
end
|
336
|
+
|
337
|
+
def plot_to_files(files)
|
338
|
+
files.each do |file|
|
339
|
+
if file=~/(?i)\.svg/
|
340
|
+
@r.eval("svg('#{file}',10,8)")
|
341
|
+
elsif file=~/(?i)\.png/
|
342
|
+
@r.eval("png('#{file}')")
|
343
|
+
else
|
344
|
+
raise "invalid format: "+file.to_s
|
345
|
+
end
|
346
|
+
yield file
|
347
|
+
LOGGER.debug "r-util> plotted to #{file}"
|
348
|
+
@r.eval("dev.off()")
|
349
|
+
end
|
350
|
+
end
|
351
|
+
end
|
352
|
+
end
|
353
|
+
|
354
|
+
|