opentox-ruby 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +39 -46
- data/VERSION +1 -1
- data/lib/algorithm.rb +797 -80
- data/lib/compound.rb +40 -0
- data/lib/config/config_ru.rb +2 -0
- data/lib/dataset.rb +57 -18
- data/lib/environment.rb +3 -3
- data/lib/feature.rb +15 -13
- data/lib/helper.rb +1 -2
- data/lib/model.rb +185 -82
- data/lib/opentox-ruby.rb +1 -1
- data/lib/overwrite.rb +2 -1
- data/lib/parser.rb +247 -69
- data/lib/rest_client_wrapper.rb +3 -2
- data/lib/serializer.rb +24 -10
- data/lib/task.rb +10 -3
- data/lib/to-html.rb +66 -41
- data/lib/validation.rb +93 -29
- metadata +206 -117
data/lib/opentox-ruby.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
['rubygems', 'sinatra', 'sinatra/url_for', 'ohm', 'rest_client', 'yaml', 'cgi', 'spork', 'error', 'overwrite', 'environment'].each do |lib|
|
1
|
+
['rubygems', 'sinatra', 'sinatra/url_for', 'ohm', 'rest_client', 'yaml', 'cgi', 'spork', 'error', 'overwrite', 'environment', 'gsl'].each do |lib|
|
2
2
|
require lib
|
3
3
|
end
|
4
4
|
|
data/lib/overwrite.rb
CHANGED
@@ -50,7 +50,8 @@ class Sinatra::Base
|
|
50
50
|
halt task.http_code,task.to_yaml # PENDING differs from task-webservice
|
51
51
|
when /html/
|
52
52
|
response['Content-Type'] = "text/html"
|
53
|
-
|
53
|
+
# html -> task created with html form -> redirect to task uri
|
54
|
+
redirect task.uri
|
54
55
|
else # default /uri-list/
|
55
56
|
response['Content-Type'] = "text/uri-list"
|
56
57
|
if task.completed?
|
data/lib/parser.rb
CHANGED
@@ -40,8 +40,9 @@ module OpenTox
|
|
40
40
|
else
|
41
41
|
file = Tempfile.new("ot-rdfxml")
|
42
42
|
if @dataset
|
43
|
-
# do not concat /metadata to uri string, this would not work for dataset/R401577?max=3
|
44
43
|
uri = URI::parse(@uri)
|
44
|
+
#remove params like dataset/<id>?max=3 from uri, not needed for metadata
|
45
|
+
uri.query = nil
|
45
46
|
uri.path = File.join(uri.path,"metadata")
|
46
47
|
uri = uri.to_s
|
47
48
|
else
|
@@ -56,7 +57,7 @@ module OpenTox
|
|
56
57
|
`rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line|
|
57
58
|
triple = line.to_triple
|
58
59
|
if triple[0] == @uri
|
59
|
-
if triple[1] == RDF.type # allow multiple types
|
60
|
+
if triple[1] == RDF.type || triple[1]==OT.predictedVariables # allow multiple types
|
60
61
|
@metadata[triple[1]] = [] unless @metadata[triple[1]]
|
61
62
|
@metadata[triple[1]] << triple[2].split('^^').first
|
62
63
|
else
|
@@ -75,6 +76,9 @@ module OpenTox
|
|
75
76
|
@metadata[OT.parameters] << parameter
|
76
77
|
end
|
77
78
|
end
|
79
|
+
#@metadata.each do |k,v|
|
80
|
+
#v = v.first if v and v.size == 1
|
81
|
+
#end
|
78
82
|
@metadata
|
79
83
|
end
|
80
84
|
|
@@ -82,7 +86,11 @@ module OpenTox
|
|
82
86
|
# @param [String] rdf
|
83
87
|
# @param [String] type of the info (e.g. OT.Task, OT.ErrorReport) needed to get the subject-uri
|
84
88
|
# @return [Owl] with uri and metadata set
|
85
|
-
def self.from_rdf( rdf, type )
|
89
|
+
def self.from_rdf( rdf, type, allow_multiple = false )
|
90
|
+
|
91
|
+
uris = Array.new
|
92
|
+
owls = Array.new
|
93
|
+
|
86
94
|
# write to file and read convert with rapper into tripples
|
87
95
|
file = Tempfile.new("ot-rdfxml")
|
88
96
|
file.puts rdf
|
@@ -95,20 +103,27 @@ module OpenTox
|
|
95
103
|
triples.each_line do |line|
|
96
104
|
triple = line.to_triple
|
97
105
|
if triple[1] == RDF['type'] and triple[2]==type
|
98
|
-
|
106
|
+
if !allow_multiple
|
107
|
+
raise "uri already set, two uris found with type: "+type.to_s if uri
|
108
|
+
end
|
99
109
|
uri = triple[0]
|
110
|
+
uris << uri
|
100
111
|
end
|
101
112
|
end
|
102
113
|
File.delete(file.path)
|
114
|
+
|
103
115
|
# load metadata
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
116
|
+
uris.each { |uri|
|
117
|
+
metadata = {}
|
118
|
+
triples.each_line do |line|
|
119
|
+
triple = line.to_triple
|
120
|
+
metadata[triple[1]] = triple[2].split('^^').first if triple[0] == uri and triple[1] != RDF['type']
|
121
|
+
end
|
122
|
+
owl = Owl::Generic.new(uri)
|
123
|
+
owl.metadata = metadata
|
124
|
+
owls << owl
|
125
|
+
}
|
126
|
+
allow_multiple ? owls : owls[0]
|
112
127
|
end
|
113
128
|
|
114
129
|
# Generic parser for all OpenTox classes
|
@@ -228,7 +243,12 @@ module OpenTox
|
|
228
243
|
file = Tempfile.new("ot-rdfxml")
|
229
244
|
# do not concat /features to uri string, this would not work for dataset/R401577?max=3
|
230
245
|
uri = URI::parse(@uri)
|
231
|
-
|
246
|
+
# PENDING
|
247
|
+
# ambit models return http://host/dataset/id?feature_uris[]=sth but
|
248
|
+
# amibt dataset services does not support http://host/dataset/id/features?feature_uris[]=sth
|
249
|
+
# and features are not inlcuded in http://host/dataset/id/features
|
250
|
+
# -> load features from complete dataset
|
251
|
+
uri.path = File.join(uri.path,"features") unless @uri=~/\?(feature_uris|page|pagesize)/
|
232
252
|
uri = uri.to_s
|
233
253
|
file.puts OpenTox::RestClientWrapper.get uri,{:subjectid => subjectid,:accept => "application/rdf+xml"},nil,false
|
234
254
|
file.close
|
@@ -244,8 +264,13 @@ module OpenTox
|
|
244
264
|
File.delete(to_delete) if to_delete
|
245
265
|
statements.each do |triple|
|
246
266
|
if features.include? triple[0]
|
247
|
-
@dataset.features[triple[0]] = {} unless @dataset.features[triple[0]]
|
248
|
-
|
267
|
+
@dataset.features[triple[0]] = {} unless @dataset.features[triple[0]]
|
268
|
+
if triple[1] == RDF.type
|
269
|
+
@dataset.features[triple[0]][triple[1]] = [] unless @dataset.features[triple[0]][triple[1]]
|
270
|
+
@dataset.features[triple[0]][triple[1]] << triple[2].split('^^').first
|
271
|
+
else
|
272
|
+
@dataset.features[triple[0]][triple[1]] = triple[2].split('^^').first
|
273
|
+
end
|
249
274
|
end
|
250
275
|
end
|
251
276
|
@dataset.features
|
@@ -271,22 +296,39 @@ module OpenTox
|
|
271
296
|
@duplicates = {}
|
272
297
|
end
|
273
298
|
|
299
|
+
def detect_new_values(row, value_maps)
|
300
|
+
row.shift
|
301
|
+
row.each_index do |i|
|
302
|
+
value = row[i]
|
303
|
+
value_maps[i] = Hash.new if value_maps[i].nil?
|
304
|
+
value_maps[i][value].nil? ? value_maps[i][value]=0 : value_maps[i][value] += 1
|
305
|
+
end
|
306
|
+
value_maps
|
307
|
+
end
|
308
|
+
|
274
309
|
# Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help)
|
275
310
|
# @param [Excel] book Excel workbook object (created with roo gem)
|
276
311
|
# @return [OpenTox::Dataset] Dataset object with Excel data
|
277
312
|
def load_spreadsheet(book)
|
278
313
|
book.default_sheet = 0
|
279
314
|
add_features book.row(1)
|
315
|
+
value_maps = Array.new
|
316
|
+
regression_features=Array.new
|
280
317
|
|
281
|
-
# AM: fix mixed read in
|
282
|
-
regression_features=false
|
283
318
|
2.upto(book.last_row) { |i|
|
284
319
|
row = book.row(i)
|
285
|
-
|
286
|
-
|
320
|
+
value_maps = detect_new_values(row, value_maps)
|
321
|
+
value_maps.each_with_index { |vm,j|
|
322
|
+
if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer.
|
323
|
+
regression_features[j]=true
|
324
|
+
else
|
325
|
+
regression_features[j]=false
|
326
|
+
end
|
327
|
+
}
|
328
|
+
}
|
329
|
+
2.upto(book.last_row) { |i|
|
330
|
+
add_values book.row(i), regression_features
|
287
331
|
}
|
288
|
-
|
289
|
-
2.upto(book.last_row) { |i| add_values book.row(i),regression_features }
|
290
332
|
warnings
|
291
333
|
@dataset
|
292
334
|
end
|
@@ -298,21 +340,27 @@ module OpenTox
|
|
298
340
|
row = 0
|
299
341
|
input = csv.split("\n")
|
300
342
|
add_features split_row(input.shift)
|
343
|
+
value_maps = Array.new
|
344
|
+
regression_features=Array.new
|
301
345
|
|
302
|
-
|
303
|
-
# AM: fix mixed read in
|
304
|
-
regression_features=false
|
305
346
|
input.each { |row|
|
306
347
|
row = split_row(row)
|
307
|
-
|
308
|
-
|
348
|
+
value_maps = detect_new_values(row, value_maps)
|
349
|
+
value_maps.each_with_index { |vm,j|
|
350
|
+
if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer.
|
351
|
+
regression_features[j]=true
|
352
|
+
else
|
353
|
+
regression_features[j]=false
|
354
|
+
end
|
355
|
+
}
|
356
|
+
}
|
357
|
+
input.each { |row|
|
358
|
+
add_values split_row(row), regression_features
|
309
359
|
}
|
310
|
-
input.each { |row| add_values split_row(row),regression_features }
|
311
360
|
warnings
|
312
361
|
@dataset
|
313
362
|
end
|
314
363
|
|
315
|
-
|
316
364
|
private
|
317
365
|
|
318
366
|
def warnings
|
@@ -354,20 +402,10 @@ module OpenTox
|
|
354
402
|
end
|
355
403
|
end
|
356
404
|
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
value = row[i]
|
362
|
-
type = feature_type(value)
|
363
|
-
if type == OT.NumericFeature
|
364
|
-
regression_features=true
|
365
|
-
end
|
366
|
-
end
|
367
|
-
regression_features
|
368
|
-
end
|
369
|
-
|
370
|
-
def add_values(row, regression_features=false)
|
405
|
+
# Adds a row to a dataset
|
406
|
+
# @param Array A row split up as an array
|
407
|
+
# @param Array Indicator for regression for each field
|
408
|
+
def add_values(row, regression_features)
|
371
409
|
|
372
410
|
smiles = row.shift
|
373
411
|
compound = Compound.from_smiles(smiles)
|
@@ -381,27 +419,23 @@ module OpenTox
|
|
381
419
|
row.each_index do |i|
|
382
420
|
value = row[i]
|
383
421
|
feature = @features[i]
|
384
|
-
type = feature_type(value)
|
385
422
|
|
423
|
+
type = nil
|
424
|
+
if (regression_features[i])
|
425
|
+
type = feature_type(value)
|
426
|
+
if type != OT.NumericFeature
|
427
|
+
raise "Error! Expected numeric values."
|
428
|
+
end
|
429
|
+
else
|
430
|
+
type = OT.NominalFeature
|
431
|
+
end
|
386
432
|
@feature_types[feature] << type
|
387
433
|
|
388
|
-
|
434
|
+
case type
|
435
|
+
when OT.NumericFeature
|
389
436
|
val = value.to_f
|
390
|
-
|
391
|
-
|
392
|
-
when OT.NominalFeature
|
393
|
-
case value.to_s
|
394
|
-
when TRUE_REGEXP
|
395
|
-
val = true
|
396
|
-
when FALSE_REGEXP
|
397
|
-
val = false
|
398
|
-
end
|
399
|
-
when OT.NumericFeature
|
400
|
-
val = value.to_f
|
401
|
-
when OT.StringFeature
|
402
|
-
val = value.to_s
|
403
|
-
@activity_errors << smiles+", "+row.join(", ")
|
404
|
-
end
|
437
|
+
when OT.NominalFeature
|
438
|
+
val = value.to_s
|
405
439
|
end
|
406
440
|
if val!=nil
|
407
441
|
@dataset.add(compound.uri, feature, val)
|
@@ -413,26 +447,170 @@ module OpenTox
|
|
413
447
|
end
|
414
448
|
end
|
415
449
|
|
416
|
-
def
|
417
|
-
|
450
|
+
def feature_type(value)
|
451
|
+
if OpenTox::Algorithm::numeric? value
|
452
|
+
return OT.NumericFeature
|
453
|
+
else
|
454
|
+
return OT.NominalFeature
|
455
|
+
end
|
456
|
+
end
|
457
|
+
|
458
|
+
def split_row(row)
|
459
|
+
row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/) # remove quotes
|
460
|
+
end
|
461
|
+
|
462
|
+
end
|
463
|
+
|
464
|
+
class Table
|
465
|
+
|
466
|
+
attr_accessor :data, :features, :compounds
|
467
|
+
|
468
|
+
def initialize
|
469
|
+
@data = {}
|
470
|
+
@activity_errors = []
|
471
|
+
end
|
472
|
+
|
473
|
+
def feature_values(feature)
|
474
|
+
@data.collect{|c, row| row[feature]}.uniq.compact
|
475
|
+
end
|
476
|
+
|
477
|
+
def feature_types(feature)
|
478
|
+
@data.collect{|c, row| feature_type(row[feature])}.uniq.compact
|
479
|
+
end
|
480
|
+
|
481
|
+
def features
|
482
|
+
@data.collect{|c,row| row.keys}.flatten.uniq
|
483
|
+
end
|
484
|
+
|
485
|
+
def clean_features
|
486
|
+
ignored_features = []
|
487
|
+
features.each do |feature|
|
488
|
+
if feature_values(feature).size > 5
|
489
|
+
if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature
|
490
|
+
# REGRESSION
|
491
|
+
elsif feature_types(feature).include? OT.NumericFeature
|
492
|
+
@data.each{|c,row| row[feature] = nil unless OpenTox::Algorithm::numeric?(row[feature]) } # delete nominal features
|
493
|
+
@activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)."
|
494
|
+
else
|
495
|
+
@activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)."
|
496
|
+
ignored_features << feature
|
497
|
+
next
|
498
|
+
end
|
499
|
+
elsif feature_values(feature).size <= 1
|
500
|
+
@activity_errors << "Feature #{feature} ignored (less than 2 feature values)."
|
501
|
+
ignored_features << feature
|
502
|
+
else
|
503
|
+
# CLASSIFICATION
|
504
|
+
end
|
505
|
+
end
|
506
|
+
ignored_features.each do |feature|
|
507
|
+
@data.each{ |c,row| row.delete feature }
|
508
|
+
end
|
509
|
+
@activity_errors
|
418
510
|
end
|
419
511
|
|
420
|
-
def
|
421
|
-
|
512
|
+
def add_to_dataset(dataset)
|
513
|
+
features.each do |feature_name|
|
514
|
+
feature_uri = File.join(dataset.uri,"feature",URI.encode(feature_name))
|
515
|
+
dataset.add_feature(feature_uri,{DC.title => feature_name})
|
516
|
+
end
|
517
|
+
|
518
|
+
@data.each do |compound,row|
|
519
|
+
unless row.empty?
|
520
|
+
row.each do |feature,value|
|
521
|
+
if OpenTox::Algorithm::numeric?(value)
|
522
|
+
value = value.to_f
|
523
|
+
elsif value.nil? or value.empty?
|
524
|
+
value = nil
|
525
|
+
else
|
526
|
+
value = value.to_s
|
527
|
+
end
|
528
|
+
feature_uri = File.join(dataset.uri,"feature",URI.encode(feature))
|
529
|
+
dataset.add(compound, feature_uri, value)
|
530
|
+
#dataset.features[feature_uri][RDF.type] = feature_types(feature)
|
531
|
+
#dataset.features[feature_uri][OT.acceptValue] = feature_values(feature)
|
532
|
+
if feature_types(feature).include? OT.NumericFeature
|
533
|
+
dataset.features[feature_uri][RDF.type] = [OT.NumericFeature]
|
534
|
+
else
|
535
|
+
dataset.features[feature_uri][RDF.type] = [OT.NominalFeature]
|
536
|
+
dataset.features[feature_uri][OT.acceptValue] = feature_values(feature)
|
537
|
+
end
|
538
|
+
end
|
539
|
+
end
|
540
|
+
end
|
422
541
|
end
|
423
542
|
|
543
|
+
private
|
544
|
+
|
424
545
|
def feature_type(value)
|
425
|
-
if
|
426
|
-
return OT.NominalFeature
|
427
|
-
elsif numeric? value
|
546
|
+
if OpenTox::Algorithm::numeric? value
|
428
547
|
return OT.NumericFeature
|
429
548
|
else
|
430
|
-
return OT.
|
549
|
+
return OT.NominalFeature
|
431
550
|
end
|
432
551
|
end
|
552
|
+
end
|
553
|
+
|
554
|
+
# quick hack to enable sdf import via csv
|
555
|
+
# should be refactored
|
556
|
+
class Sdf
|
557
|
+
|
558
|
+
attr_accessor :dataset
|
559
|
+
|
560
|
+
def initialize
|
561
|
+
@data = {}
|
562
|
+
|
563
|
+
@compound_errors = []
|
564
|
+
@activity_errors = []
|
565
|
+
@duplicates = {}
|
566
|
+
end
|
567
|
+
|
568
|
+
def load_sdf(sdf)
|
569
|
+
|
570
|
+
obconversion = OpenBabel::OBConversion.new
|
571
|
+
obmol = OpenBabel::OBMol.new
|
572
|
+
obconversion.set_in_and_out_formats "sdf", "inchi"
|
573
|
+
|
574
|
+
table = Table.new
|
575
|
+
|
576
|
+
properties = []
|
577
|
+
sdf.each_line { |l| properties << l.to_s if l.match(/</) }
|
578
|
+
properties.uniq!
|
579
|
+
properties.sort!
|
580
|
+
properties.collect!{ |p| p.gsub(/<|>/,'').strip.chomp }
|
581
|
+
|
582
|
+
rec = 0
|
583
|
+
sdf.split(/\$\$\$\$\r*\n/).each do |s|
|
584
|
+
rec += 1
|
585
|
+
obconversion.read_string obmol, s
|
586
|
+
begin
|
587
|
+
inchi = obconversion.write_string(obmol).gsub(/\s/,'').chomp
|
588
|
+
@duplicates[inchi] = [] unless @duplicates[inchi]
|
589
|
+
@duplicates[inchi] << rec #inchi#+", "+row.join(", ")
|
590
|
+
compound = Compound.from_inchi inchi
|
591
|
+
rescue
|
592
|
+
@compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}"
|
593
|
+
next
|
594
|
+
end
|
595
|
+
row = {}
|
596
|
+
obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
|
597
|
+
table.data[compound.uri] = row
|
598
|
+
end
|
599
|
+
|
600
|
+
# finda and remove ignored_features
|
601
|
+
@activity_errors = table.clean_features
|
602
|
+
table.add_to_dataset @dataset
|
603
|
+
|
604
|
+
warnings = ''
|
605
|
+
warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
|
606
|
+
warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
|
607
|
+
duplicate_warnings = ''
|
608
|
+
@duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
|
609
|
+
warnings += "<p>Duplicated structures (all structures/activities used for model building, please make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
|
610
|
+
|
611
|
+
@dataset.metadata[OT.Warnings] = warnings
|
612
|
+
@dataset
|
433
613
|
|
434
|
-
def split_row(row)
|
435
|
-
row.chomp.gsub(/["']/,'').split(/\s*[,;]\s*/) # remove quotes
|
436
614
|
end
|
437
615
|
|
438
616
|
end
|
data/lib/rest_client_wrapper.rb
CHANGED
@@ -131,13 +131,14 @@ module OpenTox
|
|
131
131
|
raise "unknown content-type for task : '"+res.content_type.to_s+"'"+" base-uri: "+base_uri.to_s+" content: "+res[0..200].to_s
|
132
132
|
end
|
133
133
|
|
134
|
-
LOGGER.debug "result is a task '"+task.uri.to_s+"', wait for completion"
|
134
|
+
#LOGGER.debug "result is a task '"+task.uri.to_s+"', wait for completion"
|
135
135
|
task.wait_for_completion waiting_task
|
136
136
|
unless task.completed? # maybe task was cancelled / error
|
137
137
|
if task.errorReport
|
138
138
|
received_error task.errorReport, task.http_code, nil, {:rest_uri => task.uri, :rest_code => task.http_code}
|
139
139
|
else
|
140
|
-
raise "task
|
140
|
+
raise "status of task '"+task.uri.to_s+"' is no longer running (hasStatus is '"+task.status+
|
141
|
+
"'), but it is neither completed nor has an errorReport"
|
141
142
|
end
|
142
143
|
end
|
143
144
|
|