opentox-ruby 3.1.0 → 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +19 -9
- data/README.markdown +1 -1
- data/Rakefile +2 -1
- data/VERSION +1 -1
- data/lib/algorithm.rb +143 -37
- data/lib/compound.rb +66 -18
- data/lib/dataset.rb +38 -3
- data/lib/model.rb +36 -13
- data/lib/parser.rb +34 -19
- data/lib/r-util.rb +93 -34
- data/lib/serializer.rb +70 -22
- data/lib/stratification.R +71 -7
- data/lib/transform.rb +5 -3
- data/lib/utils.rb +356 -97
- data/lib/validation.rb +6 -4
- metadata +20 -4
data/lib/serializer.rb
CHANGED
@@ -459,32 +459,80 @@ module OpenTox
|
|
459
459
|
def initialize(dataset)
|
460
460
|
@rows = []
|
461
461
|
@rows << ["SMILES"]
|
462
|
+
|
462
463
|
features = dataset.features.keys
|
463
|
-
|
464
|
+
|
465
|
+
# prepare for subgraphs
|
466
|
+
have_substructures = features.collect{ |id| dataset.features[id][RDF.type].include? OT.Substructure}.compact.uniq
|
467
|
+
if have_substructures.size == 1 && have_substructures[0]
|
468
|
+
features_smarts = features.collect{ |id| "'" + dataset.features[id][OT.smarts] + "'" }
|
469
|
+
end
|
470
|
+
|
471
|
+
# gather missing features
|
472
|
+
delete_features = []
|
473
|
+
features.each{ |id|
|
474
|
+
dataset.features[id][RDF.type].each { |typestr|
|
475
|
+
if typestr.include? "MissingFeature"
|
476
|
+
delete_features << id
|
477
|
+
end
|
478
|
+
}
|
479
|
+
}
|
480
|
+
features = features - delete_features
|
481
|
+
|
482
|
+
# detect nr duplicates per compound
|
483
|
+
compound_sizes = {}
|
484
|
+
dataset.compounds.each do |compound|
|
485
|
+
entries=dataset.data_entries[compound]
|
486
|
+
if entries
|
487
|
+
entries.each do |feature, values|
|
488
|
+
compound_sizes[compound] || compound_sizes[compound] = []
|
489
|
+
compound_sizes[compound] << values.size
|
490
|
+
end
|
491
|
+
compound_sizes[compound].uniq!
|
492
|
+
raise "Inappropriate data for CSV export for compound #{compound}" if compound_sizes[compound].size > 1
|
493
|
+
compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
|
494
|
+
end
|
495
|
+
end
|
496
|
+
|
497
|
+
# get headers
|
498
|
+
features_smarts && @rows.first << features_smarts || @rows.first << features
|
464
499
|
@rows.first.flatten!
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
entries.
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
500
|
+
|
501
|
+
# feature positions pre-calculated
|
502
|
+
feature_positions = features.inject({}) { |h,f|
|
503
|
+
h.merge!({f => features.index(f)+1}) # +1 due to ID
|
504
|
+
h
|
505
|
+
}
|
506
|
+
|
507
|
+
# serialize to csv
|
508
|
+
dataset.compounds.each do |compound|
|
509
|
+
entries=dataset.data_entries[compound]
|
510
|
+
if entries
|
511
|
+
inchi = URI.encode_www_form_component(Compound.new(compound).to_inchi)
|
512
|
+
|
513
|
+
# allocate container
|
514
|
+
row_container = Array.new(compound_sizes[compound])
|
515
|
+
(0...row_container.size).each do |i|
|
516
|
+
row_container[i] = Array.new(@rows.first.size)
|
517
|
+
row_container[i][0] = inchi
|
518
|
+
end
|
519
|
+
|
520
|
+
# fill entries
|
521
|
+
entries.each { |feature, values|
|
522
|
+
(0...compound_sizes[compound]).each { |i|
|
523
|
+
row_container[i][feature_positions[feature]] = values[i]
|
524
|
+
}
|
525
|
+
}
|
526
|
+
|
527
|
+
# fill zeroes for subgraphs
|
528
|
+
if (features_smarts)
|
529
|
+
row_container.collect! { |row|
|
530
|
+
row.collect! { |x| x ? x : 0 }
|
531
|
+
}
|
485
532
|
end
|
533
|
+
row_container.each { |row| @rows << row }
|
534
|
+
|
486
535
|
end
|
487
|
-
row_container.each { |r| @rows << r }
|
488
536
|
end
|
489
537
|
end
|
490
538
|
|
data/lib/stratification.R
CHANGED
@@ -1,4 +1,13 @@
|
|
1
1
|
|
2
|
+
round_it <- function( x )
|
3
|
+
{
|
4
|
+
if(isTRUE((x - floor(x))>=0.5))
|
5
|
+
ceiling(x)
|
6
|
+
else
|
7
|
+
floor(x)
|
8
|
+
}
|
9
|
+
|
10
|
+
|
2
11
|
nominal_to_binary <- function( data )
|
3
12
|
{
|
4
13
|
result = NULL
|
@@ -41,9 +50,13 @@ nominal_to_binary <- function( data )
|
|
41
50
|
result
|
42
51
|
}
|
43
52
|
|
44
|
-
process_data <- function( data )
|
53
|
+
process_data <- function( data, colnames=NULL )
|
45
54
|
{
|
46
55
|
data.num <- as.data.frame(data)
|
56
|
+
if (!is.null(colnames))
|
57
|
+
{
|
58
|
+
data.num = subset(data.num, select = colnames)
|
59
|
+
}
|
47
60
|
if (!is.numeric(data.num))
|
48
61
|
{
|
49
62
|
data.num = nominal_to_binary(data.num)
|
@@ -72,14 +85,15 @@ cluster <- function( data, min=10, max=15 )
|
|
72
85
|
cbind(s$partition[,m])
|
73
86
|
}
|
74
87
|
|
75
|
-
stratified_split <- function( data, ratio=0.3, method="cluster" )
|
88
|
+
stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL )
|
76
89
|
{
|
77
|
-
data.processed = as.matrix(process_data( data ))
|
90
|
+
data.processed = as.matrix(process_data( data, colnames ))
|
91
|
+
print(paste("split using #features: ",ncol(data.processed)))
|
78
92
|
if (method == "samplecube")
|
79
93
|
{
|
80
94
|
require("sampling")
|
81
95
|
# adjust ratio to make samplecube return exact number of samples
|
82
|
-
ratio =
|
96
|
+
ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed)
|
83
97
|
pik = rep(ratio,times=nrow(data.processed))
|
84
98
|
data.strat = cbind(pik,data.processed)
|
85
99
|
samplecube(data.strat,pik,order=2,comment=F)
|
@@ -101,10 +115,11 @@ stratified_split <- function( data, ratio=0.3, method="cluster" )
|
|
101
115
|
stop("unknown method")
|
102
116
|
}
|
103
117
|
|
104
|
-
stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
|
118
|
+
stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL )
|
105
119
|
{
|
106
120
|
print(paste(num_folds,"-fold-split, data-size",nrow(data)))
|
107
|
-
data.processed = as.matrix(process_data( data ))
|
121
|
+
data.processed = as.matrix(process_data( data, colnames ))
|
122
|
+
print(paste("split using #features: ",ncol(data.processed)))
|
108
123
|
if (method == "samplecube")
|
109
124
|
{
|
110
125
|
folds = rep(0, times=nrow(data))
|
@@ -133,7 +148,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
|
|
133
148
|
{
|
134
149
|
require("TunePareto")
|
135
150
|
cl = cluster(data.processed)
|
136
|
-
res = generateCVRuns(cl,ntimes=1,nfold=
|
151
|
+
res = generateCVRuns(cl,ntimes=1,nfold=num_folds)
|
137
152
|
folds = rep(0, times=nrow(data))
|
138
153
|
for (i in 1:num_folds)
|
139
154
|
for(j in 1:length(res[[1]][[i]]))
|
@@ -144,6 +159,50 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
|
|
144
159
|
stop("unknown method")
|
145
160
|
}
|
146
161
|
|
162
|
+
duplicate_indices <- function( data ) {
|
163
|
+
indices = 1:nrow(data)
|
164
|
+
z = data
|
165
|
+
duplicate_index = anyDuplicated(z)
|
166
|
+
while(duplicate_index) {
|
167
|
+
duplicate_to_index = anyDuplicated(z[1:duplicate_index,],fromLast=T)
|
168
|
+
#print(paste(duplicate_index,'is dupl to',duplicate_to_index))
|
169
|
+
indices[duplicate_index] <- duplicate_to_index
|
170
|
+
z[duplicate_index,] <- paste('123$§%',duplicate_index)
|
171
|
+
duplicate_index = anyDuplicated(z)
|
172
|
+
}
|
173
|
+
indices
|
174
|
+
}
|
175
|
+
|
176
|
+
add_duplicates <- function( data, dup_indices ) {
|
177
|
+
result = data[1,]
|
178
|
+
for(i in 2:length(dup_indices)) {
|
179
|
+
row = data[rownames(data)==dup_indices[i],]
|
180
|
+
if(length(row)==0)
|
181
|
+
stop(paste('index ',i,' dup-index ',dup_indices[i],'not found in data'))
|
182
|
+
result = rbind(result, row)
|
183
|
+
}
|
184
|
+
rownames(result)<-NULL
|
185
|
+
result
|
186
|
+
}
|
187
|
+
|
188
|
+
sammon_duplicates <- function( data, ... ) {
|
189
|
+
di <- duplicate_indices(data)
|
190
|
+
print(di)
|
191
|
+
u <- unique(data)
|
192
|
+
print(paste('unique data points',nrow(u),'of',nrow(data)))
|
193
|
+
if(nrow(u) <= 4) stop("number of unqiue datapoints <= 4")
|
194
|
+
points_unique <- sammon(dist(u), ...)$points
|
195
|
+
if (nrow(u)<nrow(data))
|
196
|
+
{
|
197
|
+
points <- add_duplicates(points_unique, di)
|
198
|
+
points
|
199
|
+
}
|
200
|
+
else
|
201
|
+
{
|
202
|
+
points_unique
|
203
|
+
}
|
204
|
+
}
|
205
|
+
|
147
206
|
plot_pre_process <- function( data, method="pca" )
|
148
207
|
{
|
149
208
|
data.processed = process_data( data )
|
@@ -158,6 +217,11 @@ plot_pre_process <- function( data, method="pca" )
|
|
158
217
|
data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
|
159
218
|
data.emb$conf
|
160
219
|
}
|
220
|
+
else if (method == "sammon")
|
221
|
+
{
|
222
|
+
require("MASS")
|
223
|
+
sammon_duplicates(data.processed, k=2)
|
224
|
+
}
|
161
225
|
else
|
162
226
|
stop("unknown method")
|
163
227
|
}
|
data/lib/transform.rb
CHANGED
@@ -396,7 +396,7 @@ module OpenTox
|
|
396
396
|
@q_prop = gsl_q_prop_orig.row(0).to_a
|
397
397
|
end
|
398
398
|
|
399
|
-
LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
|
399
|
+
LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop)
|
400
400
|
LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
|
401
401
|
|
402
402
|
@sims = [ gram_matrix, @sims ]
|
@@ -490,8 +490,10 @@ module OpenTox
|
|
490
490
|
|
491
491
|
@cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = []
|
492
492
|
|
493
|
-
@model.
|
494
|
-
|
493
|
+
# Major BUG! Must loop over @model.compounds, hash is unordered!
|
494
|
+
# @model.fingerprints.each
|
495
|
+
@model.compounds.each { |cmpd|
|
496
|
+
fp = @model.fingerprints[cmpd]
|
495
497
|
if @model.activities[cmpd] # row good
|
496
498
|
acts = @model.activities[cmpd]; @acts += acts
|
497
499
|
LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
|
data/lib/utils.rb
CHANGED
@@ -1,155 +1,414 @@
|
|
1
1
|
require 'csv'
|
2
|
+
require 'tempfile'
|
2
3
|
|
3
4
|
|
4
5
|
module OpenTox
|
5
6
|
|
6
7
|
module Algorithm
|
7
8
|
|
9
|
+
@ambit_descriptor_algorithm_uri = "http://apps.ideaconsult.net:8080/ambit2/algorithm/org.openscience.cdk.qsar.descriptors.molecular."
|
10
|
+
@ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/"
|
11
|
+
@ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632"
|
12
|
+
@keysfile = File.join(ENV['HOME'], ".opentox", "config", "pc_descriptors.yaml")
|
13
|
+
|
8
14
|
include OpenTox
|
9
15
|
|
10
16
|
# Calculate physico-chemical descriptors.
|
11
|
-
# @param[Hash]
|
17
|
+
# @param[Hash] required: :dataset_uri, :pc_type, :rjb, :task, :add_uri, optional: :descriptor, :lib, :subjectid
|
12
18
|
# @return[String] dataset uri
|
13
|
-
|
14
19
|
def self.pc_descriptors(params)
|
15
20
|
|
21
|
+
ds = OpenTox::Dataset.find(params[:dataset_uri],params[:subjectid])
|
22
|
+
compounds = ds.compounds.collect
|
23
|
+
task_weights = {"joelib"=> 20, "openbabel"=> 1, "cdk"=> 50 }
|
24
|
+
task_weights.keys.each { |step| task_weights.delete(step) if (params[:lib] && (!params[:lib].split(",").include?(step)))}
|
25
|
+
task_weights["load"] = 10
|
26
|
+
task_sum = Float task_weights.values.sum
|
27
|
+
task_weights.keys.each { |step| task_weights[step] /= task_sum }
|
28
|
+
task_weights.keys.each { |step| task_weights[step] = (task_weights[step]*100).floor }
|
29
|
+
|
30
|
+
jl_master=nil
|
31
|
+
cdk_master=nil
|
32
|
+
ob_master=nil
|
33
|
+
|
34
|
+
|
35
|
+
# # # openbabel (via ruby bindings)
|
36
|
+
if !params[:lib] || params[:lib].split(",").include?("openbabel")
|
37
|
+
ob_master, ob_ids = get_ob_descriptors( { :compounds => compounds, :pc_type => params[:pc_type], :descriptor => params[:descriptor] } )
|
38
|
+
params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["openbabel"]) if params[:task]
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
# # # joelib (via rjb)
|
43
|
+
if !params[:lib] || params[:lib].split(",").include?("joelib")
|
44
|
+
jl_master, jl_ids = get_jl_descriptors( { :compounds => compounds, :rjb => params[:rjb], :pc_type => params[:pc_type], :descriptor => params[:descriptor] } )
|
45
|
+
params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["joelib"]) if params[:task]
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
# # # cdk (via REST)
|
50
|
+
if !params[:lib] || params[:lib].split(",").include?("cdk")
|
51
|
+
ambit_result_uri, smiles_to_inchi, cdk_ids = get_cdk_descriptors( { :compounds => compounds, :pc_type => params[:pc_type], :task => params[:task], :step => task_weights["cdk"], :descriptor => params[:descriptor] } )
|
52
|
+
#LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
|
53
|
+
cdk_master, cdk_ids, ambit_ids = load_ds_csv(ambit_result_uri, smiles_to_inchi, cdk_ids )
|
54
|
+
params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["load"]) if params[:task]
|
55
|
+
end
|
56
|
+
|
57
|
+
# # # fuse CSVs ("master" structures)
|
58
|
+
if jl_master && cdk_master
|
59
|
+
nr_cols = (jl_master[0].size)-1
|
60
|
+
LOGGER.debug "Merging #{nr_cols} new columns"
|
61
|
+
cdk_master.each {|row| nr_cols.times { row.push(nil) } }
|
62
|
+
jl_master.each do |row|
|
63
|
+
temp = cdk_master.assoc(row[0]) # Finds the appropriate line in master
|
64
|
+
((-1*nr_cols)..-1).collect.each { |idx|
|
65
|
+
temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
|
66
|
+
}
|
67
|
+
end
|
68
|
+
master = cdk_master
|
69
|
+
else # either jl_master or cdk_master nil
|
70
|
+
master = jl_master || cdk_master
|
71
|
+
end
|
72
|
+
|
73
|
+
if ob_master && master
|
74
|
+
nr_cols = (ob_master[0].size)-1
|
75
|
+
LOGGER.debug "Merging #{nr_cols} new columns"
|
76
|
+
master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
|
77
|
+
ob_master.each do |row|
|
78
|
+
temp = master.assoc(row[0]) # Finds the appropriate line in master
|
79
|
+
((-1*nr_cols)..-1).collect.each { |idx|
|
80
|
+
temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
|
81
|
+
}
|
82
|
+
end
|
83
|
+
else # either ob_master or master nil
|
84
|
+
master = ob_master || master
|
85
|
+
end
|
86
|
+
|
87
|
+
if master
|
88
|
+
|
89
|
+
ds = OpenTox::Dataset.find(
|
90
|
+
OpenTox::RestClientWrapper.post(
|
91
|
+
File.join(CONFIG[:services]["opentox-dataset"]), master.collect { |row| row.join(",") }.join("\n"), {:content_type => "text/csv", :subjectid => params[:subjectid]}
|
92
|
+
),params[:subjectid]
|
93
|
+
)
|
94
|
+
|
95
|
+
# # # add feature metadata
|
96
|
+
pc_descriptors = YAML::load_file(@keysfile)
|
97
|
+
ambit_ids && ambit_ids.each_with_index { |id,idx|
|
98
|
+
raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
|
99
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[cdk_ids[idx]][:name]} [#{pc_descriptors[cdk_ids[idx]][:pc_type]}, #{pc_descriptors[cdk_ids[idx]][:lib]}]"})
|
100
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => @ambit_descriptor_algorithm_uri + cdk_ids[idx]})
|
101
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
|
102
|
+
}
|
103
|
+
ob_ids && ob_ids.each { |id|
|
104
|
+
raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
|
105
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[id][:name]} [#{pc_descriptors[id][:pc_type]}, #{pc_descriptors[id][:lib]}]"})
|
106
|
+
creator_uri = ds.uri.gsub(/\/dataset\/.*/, "/algorithm/pc")
|
107
|
+
creator_uri += "/#{id}" if params[:add_uri]
|
108
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => creator_uri})
|
109
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
|
110
|
+
}
|
111
|
+
jl_ids && jl_ids.each { |id|
|
112
|
+
raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
|
113
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[id][:name]} [#{pc_descriptors[id][:pc_type]}, #{pc_descriptors[id][:lib]}]"})
|
114
|
+
creator_uri = ds.uri.gsub(/\/dataset\/.*/, "/algorithm/pc")
|
115
|
+
creator_uri += "/#{id}" if params[:add_uri]
|
116
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => creator_uri})
|
117
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
|
118
|
+
}
|
119
|
+
|
120
|
+
ds.save(params[:subjectid])
|
121
|
+
else
|
122
|
+
raise OpenTox::BadRequestError.new "No descriptors matching your criteria found."
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
# Calculate OpenBabel physico-chemical descriptors.
|
129
|
+
# @param[Hash] required: :compounds, :pc_type, :task, optional: :descriptor
|
130
|
+
# @return[Array] CSV, array of field ids, array of field descriptions
|
131
|
+
def self.get_ob_descriptors(params)
|
132
|
+
|
133
|
+
master = nil
|
134
|
+
|
16
135
|
begin
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
136
|
+
csvfile = Tempfile.open(['ob_descriptors-','.csv'])
|
137
|
+
|
138
|
+
pc_descriptors = YAML::load_file(@keysfile)
|
139
|
+
ids = pc_descriptors.collect{ |id, info|
|
140
|
+
id if info[:lib] == "openbabel" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
|
141
|
+
}.compact
|
142
|
+
|
143
|
+
if ids.length > 0
|
144
|
+
csvfile.puts((["SMILES"] + ids).join(","))
|
145
|
+
|
146
|
+
# remember inchis
|
147
|
+
inchis = params[:compounds].collect { |c_uri|
|
148
|
+
URI.encode_www_form_component(OpenTox::Compound.new(c_uri).to_inchi)
|
149
|
+
}
|
150
|
+
|
151
|
+
# Process compounds
|
152
|
+
obmol = OpenBabel::OBMol.new
|
153
|
+
obconversion = OpenBabel::OBConversion.new
|
154
|
+
obconversion.set_in_and_out_formats 'inchi', 'can'
|
155
|
+
|
156
|
+
inchis.each_with_index { |inchi, c_idx|
|
157
|
+
row = [inchis[c_idx]]
|
158
|
+
obconversion.read_string(obmol, URI.decode_www_form_component(inchi))
|
159
|
+
ids.each { |name|
|
160
|
+
if obmol.respond_to?(name.underscore)
|
161
|
+
val = eval("obmol.#{name.underscore}") if obmol.respond_to?(name.underscore)
|
162
|
+
else
|
163
|
+
if name != "nF" && name != "spinMult" && name != "nHal" && name != "logP"
|
164
|
+
val = OpenBabel::OBDescriptor.find_type(name.underscore).predict(obmol)
|
165
|
+
elsif name == "nF"
|
166
|
+
val = OpenBabel::OBDescriptor.find_type("nf").predict(obmol)
|
167
|
+
elsif name == "spinMult" || name == "nHal" || name == "logP"
|
168
|
+
val = OpenBabel::OBDescriptor.find_type(name).predict(obmol)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
if OpenTox::Algorithm.numeric?(val)
|
172
|
+
val = Float(val)
|
173
|
+
val = nil if val.nan?
|
174
|
+
val = nil if (val && val.infinite?)
|
175
|
+
end
|
176
|
+
row << val
|
177
|
+
}
|
178
|
+
LOGGER.debug "Compound #{c_idx+1} (#{inchis.size}), #{row.size} entries"
|
179
|
+
csvfile.puts(row.join(","))
|
180
|
+
csvfile.flush
|
181
|
+
}
|
182
|
+
master = CSV::parse(File.open(csvfile.path, "rb").read)
|
183
|
+
end
|
184
|
+
|
23
185
|
rescue Exception => e
|
24
186
|
LOGGER.debug "#{e.class}: #{e.message}"
|
25
187
|
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
188
|
+
ensure
|
189
|
+
csvfile.close!
|
26
190
|
end
|
27
191
|
|
192
|
+
[ master, ids ]
|
193
|
+
|
28
194
|
end
|
29
|
-
|
30
|
-
# Calculates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit.
|
31
|
-
# @param[Hash] Required keys: :compounds, :pc_type
|
32
|
-
# @return[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
|
33
|
-
def self.get_pc_descriptors(params)
|
34
195
|
|
196
|
+
|
197
|
+
|
198
|
+
# Calculate Joelib2 physico-chemical descriptors.
|
199
|
+
# @param[Hash] required: :compounds, :pc_type, :task, optional: :descriptor
|
200
|
+
# @return[Array] CSV, array of field ids, array of field descriptions
|
201
|
+
def self.get_jl_descriptors(params)
|
202
|
+
|
203
|
+
master = nil
|
204
|
+
s = params[:rjb]; raise "No Java environment" unless s
|
205
|
+
|
206
|
+
# Load keys, enter CSV headers
|
35
207
|
begin
|
208
|
+
csvfile = Tempfile.open(['jl_descriptors-','.csv'])
|
36
209
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
210
|
+
pc_descriptors = YAML::load_file(@keysfile)
|
211
|
+
ids = pc_descriptors.collect{ |id, info|
|
212
|
+
id if info[:lib] == "joelib" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
|
213
|
+
}.compact
|
214
|
+
|
215
|
+
|
216
|
+
if ids.length > 0
|
217
|
+
csvfile.puts((["SMILES"] + ids).join(","))
|
218
|
+
|
219
|
+
# remember inchis
|
220
|
+
inchis = params[:compounds].collect { |c_uri|
|
221
|
+
cmpd = OpenTox::Compound.new(c_uri)
|
222
|
+
URI.encode_www_form_component(cmpd.to_inchi)
|
223
|
+
}
|
224
|
+
|
225
|
+
# Process compounds
|
226
|
+
params[:compounds].each_with_index { |c_uri, c_idx|
|
227
|
+
cmpd = OpenTox::Compound.new(c_uri)
|
228
|
+
inchi = cmpd.to_inchi
|
229
|
+
sdf_data = cmpd.to_sdf
|
230
|
+
|
231
|
+
infile = Tempfile.open(['jl_descriptors-in-','.sdf'])
|
232
|
+
outfile_path = infile.path.gsub(/jl_descriptors-in/,"jl_descriptors-out")
|
233
|
+
|
234
|
+
begin
|
235
|
+
infile.puts sdf_data
|
236
|
+
infile.flush
|
237
|
+
s.new(infile.path, outfile_path) # runs joelib
|
238
|
+
|
239
|
+
row = [inchis[c_idx]]
|
240
|
+
ids.each_with_index do |k,i| # Fill row
|
241
|
+
re = Regexp.new(k)
|
242
|
+
open(outfile_path) do |f|
|
243
|
+
f.each do |line|
|
244
|
+
if @prev == k
|
245
|
+
entry = line.chomp
|
246
|
+
val = nil
|
247
|
+
if OpenTox::Algorithm.numeric?(entry)
|
248
|
+
val = Float(entry)
|
249
|
+
val = nil if val.nan?
|
250
|
+
val = nil if (val && val.infinite?)
|
251
|
+
end
|
252
|
+
row << val
|
253
|
+
break
|
254
|
+
end
|
255
|
+
@prev = line.gsub(/^.*types./,"").gsub(/count./,"").gsub(/>/,"").chomp if line =~ re
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
LOGGER.debug "Compound #{c_idx+1} (#{inchis.size}), #{row.size} entries"
|
260
|
+
csvfile.puts(row.join(","))
|
261
|
+
csvfile.flush
|
262
|
+
|
263
|
+
rescue Exception => e
|
264
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
265
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
266
|
+
ensure
|
267
|
+
File.delete(infile.path.gsub(/\.sdf/,".numeric.sdf"))
|
268
|
+
File.delete(outfile_path)
|
269
|
+
infile.close!
|
270
|
+
end
|
271
|
+
}
|
272
|
+
master = CSV::parse(File.open(csvfile.path, "rb").read)
|
50
273
|
end
|
51
|
-
#LOGGER.debug "Ambit descriptor URIs: #{descs_uris.join(", ")}"
|
52
274
|
|
275
|
+
rescue Exception => e
|
276
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
277
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
278
|
+
ensure
|
279
|
+
[ csvfile].each { |f| f.close! }
|
280
|
+
end
|
281
|
+
|
282
|
+
[ master, ids ]
|
283
|
+
|
284
|
+
end
|
285
|
+
|
286
|
+
# Calculate CDK physico-chemical descriptors via Ambit -- DO NOT OVERLOAD Ambit.
|
287
|
+
# @param[Hash] required: :compounds, :pc_type, :task, :step optional: :descriptor
|
288
|
+
# @return[Array] array of Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features, hash smiles to inchi, array of field descriptions
|
289
|
+
def self.get_cdk_descriptors(params)
|
290
|
+
|
291
|
+
ambit_result_uri = [] # 1st pos: base uri, then features
|
292
|
+
smiles_to_inchi = {}
|
293
|
+
task_weights = {"electronic"=> 4, "topological"=> 19, "constitutional"=> 12, "geometrical"=> 3, "hybrid"=> 2, "cpsa"=> 1 }
|
294
|
+
task_weights.keys.each { |pc_type| task_weights.delete(pc_type) if (params[:pc_type] && (!params[:pc_type].split(",").include?(pc_type)))}
|
295
|
+
task_sum = Float task_weights.values.sum
|
296
|
+
task_weights.keys.each { |pc_type| task_weights[pc_type] /= task_sum }
|
297
|
+
task_weights.keys.each { |pc_type| task_weights[pc_type] *= params[:step] }
|
298
|
+
|
299
|
+
|
300
|
+
# extract wanted descriptors from config file and parameters
|
301
|
+
pc_descriptors = YAML::load_file(@keysfile)
|
302
|
+
|
303
|
+
ids = pc_descriptors.collect { |id, info|
|
304
|
+
"#{info[:pc_type]}:::#{id}" if info[:lib] == "cdk" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
|
305
|
+
}.compact
|
306
|
+
|
307
|
+
if ids.size > 0
|
308
|
+
ids.sort!
|
309
|
+
ids.collect! { |id| id.split(":::").last }
|
310
|
+
|
311
|
+
# create dataset at Ambit
|
53
312
|
begin
|
54
|
-
# Create SMI
|
55
|
-
smiles_array = []; smiles_to_inchi = {}
|
56
313
|
params[:compounds].each do |n|
|
57
314
|
cmpd = OpenTox::Compound.new(n)
|
58
315
|
smiles_string = cmpd.to_smiles
|
59
316
|
smiles_to_inchi[smiles_string] = URI.encode_www_form_component(cmpd.to_inchi)
|
60
|
-
smiles_array << smiles_string
|
61
317
|
end
|
62
|
-
smi_file = Tempfile.open(['pc_ambit', '.csv'])
|
63
|
-
|
64
|
-
|
65
|
-
# Create Ambit dataset
|
66
|
-
smi_file.puts( "SMILES\n" )
|
67
|
-
smi_file.puts( smiles_array.join("\n") )
|
68
|
-
smi_file.flush
|
69
|
-
ambit_ds_uri = OpenTox::RestClientWrapper.post(ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} )
|
318
|
+
smi_file = Tempfile.open(['pc_ambit', '.csv']) ; smi_file.puts( "SMILES\n" + smiles_to_inchi.keys.join("\n") ) ; smi_file.flush
|
319
|
+
ambit_ds_uri = OpenTox::RestClientWrapper.post(@ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} )
|
320
|
+
ambit_result_uri = [ ambit_ds_uri + "?" ] # 1st pos: base uri, then features
|
70
321
|
rescue Exception => e
|
71
322
|
LOGGER.debug "#{e.class}: #{e.message}"
|
72
323
|
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
73
324
|
ensure
|
74
325
|
smi_file.close! if smi_file
|
75
326
|
end
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }"
|
82
|
-
end
|
83
|
-
|
84
|
-
# Get Ambit results
|
85
|
-
ambit_result_uri = [] # 1st pos: base uri, then features
|
86
|
-
ambit_result_uri << ambit_ds_uri + "?"
|
327
|
+
# get SMILES feature URI
|
328
|
+
ambit_smiles_uri = OpenTox::RestClientWrapper.get(
|
329
|
+
ambit_ds_uri + "/features",
|
330
|
+
{:accept=> "text/uri-list"}
|
331
|
+
).chomp
|
87
332
|
ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&")
|
88
|
-
|
89
|
-
|
333
|
+
# always calculate 3D (http://goo.gl/Tk81j), then get results
|
334
|
+
OpenTox::RestClientWrapper.post(
|
335
|
+
@ambit_mopac_model_uri,
|
336
|
+
{:dataset_uri => ambit_ds_uri},
|
337
|
+
{:accept => "text/uri-list"}
|
338
|
+
)
|
339
|
+
current_cat = ""
|
340
|
+
ids.each_with_index do |id, i|
|
341
|
+
old_cat = current_cat; current_cat = pc_descriptors[id][:pc_type]
|
342
|
+
params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights[old_cat]) if params[:task] && old_cat != current_cat && old_cat != ""
|
343
|
+
algorithm = Algorithm::Generic.new(@ambit_descriptor_algorithm_uri+id)
|
90
344
|
result_uri = algorithm.run({:dataset_uri => ambit_ds_uri})
|
91
345
|
ambit_result_uri << result_uri.split("?")[1] + "&"
|
92
|
-
LOGGER.debug "Ambit (#{
|
346
|
+
LOGGER.debug "Ambit (#{ids.size}): #{i+1}"
|
93
347
|
end
|
348
|
+
params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights[current_cat]) if params[:task]
|
94
349
|
#LOGGER.debug "Ambit result: #{ambit_result_uri.join('')}"
|
95
|
-
[ ambit_result_uri, smiles_to_inchi ]
|
96
|
-
|
97
|
-
rescue Exception => e
|
98
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
99
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
100
350
|
end
|
351
|
+
|
352
|
+
[ ambit_result_uri, smiles_to_inchi, ids ]
|
353
|
+
|
101
354
|
end
|
102
355
|
|
103
356
|
|
104
357
|
# Load dataset via CSV
|
105
358
|
# @param[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
|
106
|
-
# @
|
107
|
-
|
359
|
+
# @param[Hash] keys: SMILES, values: InChIs
|
360
|
+
# @param[Array] field descriptions, one for each feature
|
361
|
+
# @return[Array] CSV, array of field ids, array of field descriptions
|
362
|
+
def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, single_ids, subjectid=nil)
|
108
363
|
|
109
364
|
master=nil
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
}
|
365
|
+
ids=[]
|
366
|
+
ambit_ids=[]
|
367
|
+
|
368
|
+
if ambit_result_uri.size > 0
|
369
|
+
(1...ambit_result_uri.size).collect { |idx|
|
370
|
+
curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
|
371
|
+
#LOGGER.debug "Requesting #{curr_uri}"
|
372
|
+
csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
|
373
|
+
if csv_data[0] && csv_data[0].size>1
|
374
|
+
if master.nil? # This is the smiles entry
|
375
|
+
(1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
|
376
|
+
master = csv_data
|
377
|
+
next
|
378
|
+
else
|
379
|
+
index_uri = csv_data[0].index("SMILES")
|
380
|
+
csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information
|
381
|
+
|
382
|
+
nr_cols = (csv_data[0].size)-1
|
383
|
+
LOGGER.debug "Merging #{nr_cols} new columns"
|
384
|
+
ids += Array.new(nr_cols, single_ids[idx-2])
|
385
|
+
master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
|
386
|
+
csv_data.each do |row|
|
387
|
+
temp = master.assoc(row[0]) # Finds the appropriate line in master
|
388
|
+
((-1*nr_cols)..-1).collect.each { |idx|
|
389
|
+
temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
|
390
|
+
}
|
391
|
+
end
|
131
392
|
end
|
132
393
|
end
|
133
|
-
|
134
|
-
}
|
394
|
+
}
|
135
395
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
396
|
+
index_uri = master[0].index("Compound")
|
397
|
+
master.map {|i| i.delete_at(index_uri)}
|
398
|
+
master[0].each {|cell| cell.chomp!(" ")}
|
399
|
+
master[0][0] = "Compound" #"SMILES"
|
400
|
+
index_smi = master[0].index("SMILES")
|
401
|
+
master.map {|i| i.delete_at(index_smi)} if index_smi
|
402
|
+
master[0][0] = "SMILES"
|
403
|
+
ambit_ids=master[0].collect {|header| header.to_s.gsub(/[\/.\\\(\)\{\}\[\]]/,"_")}
|
404
|
+
ambit_ids.shift
|
405
|
+
end
|
143
406
|
|
144
407
|
#LOGGER.debug "-------- AM: Writing to dumpfile"
|
145
408
|
#File.open("/tmp/test.csv", 'w') {|f| f.write( master.collect {|r| r.join(",")}.join("\n") ) }
|
146
409
|
|
147
|
-
|
148
|
-
|
149
|
-
ds.save(subjectid)
|
150
|
-
parser.dataset = ds
|
151
|
-
ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"))
|
152
|
-
ds.save(subjectid)
|
410
|
+
[ master, ids, ambit_ids ]
|
411
|
+
|
153
412
|
end
|
154
413
|
|
155
414
|
|
@@ -208,8 +467,8 @@ module OpenTox
|
|
208
467
|
end
|
209
468
|
|
210
469
|
|
211
|
-
# Effect calculation for classification
|
212
|
-
# @param [Array] Array of occurrences per class in the form of Enumerables.
|
470
|
+
# Effect calculation for classification. It is assumed that the elements of the arrays match each other pairwise
|
471
|
+
# @param [Array] Array of occurrences per class (in the form of Enumerables).
|
213
472
|
# @param [Array] Array of database instance counts per class.
|
214
473
|
def self.effect(occurrences, db_instances)
|
215
474
|
max=0
|