opentox-ruby 3.1.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +19 -9
- data/README.markdown +1 -1
- data/Rakefile +2 -1
- data/VERSION +1 -1
- data/lib/algorithm.rb +143 -37
- data/lib/compound.rb +66 -18
- data/lib/dataset.rb +38 -3
- data/lib/model.rb +36 -13
- data/lib/parser.rb +34 -19
- data/lib/r-util.rb +93 -34
- data/lib/serializer.rb +70 -22
- data/lib/stratification.R +71 -7
- data/lib/transform.rb +5 -3
- data/lib/utils.rb +356 -97
- data/lib/validation.rb +6 -4
- metadata +20 -4
data/lib/serializer.rb
CHANGED
@@ -459,32 +459,80 @@ module OpenTox
|
|
459
459
|
def initialize(dataset)
|
460
460
|
@rows = []
|
461
461
|
@rows << ["SMILES"]
|
462
|
+
|
462
463
|
features = dataset.features.keys
|
463
|
-
|
464
|
+
|
465
|
+
# prepare for subgraphs
|
466
|
+
have_substructures = features.collect{ |id| dataset.features[id][RDF.type].include? OT.Substructure}.compact.uniq
|
467
|
+
if have_substructures.size == 1 && have_substructures[0]
|
468
|
+
features_smarts = features.collect{ |id| "'" + dataset.features[id][OT.smarts] + "'" }
|
469
|
+
end
|
470
|
+
|
471
|
+
# gather missing features
|
472
|
+
delete_features = []
|
473
|
+
features.each{ |id|
|
474
|
+
dataset.features[id][RDF.type].each { |typestr|
|
475
|
+
if typestr.include? "MissingFeature"
|
476
|
+
delete_features << id
|
477
|
+
end
|
478
|
+
}
|
479
|
+
}
|
480
|
+
features = features - delete_features
|
481
|
+
|
482
|
+
# detect nr duplicates per compound
|
483
|
+
compound_sizes = {}
|
484
|
+
dataset.compounds.each do |compound|
|
485
|
+
entries=dataset.data_entries[compound]
|
486
|
+
if entries
|
487
|
+
entries.each do |feature, values|
|
488
|
+
compound_sizes[compound] || compound_sizes[compound] = []
|
489
|
+
compound_sizes[compound] << values.size
|
490
|
+
end
|
491
|
+
compound_sizes[compound].uniq!
|
492
|
+
raise "Inappropriate data for CSV export for compound #{compound}" if compound_sizes[compound].size > 1
|
493
|
+
compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
|
494
|
+
end
|
495
|
+
end
|
496
|
+
|
497
|
+
# get headers
|
498
|
+
features_smarts && @rows.first << features_smarts || @rows.first << features
|
464
499
|
@rows.first.flatten!
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
entries.
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
500
|
+
|
501
|
+
# feature positions pre-calculated
|
502
|
+
feature_positions = features.inject({}) { |h,f|
|
503
|
+
h.merge!({f => features.index(f)+1}) # +1 due to ID
|
504
|
+
h
|
505
|
+
}
|
506
|
+
|
507
|
+
# serialize to csv
|
508
|
+
dataset.compounds.each do |compound|
|
509
|
+
entries=dataset.data_entries[compound]
|
510
|
+
if entries
|
511
|
+
inchi = URI.encode_www_form_component(Compound.new(compound).to_inchi)
|
512
|
+
|
513
|
+
# allocate container
|
514
|
+
row_container = Array.new(compound_sizes[compound])
|
515
|
+
(0...row_container.size).each do |i|
|
516
|
+
row_container[i] = Array.new(@rows.first.size)
|
517
|
+
row_container[i][0] = inchi
|
518
|
+
end
|
519
|
+
|
520
|
+
# fill entries
|
521
|
+
entries.each { |feature, values|
|
522
|
+
(0...compound_sizes[compound]).each { |i|
|
523
|
+
row_container[i][feature_positions[feature]] = values[i]
|
524
|
+
}
|
525
|
+
}
|
526
|
+
|
527
|
+
# fill zeroes for subgraphs
|
528
|
+
if (features_smarts)
|
529
|
+
row_container.collect! { |row|
|
530
|
+
row.collect! { |x| x ? x : 0 }
|
531
|
+
}
|
485
532
|
end
|
533
|
+
row_container.each { |row| @rows << row }
|
534
|
+
|
486
535
|
end
|
487
|
-
row_container.each { |r| @rows << r }
|
488
536
|
end
|
489
537
|
end
|
490
538
|
|
data/lib/stratification.R
CHANGED
@@ -1,4 +1,13 @@
|
|
1
1
|
|
2
|
+
round_it <- function( x )
|
3
|
+
{
|
4
|
+
if(isTRUE((x - floor(x))>=0.5))
|
5
|
+
ceiling(x)
|
6
|
+
else
|
7
|
+
floor(x)
|
8
|
+
}
|
9
|
+
|
10
|
+
|
2
11
|
nominal_to_binary <- function( data )
|
3
12
|
{
|
4
13
|
result = NULL
|
@@ -41,9 +50,13 @@ nominal_to_binary <- function( data )
|
|
41
50
|
result
|
42
51
|
}
|
43
52
|
|
44
|
-
process_data <- function( data )
|
53
|
+
process_data <- function( data, colnames=NULL )
|
45
54
|
{
|
46
55
|
data.num <- as.data.frame(data)
|
56
|
+
if (!is.null(colnames))
|
57
|
+
{
|
58
|
+
data.num = subset(data.num, select = colnames)
|
59
|
+
}
|
47
60
|
if (!is.numeric(data.num))
|
48
61
|
{
|
49
62
|
data.num = nominal_to_binary(data.num)
|
@@ -72,14 +85,15 @@ cluster <- function( data, min=10, max=15 )
|
|
72
85
|
cbind(s$partition[,m])
|
73
86
|
}
|
74
87
|
|
75
|
-
stratified_split <- function( data, ratio=0.3, method="cluster" )
|
88
|
+
stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL )
|
76
89
|
{
|
77
|
-
data.processed = as.matrix(process_data( data ))
|
90
|
+
data.processed = as.matrix(process_data( data, colnames ))
|
91
|
+
print(paste("split using #features: ",ncol(data.processed)))
|
78
92
|
if (method == "samplecube")
|
79
93
|
{
|
80
94
|
require("sampling")
|
81
95
|
# adjust ratio to make samplecube return exact number of samples
|
82
|
-
ratio =
|
96
|
+
ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed)
|
83
97
|
pik = rep(ratio,times=nrow(data.processed))
|
84
98
|
data.strat = cbind(pik,data.processed)
|
85
99
|
samplecube(data.strat,pik,order=2,comment=F)
|
@@ -101,10 +115,11 @@ stratified_split <- function( data, ratio=0.3, method="cluster" )
|
|
101
115
|
stop("unknown method")
|
102
116
|
}
|
103
117
|
|
104
|
-
stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
|
118
|
+
stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL )
|
105
119
|
{
|
106
120
|
print(paste(num_folds,"-fold-split, data-size",nrow(data)))
|
107
|
-
data.processed = as.matrix(process_data( data ))
|
121
|
+
data.processed = as.matrix(process_data( data, colnames ))
|
122
|
+
print(paste("split using #features: ",ncol(data.processed)))
|
108
123
|
if (method == "samplecube")
|
109
124
|
{
|
110
125
|
folds = rep(0, times=nrow(data))
|
@@ -133,7 +148,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
|
|
133
148
|
{
|
134
149
|
require("TunePareto")
|
135
150
|
cl = cluster(data.processed)
|
136
|
-
res = generateCVRuns(cl,ntimes=1,nfold=
|
151
|
+
res = generateCVRuns(cl,ntimes=1,nfold=num_folds)
|
137
152
|
folds = rep(0, times=nrow(data))
|
138
153
|
for (i in 1:num_folds)
|
139
154
|
for(j in 1:length(res[[1]][[i]]))
|
@@ -144,6 +159,50 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
|
|
144
159
|
stop("unknown method")
|
145
160
|
}
|
146
161
|
|
162
|
+
duplicate_indices <- function( data ) {
|
163
|
+
indices = 1:nrow(data)
|
164
|
+
z = data
|
165
|
+
duplicate_index = anyDuplicated(z)
|
166
|
+
while(duplicate_index) {
|
167
|
+
duplicate_to_index = anyDuplicated(z[1:duplicate_index,],fromLast=T)
|
168
|
+
#print(paste(duplicate_index,'is dupl to',duplicate_to_index))
|
169
|
+
indices[duplicate_index] <- duplicate_to_index
|
170
|
+
z[duplicate_index,] <- paste('123$§%',duplicate_index)
|
171
|
+
duplicate_index = anyDuplicated(z)
|
172
|
+
}
|
173
|
+
indices
|
174
|
+
}
|
175
|
+
|
176
|
+
add_duplicates <- function( data, dup_indices ) {
|
177
|
+
result = data[1,]
|
178
|
+
for(i in 2:length(dup_indices)) {
|
179
|
+
row = data[rownames(data)==dup_indices[i],]
|
180
|
+
if(length(row)==0)
|
181
|
+
stop(paste('index ',i,' dup-index ',dup_indices[i],'not found in data'))
|
182
|
+
result = rbind(result, row)
|
183
|
+
}
|
184
|
+
rownames(result)<-NULL
|
185
|
+
result
|
186
|
+
}
|
187
|
+
|
188
|
+
sammon_duplicates <- function( data, ... ) {
|
189
|
+
di <- duplicate_indices(data)
|
190
|
+
print(di)
|
191
|
+
u <- unique(data)
|
192
|
+
print(paste('unique data points',nrow(u),'of',nrow(data)))
|
193
|
+
if(nrow(u) <= 4) stop("number of unqiue datapoints <= 4")
|
194
|
+
points_unique <- sammon(dist(u), ...)$points
|
195
|
+
if (nrow(u)<nrow(data))
|
196
|
+
{
|
197
|
+
points <- add_duplicates(points_unique, di)
|
198
|
+
points
|
199
|
+
}
|
200
|
+
else
|
201
|
+
{
|
202
|
+
points_unique
|
203
|
+
}
|
204
|
+
}
|
205
|
+
|
147
206
|
plot_pre_process <- function( data, method="pca" )
|
148
207
|
{
|
149
208
|
data.processed = process_data( data )
|
@@ -158,6 +217,11 @@ plot_pre_process <- function( data, method="pca" )
|
|
158
217
|
data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
|
159
218
|
data.emb$conf
|
160
219
|
}
|
220
|
+
else if (method == "sammon")
|
221
|
+
{
|
222
|
+
require("MASS")
|
223
|
+
sammon_duplicates(data.processed, k=2)
|
224
|
+
}
|
161
225
|
else
|
162
226
|
stop("unknown method")
|
163
227
|
}
|
data/lib/transform.rb
CHANGED
@@ -396,7 +396,7 @@ module OpenTox
|
|
396
396
|
@q_prop = gsl_q_prop_orig.row(0).to_a
|
397
397
|
end
|
398
398
|
|
399
|
-
LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
|
399
|
+
LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop)
|
400
400
|
LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
|
401
401
|
|
402
402
|
@sims = [ gram_matrix, @sims ]
|
@@ -490,8 +490,10 @@ module OpenTox
|
|
490
490
|
|
491
491
|
@cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = []
|
492
492
|
|
493
|
-
@model.
|
494
|
-
|
493
|
+
# Major BUG! Must loop over @model.compounds, hash is unordered!
|
494
|
+
# @model.fingerprints.each
|
495
|
+
@model.compounds.each { |cmpd|
|
496
|
+
fp = @model.fingerprints[cmpd]
|
495
497
|
if @model.activities[cmpd] # row good
|
496
498
|
acts = @model.activities[cmpd]; @acts += acts
|
497
499
|
LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
|
data/lib/utils.rb
CHANGED
@@ -1,155 +1,414 @@
|
|
1
1
|
require 'csv'
|
2
|
+
require 'tempfile'
|
2
3
|
|
3
4
|
|
4
5
|
module OpenTox
|
5
6
|
|
6
7
|
module Algorithm
|
7
8
|
|
9
|
+
@ambit_descriptor_algorithm_uri = "http://apps.ideaconsult.net:8080/ambit2/algorithm/org.openscience.cdk.qsar.descriptors.molecular."
|
10
|
+
@ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/"
|
11
|
+
@ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632"
|
12
|
+
@keysfile = File.join(ENV['HOME'], ".opentox", "config", "pc_descriptors.yaml")
|
13
|
+
|
8
14
|
include OpenTox
|
9
15
|
|
10
16
|
# Calculate physico-chemical descriptors.
|
11
|
-
# @param[Hash]
|
17
|
+
# @param[Hash] required: :dataset_uri, :pc_type, :rjb, :task, :add_uri, optional: :descriptor, :lib, :subjectid
|
12
18
|
# @return[String] dataset uri
|
13
|
-
|
14
19
|
def self.pc_descriptors(params)
|
15
20
|
|
21
|
+
ds = OpenTox::Dataset.find(params[:dataset_uri],params[:subjectid])
|
22
|
+
compounds = ds.compounds.collect
|
23
|
+
task_weights = {"joelib"=> 20, "openbabel"=> 1, "cdk"=> 50 }
|
24
|
+
task_weights.keys.each { |step| task_weights.delete(step) if (params[:lib] && (!params[:lib].split(",").include?(step)))}
|
25
|
+
task_weights["load"] = 10
|
26
|
+
task_sum = Float task_weights.values.sum
|
27
|
+
task_weights.keys.each { |step| task_weights[step] /= task_sum }
|
28
|
+
task_weights.keys.each { |step| task_weights[step] = (task_weights[step]*100).floor }
|
29
|
+
|
30
|
+
jl_master=nil
|
31
|
+
cdk_master=nil
|
32
|
+
ob_master=nil
|
33
|
+
|
34
|
+
|
35
|
+
# # # openbabel (via ruby bindings)
|
36
|
+
if !params[:lib] || params[:lib].split(",").include?("openbabel")
|
37
|
+
ob_master, ob_ids = get_ob_descriptors( { :compounds => compounds, :pc_type => params[:pc_type], :descriptor => params[:descriptor] } )
|
38
|
+
params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["openbabel"]) if params[:task]
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
# # # joelib (via rjb)
|
43
|
+
if !params[:lib] || params[:lib].split(",").include?("joelib")
|
44
|
+
jl_master, jl_ids = get_jl_descriptors( { :compounds => compounds, :rjb => params[:rjb], :pc_type => params[:pc_type], :descriptor => params[:descriptor] } )
|
45
|
+
params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["joelib"]) if params[:task]
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
# # # cdk (via REST)
|
50
|
+
if !params[:lib] || params[:lib].split(",").include?("cdk")
|
51
|
+
ambit_result_uri, smiles_to_inchi, cdk_ids = get_cdk_descriptors( { :compounds => compounds, :pc_type => params[:pc_type], :task => params[:task], :step => task_weights["cdk"], :descriptor => params[:descriptor] } )
|
52
|
+
#LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
|
53
|
+
cdk_master, cdk_ids, ambit_ids = load_ds_csv(ambit_result_uri, smiles_to_inchi, cdk_ids )
|
54
|
+
params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["load"]) if params[:task]
|
55
|
+
end
|
56
|
+
|
57
|
+
# # # fuse CSVs ("master" structures)
|
58
|
+
if jl_master && cdk_master
|
59
|
+
nr_cols = (jl_master[0].size)-1
|
60
|
+
LOGGER.debug "Merging #{nr_cols} new columns"
|
61
|
+
cdk_master.each {|row| nr_cols.times { row.push(nil) } }
|
62
|
+
jl_master.each do |row|
|
63
|
+
temp = cdk_master.assoc(row[0]) # Finds the appropriate line in master
|
64
|
+
((-1*nr_cols)..-1).collect.each { |idx|
|
65
|
+
temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
|
66
|
+
}
|
67
|
+
end
|
68
|
+
master = cdk_master
|
69
|
+
else # either jl_master or cdk_master nil
|
70
|
+
master = jl_master || cdk_master
|
71
|
+
end
|
72
|
+
|
73
|
+
if ob_master && master
|
74
|
+
nr_cols = (ob_master[0].size)-1
|
75
|
+
LOGGER.debug "Merging #{nr_cols} new columns"
|
76
|
+
master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
|
77
|
+
ob_master.each do |row|
|
78
|
+
temp = master.assoc(row[0]) # Finds the appropriate line in master
|
79
|
+
((-1*nr_cols)..-1).collect.each { |idx|
|
80
|
+
temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
|
81
|
+
}
|
82
|
+
end
|
83
|
+
else # either ob_master or master nil
|
84
|
+
master = ob_master || master
|
85
|
+
end
|
86
|
+
|
87
|
+
if master
|
88
|
+
|
89
|
+
ds = OpenTox::Dataset.find(
|
90
|
+
OpenTox::RestClientWrapper.post(
|
91
|
+
File.join(CONFIG[:services]["opentox-dataset"]), master.collect { |row| row.join(",") }.join("\n"), {:content_type => "text/csv", :subjectid => params[:subjectid]}
|
92
|
+
),params[:subjectid]
|
93
|
+
)
|
94
|
+
|
95
|
+
# # # add feature metadata
|
96
|
+
pc_descriptors = YAML::load_file(@keysfile)
|
97
|
+
ambit_ids && ambit_ids.each_with_index { |id,idx|
|
98
|
+
raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
|
99
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[cdk_ids[idx]][:name]} [#{pc_descriptors[cdk_ids[idx]][:pc_type]}, #{pc_descriptors[cdk_ids[idx]][:lib]}]"})
|
100
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => @ambit_descriptor_algorithm_uri + cdk_ids[idx]})
|
101
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
|
102
|
+
}
|
103
|
+
ob_ids && ob_ids.each { |id|
|
104
|
+
raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
|
105
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[id][:name]} [#{pc_descriptors[id][:pc_type]}, #{pc_descriptors[id][:lib]}]"})
|
106
|
+
creator_uri = ds.uri.gsub(/\/dataset\/.*/, "/algorithm/pc")
|
107
|
+
creator_uri += "/#{id}" if params[:add_uri]
|
108
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => creator_uri})
|
109
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
|
110
|
+
}
|
111
|
+
jl_ids && jl_ids.each { |id|
|
112
|
+
raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
|
113
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[id][:name]} [#{pc_descriptors[id][:pc_type]}, #{pc_descriptors[id][:lib]}]"})
|
114
|
+
creator_uri = ds.uri.gsub(/\/dataset\/.*/, "/algorithm/pc")
|
115
|
+
creator_uri += "/#{id}" if params[:add_uri]
|
116
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => creator_uri})
|
117
|
+
ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
|
118
|
+
}
|
119
|
+
|
120
|
+
ds.save(params[:subjectid])
|
121
|
+
else
|
122
|
+
raise OpenTox::BadRequestError.new "No descriptors matching your criteria found."
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
# Calculate OpenBabel physico-chemical descriptors.
|
129
|
+
# @param[Hash] required: :compounds, :pc_type, :task, optional: :descriptor
|
130
|
+
# @return[Array] CSV, array of field ids, array of field descriptions
|
131
|
+
def self.get_ob_descriptors(params)
|
132
|
+
|
133
|
+
master = nil
|
134
|
+
|
16
135
|
begin
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
136
|
+
csvfile = Tempfile.open(['ob_descriptors-','.csv'])
|
137
|
+
|
138
|
+
pc_descriptors = YAML::load_file(@keysfile)
|
139
|
+
ids = pc_descriptors.collect{ |id, info|
|
140
|
+
id if info[:lib] == "openbabel" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
|
141
|
+
}.compact
|
142
|
+
|
143
|
+
if ids.length > 0
|
144
|
+
csvfile.puts((["SMILES"] + ids).join(","))
|
145
|
+
|
146
|
+
# remember inchis
|
147
|
+
inchis = params[:compounds].collect { |c_uri|
|
148
|
+
URI.encode_www_form_component(OpenTox::Compound.new(c_uri).to_inchi)
|
149
|
+
}
|
150
|
+
|
151
|
+
# Process compounds
|
152
|
+
obmol = OpenBabel::OBMol.new
|
153
|
+
obconversion = OpenBabel::OBConversion.new
|
154
|
+
obconversion.set_in_and_out_formats 'inchi', 'can'
|
155
|
+
|
156
|
+
inchis.each_with_index { |inchi, c_idx|
|
157
|
+
row = [inchis[c_idx]]
|
158
|
+
obconversion.read_string(obmol, URI.decode_www_form_component(inchi))
|
159
|
+
ids.each { |name|
|
160
|
+
if obmol.respond_to?(name.underscore)
|
161
|
+
val = eval("obmol.#{name.underscore}") if obmol.respond_to?(name.underscore)
|
162
|
+
else
|
163
|
+
if name != "nF" && name != "spinMult" && name != "nHal" && name != "logP"
|
164
|
+
val = OpenBabel::OBDescriptor.find_type(name.underscore).predict(obmol)
|
165
|
+
elsif name == "nF"
|
166
|
+
val = OpenBabel::OBDescriptor.find_type("nf").predict(obmol)
|
167
|
+
elsif name == "spinMult" || name == "nHal" || name == "logP"
|
168
|
+
val = OpenBabel::OBDescriptor.find_type(name).predict(obmol)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
if OpenTox::Algorithm.numeric?(val)
|
172
|
+
val = Float(val)
|
173
|
+
val = nil if val.nan?
|
174
|
+
val = nil if (val && val.infinite?)
|
175
|
+
end
|
176
|
+
row << val
|
177
|
+
}
|
178
|
+
LOGGER.debug "Compound #{c_idx+1} (#{inchis.size}), #{row.size} entries"
|
179
|
+
csvfile.puts(row.join(","))
|
180
|
+
csvfile.flush
|
181
|
+
}
|
182
|
+
master = CSV::parse(File.open(csvfile.path, "rb").read)
|
183
|
+
end
|
184
|
+
|
23
185
|
rescue Exception => e
|
24
186
|
LOGGER.debug "#{e.class}: #{e.message}"
|
25
187
|
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
188
|
+
ensure
|
189
|
+
csvfile.close!
|
26
190
|
end
|
27
191
|
|
192
|
+
[ master, ids ]
|
193
|
+
|
28
194
|
end
|
29
|
-
|
30
|
-
# Calculates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit.
|
31
|
-
# @param[Hash] Required keys: :compounds, :pc_type
|
32
|
-
# @return[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
|
33
|
-
def self.get_pc_descriptors(params)
|
34
195
|
|
196
|
+
|
197
|
+
|
198
|
+
# Calculate Joelib2 physico-chemical descriptors.
|
199
|
+
# @param[Hash] required: :compounds, :pc_type, :task, optional: :descriptor
|
200
|
+
# @return[Array] CSV, array of field ids, array of field descriptions
|
201
|
+
def self.get_jl_descriptors(params)
|
202
|
+
|
203
|
+
master = nil
|
204
|
+
s = params[:rjb]; raise "No Java environment" unless s
|
205
|
+
|
206
|
+
# Load keys, enter CSV headers
|
35
207
|
begin
|
208
|
+
csvfile = Tempfile.open(['jl_descriptors-','.csv'])
|
36
209
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
210
|
+
pc_descriptors = YAML::load_file(@keysfile)
|
211
|
+
ids = pc_descriptors.collect{ |id, info|
|
212
|
+
id if info[:lib] == "joelib" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
|
213
|
+
}.compact
|
214
|
+
|
215
|
+
|
216
|
+
if ids.length > 0
|
217
|
+
csvfile.puts((["SMILES"] + ids).join(","))
|
218
|
+
|
219
|
+
# remember inchis
|
220
|
+
inchis = params[:compounds].collect { |c_uri|
|
221
|
+
cmpd = OpenTox::Compound.new(c_uri)
|
222
|
+
URI.encode_www_form_component(cmpd.to_inchi)
|
223
|
+
}
|
224
|
+
|
225
|
+
# Process compounds
|
226
|
+
params[:compounds].each_with_index { |c_uri, c_idx|
|
227
|
+
cmpd = OpenTox::Compound.new(c_uri)
|
228
|
+
inchi = cmpd.to_inchi
|
229
|
+
sdf_data = cmpd.to_sdf
|
230
|
+
|
231
|
+
infile = Tempfile.open(['jl_descriptors-in-','.sdf'])
|
232
|
+
outfile_path = infile.path.gsub(/jl_descriptors-in/,"jl_descriptors-out")
|
233
|
+
|
234
|
+
begin
|
235
|
+
infile.puts sdf_data
|
236
|
+
infile.flush
|
237
|
+
s.new(infile.path, outfile_path) # runs joelib
|
238
|
+
|
239
|
+
row = [inchis[c_idx]]
|
240
|
+
ids.each_with_index do |k,i| # Fill row
|
241
|
+
re = Regexp.new(k)
|
242
|
+
open(outfile_path) do |f|
|
243
|
+
f.each do |line|
|
244
|
+
if @prev == k
|
245
|
+
entry = line.chomp
|
246
|
+
val = nil
|
247
|
+
if OpenTox::Algorithm.numeric?(entry)
|
248
|
+
val = Float(entry)
|
249
|
+
val = nil if val.nan?
|
250
|
+
val = nil if (val && val.infinite?)
|
251
|
+
end
|
252
|
+
row << val
|
253
|
+
break
|
254
|
+
end
|
255
|
+
@prev = line.gsub(/^.*types./,"").gsub(/count./,"").gsub(/>/,"").chomp if line =~ re
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
LOGGER.debug "Compound #{c_idx+1} (#{inchis.size}), #{row.size} entries"
|
260
|
+
csvfile.puts(row.join(","))
|
261
|
+
csvfile.flush
|
262
|
+
|
263
|
+
rescue Exception => e
|
264
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
265
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
266
|
+
ensure
|
267
|
+
File.delete(infile.path.gsub(/\.sdf/,".numeric.sdf"))
|
268
|
+
File.delete(outfile_path)
|
269
|
+
infile.close!
|
270
|
+
end
|
271
|
+
}
|
272
|
+
master = CSV::parse(File.open(csvfile.path, "rb").read)
|
50
273
|
end
|
51
|
-
#LOGGER.debug "Ambit descriptor URIs: #{descs_uris.join(", ")}"
|
52
274
|
|
275
|
+
rescue Exception => e
|
276
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
277
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
278
|
+
ensure
|
279
|
+
[ csvfile].each { |f| f.close! }
|
280
|
+
end
|
281
|
+
|
282
|
+
[ master, ids ]
|
283
|
+
|
284
|
+
end
|
285
|
+
|
286
|
+
# Calculate CDK physico-chemical descriptors via Ambit -- DO NOT OVERLOAD Ambit.
|
287
|
+
# @param[Hash] required: :compounds, :pc_type, :task, :step optional: :descriptor
|
288
|
+
# @return[Array] array of Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features, hash smiles to inchi, array of field descriptions
|
289
|
+
def self.get_cdk_descriptors(params)
|
290
|
+
|
291
|
+
ambit_result_uri = [] # 1st pos: base uri, then features
|
292
|
+
smiles_to_inchi = {}
|
293
|
+
task_weights = {"electronic"=> 4, "topological"=> 19, "constitutional"=> 12, "geometrical"=> 3, "hybrid"=> 2, "cpsa"=> 1 }
|
294
|
+
task_weights.keys.each { |pc_type| task_weights.delete(pc_type) if (params[:pc_type] && (!params[:pc_type].split(",").include?(pc_type)))}
|
295
|
+
task_sum = Float task_weights.values.sum
|
296
|
+
task_weights.keys.each { |pc_type| task_weights[pc_type] /= task_sum }
|
297
|
+
task_weights.keys.each { |pc_type| task_weights[pc_type] *= params[:step] }
|
298
|
+
|
299
|
+
|
300
|
+
# extract wanted descriptors from config file and parameters
|
301
|
+
pc_descriptors = YAML::load_file(@keysfile)
|
302
|
+
|
303
|
+
ids = pc_descriptors.collect { |id, info|
|
304
|
+
"#{info[:pc_type]}:::#{id}" if info[:lib] == "cdk" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
|
305
|
+
}.compact
|
306
|
+
|
307
|
+
if ids.size > 0
|
308
|
+
ids.sort!
|
309
|
+
ids.collect! { |id| id.split(":::").last }
|
310
|
+
|
311
|
+
# create dataset at Ambit
|
53
312
|
begin
|
54
|
-
# Create SMI
|
55
|
-
smiles_array = []; smiles_to_inchi = {}
|
56
313
|
params[:compounds].each do |n|
|
57
314
|
cmpd = OpenTox::Compound.new(n)
|
58
315
|
smiles_string = cmpd.to_smiles
|
59
316
|
smiles_to_inchi[smiles_string] = URI.encode_www_form_component(cmpd.to_inchi)
|
60
|
-
smiles_array << smiles_string
|
61
317
|
end
|
62
|
-
smi_file = Tempfile.open(['pc_ambit', '.csv'])
|
63
|
-
|
64
|
-
|
65
|
-
# Create Ambit dataset
|
66
|
-
smi_file.puts( "SMILES\n" )
|
67
|
-
smi_file.puts( smiles_array.join("\n") )
|
68
|
-
smi_file.flush
|
69
|
-
ambit_ds_uri = OpenTox::RestClientWrapper.post(ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} )
|
318
|
+
smi_file = Tempfile.open(['pc_ambit', '.csv']) ; smi_file.puts( "SMILES\n" + smiles_to_inchi.keys.join("\n") ) ; smi_file.flush
|
319
|
+
ambit_ds_uri = OpenTox::RestClientWrapper.post(@ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} )
|
320
|
+
ambit_result_uri = [ ambit_ds_uri + "?" ] # 1st pos: base uri, then features
|
70
321
|
rescue Exception => e
|
71
322
|
LOGGER.debug "#{e.class}: #{e.message}"
|
72
323
|
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
73
324
|
ensure
|
74
325
|
smi_file.close! if smi_file
|
75
326
|
end
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }"
|
82
|
-
end
|
83
|
-
|
84
|
-
# Get Ambit results
|
85
|
-
ambit_result_uri = [] # 1st pos: base uri, then features
|
86
|
-
ambit_result_uri << ambit_ds_uri + "?"
|
327
|
+
# get SMILES feature URI
|
328
|
+
ambit_smiles_uri = OpenTox::RestClientWrapper.get(
|
329
|
+
ambit_ds_uri + "/features",
|
330
|
+
{:accept=> "text/uri-list"}
|
331
|
+
).chomp
|
87
332
|
ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&")
|
88
|
-
|
89
|
-
|
333
|
+
# always calculate 3D (http://goo.gl/Tk81j), then get results
|
334
|
+
OpenTox::RestClientWrapper.post(
|
335
|
+
@ambit_mopac_model_uri,
|
336
|
+
{:dataset_uri => ambit_ds_uri},
|
337
|
+
{:accept => "text/uri-list"}
|
338
|
+
)
|
339
|
+
current_cat = ""
|
340
|
+
ids.each_with_index do |id, i|
|
341
|
+
old_cat = current_cat; current_cat = pc_descriptors[id][:pc_type]
|
342
|
+
params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights[old_cat]) if params[:task] && old_cat != current_cat && old_cat != ""
|
343
|
+
algorithm = Algorithm::Generic.new(@ambit_descriptor_algorithm_uri+id)
|
90
344
|
result_uri = algorithm.run({:dataset_uri => ambit_ds_uri})
|
91
345
|
ambit_result_uri << result_uri.split("?")[1] + "&"
|
92
|
-
LOGGER.debug "Ambit (#{
|
346
|
+
LOGGER.debug "Ambit (#{ids.size}): #{i+1}"
|
93
347
|
end
|
348
|
+
params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights[current_cat]) if params[:task]
|
94
349
|
#LOGGER.debug "Ambit result: #{ambit_result_uri.join('')}"
|
95
|
-
[ ambit_result_uri, smiles_to_inchi ]
|
96
|
-
|
97
|
-
rescue Exception => e
|
98
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
99
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
100
350
|
end
|
351
|
+
|
352
|
+
[ ambit_result_uri, smiles_to_inchi, ids ]
|
353
|
+
|
101
354
|
end
|
102
355
|
|
103
356
|
|
104
357
|
# Load dataset via CSV
|
105
358
|
# @param[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
|
106
|
-
# @
|
107
|
-
|
359
|
+
# @param[Hash] keys: SMILES, values: InChIs
|
360
|
+
# @param[Array] field descriptions, one for each feature
|
361
|
+
# @return[Array] CSV, array of field ids, array of field descriptions
|
362
|
+
def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, single_ids, subjectid=nil)
|
108
363
|
|
109
364
|
master=nil
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
}
|
365
|
+
ids=[]
|
366
|
+
ambit_ids=[]
|
367
|
+
|
368
|
+
if ambit_result_uri.size > 0
|
369
|
+
(1...ambit_result_uri.size).collect { |idx|
|
370
|
+
curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
|
371
|
+
#LOGGER.debug "Requesting #{curr_uri}"
|
372
|
+
csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
|
373
|
+
if csv_data[0] && csv_data[0].size>1
|
374
|
+
if master.nil? # This is the smiles entry
|
375
|
+
(1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
|
376
|
+
master = csv_data
|
377
|
+
next
|
378
|
+
else
|
379
|
+
index_uri = csv_data[0].index("SMILES")
|
380
|
+
csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information
|
381
|
+
|
382
|
+
nr_cols = (csv_data[0].size)-1
|
383
|
+
LOGGER.debug "Merging #{nr_cols} new columns"
|
384
|
+
ids += Array.new(nr_cols, single_ids[idx-2])
|
385
|
+
master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
|
386
|
+
csv_data.each do |row|
|
387
|
+
temp = master.assoc(row[0]) # Finds the appropriate line in master
|
388
|
+
((-1*nr_cols)..-1).collect.each { |idx|
|
389
|
+
temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
|
390
|
+
}
|
391
|
+
end
|
131
392
|
end
|
132
393
|
end
|
133
|
-
|
134
|
-
}
|
394
|
+
}
|
135
395
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
396
|
+
index_uri = master[0].index("Compound")
|
397
|
+
master.map {|i| i.delete_at(index_uri)}
|
398
|
+
master[0].each {|cell| cell.chomp!(" ")}
|
399
|
+
master[0][0] = "Compound" #"SMILES"
|
400
|
+
index_smi = master[0].index("SMILES")
|
401
|
+
master.map {|i| i.delete_at(index_smi)} if index_smi
|
402
|
+
master[0][0] = "SMILES"
|
403
|
+
ambit_ids=master[0].collect {|header| header.to_s.gsub(/[\/.\\\(\)\{\}\[\]]/,"_")}
|
404
|
+
ambit_ids.shift
|
405
|
+
end
|
143
406
|
|
144
407
|
#LOGGER.debug "-------- AM: Writing to dumpfile"
|
145
408
|
#File.open("/tmp/test.csv", 'w') {|f| f.write( master.collect {|r| r.join(",")}.join("\n") ) }
|
146
409
|
|
147
|
-
|
148
|
-
|
149
|
-
ds.save(subjectid)
|
150
|
-
parser.dataset = ds
|
151
|
-
ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"))
|
152
|
-
ds.save(subjectid)
|
410
|
+
[ master, ids, ambit_ids ]
|
411
|
+
|
153
412
|
end
|
154
413
|
|
155
414
|
|
@@ -208,8 +467,8 @@ module OpenTox
|
|
208
467
|
end
|
209
468
|
|
210
469
|
|
211
|
-
# Effect calculation for classification
|
212
|
-
# @param [Array] Array of occurrences per class in the form of Enumerables.
|
470
|
+
# Effect calculation for classification. It is assumed that the elements of the arrays match each other pairwise
|
471
|
+
# @param [Array] Array of occurrences per class (in the form of Enumerables).
|
213
472
|
# @param [Array] Array of database instance counts per class.
|
214
473
|
def self.effect(occurrences, db_instances)
|
215
474
|
max=0
|