opentox-ruby 3.1.0 → 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -459,32 +459,80 @@ module OpenTox
459
459
  def initialize(dataset)
460
460
  @rows = []
461
461
  @rows << ["SMILES"]
462
+
462
463
  features = dataset.features.keys
463
- @rows.first << features
464
+
465
+ # prepare for subgraphs
466
+ have_substructures = features.collect{ |id| dataset.features[id][RDF.type].include? OT.Substructure}.compact.uniq
467
+ if have_substructures.size == 1 && have_substructures[0]
468
+ features_smarts = features.collect{ |id| "'" + dataset.features[id][OT.smarts] + "'" }
469
+ end
470
+
471
+ # gather missing features
472
+ delete_features = []
473
+ features.each{ |id|
474
+ dataset.features[id][RDF.type].each { |typestr|
475
+ if typestr.include? "MissingFeature"
476
+ delete_features << id
477
+ end
478
+ }
479
+ }
480
+ features = features - delete_features
481
+
482
+ # detect nr duplicates per compound
483
+ compound_sizes = {}
484
+ dataset.compounds.each do |compound|
485
+ entries=dataset.data_entries[compound]
486
+ if entries
487
+ entries.each do |feature, values|
488
+ compound_sizes[compound] || compound_sizes[compound] = []
489
+ compound_sizes[compound] << values.size
490
+ end
491
+ compound_sizes[compound].uniq!
492
+ raise "Inappropriate data for CSV export for compound #{compound}" if compound_sizes[compound].size > 1
493
+ compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
494
+ end
495
+ end
496
+
497
+ # get headers
498
+ features_smarts && @rows.first << features_smarts || @rows.first << features
464
499
  @rows.first.flatten!
465
- dataset.data_entries.each do |compound,entries|
466
- cmpd = Compound.new(compound)
467
- smiles = cmpd.to_smiles
468
- inchi = URI.encode_www_form_component(cmpd.to_inchi)
469
- row_container = Array.new
470
- row = Array.new(@rows.first.size)
471
- row_container << row
472
- #row[0] = smiles
473
- row[0] = inchi
474
- entries.each do |feature, values|
475
- i = features.index(feature)+1
476
- values.each do |value|
477
- if row_container[0][i]
478
- #LOGGER.debug "Feature '#{feature}' (nr '#{i}'): '#{value}'"
479
- row_container << row_container.last.collect
480
- row_container.last[i] = value
481
- #LOGGER.debug "RC: #{row_container.to_yaml}"
482
- else
483
- row_container.each { |r| r[i] = value }
484
- end
500
+
501
+ # feature positions pre-calculated
502
+ feature_positions = features.inject({}) { |h,f|
503
+ h.merge!({f => features.index(f)+1}) # +1 due to ID
504
+ h
505
+ }
506
+
507
+ # serialize to csv
508
+ dataset.compounds.each do |compound|
509
+ entries=dataset.data_entries[compound]
510
+ if entries
511
+ inchi = URI.encode_www_form_component(Compound.new(compound).to_inchi)
512
+
513
+ # allocate container
514
+ row_container = Array.new(compound_sizes[compound])
515
+ (0...row_container.size).each do |i|
516
+ row_container[i] = Array.new(@rows.first.size)
517
+ row_container[i][0] = inchi
518
+ end
519
+
520
+ # fill entries
521
+ entries.each { |feature, values|
522
+ (0...compound_sizes[compound]).each { |i|
523
+ row_container[i][feature_positions[feature]] = values[i]
524
+ }
525
+ }
526
+
527
+ # fill zeroes for subgraphs
528
+ if (features_smarts)
529
+ row_container.collect! { |row|
530
+ row.collect! { |x| x ? x : 0 }
531
+ }
485
532
  end
533
+ row_container.each { |row| @rows << row }
534
+
486
535
  end
487
- row_container.each { |r| @rows << r }
488
536
  end
489
537
  end
490
538
 
@@ -1,4 +1,13 @@
1
1
 
2
+ round_it <- function( x )
3
+ {
4
+ if(isTRUE((x - floor(x))>=0.5))
5
+ ceiling(x)
6
+ else
7
+ floor(x)
8
+ }
9
+
10
+
2
11
  nominal_to_binary <- function( data )
3
12
  {
4
13
  result = NULL
@@ -41,9 +50,13 @@ nominal_to_binary <- function( data )
41
50
  result
42
51
  }
43
52
 
44
- process_data <- function( data )
53
+ process_data <- function( data, colnames=NULL )
45
54
  {
46
55
  data.num <- as.data.frame(data)
56
+ if (!is.null(colnames))
57
+ {
58
+ data.num = subset(data.num, select = colnames)
59
+ }
47
60
  if (!is.numeric(data.num))
48
61
  {
49
62
  data.num = nominal_to_binary(data.num)
@@ -72,14 +85,15 @@ cluster <- function( data, min=10, max=15 )
72
85
  cbind(s$partition[,m])
73
86
  }
74
87
 
75
- stratified_split <- function( data, ratio=0.3, method="cluster" )
88
+ stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL )
76
89
  {
77
- data.processed = as.matrix(process_data( data ))
90
+ data.processed = as.matrix(process_data( data, colnames ))
91
+ print(paste("split using #features: ",ncol(data.processed)))
78
92
  if (method == "samplecube")
79
93
  {
80
94
  require("sampling")
81
95
  # adjust ratio to make samplecube return exact number of samples
82
- ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
96
+ ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed)
83
97
  pik = rep(ratio,times=nrow(data.processed))
84
98
  data.strat = cbind(pik,data.processed)
85
99
  samplecube(data.strat,pik,order=2,comment=F)
@@ -101,10 +115,11 @@ stratified_split <- function( data, ratio=0.3, method="cluster" )
101
115
  stop("unknown method")
102
116
  }
103
117
 
104
- stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
118
+ stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL )
105
119
  {
106
120
  print(paste(num_folds,"-fold-split, data-size",nrow(data)))
107
- data.processed = as.matrix(process_data( data ))
121
+ data.processed = as.matrix(process_data( data, colnames ))
122
+ print(paste("split using #features: ",ncol(data.processed)))
108
123
  if (method == "samplecube")
109
124
  {
110
125
  folds = rep(0, times=nrow(data))
@@ -133,7 +148,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
133
148
  {
134
149
  require("TunePareto")
135
150
  cl = cluster(data.processed)
136
- res = generateCVRuns(cl,ntimes=1,nfold=3)
151
+ res = generateCVRuns(cl,ntimes=1,nfold=num_folds)
137
152
  folds = rep(0, times=nrow(data))
138
153
  for (i in 1:num_folds)
139
154
  for(j in 1:length(res[[1]][[i]]))
@@ -144,6 +159,50 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
144
159
  stop("unknown method")
145
160
  }
146
161
 
162
+ duplicate_indices <- function( data ) {
163
+ indices = 1:nrow(data)
164
+ z = data
165
+ duplicate_index = anyDuplicated(z)
166
+ while(duplicate_index) {
167
+ duplicate_to_index = anyDuplicated(z[1:duplicate_index,],fromLast=T)
168
+ #print(paste(duplicate_index,'is dupl to',duplicate_to_index))
169
+ indices[duplicate_index] <- duplicate_to_index
170
+ z[duplicate_index,] <- paste('123$§%',duplicate_index)
171
+ duplicate_index = anyDuplicated(z)
172
+ }
173
+ indices
174
+ }
175
+
176
+ add_duplicates <- function( data, dup_indices ) {
177
+ result = data[1,]
178
+ for(i in 2:length(dup_indices)) {
179
+ row = data[rownames(data)==dup_indices[i],]
180
+ if(length(row)==0)
181
+ stop(paste('index ',i,' dup-index ',dup_indices[i],'not found in data'))
182
+ result = rbind(result, row)
183
+ }
184
+ rownames(result)<-NULL
185
+ result
186
+ }
187
+
188
+ sammon_duplicates <- function( data, ... ) {
189
+ di <- duplicate_indices(data)
190
+ print(di)
191
+ u <- unique(data)
192
+ print(paste('unique data points',nrow(u),'of',nrow(data)))
193
+ if(nrow(u) <= 4) stop("number of unqiue datapoints <= 4")
194
+ points_unique <- sammon(dist(u), ...)$points
195
+ if (nrow(u)<nrow(data))
196
+ {
197
+ points <- add_duplicates(points_unique, di)
198
+ points
199
+ }
200
+ else
201
+ {
202
+ points_unique
203
+ }
204
+ }
205
+
147
206
  plot_pre_process <- function( data, method="pca" )
148
207
  {
149
208
  data.processed = process_data( data )
@@ -158,6 +217,11 @@ plot_pre_process <- function( data, method="pca" )
158
217
  data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
159
218
  data.emb$conf
160
219
  }
220
+ else if (method == "sammon")
221
+ {
222
+ require("MASS")
223
+ sammon_duplicates(data.processed, k=2)
224
+ }
161
225
  else
162
226
  stop("unknown method")
163
227
  }
@@ -396,7 +396,7 @@ module OpenTox
396
396
  @q_prop = gsl_q_prop_orig.row(0).to_a
397
397
  end
398
398
 
399
- LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
399
+ LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop)
400
400
  LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
401
401
 
402
402
  @sims = [ gram_matrix, @sims ]
@@ -490,8 +490,10 @@ module OpenTox
490
490
 
491
491
  @cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = []
492
492
 
493
- @model.fingerprints.each { |fp|
494
- cmpd = fp[0]; fp = fp[1]
493
+ # Major BUG! Must loop over @model.compounds, hash is unordered!
494
+ # @model.fingerprints.each
495
+ @model.compounds.each { |cmpd|
496
+ fp = @model.fingerprints[cmpd]
495
497
  if @model.activities[cmpd] # row good
496
498
  acts = @model.activities[cmpd]; @acts += acts
497
499
  LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
@@ -1,155 +1,414 @@
1
1
  require 'csv'
2
+ require 'tempfile'
2
3
 
3
4
 
4
5
  module OpenTox
5
6
 
6
7
  module Algorithm
7
8
 
9
+ @ambit_descriptor_algorithm_uri = "http://apps.ideaconsult.net:8080/ambit2/algorithm/org.openscience.cdk.qsar.descriptors.molecular."
10
+ @ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/"
11
+ @ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632"
12
+ @keysfile = File.join(ENV['HOME'], ".opentox", "config", "pc_descriptors.yaml")
13
+
8
14
  include OpenTox
9
15
 
10
16
  # Calculate physico-chemical descriptors.
11
- # @param[Hash] Required keys: :dataset_uri, :pc_type
17
+ # @param[Hash] required: :dataset_uri, :pc_type, :rjb, :task, :add_uri, optional: :descriptor, :lib, :subjectid
12
18
  # @return[String] dataset uri
13
-
14
19
  def self.pc_descriptors(params)
15
20
 
21
+ ds = OpenTox::Dataset.find(params[:dataset_uri],params[:subjectid])
22
+ compounds = ds.compounds.collect
23
+ task_weights = {"joelib"=> 20, "openbabel"=> 1, "cdk"=> 50 }
24
+ task_weights.keys.each { |step| task_weights.delete(step) if (params[:lib] && (!params[:lib].split(",").include?(step)))}
25
+ task_weights["load"] = 10
26
+ task_sum = Float task_weights.values.sum
27
+ task_weights.keys.each { |step| task_weights[step] /= task_sum }
28
+ task_weights.keys.each { |step| task_weights[step] = (task_weights[step]*100).floor }
29
+
30
+ jl_master=nil
31
+ cdk_master=nil
32
+ ob_master=nil
33
+
34
+
35
+ # # # openbabel (via ruby bindings)
36
+ if !params[:lib] || params[:lib].split(",").include?("openbabel")
37
+ ob_master, ob_ids = get_ob_descriptors( { :compounds => compounds, :pc_type => params[:pc_type], :descriptor => params[:descriptor] } )
38
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["openbabel"]) if params[:task]
39
+ end
40
+
41
+
42
+ # # # joelib (via rjb)
43
+ if !params[:lib] || params[:lib].split(",").include?("joelib")
44
+ jl_master, jl_ids = get_jl_descriptors( { :compounds => compounds, :rjb => params[:rjb], :pc_type => params[:pc_type], :descriptor => params[:descriptor] } )
45
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["joelib"]) if params[:task]
46
+ end
47
+
48
+
49
+ # # # cdk (via REST)
50
+ if !params[:lib] || params[:lib].split(",").include?("cdk")
51
+ ambit_result_uri, smiles_to_inchi, cdk_ids = get_cdk_descriptors( { :compounds => compounds, :pc_type => params[:pc_type], :task => params[:task], :step => task_weights["cdk"], :descriptor => params[:descriptor] } )
52
+ #LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
53
+ cdk_master, cdk_ids, ambit_ids = load_ds_csv(ambit_result_uri, smiles_to_inchi, cdk_ids )
54
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["load"]) if params[:task]
55
+ end
56
+
57
+ # # # fuse CSVs ("master" structures)
58
+ if jl_master && cdk_master
59
+ nr_cols = (jl_master[0].size)-1
60
+ LOGGER.debug "Merging #{nr_cols} new columns"
61
+ cdk_master.each {|row| nr_cols.times { row.push(nil) } }
62
+ jl_master.each do |row|
63
+ temp = cdk_master.assoc(row[0]) # Finds the appropriate line in master
64
+ ((-1*nr_cols)..-1).collect.each { |idx|
65
+ temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
66
+ }
67
+ end
68
+ master = cdk_master
69
+ else # either jl_master or cdk_master nil
70
+ master = jl_master || cdk_master
71
+ end
72
+
73
+ if ob_master && master
74
+ nr_cols = (ob_master[0].size)-1
75
+ LOGGER.debug "Merging #{nr_cols} new columns"
76
+ master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
77
+ ob_master.each do |row|
78
+ temp = master.assoc(row[0]) # Finds the appropriate line in master
79
+ ((-1*nr_cols)..-1).collect.each { |idx|
80
+ temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
81
+ }
82
+ end
83
+ else # either ob_master or master nil
84
+ master = ob_master || master
85
+ end
86
+
87
+ if master
88
+
89
+ ds = OpenTox::Dataset.find(
90
+ OpenTox::RestClientWrapper.post(
91
+ File.join(CONFIG[:services]["opentox-dataset"]), master.collect { |row| row.join(",") }.join("\n"), {:content_type => "text/csv", :subjectid => params[:subjectid]}
92
+ ),params[:subjectid]
93
+ )
94
+
95
+ # # # add feature metadata
96
+ pc_descriptors = YAML::load_file(@keysfile)
97
+ ambit_ids && ambit_ids.each_with_index { |id,idx|
98
+ raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
99
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[cdk_ids[idx]][:name]} [#{pc_descriptors[cdk_ids[idx]][:pc_type]}, #{pc_descriptors[cdk_ids[idx]][:lib]}]"})
100
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => @ambit_descriptor_algorithm_uri + cdk_ids[idx]})
101
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
102
+ }
103
+ ob_ids && ob_ids.each { |id|
104
+ raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
105
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[id][:name]} [#{pc_descriptors[id][:pc_type]}, #{pc_descriptors[id][:lib]}]"})
106
+ creator_uri = ds.uri.gsub(/\/dataset\/.*/, "/algorithm/pc")
107
+ creator_uri += "/#{id}" if params[:add_uri]
108
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => creator_uri})
109
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
110
+ }
111
+ jl_ids && jl_ids.each { |id|
112
+ raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
113
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[id][:name]} [#{pc_descriptors[id][:pc_type]}, #{pc_descriptors[id][:lib]}]"})
114
+ creator_uri = ds.uri.gsub(/\/dataset\/.*/, "/algorithm/pc")
115
+ creator_uri += "/#{id}" if params[:add_uri]
116
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => creator_uri})
117
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
118
+ }
119
+
120
+ ds.save(params[:subjectid])
121
+ else
122
+ raise OpenTox::BadRequestError.new "No descriptors matching your criteria found."
123
+ end
124
+
125
+ end
126
+
127
+
128
+ # Calculate OpenBabel physico-chemical descriptors.
129
+ # @param[Hash] required: :compounds, :pc_type, :task, optional: :descriptor
130
+ # @return[Array] CSV, array of field ids, array of field descriptions
131
+ def self.get_ob_descriptors(params)
132
+
133
+ master = nil
134
+
16
135
  begin
17
- ds = OpenTox::Dataset.find(params[:dataset_uri])
18
- compounds = ds.compounds.collect
19
- ambit_result_uri, smiles_to_inchi = get_pc_descriptors( { :compounds => compounds, :pc_type => params[:pc_type] } )
20
- #ambit_result_uri = ["http://apps.ideaconsult.net:8080/ambit2/dataset/987103?" ,"feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Ffeature%2F4276789&", "feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Fmodel%2F16%2Fpredicted"] # for testing
21
- LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
22
- load_ds_csv(ambit_result_uri, smiles_to_inchi)
136
+ csvfile = Tempfile.open(['ob_descriptors-','.csv'])
137
+
138
+ pc_descriptors = YAML::load_file(@keysfile)
139
+ ids = pc_descriptors.collect{ |id, info|
140
+ id if info[:lib] == "openbabel" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
141
+ }.compact
142
+
143
+ if ids.length > 0
144
+ csvfile.puts((["SMILES"] + ids).join(","))
145
+
146
+ # remember inchis
147
+ inchis = params[:compounds].collect { |c_uri|
148
+ URI.encode_www_form_component(OpenTox::Compound.new(c_uri).to_inchi)
149
+ }
150
+
151
+ # Process compounds
152
+ obmol = OpenBabel::OBMol.new
153
+ obconversion = OpenBabel::OBConversion.new
154
+ obconversion.set_in_and_out_formats 'inchi', 'can'
155
+
156
+ inchis.each_with_index { |inchi, c_idx|
157
+ row = [inchis[c_idx]]
158
+ obconversion.read_string(obmol, URI.decode_www_form_component(inchi))
159
+ ids.each { |name|
160
+ if obmol.respond_to?(name.underscore)
161
+ val = eval("obmol.#{name.underscore}") if obmol.respond_to?(name.underscore)
162
+ else
163
+ if name != "nF" && name != "spinMult" && name != "nHal" && name != "logP"
164
+ val = OpenBabel::OBDescriptor.find_type(name.underscore).predict(obmol)
165
+ elsif name == "nF"
166
+ val = OpenBabel::OBDescriptor.find_type("nf").predict(obmol)
167
+ elsif name == "spinMult" || name == "nHal" || name == "logP"
168
+ val = OpenBabel::OBDescriptor.find_type(name).predict(obmol)
169
+ end
170
+ end
171
+ if OpenTox::Algorithm.numeric?(val)
172
+ val = Float(val)
173
+ val = nil if val.nan?
174
+ val = nil if (val && val.infinite?)
175
+ end
176
+ row << val
177
+ }
178
+ LOGGER.debug "Compound #{c_idx+1} (#{inchis.size}), #{row.size} entries"
179
+ csvfile.puts(row.join(","))
180
+ csvfile.flush
181
+ }
182
+ master = CSV::parse(File.open(csvfile.path, "rb").read)
183
+ end
184
+
23
185
  rescue Exception => e
24
186
  LOGGER.debug "#{e.class}: #{e.message}"
25
187
  LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
188
+ ensure
189
+ csvfile.close!
26
190
  end
27
191
 
192
+ [ master, ids ]
193
+
28
194
  end
29
-
30
- # Calculates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit.
31
- # @param[Hash] Required keys: :compounds, :pc_type
32
- # @return[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
33
- def self.get_pc_descriptors(params)
34
195
 
196
+
197
+
198
+ # Calculate Joelib2 physico-chemical descriptors.
199
+ # @param[Hash] required: :compounds, :pc_type, :task, optional: :descriptor
200
+ # @return[Array] CSV, array of field ids, array of field descriptions
201
+ def self.get_jl_descriptors(params)
202
+
203
+ master = nil
204
+ s = params[:rjb]; raise "No Java environment" unless s
205
+
206
+ # Load keys, enter CSV headers
35
207
  begin
208
+ csvfile = Tempfile.open(['jl_descriptors-','.csv'])
36
209
 
37
- ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/"
38
- ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632"
39
- descs = YAML::load_file( File.join(ENV['HOME'], ".opentox", "config", "ambit_descriptors.yaml") )
40
- descs_uris = []
41
- params[:pc_type] = "electronic,cpsa" if params[:pc_type].nil? # rescue missing pc_type
42
- types = params[:pc_type].split(",")
43
- descs.each { |uri, cat_name|
44
- if types.include? cat_name[:category]
45
- descs_uris << uri
46
- end
47
- }
48
- if descs_uris.size == 0
49
- raise "Error! Empty set of descriptors. Did you supply one of [geometrical, topological, electronic, constitutional, hybrid, cpsa] ?"
210
+ pc_descriptors = YAML::load_file(@keysfile)
211
+ ids = pc_descriptors.collect{ |id, info|
212
+ id if info[:lib] == "joelib" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
213
+ }.compact
214
+
215
+
216
+ if ids.length > 0
217
+ csvfile.puts((["SMILES"] + ids).join(","))
218
+
219
+ # remember inchis
220
+ inchis = params[:compounds].collect { |c_uri|
221
+ cmpd = OpenTox::Compound.new(c_uri)
222
+ URI.encode_www_form_component(cmpd.to_inchi)
223
+ }
224
+
225
+ # Process compounds
226
+ params[:compounds].each_with_index { |c_uri, c_idx|
227
+ cmpd = OpenTox::Compound.new(c_uri)
228
+ inchi = cmpd.to_inchi
229
+ sdf_data = cmpd.to_sdf
230
+
231
+ infile = Tempfile.open(['jl_descriptors-in-','.sdf'])
232
+ outfile_path = infile.path.gsub(/jl_descriptors-in/,"jl_descriptors-out")
233
+
234
+ begin
235
+ infile.puts sdf_data
236
+ infile.flush
237
+ s.new(infile.path, outfile_path) # runs joelib
238
+
239
+ row = [inchis[c_idx]]
240
+ ids.each_with_index do |k,i| # Fill row
241
+ re = Regexp.new(k)
242
+ open(outfile_path) do |f|
243
+ f.each do |line|
244
+ if @prev == k
245
+ entry = line.chomp
246
+ val = nil
247
+ if OpenTox::Algorithm.numeric?(entry)
248
+ val = Float(entry)
249
+ val = nil if val.nan?
250
+ val = nil if (val && val.infinite?)
251
+ end
252
+ row << val
253
+ break
254
+ end
255
+ @prev = line.gsub(/^.*types./,"").gsub(/count./,"").gsub(/>/,"").chomp if line =~ re
256
+ end
257
+ end
258
+ end
259
+ LOGGER.debug "Compound #{c_idx+1} (#{inchis.size}), #{row.size} entries"
260
+ csvfile.puts(row.join(","))
261
+ csvfile.flush
262
+
263
+ rescue Exception => e
264
+ LOGGER.debug "#{e.class}: #{e.message}"
265
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
266
+ ensure
267
+ File.delete(infile.path.gsub(/\.sdf/,".numeric.sdf"))
268
+ File.delete(outfile_path)
269
+ infile.close!
270
+ end
271
+ }
272
+ master = CSV::parse(File.open(csvfile.path, "rb").read)
50
273
  end
51
- #LOGGER.debug "Ambit descriptor URIs: #{descs_uris.join(", ")}"
52
274
 
275
+ rescue Exception => e
276
+ LOGGER.debug "#{e.class}: #{e.message}"
277
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
278
+ ensure
279
+ [ csvfile].each { |f| f.close! }
280
+ end
281
+
282
+ [ master, ids ]
283
+
284
+ end
285
+
286
+ # Calculate CDK physico-chemical descriptors via Ambit -- DO NOT OVERLOAD Ambit.
287
+ # @param[Hash] required: :compounds, :pc_type, :task, :step optional: :descriptor
288
+ # @return[Array] array of Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features, hash smiles to inchi, array of field descriptions
289
+ def self.get_cdk_descriptors(params)
290
+
291
+ ambit_result_uri = [] # 1st pos: base uri, then features
292
+ smiles_to_inchi = {}
293
+ task_weights = {"electronic"=> 4, "topological"=> 19, "constitutional"=> 12, "geometrical"=> 3, "hybrid"=> 2, "cpsa"=> 1 }
294
+ task_weights.keys.each { |pc_type| task_weights.delete(pc_type) if (params[:pc_type] && (!params[:pc_type].split(",").include?(pc_type)))}
295
+ task_sum = Float task_weights.values.sum
296
+ task_weights.keys.each { |pc_type| task_weights[pc_type] /= task_sum }
297
+ task_weights.keys.each { |pc_type| task_weights[pc_type] *= params[:step] }
298
+
299
+
300
+ # extract wanted descriptors from config file and parameters
301
+ pc_descriptors = YAML::load_file(@keysfile)
302
+
303
+ ids = pc_descriptors.collect { |id, info|
304
+ "#{info[:pc_type]}:::#{id}" if info[:lib] == "cdk" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
305
+ }.compact
306
+
307
+ if ids.size > 0
308
+ ids.sort!
309
+ ids.collect! { |id| id.split(":::").last }
310
+
311
+ # create dataset at Ambit
53
312
  begin
54
- # Create SMI
55
- smiles_array = []; smiles_to_inchi = {}
56
313
  params[:compounds].each do |n|
57
314
  cmpd = OpenTox::Compound.new(n)
58
315
  smiles_string = cmpd.to_smiles
59
316
  smiles_to_inchi[smiles_string] = URI.encode_www_form_component(cmpd.to_inchi)
60
- smiles_array << smiles_string
61
317
  end
62
- smi_file = Tempfile.open(['pc_ambit', '.csv'])
63
- pc_descriptors = nil
64
-
65
- # Create Ambit dataset
66
- smi_file.puts( "SMILES\n" )
67
- smi_file.puts( smiles_array.join("\n") )
68
- smi_file.flush
69
- ambit_ds_uri = OpenTox::RestClientWrapper.post(ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} )
318
+ smi_file = Tempfile.open(['pc_ambit', '.csv']) ; smi_file.puts( "SMILES\n" + smiles_to_inchi.keys.join("\n") ) ; smi_file.flush
319
+ ambit_ds_uri = OpenTox::RestClientWrapper.post(@ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} )
320
+ ambit_result_uri = [ ambit_ds_uri + "?" ] # 1st pos: base uri, then features
70
321
  rescue Exception => e
71
322
  LOGGER.debug "#{e.class}: #{e.message}"
72
323
  LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
73
324
  ensure
74
325
  smi_file.close! if smi_file
75
326
  end
76
- ambit_smiles_uri = OpenTox::RestClientWrapper.get(ambit_ds_uri + "/features", {:accept=> "text/uri-list"} ).chomp
77
-
78
- # Calculate 3D for CPSA
79
- if types.include? "cpsa"
80
- ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} )
81
- LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }"
82
- end
83
-
84
- # Get Ambit results
85
- ambit_result_uri = [] # 1st pos: base uri, then features
86
- ambit_result_uri << ambit_ds_uri + "?"
327
+ # get SMILES feature URI
328
+ ambit_smiles_uri = OpenTox::RestClientWrapper.get(
329
+ ambit_ds_uri + "/features",
330
+ {:accept=> "text/uri-list"}
331
+ ).chomp
87
332
  ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&")
88
- descs_uris.each_with_index do |uri, i|
89
- algorithm = Algorithm::Generic.new(uri)
333
+ # always calculate 3D (http://goo.gl/Tk81j), then get results
334
+ OpenTox::RestClientWrapper.post(
335
+ @ambit_mopac_model_uri,
336
+ {:dataset_uri => ambit_ds_uri},
337
+ {:accept => "text/uri-list"}
338
+ )
339
+ current_cat = ""
340
+ ids.each_with_index do |id, i|
341
+ old_cat = current_cat; current_cat = pc_descriptors[id][:pc_type]
342
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights[old_cat]) if params[:task] && old_cat != current_cat && old_cat != ""
343
+ algorithm = Algorithm::Generic.new(@ambit_descriptor_algorithm_uri+id)
90
344
  result_uri = algorithm.run({:dataset_uri => ambit_ds_uri})
91
345
  ambit_result_uri << result_uri.split("?")[1] + "&"
92
- LOGGER.debug "Ambit (#{descs_uris.size}): #{i+1}"
346
+ LOGGER.debug "Ambit (#{ids.size}): #{i+1}"
93
347
  end
348
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights[current_cat]) if params[:task]
94
349
  #LOGGER.debug "Ambit result: #{ambit_result_uri.join('')}"
95
- [ ambit_result_uri, smiles_to_inchi ]
96
-
97
- rescue Exception => e
98
- LOGGER.debug "#{e.class}: #{e.message}"
99
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
100
350
  end
351
+
352
+ [ ambit_result_uri, smiles_to_inchi, ids ]
353
+
101
354
  end
102
355
 
103
356
 
104
357
  # Load dataset via CSV
105
358
  # @param[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
106
- # @return[String] dataset uri
107
- def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, subjectid=nil)
359
+ # @param[Hash] keys: SMILES, values: InChIs
360
+ # @param[Array] field descriptions, one for each feature
361
+ # @return[Array] CSV, array of field ids, array of field descriptions
362
+ def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, single_ids, subjectid=nil)
108
363
 
109
364
  master=nil
110
- (1...ambit_result_uri.size).collect { |idx|
111
- curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
112
- LOGGER.debug "Requesting #{curr_uri}"
113
- csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
114
- if csv_data[0] && csv_data[0].size>1
115
- if master.nil? # This is the smiles entry
116
- (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
117
- master = csv_data
118
- next
119
- else
120
- index_uri = csv_data[0].index("SMILES")
121
- csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information
122
-
123
- nr_cols = (csv_data[0].size)-1
124
- LOGGER.debug "Merging #{nr_cols} new columns"
125
- master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
126
- csv_data.each do |row|
127
- temp = master.assoc(row[0]) # Finds the appropriate line in master
128
- ((-1*nr_cols)..-1).collect.each { |idx|
129
- temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
130
- }
365
+ ids=[]
366
+ ambit_ids=[]
367
+
368
+ if ambit_result_uri.size > 0
369
+ (1...ambit_result_uri.size).collect { |idx|
370
+ curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
371
+ #LOGGER.debug "Requesting #{curr_uri}"
372
+ csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
373
+ if csv_data[0] && csv_data[0].size>1
374
+ if master.nil? # This is the smiles entry
375
+ (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
376
+ master = csv_data
377
+ next
378
+ else
379
+ index_uri = csv_data[0].index("SMILES")
380
+ csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information
381
+
382
+ nr_cols = (csv_data[0].size)-1
383
+ LOGGER.debug "Merging #{nr_cols} new columns"
384
+ ids += Array.new(nr_cols, single_ids[idx-2])
385
+ master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
386
+ csv_data.each do |row|
387
+ temp = master.assoc(row[0]) # Finds the appropriate line in master
388
+ ((-1*nr_cols)..-1).collect.each { |idx|
389
+ temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
390
+ }
391
+ end
131
392
  end
132
393
  end
133
- end
134
- }
394
+ }
135
395
 
136
- index_uri = master[0].index("Compound")
137
- master.map {|i| i.delete_at(index_uri)}
138
- master[0].each {|cell| cell.chomp!(" ")}
139
- master[0][0] = "Compound" #"SMILES"
140
- index_smi = master[0].index("SMILES")
141
- master.map {|i| i.delete_at(index_smi)} if index_smi
142
- #master[0][0] = "SMILES"
396
+ index_uri = master[0].index("Compound")
397
+ master.map {|i| i.delete_at(index_uri)}
398
+ master[0].each {|cell| cell.chomp!(" ")}
399
+ master[0][0] = "Compound" #"SMILES"
400
+ index_smi = master[0].index("SMILES")
401
+ master.map {|i| i.delete_at(index_smi)} if index_smi
402
+ master[0][0] = "SMILES"
403
+ ambit_ids=master[0].collect {|header| header.to_s.gsub(/[\/.\\\(\)\{\}\[\]]/,"_")}
404
+ ambit_ids.shift
405
+ end
143
406
 
144
407
  #LOGGER.debug "-------- AM: Writing to dumpfile"
145
408
  #File.open("/tmp/test.csv", 'w') {|f| f.write( master.collect {|r| r.join(",")}.join("\n") ) }
146
409
 
147
- parser = OpenTox::Parser::Spreadsheets.new
148
- ds = OpenTox::Dataset.new(nil,subjectid)
149
- ds.save(subjectid)
150
- parser.dataset = ds
151
- ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"))
152
- ds.save(subjectid)
410
+ [ master, ids, ambit_ids ]
411
+
153
412
  end
154
413
 
155
414
 
@@ -208,8 +467,8 @@ module OpenTox
208
467
  end
209
468
 
210
469
 
211
- # Effect calculation for classification
212
- # @param [Array] Array of occurrences per class in the form of Enumerables.
470
+ # Effect calculation for classification. It is assumed that the elements of the arrays match each other pairwise
471
+ # @param [Array] Array of occurrences per class (in the form of Enumerables).
213
472
  # @param [Array] Array of database instance counts per class.
214
473
  def self.effect(occurrences, db_instances)
215
474
  max=0