opentox-ruby 3.1.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -459,32 +459,80 @@ module OpenTox
459
459
  def initialize(dataset)
460
460
  @rows = []
461
461
  @rows << ["SMILES"]
462
+
462
463
  features = dataset.features.keys
463
- @rows.first << features
464
+
465
+ # prepare for subgraphs
466
+ have_substructures = features.collect{ |id| dataset.features[id][RDF.type].include? OT.Substructure}.compact.uniq
467
+ if have_substructures.size == 1 && have_substructures[0]
468
+ features_smarts = features.collect{ |id| "'" + dataset.features[id][OT.smarts] + "'" }
469
+ end
470
+
471
+ # gather missing features
472
+ delete_features = []
473
+ features.each{ |id|
474
+ dataset.features[id][RDF.type].each { |typestr|
475
+ if typestr.include? "MissingFeature"
476
+ delete_features << id
477
+ end
478
+ }
479
+ }
480
+ features = features - delete_features
481
+
482
+ # detect nr duplicates per compound
483
+ compound_sizes = {}
484
+ dataset.compounds.each do |compound|
485
+ entries=dataset.data_entries[compound]
486
+ if entries
487
+ entries.each do |feature, values|
488
+ compound_sizes[compound] || compound_sizes[compound] = []
489
+ compound_sizes[compound] << values.size
490
+ end
491
+ compound_sizes[compound].uniq!
492
+ raise "Inappropriate data for CSV export for compound #{compound}" if compound_sizes[compound].size > 1
493
+ compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
494
+ end
495
+ end
496
+
497
+ # get headers
498
+ features_smarts && @rows.first << features_smarts || @rows.first << features
464
499
  @rows.first.flatten!
465
- dataset.data_entries.each do |compound,entries|
466
- cmpd = Compound.new(compound)
467
- smiles = cmpd.to_smiles
468
- inchi = URI.encode_www_form_component(cmpd.to_inchi)
469
- row_container = Array.new
470
- row = Array.new(@rows.first.size)
471
- row_container << row
472
- #row[0] = smiles
473
- row[0] = inchi
474
- entries.each do |feature, values|
475
- i = features.index(feature)+1
476
- values.each do |value|
477
- if row_container[0][i]
478
- #LOGGER.debug "Feature '#{feature}' (nr '#{i}'): '#{value}'"
479
- row_container << row_container.last.collect
480
- row_container.last[i] = value
481
- #LOGGER.debug "RC: #{row_container.to_yaml}"
482
- else
483
- row_container.each { |r| r[i] = value }
484
- end
500
+
501
+ # feature positions pre-calculated
502
+ feature_positions = features.inject({}) { |h,f|
503
+ h.merge!({f => features.index(f)+1}) # +1 due to ID
504
+ h
505
+ }
506
+
507
+ # serialize to csv
508
+ dataset.compounds.each do |compound|
509
+ entries=dataset.data_entries[compound]
510
+ if entries
511
+ inchi = URI.encode_www_form_component(Compound.new(compound).to_inchi)
512
+
513
+ # allocate container
514
+ row_container = Array.new(compound_sizes[compound])
515
+ (0...row_container.size).each do |i|
516
+ row_container[i] = Array.new(@rows.first.size)
517
+ row_container[i][0] = inchi
518
+ end
519
+
520
+ # fill entries
521
+ entries.each { |feature, values|
522
+ (0...compound_sizes[compound]).each { |i|
523
+ row_container[i][feature_positions[feature]] = values[i]
524
+ }
525
+ }
526
+
527
+ # fill zeroes for subgraphs
528
+ if (features_smarts)
529
+ row_container.collect! { |row|
530
+ row.collect! { |x| x ? x : 0 }
531
+ }
485
532
  end
533
+ row_container.each { |row| @rows << row }
534
+
486
535
  end
487
- row_container.each { |r| @rows << r }
488
536
  end
489
537
  end
490
538
 
@@ -1,4 +1,13 @@
1
1
 
2
+ round_it <- function( x )
3
+ {
4
+ if(isTRUE((x - floor(x))>=0.5))
5
+ ceiling(x)
6
+ else
7
+ floor(x)
8
+ }
9
+
10
+
2
11
  nominal_to_binary <- function( data )
3
12
  {
4
13
  result = NULL
@@ -41,9 +50,13 @@ nominal_to_binary <- function( data )
41
50
  result
42
51
  }
43
52
 
44
- process_data <- function( data )
53
+ process_data <- function( data, colnames=NULL )
45
54
  {
46
55
  data.num <- as.data.frame(data)
56
+ if (!is.null(colnames))
57
+ {
58
+ data.num = subset(data.num, select = colnames)
59
+ }
47
60
  if (!is.numeric(data.num))
48
61
  {
49
62
  data.num = nominal_to_binary(data.num)
@@ -72,14 +85,15 @@ cluster <- function( data, min=10, max=15 )
72
85
  cbind(s$partition[,m])
73
86
  }
74
87
 
75
- stratified_split <- function( data, ratio=0.3, method="cluster" )
88
+ stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL )
76
89
  {
77
- data.processed = as.matrix(process_data( data ))
90
+ data.processed = as.matrix(process_data( data, colnames ))
91
+ print(paste("split using #features: ",ncol(data.processed)))
78
92
  if (method == "samplecube")
79
93
  {
80
94
  require("sampling")
81
95
  # adjust ratio to make samplecube return exact number of samples
82
- ratio = round(nrow(data.processed)*ratio)/nrow(data.processed)
96
+ ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed)
83
97
  pik = rep(ratio,times=nrow(data.processed))
84
98
  data.strat = cbind(pik,data.processed)
85
99
  samplecube(data.strat,pik,order=2,comment=F)
@@ -101,10 +115,11 @@ stratified_split <- function( data, ratio=0.3, method="cluster" )
101
115
  stop("unknown method")
102
116
  }
103
117
 
104
- stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
118
+ stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL )
105
119
  {
106
120
  print(paste(num_folds,"-fold-split, data-size",nrow(data)))
107
- data.processed = as.matrix(process_data( data ))
121
+ data.processed = as.matrix(process_data( data, colnames ))
122
+ print(paste("split using #features: ",ncol(data.processed)))
108
123
  if (method == "samplecube")
109
124
  {
110
125
  folds = rep(0, times=nrow(data))
@@ -133,7 +148,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
133
148
  {
134
149
  require("TunePareto")
135
150
  cl = cluster(data.processed)
136
- res = generateCVRuns(cl,ntimes=1,nfold=3)
151
+ res = generateCVRuns(cl,ntimes=1,nfold=num_folds)
137
152
  folds = rep(0, times=nrow(data))
138
153
  for (i in 1:num_folds)
139
154
  for(j in 1:length(res[[1]][[i]]))
@@ -144,6 +159,50 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" )
144
159
  stop("unknown method")
145
160
  }
146
161
 
162
+ duplicate_indices <- function( data ) {
163
+ indices = 1:nrow(data)
164
+ z = data
165
+ duplicate_index = anyDuplicated(z)
166
+ while(duplicate_index) {
167
+ duplicate_to_index = anyDuplicated(z[1:duplicate_index,],fromLast=T)
168
+ #print(paste(duplicate_index,'is dupl to',duplicate_to_index))
169
+ indices[duplicate_index] <- duplicate_to_index
170
+ z[duplicate_index,] <- paste('123$§%',duplicate_index)
171
+ duplicate_index = anyDuplicated(z)
172
+ }
173
+ indices
174
+ }
175
+
176
+ add_duplicates <- function( data, dup_indices ) {
177
+ result = data[1,]
178
+ for(i in 2:length(dup_indices)) {
179
+ row = data[rownames(data)==dup_indices[i],]
180
+ if(length(row)==0)
181
+ stop(paste('index ',i,' dup-index ',dup_indices[i],'not found in data'))
182
+ result = rbind(result, row)
183
+ }
184
+ rownames(result)<-NULL
185
+ result
186
+ }
187
+
188
+ sammon_duplicates <- function( data, ... ) {
189
+ di <- duplicate_indices(data)
190
+ print(di)
191
+ u <- unique(data)
192
+ print(paste('unique data points',nrow(u),'of',nrow(data)))
193
+ if(nrow(u) <= 4) stop("number of unqiue datapoints <= 4")
194
+ points_unique <- sammon(dist(u), ...)$points
195
+ if (nrow(u)<nrow(data))
196
+ {
197
+ points <- add_duplicates(points_unique, di)
198
+ points
199
+ }
200
+ else
201
+ {
202
+ points_unique
203
+ }
204
+ }
205
+
147
206
  plot_pre_process <- function( data, method="pca" )
148
207
  {
149
208
  data.processed = process_data( data )
@@ -158,6 +217,11 @@ plot_pre_process <- function( data, method="pca" )
158
217
  data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T)
159
218
  data.emb$conf
160
219
  }
220
+ else if (method == "sammon")
221
+ {
222
+ require("MASS")
223
+ sammon_duplicates(data.processed, k=2)
224
+ }
161
225
  else
162
226
  stop("unknown method")
163
227
  }
@@ -396,7 +396,7 @@ module OpenTox
396
396
  @q_prop = gsl_q_prop_orig.row(0).to_a
397
397
  end
398
398
 
399
- LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}"
399
+ LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop)
400
400
  LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}"
401
401
 
402
402
  @sims = [ gram_matrix, @sims ]
@@ -490,8 +490,10 @@ module OpenTox
490
490
 
491
491
  @cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = []
492
492
 
493
- @model.fingerprints.each { |fp|
494
- cmpd = fp[0]; fp = fp[1]
493
+ # Major BUG! Must loop over @model.compounds, hash is unordered!
494
+ # @model.fingerprints.each
495
+ @model.compounds.each { |cmpd|
496
+ fp = @model.fingerprints[cmpd]
495
497
  if @model.activities[cmpd] # row good
496
498
  acts = @model.activities[cmpd]; @acts += acts
497
499
  LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
@@ -1,155 +1,414 @@
1
1
  require 'csv'
2
+ require 'tempfile'
2
3
 
3
4
 
4
5
  module OpenTox
5
6
 
6
7
  module Algorithm
7
8
 
9
+ @ambit_descriptor_algorithm_uri = "http://apps.ideaconsult.net:8080/ambit2/algorithm/org.openscience.cdk.qsar.descriptors.molecular."
10
+ @ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/"
11
+ @ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632"
12
+ @keysfile = File.join(ENV['HOME'], ".opentox", "config", "pc_descriptors.yaml")
13
+
8
14
  include OpenTox
9
15
 
10
16
  # Calculate physico-chemical descriptors.
11
- # @param[Hash] Required keys: :dataset_uri, :pc_type
17
+ # @param[Hash] required: :dataset_uri, :pc_type, :rjb, :task, :add_uri, optional: :descriptor, :lib, :subjectid
12
18
  # @return[String] dataset uri
13
-
14
19
  def self.pc_descriptors(params)
15
20
 
21
+ ds = OpenTox::Dataset.find(params[:dataset_uri],params[:subjectid])
22
+ compounds = ds.compounds.collect
23
+ task_weights = {"joelib"=> 20, "openbabel"=> 1, "cdk"=> 50 }
24
+ task_weights.keys.each { |step| task_weights.delete(step) if (params[:lib] && (!params[:lib].split(",").include?(step)))}
25
+ task_weights["load"] = 10
26
+ task_sum = Float task_weights.values.sum
27
+ task_weights.keys.each { |step| task_weights[step] /= task_sum }
28
+ task_weights.keys.each { |step| task_weights[step] = (task_weights[step]*100).floor }
29
+
30
+ jl_master=nil
31
+ cdk_master=nil
32
+ ob_master=nil
33
+
34
+
35
+ # # # openbabel (via ruby bindings)
36
+ if !params[:lib] || params[:lib].split(",").include?("openbabel")
37
+ ob_master, ob_ids = get_ob_descriptors( { :compounds => compounds, :pc_type => params[:pc_type], :descriptor => params[:descriptor] } )
38
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["openbabel"]) if params[:task]
39
+ end
40
+
41
+
42
+ # # # joelib (via rjb)
43
+ if !params[:lib] || params[:lib].split(",").include?("joelib")
44
+ jl_master, jl_ids = get_jl_descriptors( { :compounds => compounds, :rjb => params[:rjb], :pc_type => params[:pc_type], :descriptor => params[:descriptor] } )
45
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["joelib"]) if params[:task]
46
+ end
47
+
48
+
49
+ # # # cdk (via REST)
50
+ if !params[:lib] || params[:lib].split(",").include?("cdk")
51
+ ambit_result_uri, smiles_to_inchi, cdk_ids = get_cdk_descriptors( { :compounds => compounds, :pc_type => params[:pc_type], :task => params[:task], :step => task_weights["cdk"], :descriptor => params[:descriptor] } )
52
+ #LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
53
+ cdk_master, cdk_ids, ambit_ids = load_ds_csv(ambit_result_uri, smiles_to_inchi, cdk_ids )
54
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["load"]) if params[:task]
55
+ end
56
+
57
+ # # # fuse CSVs ("master" structures)
58
+ if jl_master && cdk_master
59
+ nr_cols = (jl_master[0].size)-1
60
+ LOGGER.debug "Merging #{nr_cols} new columns"
61
+ cdk_master.each {|row| nr_cols.times { row.push(nil) } }
62
+ jl_master.each do |row|
63
+ temp = cdk_master.assoc(row[0]) # Finds the appropriate line in master
64
+ ((-1*nr_cols)..-1).collect.each { |idx|
65
+ temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
66
+ }
67
+ end
68
+ master = cdk_master
69
+ else # either jl_master or cdk_master nil
70
+ master = jl_master || cdk_master
71
+ end
72
+
73
+ if ob_master && master
74
+ nr_cols = (ob_master[0].size)-1
75
+ LOGGER.debug "Merging #{nr_cols} new columns"
76
+ master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
77
+ ob_master.each do |row|
78
+ temp = master.assoc(row[0]) # Finds the appropriate line in master
79
+ ((-1*nr_cols)..-1).collect.each { |idx|
80
+ temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
81
+ }
82
+ end
83
+ else # either ob_master or master nil
84
+ master = ob_master || master
85
+ end
86
+
87
+ if master
88
+
89
+ ds = OpenTox::Dataset.find(
90
+ OpenTox::RestClientWrapper.post(
91
+ File.join(CONFIG[:services]["opentox-dataset"]), master.collect { |row| row.join(",") }.join("\n"), {:content_type => "text/csv", :subjectid => params[:subjectid]}
92
+ ),params[:subjectid]
93
+ )
94
+
95
+ # # # add feature metadata
96
+ pc_descriptors = YAML::load_file(@keysfile)
97
+ ambit_ids && ambit_ids.each_with_index { |id,idx|
98
+ raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
99
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[cdk_ids[idx]][:name]} [#{pc_descriptors[cdk_ids[idx]][:pc_type]}, #{pc_descriptors[cdk_ids[idx]][:lib]}]"})
100
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => @ambit_descriptor_algorithm_uri + cdk_ids[idx]})
101
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
102
+ }
103
+ ob_ids && ob_ids.each { |id|
104
+ raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
105
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[id][:name]} [#{pc_descriptors[id][:pc_type]}, #{pc_descriptors[id][:lib]}]"})
106
+ creator_uri = ds.uri.gsub(/\/dataset\/.*/, "/algorithm/pc")
107
+ creator_uri += "/#{id}" if params[:add_uri]
108
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => creator_uri})
109
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
110
+ }
111
+ jl_ids && jl_ids.each { |id|
112
+ raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)]
113
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[id][:name]} [#{pc_descriptors[id][:pc_type]}, #{pc_descriptors[id][:lib]}]"})
114
+ creator_uri = ds.uri.gsub(/\/dataset\/.*/, "/algorithm/pc")
115
+ creator_uri += "/#{id}" if params[:add_uri]
116
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => creator_uri})
117
+ ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]})
118
+ }
119
+
120
+ ds.save(params[:subjectid])
121
+ else
122
+ raise OpenTox::BadRequestError.new "No descriptors matching your criteria found."
123
+ end
124
+
125
+ end
126
+
127
+
128
+ # Calculate OpenBabel physico-chemical descriptors.
129
+ # @param[Hash] required: :compounds, :pc_type, :task, optional: :descriptor
130
+ # @return[Array] CSV, array of field ids, array of field descriptions
131
+ def self.get_ob_descriptors(params)
132
+
133
+ master = nil
134
+
16
135
  begin
17
- ds = OpenTox::Dataset.find(params[:dataset_uri])
18
- compounds = ds.compounds.collect
19
- ambit_result_uri, smiles_to_inchi = get_pc_descriptors( { :compounds => compounds, :pc_type => params[:pc_type] } )
20
- #ambit_result_uri = ["http://apps.ideaconsult.net:8080/ambit2/dataset/987103?" ,"feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Ffeature%2F4276789&", "feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Fmodel%2F16%2Fpredicted"] # for testing
21
- LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
22
- load_ds_csv(ambit_result_uri, smiles_to_inchi)
136
+ csvfile = Tempfile.open(['ob_descriptors-','.csv'])
137
+
138
+ pc_descriptors = YAML::load_file(@keysfile)
139
+ ids = pc_descriptors.collect{ |id, info|
140
+ id if info[:lib] == "openbabel" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
141
+ }.compact
142
+
143
+ if ids.length > 0
144
+ csvfile.puts((["SMILES"] + ids).join(","))
145
+
146
+ # remember inchis
147
+ inchis = params[:compounds].collect { |c_uri|
148
+ URI.encode_www_form_component(OpenTox::Compound.new(c_uri).to_inchi)
149
+ }
150
+
151
+ # Process compounds
152
+ obmol = OpenBabel::OBMol.new
153
+ obconversion = OpenBabel::OBConversion.new
154
+ obconversion.set_in_and_out_formats 'inchi', 'can'
155
+
156
+ inchis.each_with_index { |inchi, c_idx|
157
+ row = [inchis[c_idx]]
158
+ obconversion.read_string(obmol, URI.decode_www_form_component(inchi))
159
+ ids.each { |name|
160
+ if obmol.respond_to?(name.underscore)
161
+ val = eval("obmol.#{name.underscore}") if obmol.respond_to?(name.underscore)
162
+ else
163
+ if name != "nF" && name != "spinMult" && name != "nHal" && name != "logP"
164
+ val = OpenBabel::OBDescriptor.find_type(name.underscore).predict(obmol)
165
+ elsif name == "nF"
166
+ val = OpenBabel::OBDescriptor.find_type("nf").predict(obmol)
167
+ elsif name == "spinMult" || name == "nHal" || name == "logP"
168
+ val = OpenBabel::OBDescriptor.find_type(name).predict(obmol)
169
+ end
170
+ end
171
+ if OpenTox::Algorithm.numeric?(val)
172
+ val = Float(val)
173
+ val = nil if val.nan?
174
+ val = nil if (val && val.infinite?)
175
+ end
176
+ row << val
177
+ }
178
+ LOGGER.debug "Compound #{c_idx+1} (#{inchis.size}), #{row.size} entries"
179
+ csvfile.puts(row.join(","))
180
+ csvfile.flush
181
+ }
182
+ master = CSV::parse(File.open(csvfile.path, "rb").read)
183
+ end
184
+
23
185
  rescue Exception => e
24
186
  LOGGER.debug "#{e.class}: #{e.message}"
25
187
  LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
188
+ ensure
189
+ csvfile.close!
26
190
  end
27
191
 
192
+ [ master, ids ]
193
+
28
194
  end
29
-
30
- # Calculates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit.
31
- # @param[Hash] Required keys: :compounds, :pc_type
32
- # @return[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
33
- def self.get_pc_descriptors(params)
34
195
 
196
+
197
+
198
+ # Calculate Joelib2 physico-chemical descriptors.
199
+ # @param[Hash] required: :compounds, :pc_type, :task, optional: :descriptor
200
+ # @return[Array] CSV, array of field ids, array of field descriptions
201
+ def self.get_jl_descriptors(params)
202
+
203
+ master = nil
204
+ s = params[:rjb]; raise "No Java environment" unless s
205
+
206
+ # Load keys, enter CSV headers
35
207
  begin
208
+ csvfile = Tempfile.open(['jl_descriptors-','.csv'])
36
209
 
37
- ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/"
38
- ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632"
39
- descs = YAML::load_file( File.join(ENV['HOME'], ".opentox", "config", "ambit_descriptors.yaml") )
40
- descs_uris = []
41
- params[:pc_type] = "electronic,cpsa" if params[:pc_type].nil? # rescue missing pc_type
42
- types = params[:pc_type].split(",")
43
- descs.each { |uri, cat_name|
44
- if types.include? cat_name[:category]
45
- descs_uris << uri
46
- end
47
- }
48
- if descs_uris.size == 0
49
- raise "Error! Empty set of descriptors. Did you supply one of [geometrical, topological, electronic, constitutional, hybrid, cpsa] ?"
210
+ pc_descriptors = YAML::load_file(@keysfile)
211
+ ids = pc_descriptors.collect{ |id, info|
212
+ id if info[:lib] == "joelib" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
213
+ }.compact
214
+
215
+
216
+ if ids.length > 0
217
+ csvfile.puts((["SMILES"] + ids).join(","))
218
+
219
+ # remember inchis
220
+ inchis = params[:compounds].collect { |c_uri|
221
+ cmpd = OpenTox::Compound.new(c_uri)
222
+ URI.encode_www_form_component(cmpd.to_inchi)
223
+ }
224
+
225
+ # Process compounds
226
+ params[:compounds].each_with_index { |c_uri, c_idx|
227
+ cmpd = OpenTox::Compound.new(c_uri)
228
+ inchi = cmpd.to_inchi
229
+ sdf_data = cmpd.to_sdf
230
+
231
+ infile = Tempfile.open(['jl_descriptors-in-','.sdf'])
232
+ outfile_path = infile.path.gsub(/jl_descriptors-in/,"jl_descriptors-out")
233
+
234
+ begin
235
+ infile.puts sdf_data
236
+ infile.flush
237
+ s.new(infile.path, outfile_path) # runs joelib
238
+
239
+ row = [inchis[c_idx]]
240
+ ids.each_with_index do |k,i| # Fill row
241
+ re = Regexp.new(k)
242
+ open(outfile_path) do |f|
243
+ f.each do |line|
244
+ if @prev == k
245
+ entry = line.chomp
246
+ val = nil
247
+ if OpenTox::Algorithm.numeric?(entry)
248
+ val = Float(entry)
249
+ val = nil if val.nan?
250
+ val = nil if (val && val.infinite?)
251
+ end
252
+ row << val
253
+ break
254
+ end
255
+ @prev = line.gsub(/^.*types./,"").gsub(/count./,"").gsub(/>/,"").chomp if line =~ re
256
+ end
257
+ end
258
+ end
259
+ LOGGER.debug "Compound #{c_idx+1} (#{inchis.size}), #{row.size} entries"
260
+ csvfile.puts(row.join(","))
261
+ csvfile.flush
262
+
263
+ rescue Exception => e
264
+ LOGGER.debug "#{e.class}: #{e.message}"
265
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
266
+ ensure
267
+ File.delete(infile.path.gsub(/\.sdf/,".numeric.sdf"))
268
+ File.delete(outfile_path)
269
+ infile.close!
270
+ end
271
+ }
272
+ master = CSV::parse(File.open(csvfile.path, "rb").read)
50
273
  end
51
- #LOGGER.debug "Ambit descriptor URIs: #{descs_uris.join(", ")}"
52
274
 
275
+ rescue Exception => e
276
+ LOGGER.debug "#{e.class}: #{e.message}"
277
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
278
+ ensure
279
+ [ csvfile].each { |f| f.close! }
280
+ end
281
+
282
+ [ master, ids ]
283
+
284
+ end
285
+
286
+ # Calculate CDK physico-chemical descriptors via Ambit -- DO NOT OVERLOAD Ambit.
287
+ # @param[Hash] required: :compounds, :pc_type, :task, :step optional: :descriptor
288
+ # @return[Array] array of Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features, hash smiles to inchi, array of field descriptions
289
+ def self.get_cdk_descriptors(params)
290
+
291
+ ambit_result_uri = [] # 1st pos: base uri, then features
292
+ smiles_to_inchi = {}
293
+ task_weights = {"electronic"=> 4, "topological"=> 19, "constitutional"=> 12, "geometrical"=> 3, "hybrid"=> 2, "cpsa"=> 1 }
294
+ task_weights.keys.each { |pc_type| task_weights.delete(pc_type) if (params[:pc_type] && (!params[:pc_type].split(",").include?(pc_type)))}
295
+ task_sum = Float task_weights.values.sum
296
+ task_weights.keys.each { |pc_type| task_weights[pc_type] /= task_sum }
297
+ task_weights.keys.each { |pc_type| task_weights[pc_type] *= params[:step] }
298
+
299
+
300
+ # extract wanted descriptors from config file and parameters
301
+ pc_descriptors = YAML::load_file(@keysfile)
302
+
303
+ ids = pc_descriptors.collect { |id, info|
304
+ "#{info[:pc_type]}:::#{id}" if info[:lib] == "cdk" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor])
305
+ }.compact
306
+
307
+ if ids.size > 0
308
+ ids.sort!
309
+ ids.collect! { |id| id.split(":::").last }
310
+
311
+ # create dataset at Ambit
53
312
  begin
54
- # Create SMI
55
- smiles_array = []; smiles_to_inchi = {}
56
313
  params[:compounds].each do |n|
57
314
  cmpd = OpenTox::Compound.new(n)
58
315
  smiles_string = cmpd.to_smiles
59
316
  smiles_to_inchi[smiles_string] = URI.encode_www_form_component(cmpd.to_inchi)
60
- smiles_array << smiles_string
61
317
  end
62
- smi_file = Tempfile.open(['pc_ambit', '.csv'])
63
- pc_descriptors = nil
64
-
65
- # Create Ambit dataset
66
- smi_file.puts( "SMILES\n" )
67
- smi_file.puts( smiles_array.join("\n") )
68
- smi_file.flush
69
- ambit_ds_uri = OpenTox::RestClientWrapper.post(ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} )
318
+ smi_file = Tempfile.open(['pc_ambit', '.csv']) ; smi_file.puts( "SMILES\n" + smiles_to_inchi.keys.join("\n") ) ; smi_file.flush
319
+ ambit_ds_uri = OpenTox::RestClientWrapper.post(@ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} )
320
+ ambit_result_uri = [ ambit_ds_uri + "?" ] # 1st pos: base uri, then features
70
321
  rescue Exception => e
71
322
  LOGGER.debug "#{e.class}: #{e.message}"
72
323
  LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
73
324
  ensure
74
325
  smi_file.close! if smi_file
75
326
  end
76
- ambit_smiles_uri = OpenTox::RestClientWrapper.get(ambit_ds_uri + "/features", {:accept=> "text/uri-list"} ).chomp
77
-
78
- # Calculate 3D for CPSA
79
- if types.include? "cpsa"
80
- ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} )
81
- LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }"
82
- end
83
-
84
- # Get Ambit results
85
- ambit_result_uri = [] # 1st pos: base uri, then features
86
- ambit_result_uri << ambit_ds_uri + "?"
327
+ # get SMILES feature URI
328
+ ambit_smiles_uri = OpenTox::RestClientWrapper.get(
329
+ ambit_ds_uri + "/features",
330
+ {:accept=> "text/uri-list"}
331
+ ).chomp
87
332
  ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&")
88
- descs_uris.each_with_index do |uri, i|
89
- algorithm = Algorithm::Generic.new(uri)
333
+ # always calculate 3D (http://goo.gl/Tk81j), then get results
334
+ OpenTox::RestClientWrapper.post(
335
+ @ambit_mopac_model_uri,
336
+ {:dataset_uri => ambit_ds_uri},
337
+ {:accept => "text/uri-list"}
338
+ )
339
+ current_cat = ""
340
+ ids.each_with_index do |id, i|
341
+ old_cat = current_cat; current_cat = pc_descriptors[id][:pc_type]
342
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights[old_cat]) if params[:task] && old_cat != current_cat && old_cat != ""
343
+ algorithm = Algorithm::Generic.new(@ambit_descriptor_algorithm_uri+id)
90
344
  result_uri = algorithm.run({:dataset_uri => ambit_ds_uri})
91
345
  ambit_result_uri << result_uri.split("?")[1] + "&"
92
- LOGGER.debug "Ambit (#{descs_uris.size}): #{i+1}"
346
+ LOGGER.debug "Ambit (#{ids.size}): #{i+1}"
93
347
  end
348
+ params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights[current_cat]) if params[:task]
94
349
  #LOGGER.debug "Ambit result: #{ambit_result_uri.join('')}"
95
- [ ambit_result_uri, smiles_to_inchi ]
96
-
97
- rescue Exception => e
98
- LOGGER.debug "#{e.class}: #{e.message}"
99
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
100
350
  end
351
+
352
+ [ ambit_result_uri, smiles_to_inchi, ids ]
353
+
101
354
  end
102
355
 
103
356
 
104
357
  # Load dataset via CSV
105
358
  # @param[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
106
- # @return[String] dataset uri
107
- def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, subjectid=nil)
359
+ # @param[Hash] keys: SMILES, values: InChIs
360
+ # @param[Array] field descriptions, one for each feature
361
+ # @return[Array] CSV, array of field ids, array of field descriptions
362
+ def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, single_ids, subjectid=nil)
108
363
 
109
364
  master=nil
110
- (1...ambit_result_uri.size).collect { |idx|
111
- curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
112
- LOGGER.debug "Requesting #{curr_uri}"
113
- csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
114
- if csv_data[0] && csv_data[0].size>1
115
- if master.nil? # This is the smiles entry
116
- (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
117
- master = csv_data
118
- next
119
- else
120
- index_uri = csv_data[0].index("SMILES")
121
- csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information
122
-
123
- nr_cols = (csv_data[0].size)-1
124
- LOGGER.debug "Merging #{nr_cols} new columns"
125
- master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
126
- csv_data.each do |row|
127
- temp = master.assoc(row[0]) # Finds the appropriate line in master
128
- ((-1*nr_cols)..-1).collect.each { |idx|
129
- temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
130
- }
365
+ ids=[]
366
+ ambit_ids=[]
367
+
368
+ if ambit_result_uri.size > 0
369
+ (1...ambit_result_uri.size).collect { |idx|
370
+ curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
371
+ #LOGGER.debug "Requesting #{curr_uri}"
372
+ csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
373
+ if csv_data[0] && csv_data[0].size>1
374
+ if master.nil? # This is the smiles entry
375
+ (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
376
+ master = csv_data
377
+ next
378
+ else
379
+ index_uri = csv_data[0].index("SMILES")
380
+ csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information
381
+
382
+ nr_cols = (csv_data[0].size)-1
383
+ LOGGER.debug "Merging #{nr_cols} new columns"
384
+ ids += Array.new(nr_cols, single_ids[idx-2])
385
+ master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
386
+ csv_data.each do |row|
387
+ temp = master.assoc(row[0]) # Finds the appropriate line in master
388
+ ((-1*nr_cols)..-1).collect.each { |idx|
389
+ temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
390
+ }
391
+ end
131
392
  end
132
393
  end
133
- end
134
- }
394
+ }
135
395
 
136
- index_uri = master[0].index("Compound")
137
- master.map {|i| i.delete_at(index_uri)}
138
- master[0].each {|cell| cell.chomp!(" ")}
139
- master[0][0] = "Compound" #"SMILES"
140
- index_smi = master[0].index("SMILES")
141
- master.map {|i| i.delete_at(index_smi)} if index_smi
142
- #master[0][0] = "SMILES"
396
+ index_uri = master[0].index("Compound")
397
+ master.map {|i| i.delete_at(index_uri)}
398
+ master[0].each {|cell| cell.chomp!(" ")}
399
+ master[0][0] = "Compound" #"SMILES"
400
+ index_smi = master[0].index("SMILES")
401
+ master.map {|i| i.delete_at(index_smi)} if index_smi
402
+ master[0][0] = "SMILES"
403
+ ambit_ids=master[0].collect {|header| header.to_s.gsub(/[\/.\\\(\)\{\}\[\]]/,"_")}
404
+ ambit_ids.shift
405
+ end
143
406
 
144
407
  #LOGGER.debug "-------- AM: Writing to dumpfile"
145
408
  #File.open("/tmp/test.csv", 'w') {|f| f.write( master.collect {|r| r.join(",")}.join("\n") ) }
146
409
 
147
- parser = OpenTox::Parser::Spreadsheets.new
148
- ds = OpenTox::Dataset.new(nil,subjectid)
149
- ds.save(subjectid)
150
- parser.dataset = ds
151
- ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"))
152
- ds.save(subjectid)
410
+ [ master, ids, ambit_ids ]
411
+
153
412
  end
154
413
 
155
414
 
@@ -208,8 +467,8 @@ module OpenTox
208
467
  end
209
468
 
210
469
 
211
- # Effect calculation for classification
212
- # @param [Array] Array of occurrences per class in the form of Enumerables.
470
+ # Effect calculation for classification. It is assumed that the elements of the arrays match each other pairwise
471
+ # @param [Array] Array of occurrences per class (in the form of Enumerables).
213
472
  # @param [Array] Array of database instance counts per class.
214
473
  def self.effect(occurrences, db_instances)
215
474
  max=0