opentox-ruby 3.0.1 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/utils.rb ADDED
@@ -0,0 +1,372 @@
1
+ require 'csv'
2
+
3
+
4
+ module OpenTox
5
+
6
+ module Algorithm
7
+
8
+ include OpenTox
9
+
10
+ # Calculate physico-chemical descriptors.
11
+ # @param[Hash] Required keys: :dataset_uri, :pc_type
12
+ # @return[String] dataset uri
13
+
14
+ def self.pc_descriptors(params)
15
+
16
+ begin
17
+ ds = OpenTox::Dataset.find(params[:dataset_uri])
18
+ compounds = ds.compounds.collect
19
+ ambit_result_uri, smiles_to_inchi = get_pc_descriptors( { :compounds => compounds, :pc_type => params[:pc_type] } )
20
+ #ambit_result_uri = ["http://apps.ideaconsult.net:8080/ambit2/dataset/987103?" ,"feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Ffeature%2F4276789&", "feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Fmodel%2F16%2Fpredicted"] # for testing
21
+ LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'"
22
+ load_ds_csv(ambit_result_uri, smiles_to_inchi)
23
+ rescue Exception => e
24
+ LOGGER.debug "#{e.class}: #{e.message}"
25
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
26
+ end
27
+
28
+ end
29
+
30
+ # Calculates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit.
31
+ # @param[Hash] Required keys: :compounds, :pc_type
32
+ # @return[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
33
+ def self.get_pc_descriptors(params)
34
+
35
+ begin
36
+
37
+ ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/"
38
+ ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632"
39
+ descs = YAML::load_file( File.join(ENV['HOME'], ".opentox", "config", "ambit_descriptors.yaml") )
40
+ descs_uris = []
41
+ params[:pc_type] = "electronic,cpsa" if params[:pc_type].nil? # rescue missing pc_type
42
+ types = params[:pc_type].split(",")
43
+ descs.each { |uri, cat_name|
44
+ if types.include? cat_name[:category]
45
+ descs_uris << uri
46
+ end
47
+ }
48
+ if descs_uris.size == 0
49
+ raise "Error! Empty set of descriptors. Did you supply one of [geometrical, topological, electronic, constitutional, hybrid, cpsa] ?"
50
+ end
51
+ #LOGGER.debug "Ambit descriptor URIs: #{descs_uris.join(", ")}"
52
+
53
+ begin
54
+ # Create SMI
55
+ smiles_array = []; smiles_to_inchi = {}
56
+ params[:compounds].each do |n|
57
+ cmpd = OpenTox::Compound.new(n)
58
+ smiles_string = cmpd.to_smiles
59
+ smiles_to_inchi[smiles_string] = URI.encode_www_form_component(cmpd.to_inchi)
60
+ smiles_array << smiles_string
61
+ end
62
+ smi_file = Tempfile.open(['pc_ambit', '.csv'])
63
+ pc_descriptors = nil
64
+
65
+ # Create Ambit dataset
66
+ smi_file.puts( "SMILES\n" )
67
+ smi_file.puts( smiles_array.join("\n") )
68
+ smi_file.flush
69
+ ambit_ds_uri = OpenTox::RestClientWrapper.post(ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} )
70
+ rescue Exception => e
71
+ LOGGER.debug "#{e.class}: #{e.message}"
72
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
73
+ ensure
74
+ smi_file.close! if smi_file
75
+ end
76
+ ambit_smiles_uri = OpenTox::RestClientWrapper.get(ambit_ds_uri + "/features", {:accept=> "text/uri-list"} ).chomp
77
+
78
+ # Calculate 3D for CPSA
79
+ if types.include? "cpsa"
80
+ ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} )
81
+ LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }"
82
+ end
83
+
84
+ # Get Ambit results
85
+ ambit_result_uri = [] # 1st pos: base uri, then features
86
+ ambit_result_uri << ambit_ds_uri + "?"
87
+ ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&")
88
+ descs_uris.each_with_index do |uri, i|
89
+ algorithm = Algorithm::Generic.new(uri)
90
+ result_uri = algorithm.run({:dataset_uri => ambit_ds_uri})
91
+ ambit_result_uri << result_uri.split("?")[1] + "&"
92
+ LOGGER.debug "Ambit (#{descs_uris.size}): #{i+1}"
93
+ end
94
+ #LOGGER.debug "Ambit result: #{ambit_result_uri.join('')}"
95
+ [ ambit_result_uri, smiles_to_inchi ]
96
+
97
+ rescue Exception => e
98
+ LOGGER.debug "#{e.class}: #{e.message}"
99
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
100
+ end
101
+ end
102
+
103
+
104
+ # Load dataset via CSV
105
+ # @param[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features
106
+ # @return[String] dataset uri
107
+ def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, subjectid=nil)
108
+
109
+ master=nil
110
+ (1...ambit_result_uri.size).collect { |idx|
111
+ curr_uri = ambit_result_uri[0] + ambit_result_uri[idx]
112
+ LOGGER.debug "Requesting #{curr_uri}"
113
+ csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) )
114
+ if csv_data[0] && csv_data[0].size>1
115
+ if master.nil? # This is the smiles entry
116
+ (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] }
117
+ master = csv_data
118
+ next
119
+ else
120
+ index_uri = csv_data[0].index("SMILES")
121
+ csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information
122
+
123
+ nr_cols = (csv_data[0].size)-1
124
+ LOGGER.debug "Merging #{nr_cols} new columns"
125
+ master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows
126
+ csv_data.each do |row|
127
+ temp = master.assoc(row[0]) # Finds the appropriate line in master
128
+ ((-1*nr_cols)..-1).collect.each { |idx|
129
+ temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found
130
+ }
131
+ end
132
+ end
133
+ end
134
+ }
135
+
136
+ index_uri = master[0].index("Compound")
137
+ master.map {|i| i.delete_at(index_uri)}
138
+ master[0].each {|cell| cell.chomp!(" ")}
139
+ master[0][0] = "Compound" #"SMILES"
140
+ index_smi = master[0].index("SMILES")
141
+ master.map {|i| i.delete_at(index_smi)} if index_smi
142
+ #master[0][0] = "SMILES"
143
+
144
+ #LOGGER.debug "-------- AM: Writing to dumpfile"
145
+ #File.open("/tmp/test.csv", 'w') {|f| f.write( master.collect {|r| r.join(",")}.join("\n") ) }
146
+
147
+ parser = OpenTox::Parser::Spreadsheets.new
148
+ ds = OpenTox::Dataset.new(nil,subjectid)
149
+ ds.save(subjectid)
150
+ parser.dataset = ds
151
+ ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n"))
152
+ ds.save(subjectid)
153
+ end
154
+
155
+
156
+ # Gauss kernel
157
+ # @return [Float]
158
+ def self.gauss(x, sigma = 0.3)
159
+ d = 1.0 - x.to_f
160
+ Math.exp(-(d*d)/(2*sigma*sigma))
161
+ end
162
+
163
+
164
+ # For symbolic features
165
+ # @param [Array] Array to test, must indicate non-occurrence with 0.
166
+ # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
167
+ def self.isnull_or_singular?(array)
168
+ nr_zeroes = array.count(0)
169
+ return (nr_zeroes == array.size) || # remove non-occurring feature
170
+ (nr_zeroes == array.size-1) || # remove singular feature
171
+ (nr_zeroes == 0) # also remove feature present everywhere
172
+ end
173
+
174
+
175
+ # Numeric value test
176
+ # @param[Object] value
177
+ # @return [Boolean] Whether value is a number
178
+ def self.numeric?(value)
179
+ true if Float(value) rescue false
180
+ end
181
+
182
+
183
+ # For symbolic features
184
+ # @param [Array] Array to test, must indicate non-occurrence with 0.
185
+ # @return [Boolean] Whether the feature has variance zero.
186
+ def self.zero_variance?(array)
187
+ return array.uniq.size == 1
188
+ end
189
+
190
+
191
+ # Sum of an array for Arrays.
192
+ # @param [Array] Array with values
193
+ # @return [Integer] Sum of size of values
194
+ def self.sum_size(array)
195
+ sum=0
196
+ array.each { |e| sum += e.size }
197
+ return sum
198
+ end
199
+
200
+
201
+ # Minimum Frequency
202
+ # @param [Integer] per-mil value
203
+ # return [Integer] min-frequency
204
+ def self.min_frequency(training_dataset,per_mil)
205
+ minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
206
+ minfreq = 2 unless minfreq > 2
207
+ Integer (minfreq)
208
+ end
209
+
210
+
211
+ # Effect calculation for classification
212
+ # @param [Array] Array of occurrences per class in the form of Enumerables.
213
+ # @param [Array] Array of database instance counts per class.
214
+ def self.effect(occurrences, db_instances)
215
+ max=0
216
+ max_value=0
217
+ nr_o = self.sum_size(occurrences)
218
+ nr_db = db_instances.to_scale.sum
219
+
220
+ occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
221
+ actual = o.size.to_f/nr_o
222
+ expected = db_instances[i].to_f/nr_db
223
+ if actual > expected
224
+ if ((actual - expected) / actual) > max_value
225
+ max_value = (actual - expected) / actual # 'Schleppzeiger'
226
+ max = i
227
+ end
228
+ end
229
+ }
230
+ max
231
+ end
232
+
233
+
234
+ # neighbors
235
+
236
+ module Neighbors
237
+
238
+ # Get confidence.
239
+ # @param[Hash] Required keys: :sims, :acts
240
+ # @return[Float] Confidence
241
+ def self.get_confidence(params)
242
+ conf = params[:sims].inject{|sum,x| sum + x }
243
+ confidence = conf/params[:sims].size
244
+ LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
245
+ return confidence
246
+ end
247
+
248
+ end
249
+
250
+
251
+ # Similarity calculations
252
+ module Similarity
253
+
254
+ # Tanimoto similarity
255
+ # @param [Hash, Array] fingerprints of first compound
256
+ # @param [Hash, Array] fingerprints of second compound
257
+ # @return [Float] (Weighted) tanimoto similarity
258
+ def self.tanimoto(fingerprints_a,fingerprints_b,weights=nil,params=nil)
259
+
260
+ common_p_sum = 0.0
261
+ all_p_sum = 0.0
262
+
263
+ # fingerprints are hashes
264
+ if fingerprints_a.class == Hash && fingerprints_b.class == Hash
265
+ common_features = fingerprints_a.keys & fingerprints_b.keys
266
+ all_features = (fingerprints_a.keys + fingerprints_b.keys).uniq
267
+ if common_features.size > 0
268
+ common_features.each{ |f| common_p_sum += [ fingerprints_a[f], fingerprints_b[f] ].min }
269
+ all_features.each{ |f| all_p_sum += [ fingerprints_a[f],fingerprints_b[f] ].compact.max } # compact, since one fp may be empty at that pos
270
+ end
271
+
272
+ # fingerprints are arrays
273
+ elsif fingerprints_a.class == Array && fingerprints_b.class == Array
274
+ size = [ fingerprints_a.size, fingerprints_b.size ].min
275
+ LOGGER.warn "fingerprints don't have equal size" if fingerprints_a.size != fingerprints_b.size
276
+ (0...size).each { |idx|
277
+ common_p_sum += [ fingerprints_a[idx], fingerprints_b[idx] ].min
278
+ all_p_sum += [ fingerprints_a[idx], fingerprints_b[idx] ].max
279
+ }
280
+ end
281
+
282
+ (all_p_sum > 0.0) ? (common_p_sum/all_p_sum) : 0.0
283
+
284
+ end
285
+
286
+
287
+ # Cosine similarity
288
+ # @param [Hash] properties_a key-value properties of first compound
289
+ # @param [Hash] properties_b key-value properties of second compound
290
+ # @return [Float] cosine of angle enclosed between vectors induced by keys present in both a and b
291
+ def self.cosine(fingerprints_a,fingerprints_b,weights=nil)
292
+
293
+ # fingerprints are hashes
294
+ if fingerprints_a.class == Hash && fingerprints_b.class == Hash
295
+ a = []; b = []
296
+ common_features = fingerprints_a.keys & fingerprints_b.keys
297
+ if common_features.size > 1
298
+ common_features.each do |p|
299
+ a << fingerprints_a[p]
300
+ b << fingerprints_b[p]
301
+ end
302
+ end
303
+
304
+ # fingerprints are arrays
305
+ elsif fingerprints_a.class == Array && fingerprints_b.class == Array
306
+ a = fingerprints_a
307
+ b = fingerprints_b
308
+ end
309
+
310
+ (a.size > 0 && b.size > 0) ? self.cosine_num(a.to_gv, b.to_gv) : 0.0
311
+
312
+ end
313
+
314
+
315
+ # Cosine similarity
316
+ # @param [GSL::Vector] a
317
+ # @param [GSL::Vector] b
318
+ # @return [Float] cosine of angle enclosed between a and b
319
+ def self.cosine_num(a, b)
320
+ if a.size>12 && b.size>12
321
+ a = a[0..11]
322
+ b = b[0..11]
323
+ end
324
+ a.dot(b) / (a.norm * b.norm)
325
+ end
326
+
327
+
328
+ # Outlier detection based on Mahalanobis distances
329
+ # Multivariate detection on X, univariate detection on y
330
+ # Uses an existing Rinruby instance, if possible
331
+ # @param[Hash] Keys query_matrix, data_matrix, acts are required; r, p_outlier optional
332
+ # @return[Array] indices identifying outliers (may occur several times, this is intended)
333
+ def self.outliers(params)
334
+ outlier_array = []
335
+ data_matrix = params[:data_matrix]
336
+ query_matrix = params[:query_matrix]
337
+ acts = params[:acts]
338
+ begin
339
+ LOGGER.debug "Outliers (p=#{params[:p_outlier] || 0.9999})..."
340
+ r = ( params[:r] || RinRuby.new(false,false) )
341
+ r.eval "suppressPackageStartupMessages(library(\"robustbase\"))"
342
+ r.eval "outlier_threshold = #{params[:p_outlier] || 0.999}"
343
+ nr_cases, nr_features = data_matrix.to_a.size, data_matrix.to_a[0].size
344
+ r.odx = data_matrix.to_a.flatten
345
+ r.q = query_matrix.to_a.flatten
346
+ r.y = acts.to_a.flatten
347
+ r.eval "odx = matrix(odx, #{nr_cases}, #{nr_features}, byrow=T)"
348
+ r.eval 'odx = rbind(q,odx)' # query is nr 0 (1) in ruby (R)
349
+ r.eval 'mah = covMcd(odx)$mah' # run MCD alg
350
+ r.eval "mah = pchisq(mah,#{nr_features})"
351
+ r.eval 'outlier_array = which(mah>outlier_threshold)' # multivariate outliers using robust mahalanobis
352
+ outlier_array = r.outlier_array.to_a.collect{|v| v-2 } # translate to ruby index (-1 for q, -1 due to ruby)
353
+ r.eval 'fqu = matrix(summary(y))[2]'
354
+ r.eval 'tqu = matrix(summary(y))[5]'
355
+ r.eval 'outlier_array = which(y>(tqu+1.5*IQR(y)))' # univariate outliers due to Tukey (http://goo.gl/mwzNH)
356
+ outlier_array += r.outlier_array.to_a.collect{|v| v-1 } # translate to ruby index (-1 due to ruby)
357
+ r.eval 'outlier_array = which(y<(fqu-1.5*IQR(y)))'
358
+ outlier_array += r.outlier_array.to_a.collect{|v| v-1 }
359
+ rescue Exception => e
360
+ LOGGER.debug "#{e.class}: #{e.message}"
361
+ #LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
362
+ end
363
+ outlier_array
364
+ end
365
+
366
+ end
367
+
368
+
369
+ end
370
+
371
+ end
372
+
data/lib/validation.rb CHANGED
@@ -1,3 +1,4 @@
1
+ require "yaml"
1
2
  module OpenTox
2
3
  class Validation
3
4
  include OpenTox
@@ -66,7 +67,7 @@ module OpenTox
66
67
  # @return [String] report uri
67
68
  def find_or_create_report( subjectid=nil, waiting_task=nil )
68
69
  @report = ValidationReport.find_for_validation(@uri, subjectid) unless @report
69
- @report = ValidationReport.create(@uri, subjectid, waiting_task) unless @report
70
+ @report = ValidationReport.create(@uri, {}, subjectid, waiting_task) unless @report
70
71
  @report.uri
71
72
  end
72
73
 
@@ -107,6 +108,31 @@ module OpenTox
107
108
  end
108
109
  table
109
110
  end
111
+
112
+ # returns probability-distribution for a given prediction
113
+ # it takes all predictions into account that have a confidence value that is >= confidence and that have the same predicted value
114
+ # (minimum 12 predictions with the hightest confidence are selected (even if the confidence is lower than the given param)
115
+ #
116
+ # @param [Float] confidence value (between 0 and 1)
117
+ # @param [String] predicted value
118
+ # @param [String,optional] subjectid
119
+ # @return [Hash] see example
120
+ #
121
+ # Example 1:
122
+ # validation.probabilities(0.3,"active")
123
+ # -> {:min_confidence=>0.32, :num_predictions=>20, :probs=>{"active"=>0.7, "moderate"=>0.25 "inactive"=>0.05}}
124
+ # there have been 20 "active" predictions with confidence >= 0.3, 70 percent of them beeing correct
125
+ #
126
+ # Example 2:
127
+ # validation.probabilities(0.8,"active")
128
+ # -> {:min_confidence=>0.45, :num_predictions=>12, :probs=>{"active"=>0.9, "moderate"=>0.1 "inactive"=>0}}
129
+ # the given confidence value was to high (i.e. <12 predictions with confidence value >= 0.8)
130
+ # the top 12 "active" predictions have a min_confidence of 0.45, 90 percent of them beeing correct
131
+ #
132
+ def probabilities( confidence, prediction, subjectid=nil )
133
+ YAML.load(OpenTox::RestClientWrapper.get(@uri+"/probabilities?prediction="+prediction.to_s+"&confidence="+confidence.to_s,
134
+ {:subjectid => subjectid, :accept => "application/x-yaml"}))
135
+ end
110
136
  end
111
137
 
112
138
  class Crossvalidation
@@ -168,6 +194,13 @@ module OpenTox
168
194
  def statistics( subjectid=nil )
169
195
  Validation.from_cv_statistics( @uri, subjectid )
170
196
  end
197
+
198
+ # documentation see OpenTox::Validation.probabilities
199
+ def probabilities( confidence, prediction, subjectid=nil )
200
+ YAML.load(OpenTox::RestClientWrapper.get(@uri+"/statistics/probabilities?prediction="+prediction.to_s+"&confidence="+confidence.to_s,
201
+ {:subjectid => subjectid, :accept => "application/x-yaml"}))
202
+ end
203
+
171
204
  end
172
205
 
173
206
  class ValidationReport
@@ -196,12 +229,18 @@ module OpenTox
196
229
 
197
230
  # creates a validation report via validation
198
231
  # @param [String] validation uri
232
+ # @param [Hash] params addiditonal possible
233
+ # (min_confidence, params={}, min_num_predictions, max_num_predictions)
199
234
  # @param [String,optional] subjectid
200
235
  # @param [OpenTox::Task,optional] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
201
236
  # @return [OpenTox::ValidationReport]
202
- def self.create( validation_uri, subjectid=nil, waiting_task=nil )
237
+ def self.create( validation_uri, params={}, subjectid=nil, waiting_task=nil )
238
+ params = {} if params==nil
239
+ raise OpenTox::BadRequestError.new "params is no hash" unless params.is_a?(Hash)
240
+ params[:validation_uris] = validation_uri
241
+ params[:subjectid] = subjectid
203
242
  uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-validation"],"/report/validation"),
204
- { :validation_uris => validation_uri, :subjectid => subjectid }, {}, waiting_task )
243
+ params, {}, waiting_task )
205
244
  ValidationReport.new(uri)
206
245
  end
207
246
 
@@ -268,15 +307,17 @@ module OpenTox
268
307
  uris.size==0 ? nil : AlgorithmComparisonReport.new(uris[-1])
269
308
  end
270
309
 
271
- # creates a crossvalidation report via crossvalidation
310
+ # creates a algorithm comparison report via crossvalidation uris
272
311
  # @param [Hash] crossvalidation uri_hash, see example
312
+ # @param [Hash] params addiditonal possible
313
+ # (ttest_significance, ttest_attributes, min_confidence, min_num_predictions, max_num_predictions)
273
314
  # @param [String,optional] subjectid
274
315
  # @param [OpenTox::Task,optional] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
275
316
  # @return [OpenTox::AlgorithmComparisonReport]
276
317
  # example for hash:
277
318
  # { :lazar-bbrc => [ http://host/validation/crossvalidation/x1, http://host/validation/crossvalidation/x2 ],
278
319
  # :lazar-last => [ http://host/validation/crossvalidation/xy, http://host/validation/crossvalidation/xy ] }
279
- def self.create( crossvalidation_uri_hash, subjectid=nil, waiting_task=nil )
320
+ def self.create( crossvalidation_uri_hash, params={}, subjectid=nil, waiting_task=nil )
280
321
  identifier = []
281
322
  validation_uris = []
282
323
  crossvalidation_uri_hash.each do |id, uris|
@@ -285,8 +326,13 @@ module OpenTox
285
326
  validation_uris << uri
286
327
  end
287
328
  end
329
+ params = {} if params==nil
330
+ raise OpenTox::BadRequestError.new "params is no hash" unless params.is_a?(Hash)
331
+ params[:validation_uris] = validation_uris.join(",")
332
+ params[:identifier] = identifier.join(",")
333
+ params[:subjectid] = subjectid
288
334
  uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-validation"],"/report/algorithm_comparison"),
289
- { :validation_uris => validation_uris.join(","), :identifier => identifier.join(","), :subjectid => subjectid }, {}, waiting_task )
335
+ params, {}, waiting_task )
290
336
  AlgorithmComparisonReport.new(uri)
291
337
  end
292
338
  end