opentox-ruby 3.0.1 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/ChangeLog CHANGED
@@ -1,3 +1,11 @@
1
+ v3.1.0 2012-02-24
2
+ * utils.rb: added for special routines (e.g. descriptor calculation)
3
+ * task.rb: Polling with increasing interval
4
+ * parser.rb: CSV up and download fixed
5
+ * transform.rb: routines to create machine learning data matrices
6
+ * algorithm.rb: SVM parameter grid search, cos similarity as algorithm,
7
+ gauss() removed
8
+
1
9
  v3.0.1 2011-10-19
2
10
  * feature: model registration to ontology service
3
11
  * ontology lib gets endpoints from ontology service
data/Rakefile CHANGED
@@ -16,7 +16,7 @@ begin
16
16
  gem.add_dependency "sinatra-respond_to", "=0.7.0"
17
17
  gem.add_dependency "sinatra-static-assets", "=0.5.0"
18
18
  gem.add_dependency "rest-client", "=1.6.1"
19
- gem.add_dependency "rack", "=1.3.1"
19
+ gem.add_dependency "rack", "=1.3.5"
20
20
  gem.add_dependency "rack-contrib", "=1.1.0"
21
21
  gem.add_dependency "rack-flash", "=0.1.1"
22
22
  gem.add_dependency "nokogiri", "=1.4.4"
@@ -42,10 +42,9 @@ begin
42
42
  gem.add_dependency "dm-migrations", "=1.1.0"
43
43
  gem.add_dependency "dm-validations", "=1.1.0"
44
44
  gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
45
- gem.add_dependency "ruby-plot", "=0.5.0"
45
+ gem.add_dependency "ruby-plot", "=0.6.0"
46
46
  gem.add_dependency "gsl", "=1.14.7"
47
47
  gem.add_dependency "statsample", "=1.1.0"
48
- #gem.add_dependency "statsample-optimization", "=2.1.0"
49
48
 
50
49
  gem.add_development_dependency 'jeweler'
51
50
  gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
data/VERSION CHANGED
@@ -1 +1 @@
1
- 3.0.1
1
+ 3.1.0
data/lib/algorithm.rb CHANGED
@@ -5,6 +5,8 @@ R = nil
5
5
  require "rinruby"
6
6
  require "statsample"
7
7
  require 'uri'
8
+ require 'transform.rb'
9
+ require 'utils.rb'
8
10
 
9
11
  module OpenTox
10
12
 
@@ -13,7 +15,7 @@ module OpenTox
13
15
 
14
16
  include OpenTox
15
17
 
16
- # Execute algorithm with parameters, please consult the OpenTox API and the webservice documentation for acceptable parameters
18
+ # Execute algorithm with parameters, consult OpenTox API and webservice documentation for acceptable parameters
17
19
  # @param [optional,Hash] params Algorithm parameters
18
20
  # @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
19
21
  # @return [String] URI of new resource (dataset, model, ...)
@@ -21,7 +23,7 @@ module OpenTox
21
23
  LOGGER.info "Running algorithm '"+@uri.to_s+"' with params: "+params.inspect
22
24
  RestClientWrapper.post(@uri, params, {:accept => 'text/uri-list'}, waiting_task).to_s
23
25
  end
24
-
26
+
25
27
  # Get OWL-DL representation in RDF/XML format
26
28
  # @return [application/rdf+xml] RDF/XML representation
27
29
  def to_rdfxml
@@ -33,7 +35,7 @@ module OpenTox
33
35
  # Generic Algorithm class, should work with all OpenTox webservices
34
36
  class Generic
35
37
  include Algorithm
36
-
38
+
37
39
  # Find Generic Opentox Algorithm via URI, and loads metadata, could raise NotFound/NotAuthorized error
38
40
  # @param [String] uri Algorithm URI
39
41
  # @return [OpenTox::Algorithm::Generic] Algorithm instance
@@ -44,14 +46,14 @@ module OpenTox
44
46
  raise "cannot load algorithm metadata" if alg.metadata==nil or alg.metadata.size==0
45
47
  alg
46
48
  end
47
-
49
+
48
50
  end
49
51
 
50
52
  # Fminer algorithms (https://github.com/amaunz/fminer2)
51
53
  class Fminer
52
54
  include Algorithm
53
55
  attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi
54
-
56
+
55
57
  def check_params(params,per_mil,subjectid=nil)
56
58
  raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
57
59
  raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
@@ -81,7 +83,7 @@ module OpenTox
81
83
  LOGGER.warn "Cannot find smiles for #{compound.to_s}."
82
84
  next
83
85
  end
84
-
86
+
85
87
  value_map=params[:value_map] unless params[:value_map].nil?
86
88
  entry.each do |feature,values|
87
89
  if feature == @prediction_feature.uri
@@ -90,7 +92,7 @@ module OpenTox
90
92
  LOGGER.warn "No #{feature} activity for #{compound.to_s}."
91
93
  else
92
94
  if @prediction_feature.feature_type == "classification"
93
- activity= value_map.invert[value].to_i # activities are mapped to 1..n
95
+ activity= value_map.invert[value.to_s].to_i # activities are mapped to 1..n
94
96
  @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
95
97
  elsif @prediction_feature.feature_type == "regression"
96
98
  activity= value.to_f
@@ -115,23 +117,23 @@ module OpenTox
115
117
 
116
118
  end
117
119
 
118
- # Backbone Refinement Class mining (http://bbrc.maunz.de/)
119
- class BBRC < Fminer
120
- # Initialize bbrc algorithm
121
- def initialize(subjectid=nil)
122
- super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
123
- load_metadata(subjectid)
124
- end
120
+ # Backbone Refinement Class mining (http://bbrc.maunz.de/)
121
+ class BBRC < Fminer
122
+ # Initialize bbrc algorithm
123
+ def initialize(subjectid=nil)
124
+ super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
125
+ load_metadata(subjectid)
125
126
  end
127
+ end
126
128
 
127
- # LAtent STructure Pattern Mining (http://last-pm.maunz.de)
128
- class LAST < Fminer
129
- # Initialize last algorithm
130
- def initialize(subjectid=nil)
131
- super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
132
- load_metadata(subjectid)
133
- end
129
+ # LAtent STructure Pattern Mining (http://last-pm.maunz.de)
130
+ class LAST < Fminer
131
+ # Initialize last algorithm
132
+ def initialize(subjectid=nil)
133
+ super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
134
+ load_metadata(subjectid)
134
135
  end
136
+ end
135
137
 
136
138
 
137
139
  # Create lazar prediction model
@@ -144,72 +146,6 @@ module OpenTox
144
146
  end
145
147
  end
146
148
 
147
- # Utility methods without dedicated webservices
148
-
149
- # Similarity calculations
150
- module Similarity
151
- include Algorithm
152
-
153
- # Tanimoto similarity
154
- # @param [Array] features_a Features of first compound
155
- # @param [Array] features_b Features of second compound
156
- # @param [optional, Hash] weights Weights for all features
157
- # @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required
158
- # @return [Float] (Weighted) tanimoto similarity
159
- def self.tanimoto(features_a,features_b,weights=nil,params=nil)
160
- common_features = features_a & features_b
161
- all_features = (features_a + features_b).uniq
162
- #LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}"
163
- if common_features.size > 0
164
- if weights
165
- #LOGGER.debug "nr_hits: #{params[:nr_hits]}"
166
- if !params.nil? && params[:nr_hits]
167
- params[:weights] = weights
168
- params[:mode] = "min"
169
- params[:features] = common_features
170
- common_p_sum = Algorithm.p_sum_support(params)
171
- params[:mode] = "max"
172
- params[:features] = all_features
173
- all_p_sum = Algorithm.p_sum_support(params)
174
- else
175
- common_p_sum = 0.0
176
- common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
177
- all_p_sum = 0.0
178
- all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
179
- end
180
- #LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}"
181
- common_p_sum/all_p_sum
182
- else
183
- #LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}"
184
- common_features.size.to_f/all_features.size.to_f
185
- end
186
- else
187
- 0.0
188
- end
189
- end
190
-
191
- # Euclidean similarity
192
- # @param [Hash] properties_a Properties of first compound
193
- # @param [Hash] properties_b Properties of second compound
194
- # @param [optional, Hash] weights Weights for all properties
195
- # @return [Float] (Weighted) euclidean similarity
196
- def self.euclidean(properties_a,properties_b,weights=nil)
197
- common_properties = properties_a.keys & properties_b.keys
198
- if common_properties.size > 1
199
- dist_sum = 0
200
- common_properties.each do |p|
201
- if weights
202
- dist_sum += ( (properties_a[p] - properties_b[p]) * Algorithm.gauss(weights[p]) )**2
203
- else
204
- dist_sum += (properties_a[p] - properties_b[p])**2
205
- end
206
- end
207
- 1/(1+Math.sqrt(dist_sum))
208
- else
209
- 0.0
210
- end
211
- end
212
- end
213
149
 
214
150
  # Structural Graph Clustering by TU Munich
215
151
  # Finds clusters similar to a query structure in a given training dataset
@@ -226,7 +162,7 @@ module OpenTox
226
162
  raise "Invalid URI."
227
163
  end
228
164
  @training_dataset_uri = training_dataset_uri
229
- if !OpenTox::Algorithm.numeric? training_threshold || training_threshold <0 || training_threshold >1
165
+ if !self.numeric? training_threshold || training_threshold <0 || training_threshold >1
230
166
  raise "Training threshold out of bounds."
231
167
  end
232
168
  @training_threshold = training_threshold.to_f
@@ -259,7 +195,7 @@ module OpenTox
259
195
  # @params[Float] Similarity threshold for query to clusters (optional)
260
196
  def get_clusters query_compound_uri, query_threshold = 0.5
261
197
 
262
- if !OpenTox::Algorithm.numeric? query_threshold || query_threshold <0 || query_threshold >1
198
+ if !self.numeric? query_threshold || query_threshold <0 || query_threshold >1
263
199
  raise "Query threshold out of bounds."
264
200
  end
265
201
  @query_threshold = query_threshold.to_f
@@ -285,7 +221,7 @@ module OpenTox
285
221
  metadata[DC.title][pattern]=""
286
222
  feature_clusterid_map[feature_uri] = metadata[DC.title].to_i
287
223
  }
288
-
224
+
289
225
  # Integrity check
290
226
  unless cluster_query_dataset.compounds.size == 1
291
227
  raise "Number of predicted compounds is != 1."
@@ -295,11 +231,11 @@ module OpenTox
295
231
  query_compound_uri = cluster_query_dataset.compounds[0]
296
232
  @target_clusters_array = Array.new
297
233
  cluster_query_dataset.features.keys.each { |cluster_membership_feature|
298
-
234
+
299
235
  # Getting dataset URI for cluster
300
236
  target_cluster = feature_clusterid_map[cluster_membership_feature]
301
237
  dataset = @clusterid_dataset_map[target_cluster]
302
-
238
+
303
239
  # Finally look up presence
304
240
  data_entry = cluster_query_dataset.data_entries[query_compound_uri]
305
241
  present = data_entry[cluster_membership_feature][0]
@@ -311,85 +247,13 @@ module OpenTox
311
247
 
312
248
  end
313
249
 
314
- module Neighbors
315
-
316
- # Local multi-linear regression (MLR) prediction from neighbors.
317
- # Uses propositionalized setting.
318
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
319
- # @return [Numeric] A prediction value.
320
- def self.local_mlr_prop(params)
321
-
322
- confidence=0.0
323
- prediction=nil
324
-
325
- if params[:neighbors].size>0
326
- props = params[:prop_kernel] ? get_props(params) : nil
327
- acts = params[:neighbors].collect { |n| act = n[:activity].to_f }
328
- sims = params[:neighbors].collect { |n| Algorithm.gauss(n[:similarity]) }
329
- LOGGER.debug "Local MLR (Propositionalization / GSL)."
330
- prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} )
331
- transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
332
- prediction = transformer.values[0]
333
- prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
334
- LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
335
- params[:conf_stdev] = false if params[:conf_stdev].nil?
336
- confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
337
- confidence = nil if prediction.nil?
338
- end
339
- {:prediction => prediction, :confidence => confidence}
340
-
341
- end
342
-
343
- # Multi-linear regression weighted by similarity.
344
- # Objective Feature Selection, Principal Components Analysis, Scaling of Axes.
345
- # @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required
346
- # @return [Numeric] A prediction value.
347
- def self.mlr(params)
348
-
349
- # GSL matrix operations:
350
- # to_a : row-wise conversion to nested array
351
- #
352
- # Statsample operations (build on GSL):
353
- # to_scale: convert into Statsample format
354
-
355
- begin
356
- n_prop = params[:n_prop].collect { |v| v }
357
- q_prop = params[:q_prop].collect { |v| v }
358
- n_prop << q_prop # attach q_prop
359
- nr_cases, nr_features = get_sizes n_prop
360
- data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
361
-
362
- # Principal Components Analysis
363
- LOGGER.debug "PCA..."
364
- pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix)
365
- data_matrix = pca.data_transformed_matrix
366
-
367
- # Attach intercept column to data
368
- intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1)
369
- data_matrix = data_matrix.horzcat(intercept)
370
- (0..data_matrix.size2-2).each { |i|
371
- autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i))
372
- data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values
373
- }
374
250
 
375
- # Detach query instance
376
- n_prop = data_matrix.to_a
377
- q_prop = n_prop.pop
378
- nr_cases, nr_features = get_sizes n_prop
379
- data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
380
251
 
381
- # model + support vectors
382
- LOGGER.debug "Creating MLR model ..."
383
- c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl)
384
- GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0]
385
- rescue Exception => e
386
- LOGGER.debug "#{e.class}: #{e.message}"
387
- end
252
+ module Neighbors
388
253
 
389
- end
390
254
 
391
255
  # Classification with majority vote from neighbors weighted by similarity
392
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
256
+ # @param [Hash] params Keys `:acts, :sims, :value_map` are required
393
257
  # @return [Numeric] A prediction value.
394
258
  def self.weighted_majority_vote(params)
395
259
 
@@ -398,12 +262,13 @@ module OpenTox
398
262
  confidence = 0.0
399
263
  prediction = nil
400
264
 
401
- params[:neighbors].each do |neighbor|
402
- neighbor_weight = Algorithm.gauss(neighbor[:similarity]).to_f
403
- neighbor_contribution += neighbor[:activity].to_f * neighbor_weight
265
+ LOGGER.debug "Weighted Majority Vote Classification."
404
266
 
267
+ params[:acts].each_index do |idx|
268
+ neighbor_weight = params[:sims][1][idx]
269
+ neighbor_contribution += params[:acts][idx] * neighbor_weight
405
270
  if params[:value_map].size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
406
- case neighbor[:activity]
271
+ case params[:acts][idx]
407
272
  when 1
408
273
  confidence_sum -= neighbor_weight
409
274
  when 2
@@ -413,294 +278,257 @@ module OpenTox
413
278
  confidence_sum += neighbor_weight
414
279
  end
415
280
  end
416
-
417
281
  if params[:value_map].size == 2
418
282
  if confidence_sum >= 0.0
419
- prediction = 2 unless params[:neighbors].size==0
283
+ prediction = 2 unless params[:acts].size==0
420
284
  elsif confidence_sum < 0.0
421
- prediction = 1 unless params[:neighbors].size==0
285
+ prediction = 1 unless params[:acts].size==0
422
286
  end
423
287
  else
424
- prediction = (neighbor_contribution/confidence_sum).round unless params[:neighbors].size==0 # AM: new multinomial prediction
288
+ prediction = (neighbor_contribution/confidence_sum).round unless params[:acts].size==0 # AM: new multinomial prediction
425
289
  end
290
+
426
291
  LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil?
427
- confidence = confidence_sum/params[:neighbors].size if params[:neighbors].size > 0
292
+ confidence = (confidence_sum/params[:acts].size).abs if params[:acts].size > 0
428
293
  LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil?
429
294
  return {:prediction => prediction, :confidence => confidence.abs}
430
295
  end
431
296
 
297
+
298
+
432
299
  # Local support vector regression from neighbors
433
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
300
+ # @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required
434
301
  # @return [Numeric] A prediction value.
435
302
  def self.local_svm_regression(params)
436
303
 
437
- confidence = 0.0
438
- prediction = nil
439
- if params[:neighbors].size>0
440
- props = params[:prop_kernel] ? get_props(params) : nil
441
- acts = params[:neighbors].collect{ |n| n[:activity].to_f }
442
- sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) }
443
- prediction = props.nil? ? local_svm(acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr")
444
- transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
445
- prediction = transformer.values[0]
446
- prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
447
- LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
448
- params[:conf_stdev] = false if params[:conf_stdev].nil?
449
- confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
450
- confidence = nil if prediction.nil?
304
+ begin
305
+ confidence = 0.0
306
+ prediction = nil
307
+
308
+ LOGGER.debug "Local SVM."
309
+ if params[:acts].size>0
310
+ if params[:props]
311
+ n_prop = params[:props][0].collect
312
+ q_prop = params[:props][1].collect
313
+ props = [ n_prop, q_prop ]
314
+ end
315
+ acts = params[:acts].collect
316
+ prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
317
+ prediction = nil if (!prediction.nil? && prediction.infinite?)
318
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
319
+ confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]})
320
+ confidence = 0.0 if prediction.nil?
321
+ end
322
+ {:prediction => prediction, :confidence => confidence}
323
+ rescue Exception => e
324
+ LOGGER.debug "#{e.class}: #{e.message}"
325
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
451
326
  end
452
- {:prediction => prediction, :confidence => confidence}
453
-
327
+
454
328
  end
455
329
 
456
- # Local support vector classification from neighbors
457
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
330
+
331
+ # Local support vector regression from neighbors
332
+ # @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required
458
333
  # @return [Numeric] A prediction value.
459
334
  def self.local_svm_classification(params)
460
335
 
461
- confidence = 0.0
462
- prediction = nil
463
- if params[:neighbors].size>0
464
- props = params[:prop_kernel] ? get_props(params) : nil
465
- acts = params[:neighbors].collect { |n| act = n[:activity] }
466
- sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
467
- prediction = props.nil? ? local_svm(acts, sims, "C-bsvc", params) : local_svm_prop(props, acts, "C-bsvc")
468
- LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
469
- params[:conf_stdev] = false if params[:conf_stdev].nil?
470
- confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
336
+ begin
337
+ confidence = 0.0
338
+ prediction = nil
339
+
340
+ LOGGER.debug "Local SVM."
341
+ if params[:acts].size>0
342
+ if params[:props]
343
+ n_prop = params[:props][0].collect
344
+ q_prop = params[:props][1].collect
345
+ props = [ n_prop, q_prop ]
346
+ end
347
+ acts = params[:acts].collect
348
+ acts = acts.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
349
+ prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
350
+ prediction = prediction.sub(/Val/,"") if prediction # Convert back to Float
351
+ confidence = 0.0 if prediction.nil?
352
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
353
+ confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]})
354
+ end
355
+ {:prediction => prediction, :confidence => confidence}
356
+ rescue Exception => e
357
+ LOGGER.debug "#{e.class}: #{e.message}"
358
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
471
359
  end
472
- {:prediction => prediction, :confidence => confidence}
473
-
360
+
474
361
  end
475
362
 
476
363
 
364
+
477
365
  # Local support vector prediction from neighbors.
478
- # Uses pre-defined Kernel Matrix.
366
+ # Uses propositionalized setting.
479
367
  # Not to be called directly (use local_svm_regression or local_svm_classification).
368
+ # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
480
369
  # @param [Array] acts, activities for neighbors.
481
- # @param [Array] sims, similarities for neighbors.
482
- # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
483
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
370
+ # @param [Float] min_train_performance, parameter to control censoring
484
371
  # @return [Numeric] A prediction value.
485
- def self.local_svm(acts, sims, type, params)
486
- LOGGER.debug "Local SVM (Weighted Tanimoto Kernel)."
487
- neighbor_matches = params[:neighbors].collect{ |n| n[:features] } # URIs of matches
488
- gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
372
+ def self.local_svm_prop(props, acts, min_train_performance)
373
+
374
+ LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
375
+ n_prop = props[0] # is a matrix, i.e. two nested Arrays.
376
+ q_prop = props[1] # is an Array.
489
377
 
490
378
  prediction = nil
491
379
  if Algorithm::zero_variance? acts
492
380
  prediction = acts[0]
493
381
  else
494
- # gram matrix
495
- (0..(neighbor_matches.length-1)).each do |i|
496
- neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]]
497
- gram_matrix[i] = [] unless gram_matrix[i]
498
- # upper triangle
499
- ((i+1)..(neighbor_matches.length-1)).each do |j|
500
- neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]]
501
- sim_params = {}
502
- if params[:nr_hits]
503
- sim_params[:nr_hits] = true
504
- sim_params[:compound_features_hits] = neighbor_i_hits
505
- sim_params[:training_compound_features_hits] = neighbor_j_hits
506
- end
507
- sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)")
508
- gram_matrix[i][j] = Algorithm.gauss(sim)
509
- gram_matrix[j] = [] unless gram_matrix[j]
510
- gram_matrix[j][i] = gram_matrix[i][j] # lower triangle
511
- end
512
- gram_matrix[i][i] = 1.0
513
- end
514
-
515
-
516
382
  #LOGGER.debug gram_matrix.to_yaml
517
383
  @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
518
- @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
519
- LOGGER.debug "Setting R data ..."
520
- # set data
521
- @r.gram_matrix = gram_matrix.flatten
522
- @r.n = neighbor_matches.size
523
- @r.y = acts
524
- @r.sims = sims
525
-
384
+ @r.eval "set.seed(1)"
385
+ @r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab"
386
+ @r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore"
387
+ @r.eval "registerDoMC()" # switch on parallel processing
526
388
  begin
527
- LOGGER.debug "Preparing R data ..."
528
- # prepare data
529
- @r.eval "y<-as.vector(y)"
530
- @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
531
- @r.eval "sims<-as.vector(sims)"
532
-
533
- # model + support vectors
534
- LOGGER.debug "Creating SVM model ..."
535
- @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"#{type}\", nu=0.5)"
536
- @r.eval "sv<-as.vector(SVindex(model))"
537
- @r.eval "sims<-sims[sv]"
538
- @r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
539
- LOGGER.debug "Predicting ..."
540
- if type == "nu-svr"
541
- @r.eval "p<-predict(model,sims)[1,1]"
542
- elsif type == "C-bsvc"
543
- @r.eval "p<-predict(model,sims)"
544
- end
545
- if type == "nu-svr"
546
- prediction = @r.p
547
- elsif type == "C-bsvc"
548
- #prediction = (@r.p.to_f == 1.0 ? true : false)
549
- prediction = @r.p
550
- end
551
- @r.quit # free R
552
- rescue Exception => e
553
- LOGGER.debug "#{e.class}: #{e.message}"
554
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
555
- end
556
-
557
- end
558
- prediction
559
- end
560
-
561
- # Local support vector prediction from neighbors.
562
- # Uses propositionalized setting.
563
- # Not to be called directly (use local_svm_regression or local_svm_classification).
564
- # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
565
- # @param [Array] acts, activities for neighbors.
566
- # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
567
- # @return [Numeric] A prediction value.
568
- def self.local_svm_prop(props, acts, type)
569
-
570
- LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
571
- n_prop = props[0] # is a matrix, i.e. two nested Arrays.
572
- q_prop = props[1] # is an Array.
573
389
 
574
- prediction = nil
575
- if Algorithm::zero_variance? acts
576
- prediction = acts[0]
577
- else
578
- #LOGGER.debug gram_matrix.to_yaml
579
- @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
580
- @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
581
- LOGGER.debug "Setting R data ..."
582
390
  # set data
391
+ LOGGER.debug "Setting R data ..."
583
392
  @r.n_prop = n_prop.flatten
584
393
  @r.n_prop_x_size = n_prop.size
585
394
  @r.n_prop_y_size = n_prop[0].size
586
395
  @r.y = acts
587
396
  @r.q_prop = q_prop
397
+ #@r.eval "y = matrix(y)"
398
+ @r.eval "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
399
+ @r.eval "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
588
400
 
589
- begin
590
- LOGGER.debug "Preparing R data ..."
591
- # prepare data
592
- @r.eval "y<-matrix(y)"
593
- @r.eval "prop_matrix<-matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=TRUE)"
594
- @r.eval "q_prop<-matrix(q_prop, 1, n_prop_y_size, byrow=TRUE)"
595
-
596
- # model + support vectors
597
- LOGGER.debug "Creating SVM model ..."
598
- @r.eval "model<-ksvm(prop_matrix, y, type=\"#{type}\", nu=0.5)"
599
- LOGGER.debug "Predicting ..."
600
- if type == "nu-svr"
601
- @r.eval "p<-predict(model,q_prop)[1,1]"
602
- elsif type == "C-bsvc"
603
- @r.eval "p<-predict(model,q_prop)"
604
- end
605
- if type == "nu-svr"
606
- prediction = @r.p
607
- elsif type == "C-bsvc"
608
- #prediction = (@r.p.to_f == 1.0 ? true : false)
609
- prediction = @r.p
610
- end
611
- @r.quit # free R
612
- rescue Exception => e
613
- LOGGER.debug "#{e.class}: #{e.message}"
614
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
615
- end
616
- end
617
- prediction
618
- end
401
+ # prepare data
402
+ LOGGER.debug "Preparing R data ..."
403
+ @r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification
404
+
405
+ @r.eval <<-EOR
406
+ rem = nearZeroVar(prop_matrix)
407
+ if (length(rem) > 0) {
408
+ prop_matrix = prop_matrix[,-rem,drop=F]
409
+ q_prop = q_prop[,-rem,drop=F]
410
+ }
411
+ rem = findCorrelation(cor(prop_matrix))
412
+ if (length(rem) > 0) {
413
+ prop_matrix = prop_matrix[,-rem,drop=F]
414
+ q_prop = q_prop[,-rem,drop=F]
415
+ }
416
+ EOR
619
417
 
620
- # Get confidence for regression, with standard deviation of neighbor activity if conf_stdev is set.
621
- # @param[Hash] Required keys: :sims, :acts, :neighbors, :conf_stdev
622
- # @return[Float] Confidence
623
- def self.get_confidence(params)
624
- if params[:conf_stdev]
625
- sim_median = params[:sims].to_scale.median
626
- if sim_median.nil?
627
- confidence = nil
628
- else
629
- standard_deviation = params[:acts].to_scale.standard_deviation_sample
630
- confidence = (sim_median*Math.exp(-1*standard_deviation)).abs
631
- if confidence.nan?
632
- confidence = nil
633
- end
634
- end
635
- else
636
- conf = params[:sims].inject{|sum,x| sum + x }
637
- confidence = conf/params[:neighbors].size
638
- end
639
- LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
640
- return confidence
641
- end
418
+ # model + support vectors
419
+ LOGGER.debug "Creating R SVM model ..."
420
+ @r.eval <<-EOR
421
+ model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale"))
422
+ perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
423
+ EOR
642
424
 
643
- # Get X and Y size of a nested Array (Matrix)
644
- def self.get_sizes(matrix)
645
- begin
646
- nr_cases = matrix.size
647
- nr_features = matrix[0].size
648
- rescue Exception => e
649
- LOGGER.debug "#{e.class}: #{e.message}"
650
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
651
- end
652
- #puts "NRC: #{nr_cases}, NRF: #{nr_features}"
653
- [ nr_cases, nr_features ]
654
- end
655
425
 
656
- # Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features)
657
- # Same for the vector describing the query compound
658
- # @param[Array] neighbors.
659
- # @param[OpenTox::Compound] query compound.
660
- # @param[Array] Dataset Features.
661
- # @param[Array] Fingerprints of neighbors.
662
- # @param[Float] p-values of Features.
663
- def self.get_props (params)
664
- matrix = Array.new
665
- begin
666
- params[:neighbors].each do |n|
667
- n = n[:compound]
668
- row = []
669
- params[:features].each do |f|
670
- if ! params[:fingerprints][n].nil?
671
- row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0)
672
- else
673
- row << 0.0
674
- end
675
- end
676
- matrix << row
677
- end
678
- row = []
679
- params[:features].each do |f|
680
- if params[:nr_hits]
681
- compound_feature_hits = params[:compound].match_hits([f])
682
- row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f]))
683
- else
684
- row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f])
685
- end
426
+ # prediction
427
+ LOGGER.debug "Predicting ..."
428
+ @r.eval "p = predict(model,q_prop)"
429
+ @r.eval "if (class(y)!='numeric') p = as.character(p)"
430
+ prediction = @r.p
431
+
432
+ # censoring
433
+ prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance )
434
+ LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}"
435
+ rescue Exception => e
436
+ LOGGER.debug "#{e.class}: #{e.message}"
437
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
686
438
  end
687
- rescue Exception => e
688
- LOGGER.debug "get_props failed with '" + $! + "'"
439
+ @r.quit # free R
689
440
  end
690
- [ matrix, row ]
441
+ prediction
691
442
  end
692
443
 
693
444
  end
694
445
 
446
+ module FeatureSelection
447
+ include Algorithm
448
+ # Recursive Feature Elimination using caret
449
+ # @param [Hash] required keys: ds_csv_file, prediction_feature, fds_csv_file (dataset CSV file, prediction feature column name, and feature dataset CSV file), optional: del_missing (delete rows with missing values).
450
+ # @return [String] feature dataset CSV file composed of selected features.
451
+ def self.rfe(params)
452
+ @r=RinRuby.new(false,false)
453
+ @r.ds_csv_file = params[:ds_csv_file].to_s
454
+ @r.prediction_feature = params[:prediction_feature].to_s
455
+ @r.fds_csv_file = params[:fds_csv_file].to_s
456
+ @r.del_missing = params[:del_missing] == true ? 1 : 0
457
+ r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
458
+ @r.f_fds_r = r_result_file.to_s
459
+
460
+ # need packs 'randomForest', 'RANN'
461
+ @r.eval <<-EOR
462
+ set.seed(1)
463
+ suppressPackageStartupMessages(library('caret'))
464
+ suppressPackageStartupMessages(library('randomForest'))
465
+ suppressPackageStartupMessages(library('RANN'))
466
+ suppressPackageStartupMessages(library('doMC'))
467
+ registerDoMC()
468
+
469
+ acts = read.csv(ds_csv_file, check.names=F)
470
+ feats = read.csv(fds_csv_file, check.names=F)
471
+ ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-)
472
+
473
+ features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]
474
+ y = ds[,which(names(ds) == prediction_feature)]
475
+
476
+ # assumes a data matrix 'features' and a vector 'y' of target values
477
+ row.names(features)=NULL
478
+
479
+ pp = NULL
480
+ if (del_missing) {
481
+ # needed if rows should be removed
482
+ na_ids = apply(features,1,function(x)any(is.na(x)))
483
+ features = features[!na_ids,]
484
+ y = y[!na_ids]
485
+ pp = preProcess(features, method=c("scale", "center"))
486
+ } else {
487
+ # Use imputation if NA's random (only then!)
488
+ pp = preProcess(features, method=c("scale", "center", "knnImpute"))
489
+ }
490
+ features = predict(pp, features)
491
+
492
+ # determine subsets
493
+ subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
494
+ subsets = c(2,3,4,5,7,10,subsets)
495
+ subsets = unique(sort(round(subsets)))
496
+ subsets = subsets[subsets<=dim(features)[2]]
497
+ subsets = subsets[subsets>1]
498
+
499
+ # Recursive feature elimination
500
+ rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)
501
+
502
+ # read existing dataset and select most useful features
503
+ csv=feats[,c("SMILES", rfProfile$optVariables)]
504
+ write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')
505
+ EOR
506
+ r_result_file
507
+ end
508
+ end
509
+
695
510
  module Substructure
696
511
  include Algorithm
697
512
  # Substructure matching
698
- # @param [OpenTox::Compound] compound Compound
699
- # @param [Array] features Array with Smarts strings
513
+ # @param [Hash] required keys: compound, features
700
514
  # @return [Array] Array with matching Smarts
701
- def self.match(compound,features)
702
- compound.match(features)
515
+ def self.match(params)
516
+ params[:compound].match(params[:features])
703
517
  end
518
+
519
+ # Substructure matching with number of non-unique hits
520
+ # @param [Hash] required keys: compound, features
521
+ # @return [Hash] Hash with matching Smarts and number of hits
522
+ def self.match_hits(params)
523
+ params[:compound].match_hits(params[:features])
524
+ end
525
+
526
+ # Substructure matching with number of non-unique hits
527
+ # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
528
+ # @return [Hash] Hash with matching Smarts and number of hits
529
+ def self.lookup(params)
530
+ params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid])
531
+ end
704
532
  end
705
533
 
706
534
  module Dataset
@@ -709,281 +537,5 @@ module OpenTox
709
537
  def features(dataset_uri,compound_uri)
710
538
  end
711
539
  end
712
-
713
- module Transform
714
- include Algorithm
715
-
716
- # The transformer that inverts values.
717
- # 1/x is used, after values have been moved >= 1.
718
- class Inverter
719
- attr_accessor :offset, :values
720
-
721
- # @params[Array] Values to transform.
722
- # @params[Float] Offset for restore.
723
- def initialize *args
724
- case args.size
725
- when 1
726
- begin
727
- values=args[0]
728
- raise "Cannot transform, values empty." if @values.size==0
729
- @values = values.collect { |v| -1.0 * v }
730
- @offset = 1.0 - @values.minmax[0]
731
- @offset = -1.0 * @offset if @offset>0.0
732
- @values.collect! { |v| v - @offset } # slide >1
733
- @values.collect! { |v| 1 / v } # invert to [0,1]
734
- rescue Exception => e
735
- LOGGER.debug "#{e.class}: #{e.message}"
736
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
737
- end
738
- when 2
739
- @offset = args[1].to_f
740
- @values = args[0].collect { |v| 1 / v }
741
- @values.collect! { |v| v + @offset }
742
- @values.collect! { |v| -1.0 * v }
743
- end
744
- end
745
- end
746
-
747
- # The transformer that takes logs.
748
- # Log10 is used, after values have been moved > 0.
749
- class Log10
750
- attr_accessor :offset, :values
751
-
752
- # @params[Array] Values to transform / restore.
753
- # @params[Float] Offset for restore.
754
- def initialize *args
755
- @distance_to_zero = 0.000000001 # 1 / 1 billion
756
- case args.size
757
- when 1
758
- begin
759
- values=args[0]
760
- raise "Cannot transform, values empty." if values.size==0
761
- @offset = values.minmax[0]
762
- @offset = -1.0 * @offset if @offset>0.0
763
- @values = values.collect { |v| v - @offset } # slide > anchor
764
- @values.collect! { |v| v + @distance_to_zero } #
765
- @values.collect! { |v| Math::log10 v } # log10 (can fail)
766
- rescue Exception => e
767
- LOGGER.debug "#{e.class}: #{e.message}"
768
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
769
- end
770
- when 2
771
- @offset = args[1].to_f
772
- @values = args[0].collect { |v| 10**v }
773
- @values.collect! { |v| v - @distance_to_zero }
774
- @values.collect! { |v| v + @offset }
775
- end
776
- end
777
- end
778
-
779
- # The transformer that does nothing (No OPeration).
780
- class NOP
781
- attr_accessor :offset, :values
782
-
783
- # @params[Array] Values to transform / restore.
784
- # @params[Float] Offset for restore.
785
- def initialize *args
786
- @offset = 0.0
787
- @distance_to_zero = 0.0
788
- case args.size
789
- when 1
790
- @values = args[0]
791
- when 2
792
- @values = args[0]
793
- end
794
- end
795
- end
796
-
797
-
798
- # Auto-Scaler for Arrays
799
- # Center on mean and divide by standard deviation
800
- class AutoScale
801
- attr_accessor :scaled_values, :mean, :stdev
802
-
803
- # @params[Array] Values to transform.
804
- def initialize values
805
- @scaled_values = values
806
- @mean = @scaled_values.to_scale.mean
807
- @stdev = @scaled_values.to_scale.standard_deviation_sample
808
- @scaled_values = @scaled_values.collect {|vi| vi - @mean }
809
- @scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0
810
- end
811
- end
812
-
813
- # Principal Components Analysis
814
- # Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
815
- class PCA
816
- attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
817
-
818
- # Creates a transformed dataset as GSL::Matrix.
819
- # @param [GSL::Matrix] Data matrix.
820
- # @param [Float] Compression ratio from [0,1].
821
- # @return [GSL::Matrix] Data transformed matrix.
822
- def initialize data_matrix, compression=0.05
823
- begin
824
- @data_matrix = data_matrix
825
- @compression = compression.to_f
826
- @stdev = Array.new
827
- @mean = Array.new
828
-
829
- # Objective Feature Selection
830
- raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
831
- @data_matrix_selected = nil
832
- (0..@data_matrix.size2-1).each { |i|
833
- if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
834
- if @data_matrix_selected.nil?
835
- @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
836
- @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
837
- else
838
- @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
839
- end
840
- end
841
- }
842
- raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
843
-
844
- # Scaling of Axes
845
- @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2)
846
- (0..@data_matrix_selected.size2-1).each { |i|
847
- @autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
848
- @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values
849
- @stdev << @autoscaler.stdev
850
- @mean << @autoscaler.mean
851
- }
852
-
853
- data_matrix_hash = Hash.new
854
- (0..@data_matrix_scaled.size2-1).each { |i|
855
- column_view = @data_matrix_scaled.col(i)
856
- data_matrix_hash[i] = column_view.to_scale
857
- }
858
- dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
859
- cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
860
- pca=Statsample::Factor::PCA.new(cor_matrix)
861
- pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
862
- @eigenvalue_sums = Array.new
863
- (0..dataset_hash.fields.size-1).each { |i|
864
- @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
865
- }
866
- eigenvectors_selected = Array.new
867
- pca.eigenvectors.each_with_index { |ev, i|
868
- if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0)
869
- eigenvectors_selected << ev.to_a
870
- end
871
- }
872
- @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose
873
- dataset_matrix = dataset_hash.to_gsl.transpose
874
- @data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose
875
- rescue Exception => e
876
- LOGGER.debug "#{e.class}: #{e.message}"
877
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
878
- end
879
- end
880
-
881
- # Restores data in the original feature space (possibly with compression loss).
882
- # @return [GSL::Matrix] Data matrix.
883
- def restore
884
- begin
885
- data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
886
- # reverse scaling
887
- (0..data_matrix_restored.size2-1).each { |i|
888
- data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0
889
- data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
890
- }
891
- data_matrix_restored
892
- rescue Exception => e
893
- LOGGER.debug "#{e.class}: #{e.message}"
894
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
895
- end
896
- end
897
-
898
- end
899
-
900
- end
901
-
902
- # Gauss kernel
903
- # @return [Float]
904
- def self.gauss(x, sigma = 0.3)
905
- d = 1.0 - x.to_f
906
- Math.exp(-(d*d)/(2*sigma*sigma))
907
- end
908
-
909
- # For symbolic features
910
- # @param [Array] Array to test, must indicate non-occurrence with 0.
911
- # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
912
- def self.isnull_or_singular?(array)
913
- nr_zeroes = array.count(0)
914
- return (nr_zeroes == array.size) || # remove non-occurring feature
915
- (nr_zeroes == array.size-1) || # remove singular feature
916
- (nr_zeroes == 0) # also remove feature present everywhere
917
- end
918
-
919
- # Numeric value test
920
- # @param[Object] value
921
- # @return [Boolean] Whether value is a number
922
- def self.numeric?(value)
923
- true if Float(value) rescue false
924
- end
925
-
926
- # For symbolic features
927
- # @param [Array] Array to test, must indicate non-occurrence with 0.
928
- # @return [Boolean] Whether the feature has variance zero.
929
- def self.zero_variance?(array)
930
- return (array.to_scale.variance_population == 0.0)
931
- end
932
-
933
- # Sum of an array for Arrays.
934
- # @param [Array] Array with values
935
- # @return [Integer] Sum of size of values
936
- def self.sum_size(array)
937
- sum=0
938
- array.each { |e| sum += e.size }
939
- return sum
940
- end
941
-
942
- # Minimum Frequency
943
- # @param [Integer] per-mil value
944
- # return [Integer] min-frequency
945
- def self.min_frequency(training_dataset,per_mil)
946
- minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
947
- minfreq = 2 unless minfreq > 2
948
- Integer (minfreq)
949
- end
950
-
951
- # Effect calculation for classification
952
- # @param [Array] Array of occurrences per class in the form of Enumerables.
953
- # @param [Array] Array of database instance counts per class.
954
- def self.effect(occurrences, db_instances)
955
- max=0
956
- max_value=0
957
- nr_o = self.sum_size(occurrences)
958
- nr_db = db_instances.to_scale.sum
959
-
960
- occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
961
- actual = o.size.to_f/nr_o
962
- expected = db_instances[i].to_f/nr_db
963
- if actual > expected
964
- if ((actual - expected) / actual) > max_value
965
- max_value = (actual - expected) / actual # 'Schleppzeiger'
966
- max = i
967
- end
968
- end
969
- }
970
- max
971
- end
972
-
973
- # Returns Support value of an fingerprint
974
- # @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required
975
- # return [Numeric] Support value
976
- def self.p_sum_support(params)
977
- p_sum = 0.0
978
- params[:features].each{|f|
979
- compound_hits = params[:compound_features_hits][f]
980
- neighbor_hits = params[:training_compound_features_hits][f]
981
- p_sum += eval("(Algorithm.gauss(params[:weights][f]) * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))")
982
- }
983
- p_sum
984
- end
985
-
986
540
  end
987
541
  end
988
-
989
-