opentox-ruby 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ChangeLog CHANGED
@@ -1,3 +1,11 @@
1
+ v3.1.0 2012-02-24
2
+ * utils.rb: added for special routines (e.g. descriptor calculation)
3
+ * task.rb: Polling with increasing interval
4
+ * parser.rb: CSV up and download fixed
5
+ * transform.rb: routines to create machine learning data matrices
6
+ * algorithm.rb: SVM parameter grid search, cos similarity as algorithm,
7
+ gauss() removed
8
+
1
9
  v3.0.1 2011-10-19
2
10
  * feature: model registration to ontology service
3
11
  * ontology lib gets endpoints from ontology service
data/Rakefile CHANGED
@@ -16,7 +16,7 @@ begin
16
16
  gem.add_dependency "sinatra-respond_to", "=0.7.0"
17
17
  gem.add_dependency "sinatra-static-assets", "=0.5.0"
18
18
  gem.add_dependency "rest-client", "=1.6.1"
19
- gem.add_dependency "rack", "=1.3.1"
19
+ gem.add_dependency "rack", "=1.3.5"
20
20
  gem.add_dependency "rack-contrib", "=1.1.0"
21
21
  gem.add_dependency "rack-flash", "=0.1.1"
22
22
  gem.add_dependency "nokogiri", "=1.4.4"
@@ -42,10 +42,9 @@ begin
42
42
  gem.add_dependency "dm-migrations", "=1.1.0"
43
43
  gem.add_dependency "dm-validations", "=1.1.0"
44
44
  gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
45
- gem.add_dependency "ruby-plot", "=0.5.0"
45
+ gem.add_dependency "ruby-plot", "=0.6.0"
46
46
  gem.add_dependency "gsl", "=1.14.7"
47
47
  gem.add_dependency "statsample", "=1.1.0"
48
- #gem.add_dependency "statsample-optimization", "=2.1.0"
49
48
 
50
49
  gem.add_development_dependency 'jeweler'
51
50
  gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
data/VERSION CHANGED
@@ -1 +1 @@
1
- 3.0.1
1
+ 3.1.0
data/lib/algorithm.rb CHANGED
@@ -5,6 +5,8 @@ R = nil
5
5
  require "rinruby"
6
6
  require "statsample"
7
7
  require 'uri'
8
+ require 'transform.rb'
9
+ require 'utils.rb'
8
10
 
9
11
  module OpenTox
10
12
 
@@ -13,7 +15,7 @@ module OpenTox
13
15
 
14
16
  include OpenTox
15
17
 
16
- # Execute algorithm with parameters, please consult the OpenTox API and the webservice documentation for acceptable parameters
18
+ # Execute algorithm with parameters, consult OpenTox API and webservice documentation for acceptable parameters
17
19
  # @param [optional,Hash] params Algorithm parameters
18
20
  # @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
19
21
  # @return [String] URI of new resource (dataset, model, ...)
@@ -21,7 +23,7 @@ module OpenTox
21
23
  LOGGER.info "Running algorithm '"+@uri.to_s+"' with params: "+params.inspect
22
24
  RestClientWrapper.post(@uri, params, {:accept => 'text/uri-list'}, waiting_task).to_s
23
25
  end
24
-
26
+
25
27
  # Get OWL-DL representation in RDF/XML format
26
28
  # @return [application/rdf+xml] RDF/XML representation
27
29
  def to_rdfxml
@@ -33,7 +35,7 @@ module OpenTox
33
35
  # Generic Algorithm class, should work with all OpenTox webservices
34
36
  class Generic
35
37
  include Algorithm
36
-
38
+
37
39
  # Find Generic Opentox Algorithm via URI, and loads metadata, could raise NotFound/NotAuthorized error
38
40
  # @param [String] uri Algorithm URI
39
41
  # @return [OpenTox::Algorithm::Generic] Algorithm instance
@@ -44,14 +46,14 @@ module OpenTox
44
46
  raise "cannot load algorithm metadata" if alg.metadata==nil or alg.metadata.size==0
45
47
  alg
46
48
  end
47
-
49
+
48
50
  end
49
51
 
50
52
  # Fminer algorithms (https://github.com/amaunz/fminer2)
51
53
  class Fminer
52
54
  include Algorithm
53
55
  attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi
54
-
56
+
55
57
  def check_params(params,per_mil,subjectid=nil)
56
58
  raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
57
59
  raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
@@ -81,7 +83,7 @@ module OpenTox
81
83
  LOGGER.warn "Cannot find smiles for #{compound.to_s}."
82
84
  next
83
85
  end
84
-
86
+
85
87
  value_map=params[:value_map] unless params[:value_map].nil?
86
88
  entry.each do |feature,values|
87
89
  if feature == @prediction_feature.uri
@@ -90,7 +92,7 @@ module OpenTox
90
92
  LOGGER.warn "No #{feature} activity for #{compound.to_s}."
91
93
  else
92
94
  if @prediction_feature.feature_type == "classification"
93
- activity= value_map.invert[value].to_i # activities are mapped to 1..n
95
+ activity= value_map.invert[value.to_s].to_i # activities are mapped to 1..n
94
96
  @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
95
97
  elsif @prediction_feature.feature_type == "regression"
96
98
  activity= value.to_f
@@ -115,23 +117,23 @@ module OpenTox
115
117
 
116
118
  end
117
119
 
118
- # Backbone Refinement Class mining (http://bbrc.maunz.de/)
119
- class BBRC < Fminer
120
- # Initialize bbrc algorithm
121
- def initialize(subjectid=nil)
122
- super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
123
- load_metadata(subjectid)
124
- end
120
+ # Backbone Refinement Class mining (http://bbrc.maunz.de/)
121
+ class BBRC < Fminer
122
+ # Initialize bbrc algorithm
123
+ def initialize(subjectid=nil)
124
+ super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
125
+ load_metadata(subjectid)
125
126
  end
127
+ end
126
128
 
127
- # LAtent STructure Pattern Mining (http://last-pm.maunz.de)
128
- class LAST < Fminer
129
- # Initialize last algorithm
130
- def initialize(subjectid=nil)
131
- super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
132
- load_metadata(subjectid)
133
- end
129
+ # LAtent STructure Pattern Mining (http://last-pm.maunz.de)
130
+ class LAST < Fminer
131
+ # Initialize last algorithm
132
+ def initialize(subjectid=nil)
133
+ super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
134
+ load_metadata(subjectid)
134
135
  end
136
+ end
135
137
 
136
138
 
137
139
  # Create lazar prediction model
@@ -144,72 +146,6 @@ module OpenTox
144
146
  end
145
147
  end
146
148
 
147
- # Utility methods without dedicated webservices
148
-
149
- # Similarity calculations
150
- module Similarity
151
- include Algorithm
152
-
153
- # Tanimoto similarity
154
- # @param [Array] features_a Features of first compound
155
- # @param [Array] features_b Features of second compound
156
- # @param [optional, Hash] weights Weights for all features
157
- # @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required
158
- # @return [Float] (Weighted) tanimoto similarity
159
- def self.tanimoto(features_a,features_b,weights=nil,params=nil)
160
- common_features = features_a & features_b
161
- all_features = (features_a + features_b).uniq
162
- #LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}"
163
- if common_features.size > 0
164
- if weights
165
- #LOGGER.debug "nr_hits: #{params[:nr_hits]}"
166
- if !params.nil? && params[:nr_hits]
167
- params[:weights] = weights
168
- params[:mode] = "min"
169
- params[:features] = common_features
170
- common_p_sum = Algorithm.p_sum_support(params)
171
- params[:mode] = "max"
172
- params[:features] = all_features
173
- all_p_sum = Algorithm.p_sum_support(params)
174
- else
175
- common_p_sum = 0.0
176
- common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
177
- all_p_sum = 0.0
178
- all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
179
- end
180
- #LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}"
181
- common_p_sum/all_p_sum
182
- else
183
- #LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}"
184
- common_features.size.to_f/all_features.size.to_f
185
- end
186
- else
187
- 0.0
188
- end
189
- end
190
-
191
- # Euclidean similarity
192
- # @param [Hash] properties_a Properties of first compound
193
- # @param [Hash] properties_b Properties of second compound
194
- # @param [optional, Hash] weights Weights for all properties
195
- # @return [Float] (Weighted) euclidean similarity
196
- def self.euclidean(properties_a,properties_b,weights=nil)
197
- common_properties = properties_a.keys & properties_b.keys
198
- if common_properties.size > 1
199
- dist_sum = 0
200
- common_properties.each do |p|
201
- if weights
202
- dist_sum += ( (properties_a[p] - properties_b[p]) * Algorithm.gauss(weights[p]) )**2
203
- else
204
- dist_sum += (properties_a[p] - properties_b[p])**2
205
- end
206
- end
207
- 1/(1+Math.sqrt(dist_sum))
208
- else
209
- 0.0
210
- end
211
- end
212
- end
213
149
 
214
150
  # Structural Graph Clustering by TU Munich
215
151
  # Finds clusters similar to a query structure in a given training dataset
@@ -226,7 +162,7 @@ module OpenTox
226
162
  raise "Invalid URI."
227
163
  end
228
164
  @training_dataset_uri = training_dataset_uri
229
- if !OpenTox::Algorithm.numeric? training_threshold || training_threshold <0 || training_threshold >1
165
+ if !self.numeric? training_threshold || training_threshold <0 || training_threshold >1
230
166
  raise "Training threshold out of bounds."
231
167
  end
232
168
  @training_threshold = training_threshold.to_f
@@ -259,7 +195,7 @@ module OpenTox
259
195
  # @params[Float] Similarity threshold for query to clusters (optional)
260
196
  def get_clusters query_compound_uri, query_threshold = 0.5
261
197
 
262
- if !OpenTox::Algorithm.numeric? query_threshold || query_threshold <0 || query_threshold >1
198
+ if !self.numeric? query_threshold || query_threshold <0 || query_threshold >1
263
199
  raise "Query threshold out of bounds."
264
200
  end
265
201
  @query_threshold = query_threshold.to_f
@@ -285,7 +221,7 @@ module OpenTox
285
221
  metadata[DC.title][pattern]=""
286
222
  feature_clusterid_map[feature_uri] = metadata[DC.title].to_i
287
223
  }
288
-
224
+
289
225
  # Integrity check
290
226
  unless cluster_query_dataset.compounds.size == 1
291
227
  raise "Number of predicted compounds is != 1."
@@ -295,11 +231,11 @@ module OpenTox
295
231
  query_compound_uri = cluster_query_dataset.compounds[0]
296
232
  @target_clusters_array = Array.new
297
233
  cluster_query_dataset.features.keys.each { |cluster_membership_feature|
298
-
234
+
299
235
  # Getting dataset URI for cluster
300
236
  target_cluster = feature_clusterid_map[cluster_membership_feature]
301
237
  dataset = @clusterid_dataset_map[target_cluster]
302
-
238
+
303
239
  # Finally look up presence
304
240
  data_entry = cluster_query_dataset.data_entries[query_compound_uri]
305
241
  present = data_entry[cluster_membership_feature][0]
@@ -311,85 +247,13 @@ module OpenTox
311
247
 
312
248
  end
313
249
 
314
- module Neighbors
315
-
316
- # Local multi-linear regression (MLR) prediction from neighbors.
317
- # Uses propositionalized setting.
318
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
319
- # @return [Numeric] A prediction value.
320
- def self.local_mlr_prop(params)
321
-
322
- confidence=0.0
323
- prediction=nil
324
-
325
- if params[:neighbors].size>0
326
- props = params[:prop_kernel] ? get_props(params) : nil
327
- acts = params[:neighbors].collect { |n| act = n[:activity].to_f }
328
- sims = params[:neighbors].collect { |n| Algorithm.gauss(n[:similarity]) }
329
- LOGGER.debug "Local MLR (Propositionalization / GSL)."
330
- prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} )
331
- transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
332
- prediction = transformer.values[0]
333
- prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
334
- LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
335
- params[:conf_stdev] = false if params[:conf_stdev].nil?
336
- confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
337
- confidence = nil if prediction.nil?
338
- end
339
- {:prediction => prediction, :confidence => confidence}
340
-
341
- end
342
-
343
- # Multi-linear regression weighted by similarity.
344
- # Objective Feature Selection, Principal Components Analysis, Scaling of Axes.
345
- # @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required
346
- # @return [Numeric] A prediction value.
347
- def self.mlr(params)
348
-
349
- # GSL matrix operations:
350
- # to_a : row-wise conversion to nested array
351
- #
352
- # Statsample operations (build on GSL):
353
- # to_scale: convert into Statsample format
354
-
355
- begin
356
- n_prop = params[:n_prop].collect { |v| v }
357
- q_prop = params[:q_prop].collect { |v| v }
358
- n_prop << q_prop # attach q_prop
359
- nr_cases, nr_features = get_sizes n_prop
360
- data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
361
-
362
- # Principal Components Analysis
363
- LOGGER.debug "PCA..."
364
- pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix)
365
- data_matrix = pca.data_transformed_matrix
366
-
367
- # Attach intercept column to data
368
- intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1)
369
- data_matrix = data_matrix.horzcat(intercept)
370
- (0..data_matrix.size2-2).each { |i|
371
- autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i))
372
- data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values
373
- }
374
250
 
375
- # Detach query instance
376
- n_prop = data_matrix.to_a
377
- q_prop = n_prop.pop
378
- nr_cases, nr_features = get_sizes n_prop
379
- data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
380
251
 
381
- # model + support vectors
382
- LOGGER.debug "Creating MLR model ..."
383
- c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl)
384
- GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0]
385
- rescue Exception => e
386
- LOGGER.debug "#{e.class}: #{e.message}"
387
- end
252
+ module Neighbors
388
253
 
389
- end
390
254
 
391
255
  # Classification with majority vote from neighbors weighted by similarity
392
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
256
+ # @param [Hash] params Keys `:acts, :sims, :value_map` are required
393
257
  # @return [Numeric] A prediction value.
394
258
  def self.weighted_majority_vote(params)
395
259
 
@@ -398,12 +262,13 @@ module OpenTox
398
262
  confidence = 0.0
399
263
  prediction = nil
400
264
 
401
- params[:neighbors].each do |neighbor|
402
- neighbor_weight = Algorithm.gauss(neighbor[:similarity]).to_f
403
- neighbor_contribution += neighbor[:activity].to_f * neighbor_weight
265
+ LOGGER.debug "Weighted Majority Vote Classification."
404
266
 
267
+ params[:acts].each_index do |idx|
268
+ neighbor_weight = params[:sims][1][idx]
269
+ neighbor_contribution += params[:acts][idx] * neighbor_weight
405
270
  if params[:value_map].size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
406
- case neighbor[:activity]
271
+ case params[:acts][idx]
407
272
  when 1
408
273
  confidence_sum -= neighbor_weight
409
274
  when 2
@@ -413,294 +278,257 @@ module OpenTox
413
278
  confidence_sum += neighbor_weight
414
279
  end
415
280
  end
416
-
417
281
  if params[:value_map].size == 2
418
282
  if confidence_sum >= 0.0
419
- prediction = 2 unless params[:neighbors].size==0
283
+ prediction = 2 unless params[:acts].size==0
420
284
  elsif confidence_sum < 0.0
421
- prediction = 1 unless params[:neighbors].size==0
285
+ prediction = 1 unless params[:acts].size==0
422
286
  end
423
287
  else
424
- prediction = (neighbor_contribution/confidence_sum).round unless params[:neighbors].size==0 # AM: new multinomial prediction
288
+ prediction = (neighbor_contribution/confidence_sum).round unless params[:acts].size==0 # AM: new multinomial prediction
425
289
  end
290
+
426
291
  LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil?
427
- confidence = confidence_sum/params[:neighbors].size if params[:neighbors].size > 0
292
+ confidence = (confidence_sum/params[:acts].size).abs if params[:acts].size > 0
428
293
  LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil?
429
294
  return {:prediction => prediction, :confidence => confidence.abs}
430
295
  end
431
296
 
297
+
298
+
432
299
  # Local support vector regression from neighbors
433
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
300
+ # @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required
434
301
  # @return [Numeric] A prediction value.
435
302
  def self.local_svm_regression(params)
436
303
 
437
- confidence = 0.0
438
- prediction = nil
439
- if params[:neighbors].size>0
440
- props = params[:prop_kernel] ? get_props(params) : nil
441
- acts = params[:neighbors].collect{ |n| n[:activity].to_f }
442
- sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) }
443
- prediction = props.nil? ? local_svm(acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr")
444
- transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
445
- prediction = transformer.values[0]
446
- prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
447
- LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
448
- params[:conf_stdev] = false if params[:conf_stdev].nil?
449
- confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
450
- confidence = nil if prediction.nil?
304
+ begin
305
+ confidence = 0.0
306
+ prediction = nil
307
+
308
+ LOGGER.debug "Local SVM."
309
+ if params[:acts].size>0
310
+ if params[:props]
311
+ n_prop = params[:props][0].collect
312
+ q_prop = params[:props][1].collect
313
+ props = [ n_prop, q_prop ]
314
+ end
315
+ acts = params[:acts].collect
316
+ prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
317
+ prediction = nil if (!prediction.nil? && prediction.infinite?)
318
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
319
+ confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]})
320
+ confidence = 0.0 if prediction.nil?
321
+ end
322
+ {:prediction => prediction, :confidence => confidence}
323
+ rescue Exception => e
324
+ LOGGER.debug "#{e.class}: #{e.message}"
325
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
451
326
  end
452
- {:prediction => prediction, :confidence => confidence}
453
-
327
+
454
328
  end
455
329
 
456
- # Local support vector classification from neighbors
457
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
330
+
331
+ # Local support vector regression from neighbors
332
+ # @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required
458
333
  # @return [Numeric] A prediction value.
459
334
  def self.local_svm_classification(params)
460
335
 
461
- confidence = 0.0
462
- prediction = nil
463
- if params[:neighbors].size>0
464
- props = params[:prop_kernel] ? get_props(params) : nil
465
- acts = params[:neighbors].collect { |n| act = n[:activity] }
466
- sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
467
- prediction = props.nil? ? local_svm(acts, sims, "C-bsvc", params) : local_svm_prop(props, acts, "C-bsvc")
468
- LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
469
- params[:conf_stdev] = false if params[:conf_stdev].nil?
470
- confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
336
+ begin
337
+ confidence = 0.0
338
+ prediction = nil
339
+
340
+ LOGGER.debug "Local SVM."
341
+ if params[:acts].size>0
342
+ if params[:props]
343
+ n_prop = params[:props][0].collect
344
+ q_prop = params[:props][1].collect
345
+ props = [ n_prop, q_prop ]
346
+ end
347
+ acts = params[:acts].collect
348
+ acts = acts.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
349
+ prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
350
+ prediction = prediction.sub(/Val/,"") if prediction # Convert back to Float
351
+ confidence = 0.0 if prediction.nil?
352
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
353
+ confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]})
354
+ end
355
+ {:prediction => prediction, :confidence => confidence}
356
+ rescue Exception => e
357
+ LOGGER.debug "#{e.class}: #{e.message}"
358
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
471
359
  end
472
- {:prediction => prediction, :confidence => confidence}
473
-
360
+
474
361
  end
475
362
 
476
363
 
364
+
477
365
  # Local support vector prediction from neighbors.
478
- # Uses pre-defined Kernel Matrix.
366
+ # Uses propositionalized setting.
479
367
  # Not to be called directly (use local_svm_regression or local_svm_classification).
368
+ # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
480
369
  # @param [Array] acts, activities for neighbors.
481
- # @param [Array] sims, similarities for neighbors.
482
- # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
483
- # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
370
+ # @param [Float] min_train_performance, parameter to control censoring
484
371
  # @return [Numeric] A prediction value.
485
- def self.local_svm(acts, sims, type, params)
486
- LOGGER.debug "Local SVM (Weighted Tanimoto Kernel)."
487
- neighbor_matches = params[:neighbors].collect{ |n| n[:features] } # URIs of matches
488
- gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
372
+ def self.local_svm_prop(props, acts, min_train_performance)
373
+
374
+ LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
375
+ n_prop = props[0] # is a matrix, i.e. two nested Arrays.
376
+ q_prop = props[1] # is an Array.
489
377
 
490
378
  prediction = nil
491
379
  if Algorithm::zero_variance? acts
492
380
  prediction = acts[0]
493
381
  else
494
- # gram matrix
495
- (0..(neighbor_matches.length-1)).each do |i|
496
- neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]]
497
- gram_matrix[i] = [] unless gram_matrix[i]
498
- # upper triangle
499
- ((i+1)..(neighbor_matches.length-1)).each do |j|
500
- neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]]
501
- sim_params = {}
502
- if params[:nr_hits]
503
- sim_params[:nr_hits] = true
504
- sim_params[:compound_features_hits] = neighbor_i_hits
505
- sim_params[:training_compound_features_hits] = neighbor_j_hits
506
- end
507
- sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)")
508
- gram_matrix[i][j] = Algorithm.gauss(sim)
509
- gram_matrix[j] = [] unless gram_matrix[j]
510
- gram_matrix[j][i] = gram_matrix[i][j] # lower triangle
511
- end
512
- gram_matrix[i][i] = 1.0
513
- end
514
-
515
-
516
382
  #LOGGER.debug gram_matrix.to_yaml
517
383
  @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
518
- @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
519
- LOGGER.debug "Setting R data ..."
520
- # set data
521
- @r.gram_matrix = gram_matrix.flatten
522
- @r.n = neighbor_matches.size
523
- @r.y = acts
524
- @r.sims = sims
525
-
384
+ @r.eval "set.seed(1)"
385
+ @r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab"
386
+ @r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore"
387
+ @r.eval "registerDoMC()" # switch on parallel processing
526
388
  begin
527
- LOGGER.debug "Preparing R data ..."
528
- # prepare data
529
- @r.eval "y<-as.vector(y)"
530
- @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
531
- @r.eval "sims<-as.vector(sims)"
532
-
533
- # model + support vectors
534
- LOGGER.debug "Creating SVM model ..."
535
- @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"#{type}\", nu=0.5)"
536
- @r.eval "sv<-as.vector(SVindex(model))"
537
- @r.eval "sims<-sims[sv]"
538
- @r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
539
- LOGGER.debug "Predicting ..."
540
- if type == "nu-svr"
541
- @r.eval "p<-predict(model,sims)[1,1]"
542
- elsif type == "C-bsvc"
543
- @r.eval "p<-predict(model,sims)"
544
- end
545
- if type == "nu-svr"
546
- prediction = @r.p
547
- elsif type == "C-bsvc"
548
- #prediction = (@r.p.to_f == 1.0 ? true : false)
549
- prediction = @r.p
550
- end
551
- @r.quit # free R
552
- rescue Exception => e
553
- LOGGER.debug "#{e.class}: #{e.message}"
554
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
555
- end
556
-
557
- end
558
- prediction
559
- end
560
-
561
- # Local support vector prediction from neighbors.
562
- # Uses propositionalized setting.
563
- # Not to be called directly (use local_svm_regression or local_svm_classification).
564
- # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
565
- # @param [Array] acts, activities for neighbors.
566
- # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
567
- # @return [Numeric] A prediction value.
568
- def self.local_svm_prop(props, acts, type)
569
-
570
- LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
571
- n_prop = props[0] # is a matrix, i.e. two nested Arrays.
572
- q_prop = props[1] # is an Array.
573
389
 
574
- prediction = nil
575
- if Algorithm::zero_variance? acts
576
- prediction = acts[0]
577
- else
578
- #LOGGER.debug gram_matrix.to_yaml
579
- @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
580
- @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
581
- LOGGER.debug "Setting R data ..."
582
390
  # set data
391
+ LOGGER.debug "Setting R data ..."
583
392
  @r.n_prop = n_prop.flatten
584
393
  @r.n_prop_x_size = n_prop.size
585
394
  @r.n_prop_y_size = n_prop[0].size
586
395
  @r.y = acts
587
396
  @r.q_prop = q_prop
397
+ #@r.eval "y = matrix(y)"
398
+ @r.eval "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
399
+ @r.eval "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
588
400
 
589
- begin
590
- LOGGER.debug "Preparing R data ..."
591
- # prepare data
592
- @r.eval "y<-matrix(y)"
593
- @r.eval "prop_matrix<-matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=TRUE)"
594
- @r.eval "q_prop<-matrix(q_prop, 1, n_prop_y_size, byrow=TRUE)"
595
-
596
- # model + support vectors
597
- LOGGER.debug "Creating SVM model ..."
598
- @r.eval "model<-ksvm(prop_matrix, y, type=\"#{type}\", nu=0.5)"
599
- LOGGER.debug "Predicting ..."
600
- if type == "nu-svr"
601
- @r.eval "p<-predict(model,q_prop)[1,1]"
602
- elsif type == "C-bsvc"
603
- @r.eval "p<-predict(model,q_prop)"
604
- end
605
- if type == "nu-svr"
606
- prediction = @r.p
607
- elsif type == "C-bsvc"
608
- #prediction = (@r.p.to_f == 1.0 ? true : false)
609
- prediction = @r.p
610
- end
611
- @r.quit # free R
612
- rescue Exception => e
613
- LOGGER.debug "#{e.class}: #{e.message}"
614
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
615
- end
616
- end
617
- prediction
618
- end
401
+ # prepare data
402
+ LOGGER.debug "Preparing R data ..."
403
+ @r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification
404
+
405
+ @r.eval <<-EOR
406
+ rem = nearZeroVar(prop_matrix)
407
+ if (length(rem) > 0) {
408
+ prop_matrix = prop_matrix[,-rem,drop=F]
409
+ q_prop = q_prop[,-rem,drop=F]
410
+ }
411
+ rem = findCorrelation(cor(prop_matrix))
412
+ if (length(rem) > 0) {
413
+ prop_matrix = prop_matrix[,-rem,drop=F]
414
+ q_prop = q_prop[,-rem,drop=F]
415
+ }
416
+ EOR
619
417
 
620
- # Get confidence for regression, with standard deviation of neighbor activity if conf_stdev is set.
621
- # @param[Hash] Required keys: :sims, :acts, :neighbors, :conf_stdev
622
- # @return[Float] Confidence
623
- def self.get_confidence(params)
624
- if params[:conf_stdev]
625
- sim_median = params[:sims].to_scale.median
626
- if sim_median.nil?
627
- confidence = nil
628
- else
629
- standard_deviation = params[:acts].to_scale.standard_deviation_sample
630
- confidence = (sim_median*Math.exp(-1*standard_deviation)).abs
631
- if confidence.nan?
632
- confidence = nil
633
- end
634
- end
635
- else
636
- conf = params[:sims].inject{|sum,x| sum + x }
637
- confidence = conf/params[:neighbors].size
638
- end
639
- LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
640
- return confidence
641
- end
418
+ # model + support vectors
419
+ LOGGER.debug "Creating R SVM model ..."
420
+ @r.eval <<-EOR
421
+ model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale"))
422
+ perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
423
+ EOR
642
424
 
643
- # Get X and Y size of a nested Array (Matrix)
644
- def self.get_sizes(matrix)
645
- begin
646
- nr_cases = matrix.size
647
- nr_features = matrix[0].size
648
- rescue Exception => e
649
- LOGGER.debug "#{e.class}: #{e.message}"
650
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
651
- end
652
- #puts "NRC: #{nr_cases}, NRF: #{nr_features}"
653
- [ nr_cases, nr_features ]
654
- end
655
425
 
656
- # Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features)
657
- # Same for the vector describing the query compound
658
- # @param[Array] neighbors.
659
- # @param[OpenTox::Compound] query compound.
660
- # @param[Array] Dataset Features.
661
- # @param[Array] Fingerprints of neighbors.
662
- # @param[Float] p-values of Features.
663
- def self.get_props (params)
664
- matrix = Array.new
665
- begin
666
- params[:neighbors].each do |n|
667
- n = n[:compound]
668
- row = []
669
- params[:features].each do |f|
670
- if ! params[:fingerprints][n].nil?
671
- row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0)
672
- else
673
- row << 0.0
674
- end
675
- end
676
- matrix << row
677
- end
678
- row = []
679
- params[:features].each do |f|
680
- if params[:nr_hits]
681
- compound_feature_hits = params[:compound].match_hits([f])
682
- row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f]))
683
- else
684
- row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f])
685
- end
426
+ # prediction
427
+ LOGGER.debug "Predicting ..."
428
+ @r.eval "p = predict(model,q_prop)"
429
+ @r.eval "if (class(y)!='numeric') p = as.character(p)"
430
+ prediction = @r.p
431
+
432
+ # censoring
433
+ prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance )
434
+ LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}"
435
+ rescue Exception => e
436
+ LOGGER.debug "#{e.class}: #{e.message}"
437
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
686
438
  end
687
- rescue Exception => e
688
- LOGGER.debug "get_props failed with '" + $! + "'"
439
+ @r.quit # free R
689
440
  end
690
- [ matrix, row ]
441
+ prediction
691
442
  end
692
443
 
693
444
  end
694
445
 
446
+ module FeatureSelection
447
+ include Algorithm
448
+ # Recursive Feature Elimination using caret
449
+ # @param [Hash] required keys: ds_csv_file, prediction_feature, fds_csv_file (dataset CSV file, prediction feature column name, and feature dataset CSV file), optional: del_missing (delete rows with missing values).
450
+ # @return [String] feature dataset CSV file composed of selected features.
451
+ def self.rfe(params)
452
+ @r=RinRuby.new(false,false)
453
+ @r.ds_csv_file = params[:ds_csv_file].to_s
454
+ @r.prediction_feature = params[:prediction_feature].to_s
455
+ @r.fds_csv_file = params[:fds_csv_file].to_s
456
+ @r.del_missing = params[:del_missing] == true ? 1 : 0
457
+ r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
458
+ @r.f_fds_r = r_result_file.to_s
459
+
460
+ # need packs 'randomForest', 'RANN'
461
+ @r.eval <<-EOR
462
+ set.seed(1)
463
+ suppressPackageStartupMessages(library('caret'))
464
+ suppressPackageStartupMessages(library('randomForest'))
465
+ suppressPackageStartupMessages(library('RANN'))
466
+ suppressPackageStartupMessages(library('doMC'))
467
+ registerDoMC()
468
+
469
+ acts = read.csv(ds_csv_file, check.names=F)
470
+ feats = read.csv(fds_csv_file, check.names=F)
471
+ ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-)
472
+
473
+ features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]
474
+ y = ds[,which(names(ds) == prediction_feature)]
475
+
476
+ # assumes a data matrix 'features' and a vector 'y' of target values
477
+ row.names(features)=NULL
478
+
479
+ pp = NULL
480
+ if (del_missing) {
481
+ # needed if rows should be removed
482
+ na_ids = apply(features,1,function(x)any(is.na(x)))
483
+ features = features[!na_ids,]
484
+ y = y[!na_ids]
485
+ pp = preProcess(features, method=c("scale", "center"))
486
+ } else {
487
+ # Use imputation if NA's random (only then!)
488
+ pp = preProcess(features, method=c("scale", "center", "knnImpute"))
489
+ }
490
+ features = predict(pp, features)
491
+
492
+ # determine subsets
493
+ subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
494
+ subsets = c(2,3,4,5,7,10,subsets)
495
+ subsets = unique(sort(round(subsets)))
496
+ subsets = subsets[subsets<=dim(features)[2]]
497
+ subsets = subsets[subsets>1]
498
+
499
+ # Recursive feature elimination
500
+ rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)
501
+
502
+ # read existing dataset and select most useful features
503
+ csv=feats[,c("SMILES", rfProfile$optVariables)]
504
+ write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')
505
+ EOR
506
+ r_result_file
507
+ end
508
+ end
509
+
695
510
  module Substructure
696
511
  include Algorithm
697
512
  # Substructure matching
698
- # @param [OpenTox::Compound] compound Compound
699
- # @param [Array] features Array with Smarts strings
513
+ # @param [Hash] required keys: compound, features
700
514
  # @return [Array] Array with matching Smarts
701
- def self.match(compound,features)
702
- compound.match(features)
515
+ def self.match(params)
516
+ params[:compound].match(params[:features])
703
517
  end
518
+
519
+ # Substructure matching with number of non-unique hits
520
+ # @param [Hash] required keys: compound, features
521
+ # @return [Hash] Hash with matching Smarts and number of hits
522
+ def self.match_hits(params)
523
+ params[:compound].match_hits(params[:features])
524
+ end
525
+
526
+ # Substructure matching with number of non-unique hits
527
+ # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
528
+ # @return [Hash] Hash with matching Smarts and number of hits
529
+ def self.lookup(params)
530
+ params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid])
531
+ end
704
532
  end
705
533
 
706
534
  module Dataset
@@ -709,281 +537,5 @@ module OpenTox
709
537
  def features(dataset_uri,compound_uri)
710
538
  end
711
539
  end
712
-
713
- module Transform
714
- include Algorithm
715
-
716
- # The transformer that inverts values.
717
- # 1/x is used, after values have been moved >= 1.
718
- class Inverter
719
- attr_accessor :offset, :values
720
-
721
- # @params[Array] Values to transform.
722
- # @params[Float] Offset for restore.
723
- def initialize *args
724
- case args.size
725
- when 1
726
- begin
727
- values=args[0]
728
- raise "Cannot transform, values empty." if @values.size==0
729
- @values = values.collect { |v| -1.0 * v }
730
- @offset = 1.0 - @values.minmax[0]
731
- @offset = -1.0 * @offset if @offset>0.0
732
- @values.collect! { |v| v - @offset } # slide >1
733
- @values.collect! { |v| 1 / v } # invert to [0,1]
734
- rescue Exception => e
735
- LOGGER.debug "#{e.class}: #{e.message}"
736
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
737
- end
738
- when 2
739
- @offset = args[1].to_f
740
- @values = args[0].collect { |v| 1 / v }
741
- @values.collect! { |v| v + @offset }
742
- @values.collect! { |v| -1.0 * v }
743
- end
744
- end
745
- end
746
-
747
- # The transformer that takes logs.
748
- # Log10 is used, after values have been moved > 0.
749
- class Log10
750
- attr_accessor :offset, :values
751
-
752
- # @params[Array] Values to transform / restore.
753
- # @params[Float] Offset for restore.
754
- def initialize *args
755
- @distance_to_zero = 0.000000001 # 1 / 1 billion
756
- case args.size
757
- when 1
758
- begin
759
- values=args[0]
760
- raise "Cannot transform, values empty." if values.size==0
761
- @offset = values.minmax[0]
762
- @offset = -1.0 * @offset if @offset>0.0
763
- @values = values.collect { |v| v - @offset } # slide > anchor
764
- @values.collect! { |v| v + @distance_to_zero } #
765
- @values.collect! { |v| Math::log10 v } # log10 (can fail)
766
- rescue Exception => e
767
- LOGGER.debug "#{e.class}: #{e.message}"
768
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
769
- end
770
- when 2
771
- @offset = args[1].to_f
772
- @values = args[0].collect { |v| 10**v }
773
- @values.collect! { |v| v - @distance_to_zero }
774
- @values.collect! { |v| v + @offset }
775
- end
776
- end
777
- end
778
-
779
- # The transformer that does nothing (No OPeration).
780
- class NOP
781
- attr_accessor :offset, :values
782
-
783
- # @params[Array] Values to transform / restore.
784
- # @params[Float] Offset for restore.
785
- def initialize *args
786
- @offset = 0.0
787
- @distance_to_zero = 0.0
788
- case args.size
789
- when 1
790
- @values = args[0]
791
- when 2
792
- @values = args[0]
793
- end
794
- end
795
- end
796
-
797
-
798
- # Auto-Scaler for Arrays
799
- # Center on mean and divide by standard deviation
800
- class AutoScale
801
- attr_accessor :scaled_values, :mean, :stdev
802
-
803
- # @params[Array] Values to transform.
804
- def initialize values
805
- @scaled_values = values
806
- @mean = @scaled_values.to_scale.mean
807
- @stdev = @scaled_values.to_scale.standard_deviation_sample
808
- @scaled_values = @scaled_values.collect {|vi| vi - @mean }
809
- @scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0
810
- end
811
- end
812
-
813
- # Principal Components Analysis
814
- # Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
815
- class PCA
816
- attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
817
-
818
- # Creates a transformed dataset as GSL::Matrix.
819
- # @param [GSL::Matrix] Data matrix.
820
- # @param [Float] Compression ratio from [0,1].
821
- # @return [GSL::Matrix] Data transformed matrix.
822
- def initialize data_matrix, compression=0.05
823
- begin
824
- @data_matrix = data_matrix
825
- @compression = compression.to_f
826
- @stdev = Array.new
827
- @mean = Array.new
828
-
829
- # Objective Feature Selection
830
- raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
831
- @data_matrix_selected = nil
832
- (0..@data_matrix.size2-1).each { |i|
833
- if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
834
- if @data_matrix_selected.nil?
835
- @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
836
- @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
837
- else
838
- @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
839
- end
840
- end
841
- }
842
- raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
843
-
844
- # Scaling of Axes
845
- @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2)
846
- (0..@data_matrix_selected.size2-1).each { |i|
847
- @autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
848
- @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values
849
- @stdev << @autoscaler.stdev
850
- @mean << @autoscaler.mean
851
- }
852
-
853
- data_matrix_hash = Hash.new
854
- (0..@data_matrix_scaled.size2-1).each { |i|
855
- column_view = @data_matrix_scaled.col(i)
856
- data_matrix_hash[i] = column_view.to_scale
857
- }
858
- dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
859
- cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
860
- pca=Statsample::Factor::PCA.new(cor_matrix)
861
- pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
862
- @eigenvalue_sums = Array.new
863
- (0..dataset_hash.fields.size-1).each { |i|
864
- @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
865
- }
866
- eigenvectors_selected = Array.new
867
- pca.eigenvectors.each_with_index { |ev, i|
868
- if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0)
869
- eigenvectors_selected << ev.to_a
870
- end
871
- }
872
- @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose
873
- dataset_matrix = dataset_hash.to_gsl.transpose
874
- @data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose
875
- rescue Exception => e
876
- LOGGER.debug "#{e.class}: #{e.message}"
877
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
878
- end
879
- end
880
-
881
- # Restores data in the original feature space (possibly with compression loss).
882
- # @return [GSL::Matrix] Data matrix.
883
- def restore
884
- begin
885
- data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
886
- # reverse scaling
887
- (0..data_matrix_restored.size2-1).each { |i|
888
- data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0
889
- data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
890
- }
891
- data_matrix_restored
892
- rescue Exception => e
893
- LOGGER.debug "#{e.class}: #{e.message}"
894
- LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
895
- end
896
- end
897
-
898
- end
899
-
900
- end
901
-
902
- # Gauss kernel
903
- # @return [Float]
904
- def self.gauss(x, sigma = 0.3)
905
- d = 1.0 - x.to_f
906
- Math.exp(-(d*d)/(2*sigma*sigma))
907
- end
908
-
909
- # For symbolic features
910
- # @param [Array] Array to test, must indicate non-occurrence with 0.
911
- # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
912
- def self.isnull_or_singular?(array)
913
- nr_zeroes = array.count(0)
914
- return (nr_zeroes == array.size) || # remove non-occurring feature
915
- (nr_zeroes == array.size-1) || # remove singular feature
916
- (nr_zeroes == 0) # also remove feature present everywhere
917
- end
918
-
919
- # Numeric value test
920
- # @param[Object] value
921
- # @return [Boolean] Whether value is a number
922
- def self.numeric?(value)
923
- true if Float(value) rescue false
924
- end
925
-
926
- # For symbolic features
927
- # @param [Array] Array to test, must indicate non-occurrence with 0.
928
- # @return [Boolean] Whether the feature has variance zero.
929
- def self.zero_variance?(array)
930
- return (array.to_scale.variance_population == 0.0)
931
- end
932
-
933
- # Sum of an array for Arrays.
934
- # @param [Array] Array with values
935
- # @return [Integer] Sum of size of values
936
- def self.sum_size(array)
937
- sum=0
938
- array.each { |e| sum += e.size }
939
- return sum
940
- end
941
-
942
- # Minimum Frequency
943
- # @param [Integer] per-mil value
944
- # return [Integer] min-frequency
945
- def self.min_frequency(training_dataset,per_mil)
946
- minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
947
- minfreq = 2 unless minfreq > 2
948
- Integer (minfreq)
949
- end
950
-
951
- # Effect calculation for classification
952
- # @param [Array] Array of occurrences per class in the form of Enumerables.
953
- # @param [Array] Array of database instance counts per class.
954
- def self.effect(occurrences, db_instances)
955
- max=0
956
- max_value=0
957
- nr_o = self.sum_size(occurrences)
958
- nr_db = db_instances.to_scale.sum
959
-
960
- occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
961
- actual = o.size.to_f/nr_o
962
- expected = db_instances[i].to_f/nr_db
963
- if actual > expected
964
- if ((actual - expected) / actual) > max_value
965
- max_value = (actual - expected) / actual # 'Schleppzeiger'
966
- max = i
967
- end
968
- end
969
- }
970
- max
971
- end
972
-
973
- # Returns Support value of an fingerprint
974
- # @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required
975
- # return [Numeric] Support value
976
- def self.p_sum_support(params)
977
- p_sum = 0.0
978
- params[:features].each{|f|
979
- compound_hits = params[:compound_features_hits][f]
980
- neighbor_hits = params[:training_compound_features_hits][f]
981
- p_sum += eval("(Algorithm.gauss(params[:weights][f]) * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))")
982
- }
983
- p_sum
984
- end
985
-
986
540
  end
987
541
  end
988
-
989
-