opentox-ruby 3.0.1 → 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +8 -0
- data/Rakefile +2 -3
- data/VERSION +1 -1
- data/lib/algorithm.rb +227 -675
- data/lib/authorization.rb +10 -8
- data/lib/compound.rb +47 -11
- data/lib/dataset.rb +50 -2
- data/lib/environment.rb +6 -1
- data/lib/model.rb +37 -72
- data/lib/opentox-ruby.rb +1 -1
- data/lib/parser.rb +115 -57
- data/lib/r-util.rb +354 -0
- data/lib/rest_client_wrapper.rb +1 -1
- data/lib/serializer.rb +47 -30
- data/lib/stratification.R +201 -0
- data/lib/task.rb +5 -1
- data/lib/transform.rb +520 -0
- data/lib/utils.rb +372 -0
- data/lib/validation.rb +52 -6
- metadata +413 -428
data/ChangeLog
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
v3.1.0 2012-02-24
|
2
|
+
* utils.rb: added for special routines (e.g. descriptor calculation)
|
3
|
+
* task.rb: Polling with increasing interval
|
4
|
+
* parser.rb: CSV up and download fixed
|
5
|
+
* transform.rb: routines to create machine learning data matrices
|
6
|
+
* algorithm.rb: SVM parameter grid search, cos similarity as algorithm,
|
7
|
+
gauss() removed
|
8
|
+
|
1
9
|
v3.0.1 2011-10-19
|
2
10
|
* feature: model registration to ontology service
|
3
11
|
* ontology lib gets endpoints from ontology service
|
data/Rakefile
CHANGED
@@ -16,7 +16,7 @@ begin
|
|
16
16
|
gem.add_dependency "sinatra-respond_to", "=0.7.0"
|
17
17
|
gem.add_dependency "sinatra-static-assets", "=0.5.0"
|
18
18
|
gem.add_dependency "rest-client", "=1.6.1"
|
19
|
-
gem.add_dependency "rack", "=1.3.
|
19
|
+
gem.add_dependency "rack", "=1.3.5"
|
20
20
|
gem.add_dependency "rack-contrib", "=1.1.0"
|
21
21
|
gem.add_dependency "rack-flash", "=0.1.1"
|
22
22
|
gem.add_dependency "nokogiri", "=1.4.4"
|
@@ -42,10 +42,9 @@ begin
|
|
42
42
|
gem.add_dependency "dm-migrations", "=1.1.0"
|
43
43
|
gem.add_dependency "dm-validations", "=1.1.0"
|
44
44
|
gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
|
45
|
-
gem.add_dependency "ruby-plot", "=0.
|
45
|
+
gem.add_dependency "ruby-plot", "=0.6.0"
|
46
46
|
gem.add_dependency "gsl", "=1.14.7"
|
47
47
|
gem.add_dependency "statsample", "=1.1.0"
|
48
|
-
#gem.add_dependency "statsample-optimization", "=2.1.0"
|
49
48
|
|
50
49
|
gem.add_development_dependency 'jeweler'
|
51
50
|
gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
3.0
|
1
|
+
3.1.0
|
data/lib/algorithm.rb
CHANGED
@@ -5,6 +5,8 @@ R = nil
|
|
5
5
|
require "rinruby"
|
6
6
|
require "statsample"
|
7
7
|
require 'uri'
|
8
|
+
require 'transform.rb'
|
9
|
+
require 'utils.rb'
|
8
10
|
|
9
11
|
module OpenTox
|
10
12
|
|
@@ -13,7 +15,7 @@ module OpenTox
|
|
13
15
|
|
14
16
|
include OpenTox
|
15
17
|
|
16
|
-
# Execute algorithm with parameters,
|
18
|
+
# Execute algorithm with parameters, consult OpenTox API and webservice documentation for acceptable parameters
|
17
19
|
# @param [optional,Hash] params Algorithm parameters
|
18
20
|
# @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
|
19
21
|
# @return [String] URI of new resource (dataset, model, ...)
|
@@ -21,7 +23,7 @@ module OpenTox
|
|
21
23
|
LOGGER.info "Running algorithm '"+@uri.to_s+"' with params: "+params.inspect
|
22
24
|
RestClientWrapper.post(@uri, params, {:accept => 'text/uri-list'}, waiting_task).to_s
|
23
25
|
end
|
24
|
-
|
26
|
+
|
25
27
|
# Get OWL-DL representation in RDF/XML format
|
26
28
|
# @return [application/rdf+xml] RDF/XML representation
|
27
29
|
def to_rdfxml
|
@@ -33,7 +35,7 @@ module OpenTox
|
|
33
35
|
# Generic Algorithm class, should work with all OpenTox webservices
|
34
36
|
class Generic
|
35
37
|
include Algorithm
|
36
|
-
|
38
|
+
|
37
39
|
# Find Generic Opentox Algorithm via URI, and loads metadata, could raise NotFound/NotAuthorized error
|
38
40
|
# @param [String] uri Algorithm URI
|
39
41
|
# @return [OpenTox::Algorithm::Generic] Algorithm instance
|
@@ -44,14 +46,14 @@ module OpenTox
|
|
44
46
|
raise "cannot load algorithm metadata" if alg.metadata==nil or alg.metadata.size==0
|
45
47
|
alg
|
46
48
|
end
|
47
|
-
|
49
|
+
|
48
50
|
end
|
49
51
|
|
50
52
|
# Fminer algorithms (https://github.com/amaunz/fminer2)
|
51
53
|
class Fminer
|
52
54
|
include Algorithm
|
53
55
|
attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi
|
54
|
-
|
56
|
+
|
55
57
|
def check_params(params,per_mil,subjectid=nil)
|
56
58
|
raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
|
57
59
|
raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
|
@@ -81,7 +83,7 @@ module OpenTox
|
|
81
83
|
LOGGER.warn "Cannot find smiles for #{compound.to_s}."
|
82
84
|
next
|
83
85
|
end
|
84
|
-
|
86
|
+
|
85
87
|
value_map=params[:value_map] unless params[:value_map].nil?
|
86
88
|
entry.each do |feature,values|
|
87
89
|
if feature == @prediction_feature.uri
|
@@ -90,7 +92,7 @@ module OpenTox
|
|
90
92
|
LOGGER.warn "No #{feature} activity for #{compound.to_s}."
|
91
93
|
else
|
92
94
|
if @prediction_feature.feature_type == "classification"
|
93
|
-
activity= value_map.invert[value].to_i # activities are mapped to 1..n
|
95
|
+
activity= value_map.invert[value.to_s].to_i # activities are mapped to 1..n
|
94
96
|
@db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
|
95
97
|
elsif @prediction_feature.feature_type == "regression"
|
96
98
|
activity= value.to_f
|
@@ -115,23 +117,23 @@ module OpenTox
|
|
115
117
|
|
116
118
|
end
|
117
119
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
end
|
120
|
+
# Backbone Refinement Class mining (http://bbrc.maunz.de/)
|
121
|
+
class BBRC < Fminer
|
122
|
+
# Initialize bbrc algorithm
|
123
|
+
def initialize(subjectid=nil)
|
124
|
+
super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
|
125
|
+
load_metadata(subjectid)
|
125
126
|
end
|
127
|
+
end
|
126
128
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
end
|
129
|
+
# LAtent STructure Pattern Mining (http://last-pm.maunz.de)
|
130
|
+
class LAST < Fminer
|
131
|
+
# Initialize last algorithm
|
132
|
+
def initialize(subjectid=nil)
|
133
|
+
super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
|
134
|
+
load_metadata(subjectid)
|
134
135
|
end
|
136
|
+
end
|
135
137
|
|
136
138
|
|
137
139
|
# Create lazar prediction model
|
@@ -144,72 +146,6 @@ module OpenTox
|
|
144
146
|
end
|
145
147
|
end
|
146
148
|
|
147
|
-
# Utility methods without dedicated webservices
|
148
|
-
|
149
|
-
# Similarity calculations
|
150
|
-
module Similarity
|
151
|
-
include Algorithm
|
152
|
-
|
153
|
-
# Tanimoto similarity
|
154
|
-
# @param [Array] features_a Features of first compound
|
155
|
-
# @param [Array] features_b Features of second compound
|
156
|
-
# @param [optional, Hash] weights Weights for all features
|
157
|
-
# @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required
|
158
|
-
# @return [Float] (Weighted) tanimoto similarity
|
159
|
-
def self.tanimoto(features_a,features_b,weights=nil,params=nil)
|
160
|
-
common_features = features_a & features_b
|
161
|
-
all_features = (features_a + features_b).uniq
|
162
|
-
#LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}"
|
163
|
-
if common_features.size > 0
|
164
|
-
if weights
|
165
|
-
#LOGGER.debug "nr_hits: #{params[:nr_hits]}"
|
166
|
-
if !params.nil? && params[:nr_hits]
|
167
|
-
params[:weights] = weights
|
168
|
-
params[:mode] = "min"
|
169
|
-
params[:features] = common_features
|
170
|
-
common_p_sum = Algorithm.p_sum_support(params)
|
171
|
-
params[:mode] = "max"
|
172
|
-
params[:features] = all_features
|
173
|
-
all_p_sum = Algorithm.p_sum_support(params)
|
174
|
-
else
|
175
|
-
common_p_sum = 0.0
|
176
|
-
common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
|
177
|
-
all_p_sum = 0.0
|
178
|
-
all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
|
179
|
-
end
|
180
|
-
#LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}"
|
181
|
-
common_p_sum/all_p_sum
|
182
|
-
else
|
183
|
-
#LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}"
|
184
|
-
common_features.size.to_f/all_features.size.to_f
|
185
|
-
end
|
186
|
-
else
|
187
|
-
0.0
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
191
|
-
# Euclidean similarity
|
192
|
-
# @param [Hash] properties_a Properties of first compound
|
193
|
-
# @param [Hash] properties_b Properties of second compound
|
194
|
-
# @param [optional, Hash] weights Weights for all properties
|
195
|
-
# @return [Float] (Weighted) euclidean similarity
|
196
|
-
def self.euclidean(properties_a,properties_b,weights=nil)
|
197
|
-
common_properties = properties_a.keys & properties_b.keys
|
198
|
-
if common_properties.size > 1
|
199
|
-
dist_sum = 0
|
200
|
-
common_properties.each do |p|
|
201
|
-
if weights
|
202
|
-
dist_sum += ( (properties_a[p] - properties_b[p]) * Algorithm.gauss(weights[p]) )**2
|
203
|
-
else
|
204
|
-
dist_sum += (properties_a[p] - properties_b[p])**2
|
205
|
-
end
|
206
|
-
end
|
207
|
-
1/(1+Math.sqrt(dist_sum))
|
208
|
-
else
|
209
|
-
0.0
|
210
|
-
end
|
211
|
-
end
|
212
|
-
end
|
213
149
|
|
214
150
|
# Structural Graph Clustering by TU Munich
|
215
151
|
# Finds clusters similar to a query structure in a given training dataset
|
@@ -226,7 +162,7 @@ module OpenTox
|
|
226
162
|
raise "Invalid URI."
|
227
163
|
end
|
228
164
|
@training_dataset_uri = training_dataset_uri
|
229
|
-
if !
|
165
|
+
if !self.numeric? training_threshold || training_threshold <0 || training_threshold >1
|
230
166
|
raise "Training threshold out of bounds."
|
231
167
|
end
|
232
168
|
@training_threshold = training_threshold.to_f
|
@@ -259,7 +195,7 @@ module OpenTox
|
|
259
195
|
# @params[Float] Similarity threshold for query to clusters (optional)
|
260
196
|
def get_clusters query_compound_uri, query_threshold = 0.5
|
261
197
|
|
262
|
-
if !
|
198
|
+
if !self.numeric? query_threshold || query_threshold <0 || query_threshold >1
|
263
199
|
raise "Query threshold out of bounds."
|
264
200
|
end
|
265
201
|
@query_threshold = query_threshold.to_f
|
@@ -285,7 +221,7 @@ module OpenTox
|
|
285
221
|
metadata[DC.title][pattern]=""
|
286
222
|
feature_clusterid_map[feature_uri] = metadata[DC.title].to_i
|
287
223
|
}
|
288
|
-
|
224
|
+
|
289
225
|
# Integrity check
|
290
226
|
unless cluster_query_dataset.compounds.size == 1
|
291
227
|
raise "Number of predicted compounds is != 1."
|
@@ -295,11 +231,11 @@ module OpenTox
|
|
295
231
|
query_compound_uri = cluster_query_dataset.compounds[0]
|
296
232
|
@target_clusters_array = Array.new
|
297
233
|
cluster_query_dataset.features.keys.each { |cluster_membership_feature|
|
298
|
-
|
234
|
+
|
299
235
|
# Getting dataset URI for cluster
|
300
236
|
target_cluster = feature_clusterid_map[cluster_membership_feature]
|
301
237
|
dataset = @clusterid_dataset_map[target_cluster]
|
302
|
-
|
238
|
+
|
303
239
|
# Finally look up presence
|
304
240
|
data_entry = cluster_query_dataset.data_entries[query_compound_uri]
|
305
241
|
present = data_entry[cluster_membership_feature][0]
|
@@ -311,85 +247,13 @@ module OpenTox
|
|
311
247
|
|
312
248
|
end
|
313
249
|
|
314
|
-
module Neighbors
|
315
|
-
|
316
|
-
# Local multi-linear regression (MLR) prediction from neighbors.
|
317
|
-
# Uses propositionalized setting.
|
318
|
-
# @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
|
319
|
-
# @return [Numeric] A prediction value.
|
320
|
-
def self.local_mlr_prop(params)
|
321
|
-
|
322
|
-
confidence=0.0
|
323
|
-
prediction=nil
|
324
|
-
|
325
|
-
if params[:neighbors].size>0
|
326
|
-
props = params[:prop_kernel] ? get_props(params) : nil
|
327
|
-
acts = params[:neighbors].collect { |n| act = n[:activity].to_f }
|
328
|
-
sims = params[:neighbors].collect { |n| Algorithm.gauss(n[:similarity]) }
|
329
|
-
LOGGER.debug "Local MLR (Propositionalization / GSL)."
|
330
|
-
prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} )
|
331
|
-
transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
|
332
|
-
prediction = transformer.values[0]
|
333
|
-
prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
|
334
|
-
LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
|
335
|
-
params[:conf_stdev] = false if params[:conf_stdev].nil?
|
336
|
-
confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
|
337
|
-
confidence = nil if prediction.nil?
|
338
|
-
end
|
339
|
-
{:prediction => prediction, :confidence => confidence}
|
340
|
-
|
341
|
-
end
|
342
|
-
|
343
|
-
# Multi-linear regression weighted by similarity.
|
344
|
-
# Objective Feature Selection, Principal Components Analysis, Scaling of Axes.
|
345
|
-
# @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required
|
346
|
-
# @return [Numeric] A prediction value.
|
347
|
-
def self.mlr(params)
|
348
|
-
|
349
|
-
# GSL matrix operations:
|
350
|
-
# to_a : row-wise conversion to nested array
|
351
|
-
#
|
352
|
-
# Statsample operations (build on GSL):
|
353
|
-
# to_scale: convert into Statsample format
|
354
|
-
|
355
|
-
begin
|
356
|
-
n_prop = params[:n_prop].collect { |v| v }
|
357
|
-
q_prop = params[:q_prop].collect { |v| v }
|
358
|
-
n_prop << q_prop # attach q_prop
|
359
|
-
nr_cases, nr_features = get_sizes n_prop
|
360
|
-
data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
|
361
|
-
|
362
|
-
# Principal Components Analysis
|
363
|
-
LOGGER.debug "PCA..."
|
364
|
-
pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix)
|
365
|
-
data_matrix = pca.data_transformed_matrix
|
366
|
-
|
367
|
-
# Attach intercept column to data
|
368
|
-
intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1)
|
369
|
-
data_matrix = data_matrix.horzcat(intercept)
|
370
|
-
(0..data_matrix.size2-2).each { |i|
|
371
|
-
autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i))
|
372
|
-
data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values
|
373
|
-
}
|
374
250
|
|
375
|
-
# Detach query instance
|
376
|
-
n_prop = data_matrix.to_a
|
377
|
-
q_prop = n_prop.pop
|
378
|
-
nr_cases, nr_features = get_sizes n_prop
|
379
|
-
data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
|
380
251
|
|
381
|
-
|
382
|
-
LOGGER.debug "Creating MLR model ..."
|
383
|
-
c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl)
|
384
|
-
GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0]
|
385
|
-
rescue Exception => e
|
386
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
387
|
-
end
|
252
|
+
module Neighbors
|
388
253
|
|
389
|
-
end
|
390
254
|
|
391
255
|
# Classification with majority vote from neighbors weighted by similarity
|
392
|
-
# @param [Hash] params Keys `:
|
256
|
+
# @param [Hash] params Keys `:acts, :sims, :value_map` are required
|
393
257
|
# @return [Numeric] A prediction value.
|
394
258
|
def self.weighted_majority_vote(params)
|
395
259
|
|
@@ -398,12 +262,13 @@ module OpenTox
|
|
398
262
|
confidence = 0.0
|
399
263
|
prediction = nil
|
400
264
|
|
401
|
-
|
402
|
-
neighbor_weight = Algorithm.gauss(neighbor[:similarity]).to_f
|
403
|
-
neighbor_contribution += neighbor[:activity].to_f * neighbor_weight
|
265
|
+
LOGGER.debug "Weighted Majority Vote Classification."
|
404
266
|
|
267
|
+
params[:acts].each_index do |idx|
|
268
|
+
neighbor_weight = params[:sims][1][idx]
|
269
|
+
neighbor_contribution += params[:acts][idx] * neighbor_weight
|
405
270
|
if params[:value_map].size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
|
406
|
-
case
|
271
|
+
case params[:acts][idx]
|
407
272
|
when 1
|
408
273
|
confidence_sum -= neighbor_weight
|
409
274
|
when 2
|
@@ -413,294 +278,257 @@ module OpenTox
|
|
413
278
|
confidence_sum += neighbor_weight
|
414
279
|
end
|
415
280
|
end
|
416
|
-
|
417
281
|
if params[:value_map].size == 2
|
418
282
|
if confidence_sum >= 0.0
|
419
|
-
prediction = 2 unless params[:
|
283
|
+
prediction = 2 unless params[:acts].size==0
|
420
284
|
elsif confidence_sum < 0.0
|
421
|
-
prediction = 1 unless params[:
|
285
|
+
prediction = 1 unless params[:acts].size==0
|
422
286
|
end
|
423
287
|
else
|
424
|
-
prediction = (neighbor_contribution/confidence_sum).round unless params[:
|
288
|
+
prediction = (neighbor_contribution/confidence_sum).round unless params[:acts].size==0 # AM: new multinomial prediction
|
425
289
|
end
|
290
|
+
|
426
291
|
LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil?
|
427
|
-
confidence = confidence_sum/params[:
|
292
|
+
confidence = (confidence_sum/params[:acts].size).abs if params[:acts].size > 0
|
428
293
|
LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil?
|
429
294
|
return {:prediction => prediction, :confidence => confidence.abs}
|
430
295
|
end
|
431
296
|
|
297
|
+
|
298
|
+
|
432
299
|
# Local support vector regression from neighbors
|
433
|
-
# @param [Hash] params Keys `:
|
300
|
+
# @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required
|
434
301
|
# @return [Numeric] A prediction value.
|
435
302
|
def self.local_svm_regression(params)
|
436
303
|
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
304
|
+
begin
|
305
|
+
confidence = 0.0
|
306
|
+
prediction = nil
|
307
|
+
|
308
|
+
LOGGER.debug "Local SVM."
|
309
|
+
if params[:acts].size>0
|
310
|
+
if params[:props]
|
311
|
+
n_prop = params[:props][0].collect
|
312
|
+
q_prop = params[:props][1].collect
|
313
|
+
props = [ n_prop, q_prop ]
|
314
|
+
end
|
315
|
+
acts = params[:acts].collect
|
316
|
+
prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
|
317
|
+
prediction = nil if (!prediction.nil? && prediction.infinite?)
|
318
|
+
LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
|
319
|
+
confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]})
|
320
|
+
confidence = 0.0 if prediction.nil?
|
321
|
+
end
|
322
|
+
{:prediction => prediction, :confidence => confidence}
|
323
|
+
rescue Exception => e
|
324
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
325
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
451
326
|
end
|
452
|
-
|
453
|
-
|
327
|
+
|
454
328
|
end
|
455
329
|
|
456
|
-
|
457
|
-
#
|
330
|
+
|
331
|
+
# Local support vector regression from neighbors
|
332
|
+
# @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required
|
458
333
|
# @return [Numeric] A prediction value.
|
459
334
|
def self.local_svm_classification(params)
|
460
335
|
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
336
|
+
begin
|
337
|
+
confidence = 0.0
|
338
|
+
prediction = nil
|
339
|
+
|
340
|
+
LOGGER.debug "Local SVM."
|
341
|
+
if params[:acts].size>0
|
342
|
+
if params[:props]
|
343
|
+
n_prop = params[:props][0].collect
|
344
|
+
q_prop = params[:props][1].collect
|
345
|
+
props = [ n_prop, q_prop ]
|
346
|
+
end
|
347
|
+
acts = params[:acts].collect
|
348
|
+
acts = acts.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
|
349
|
+
prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
|
350
|
+
prediction = prediction.sub(/Val/,"") if prediction # Convert back to Float
|
351
|
+
confidence = 0.0 if prediction.nil?
|
352
|
+
LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
|
353
|
+
confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]})
|
354
|
+
end
|
355
|
+
{:prediction => prediction, :confidence => confidence}
|
356
|
+
rescue Exception => e
|
357
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
358
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
471
359
|
end
|
472
|
-
|
473
|
-
|
360
|
+
|
474
361
|
end
|
475
362
|
|
476
363
|
|
364
|
+
|
477
365
|
# Local support vector prediction from neighbors.
|
478
|
-
# Uses
|
366
|
+
# Uses propositionalized setting.
|
479
367
|
# Not to be called directly (use local_svm_regression or local_svm_classification).
|
368
|
+
# @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
|
480
369
|
# @param [Array] acts, activities for neighbors.
|
481
|
-
# @param [
|
482
|
-
# @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
|
483
|
-
# @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
|
370
|
+
# @param [Float] min_train_performance, parameter to control censoring
|
484
371
|
# @return [Numeric] A prediction value.
|
485
|
-
def self.
|
486
|
-
|
487
|
-
|
488
|
-
|
372
|
+
def self.local_svm_prop(props, acts, min_train_performance)
|
373
|
+
|
374
|
+
LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
|
375
|
+
n_prop = props[0] # is a matrix, i.e. two nested Arrays.
|
376
|
+
q_prop = props[1] # is an Array.
|
489
377
|
|
490
378
|
prediction = nil
|
491
379
|
if Algorithm::zero_variance? acts
|
492
380
|
prediction = acts[0]
|
493
381
|
else
|
494
|
-
# gram matrix
|
495
|
-
(0..(neighbor_matches.length-1)).each do |i|
|
496
|
-
neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]]
|
497
|
-
gram_matrix[i] = [] unless gram_matrix[i]
|
498
|
-
# upper triangle
|
499
|
-
((i+1)..(neighbor_matches.length-1)).each do |j|
|
500
|
-
neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]]
|
501
|
-
sim_params = {}
|
502
|
-
if params[:nr_hits]
|
503
|
-
sim_params[:nr_hits] = true
|
504
|
-
sim_params[:compound_features_hits] = neighbor_i_hits
|
505
|
-
sim_params[:training_compound_features_hits] = neighbor_j_hits
|
506
|
-
end
|
507
|
-
sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)")
|
508
|
-
gram_matrix[i][j] = Algorithm.gauss(sim)
|
509
|
-
gram_matrix[j] = [] unless gram_matrix[j]
|
510
|
-
gram_matrix[j][i] = gram_matrix[i][j] # lower triangle
|
511
|
-
end
|
512
|
-
gram_matrix[i][i] = 1.0
|
513
|
-
end
|
514
|
-
|
515
|
-
|
516
382
|
#LOGGER.debug gram_matrix.to_yaml
|
517
383
|
@r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
|
518
|
-
@r.eval "
|
519
|
-
|
520
|
-
#
|
521
|
-
@r.
|
522
|
-
@r.n = neighbor_matches.size
|
523
|
-
@r.y = acts
|
524
|
-
@r.sims = sims
|
525
|
-
|
384
|
+
@r.eval "set.seed(1)"
|
385
|
+
@r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab"
|
386
|
+
@r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore"
|
387
|
+
@r.eval "registerDoMC()" # switch on parallel processing
|
526
388
|
begin
|
527
|
-
LOGGER.debug "Preparing R data ..."
|
528
|
-
# prepare data
|
529
|
-
@r.eval "y<-as.vector(y)"
|
530
|
-
@r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
|
531
|
-
@r.eval "sims<-as.vector(sims)"
|
532
|
-
|
533
|
-
# model + support vectors
|
534
|
-
LOGGER.debug "Creating SVM model ..."
|
535
|
-
@r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"#{type}\", nu=0.5)"
|
536
|
-
@r.eval "sv<-as.vector(SVindex(model))"
|
537
|
-
@r.eval "sims<-sims[sv]"
|
538
|
-
@r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
|
539
|
-
LOGGER.debug "Predicting ..."
|
540
|
-
if type == "nu-svr"
|
541
|
-
@r.eval "p<-predict(model,sims)[1,1]"
|
542
|
-
elsif type == "C-bsvc"
|
543
|
-
@r.eval "p<-predict(model,sims)"
|
544
|
-
end
|
545
|
-
if type == "nu-svr"
|
546
|
-
prediction = @r.p
|
547
|
-
elsif type == "C-bsvc"
|
548
|
-
#prediction = (@r.p.to_f == 1.0 ? true : false)
|
549
|
-
prediction = @r.p
|
550
|
-
end
|
551
|
-
@r.quit # free R
|
552
|
-
rescue Exception => e
|
553
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
554
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
555
|
-
end
|
556
|
-
|
557
|
-
end
|
558
|
-
prediction
|
559
|
-
end
|
560
|
-
|
561
|
-
# Local support vector prediction from neighbors.
|
562
|
-
# Uses propositionalized setting.
|
563
|
-
# Not to be called directly (use local_svm_regression or local_svm_classification).
|
564
|
-
# @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
|
565
|
-
# @param [Array] acts, activities for neighbors.
|
566
|
-
# @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
|
567
|
-
# @return [Numeric] A prediction value.
|
568
|
-
def self.local_svm_prop(props, acts, type)
|
569
|
-
|
570
|
-
LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
|
571
|
-
n_prop = props[0] # is a matrix, i.e. two nested Arrays.
|
572
|
-
q_prop = props[1] # is an Array.
|
573
389
|
|
574
|
-
prediction = nil
|
575
|
-
if Algorithm::zero_variance? acts
|
576
|
-
prediction = acts[0]
|
577
|
-
else
|
578
|
-
#LOGGER.debug gram_matrix.to_yaml
|
579
|
-
@r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
|
580
|
-
@r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
|
581
|
-
LOGGER.debug "Setting R data ..."
|
582
390
|
# set data
|
391
|
+
LOGGER.debug "Setting R data ..."
|
583
392
|
@r.n_prop = n_prop.flatten
|
584
393
|
@r.n_prop_x_size = n_prop.size
|
585
394
|
@r.n_prop_y_size = n_prop[0].size
|
586
395
|
@r.y = acts
|
587
396
|
@r.q_prop = q_prop
|
397
|
+
#@r.eval "y = matrix(y)"
|
398
|
+
@r.eval "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
|
399
|
+
@r.eval "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
|
588
400
|
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
if
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
if type == "nu-svr"
|
606
|
-
prediction = @r.p
|
607
|
-
elsif type == "C-bsvc"
|
608
|
-
#prediction = (@r.p.to_f == 1.0 ? true : false)
|
609
|
-
prediction = @r.p
|
610
|
-
end
|
611
|
-
@r.quit # free R
|
612
|
-
rescue Exception => e
|
613
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
614
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
615
|
-
end
|
616
|
-
end
|
617
|
-
prediction
|
618
|
-
end
|
401
|
+
# prepare data
|
402
|
+
LOGGER.debug "Preparing R data ..."
|
403
|
+
@r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification
|
404
|
+
|
405
|
+
@r.eval <<-EOR
|
406
|
+
rem = nearZeroVar(prop_matrix)
|
407
|
+
if (length(rem) > 0) {
|
408
|
+
prop_matrix = prop_matrix[,-rem,drop=F]
|
409
|
+
q_prop = q_prop[,-rem,drop=F]
|
410
|
+
}
|
411
|
+
rem = findCorrelation(cor(prop_matrix))
|
412
|
+
if (length(rem) > 0) {
|
413
|
+
prop_matrix = prop_matrix[,-rem,drop=F]
|
414
|
+
q_prop = q_prop[,-rem,drop=F]
|
415
|
+
}
|
416
|
+
EOR
|
619
417
|
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
if sim_median.nil?
|
627
|
-
confidence = nil
|
628
|
-
else
|
629
|
-
standard_deviation = params[:acts].to_scale.standard_deviation_sample
|
630
|
-
confidence = (sim_median*Math.exp(-1*standard_deviation)).abs
|
631
|
-
if confidence.nan?
|
632
|
-
confidence = nil
|
633
|
-
end
|
634
|
-
end
|
635
|
-
else
|
636
|
-
conf = params[:sims].inject{|sum,x| sum + x }
|
637
|
-
confidence = conf/params[:neighbors].size
|
638
|
-
end
|
639
|
-
LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
|
640
|
-
return confidence
|
641
|
-
end
|
418
|
+
# model + support vectors
|
419
|
+
LOGGER.debug "Creating R SVM model ..."
|
420
|
+
@r.eval <<-EOR
|
421
|
+
model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale"))
|
422
|
+
perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
|
423
|
+
EOR
|
642
424
|
|
643
|
-
# Get X and Y size of a nested Array (Matrix)
|
644
|
-
def self.get_sizes(matrix)
|
645
|
-
begin
|
646
|
-
nr_cases = matrix.size
|
647
|
-
nr_features = matrix[0].size
|
648
|
-
rescue Exception => e
|
649
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
650
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
651
|
-
end
|
652
|
-
#puts "NRC: #{nr_cases}, NRF: #{nr_features}"
|
653
|
-
[ nr_cases, nr_features ]
|
654
|
-
end
|
655
425
|
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
row = []
|
669
|
-
params[:features].each do |f|
|
670
|
-
if ! params[:fingerprints][n].nil?
|
671
|
-
row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0)
|
672
|
-
else
|
673
|
-
row << 0.0
|
674
|
-
end
|
675
|
-
end
|
676
|
-
matrix << row
|
677
|
-
end
|
678
|
-
row = []
|
679
|
-
params[:features].each do |f|
|
680
|
-
if params[:nr_hits]
|
681
|
-
compound_feature_hits = params[:compound].match_hits([f])
|
682
|
-
row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f]))
|
683
|
-
else
|
684
|
-
row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f])
|
685
|
-
end
|
426
|
+
# prediction
|
427
|
+
LOGGER.debug "Predicting ..."
|
428
|
+
@r.eval "p = predict(model,q_prop)"
|
429
|
+
@r.eval "if (class(y)!='numeric') p = as.character(p)"
|
430
|
+
prediction = @r.p
|
431
|
+
|
432
|
+
# censoring
|
433
|
+
prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance )
|
434
|
+
LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}"
|
435
|
+
rescue Exception => e
|
436
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
437
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
686
438
|
end
|
687
|
-
|
688
|
-
LOGGER.debug "get_props failed with '" + $! + "'"
|
439
|
+
@r.quit # free R
|
689
440
|
end
|
690
|
-
|
441
|
+
prediction
|
691
442
|
end
|
692
443
|
|
693
444
|
end
|
694
445
|
|
446
|
+
module FeatureSelection
|
447
|
+
include Algorithm
|
448
|
+
# Recursive Feature Elimination using caret
|
449
|
+
# @param [Hash] required keys: ds_csv_file, prediction_feature, fds_csv_file (dataset CSV file, prediction feature column name, and feature dataset CSV file), optional: del_missing (delete rows with missing values).
|
450
|
+
# @return [String] feature dataset CSV file composed of selected features.
|
451
|
+
def self.rfe(params)
|
452
|
+
@r=RinRuby.new(false,false)
|
453
|
+
@r.ds_csv_file = params[:ds_csv_file].to_s
|
454
|
+
@r.prediction_feature = params[:prediction_feature].to_s
|
455
|
+
@r.fds_csv_file = params[:fds_csv_file].to_s
|
456
|
+
@r.del_missing = params[:del_missing] == true ? 1 : 0
|
457
|
+
r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
|
458
|
+
@r.f_fds_r = r_result_file.to_s
|
459
|
+
|
460
|
+
# need packs 'randomForest', 'RANN'
|
461
|
+
@r.eval <<-EOR
|
462
|
+
set.seed(1)
|
463
|
+
suppressPackageStartupMessages(library('caret'))
|
464
|
+
suppressPackageStartupMessages(library('randomForest'))
|
465
|
+
suppressPackageStartupMessages(library('RANN'))
|
466
|
+
suppressPackageStartupMessages(library('doMC'))
|
467
|
+
registerDoMC()
|
468
|
+
|
469
|
+
acts = read.csv(ds_csv_file, check.names=F)
|
470
|
+
feats = read.csv(fds_csv_file, check.names=F)
|
471
|
+
ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-)
|
472
|
+
|
473
|
+
features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]
|
474
|
+
y = ds[,which(names(ds) == prediction_feature)]
|
475
|
+
|
476
|
+
# assumes a data matrix 'features' and a vector 'y' of target values
|
477
|
+
row.names(features)=NULL
|
478
|
+
|
479
|
+
pp = NULL
|
480
|
+
if (del_missing) {
|
481
|
+
# needed if rows should be removed
|
482
|
+
na_ids = apply(features,1,function(x)any(is.na(x)))
|
483
|
+
features = features[!na_ids,]
|
484
|
+
y = y[!na_ids]
|
485
|
+
pp = preProcess(features, method=c("scale", "center"))
|
486
|
+
} else {
|
487
|
+
# Use imputation if NA's random (only then!)
|
488
|
+
pp = preProcess(features, method=c("scale", "center", "knnImpute"))
|
489
|
+
}
|
490
|
+
features = predict(pp, features)
|
491
|
+
|
492
|
+
# determine subsets
|
493
|
+
subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
|
494
|
+
subsets = c(2,3,4,5,7,10,subsets)
|
495
|
+
subsets = unique(sort(round(subsets)))
|
496
|
+
subsets = subsets[subsets<=dim(features)[2]]
|
497
|
+
subsets = subsets[subsets>1]
|
498
|
+
|
499
|
+
# Recursive feature elimination
|
500
|
+
rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)
|
501
|
+
|
502
|
+
# read existing dataset and select most useful features
|
503
|
+
csv=feats[,c("SMILES", rfProfile$optVariables)]
|
504
|
+
write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')
|
505
|
+
EOR
|
506
|
+
r_result_file
|
507
|
+
end
|
508
|
+
end
|
509
|
+
|
695
510
|
module Substructure
|
696
511
|
include Algorithm
|
697
512
|
# Substructure matching
|
698
|
-
# @param [
|
699
|
-
# @param [Array] features Array with Smarts strings
|
513
|
+
# @param [Hash] required keys: compound, features
|
700
514
|
# @return [Array] Array with matching Smarts
|
701
|
-
def self.match(
|
702
|
-
compound.match(features)
|
515
|
+
def self.match(params)
|
516
|
+
params[:compound].match(params[:features])
|
703
517
|
end
|
518
|
+
|
519
|
+
# Substructure matching with number of non-unique hits
|
520
|
+
# @param [Hash] required keys: compound, features
|
521
|
+
# @return [Hash] Hash with matching Smarts and number of hits
|
522
|
+
def self.match_hits(params)
|
523
|
+
params[:compound].match_hits(params[:features])
|
524
|
+
end
|
525
|
+
|
526
|
+
# Substructure matching with number of non-unique hits
|
527
|
+
# @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
|
528
|
+
# @return [Hash] Hash with matching Smarts and number of hits
|
529
|
+
def self.lookup(params)
|
530
|
+
params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid])
|
531
|
+
end
|
704
532
|
end
|
705
533
|
|
706
534
|
module Dataset
|
@@ -709,281 +537,5 @@ module OpenTox
|
|
709
537
|
def features(dataset_uri,compound_uri)
|
710
538
|
end
|
711
539
|
end
|
712
|
-
|
713
|
-
module Transform
|
714
|
-
include Algorithm
|
715
|
-
|
716
|
-
# The transformer that inverts values.
|
717
|
-
# 1/x is used, after values have been moved >= 1.
|
718
|
-
class Inverter
|
719
|
-
attr_accessor :offset, :values
|
720
|
-
|
721
|
-
# @params[Array] Values to transform.
|
722
|
-
# @params[Float] Offset for restore.
|
723
|
-
def initialize *args
|
724
|
-
case args.size
|
725
|
-
when 1
|
726
|
-
begin
|
727
|
-
values=args[0]
|
728
|
-
raise "Cannot transform, values empty." if @values.size==0
|
729
|
-
@values = values.collect { |v| -1.0 * v }
|
730
|
-
@offset = 1.0 - @values.minmax[0]
|
731
|
-
@offset = -1.0 * @offset if @offset>0.0
|
732
|
-
@values.collect! { |v| v - @offset } # slide >1
|
733
|
-
@values.collect! { |v| 1 / v } # invert to [0,1]
|
734
|
-
rescue Exception => e
|
735
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
736
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
737
|
-
end
|
738
|
-
when 2
|
739
|
-
@offset = args[1].to_f
|
740
|
-
@values = args[0].collect { |v| 1 / v }
|
741
|
-
@values.collect! { |v| v + @offset }
|
742
|
-
@values.collect! { |v| -1.0 * v }
|
743
|
-
end
|
744
|
-
end
|
745
|
-
end
|
746
|
-
|
747
|
-
# The transformer that takes logs.
|
748
|
-
# Log10 is used, after values have been moved > 0.
|
749
|
-
class Log10
|
750
|
-
attr_accessor :offset, :values
|
751
|
-
|
752
|
-
# @params[Array] Values to transform / restore.
|
753
|
-
# @params[Float] Offset for restore.
|
754
|
-
def initialize *args
|
755
|
-
@distance_to_zero = 0.000000001 # 1 / 1 billion
|
756
|
-
case args.size
|
757
|
-
when 1
|
758
|
-
begin
|
759
|
-
values=args[0]
|
760
|
-
raise "Cannot transform, values empty." if values.size==0
|
761
|
-
@offset = values.minmax[0]
|
762
|
-
@offset = -1.0 * @offset if @offset>0.0
|
763
|
-
@values = values.collect { |v| v - @offset } # slide > anchor
|
764
|
-
@values.collect! { |v| v + @distance_to_zero } #
|
765
|
-
@values.collect! { |v| Math::log10 v } # log10 (can fail)
|
766
|
-
rescue Exception => e
|
767
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
768
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
769
|
-
end
|
770
|
-
when 2
|
771
|
-
@offset = args[1].to_f
|
772
|
-
@values = args[0].collect { |v| 10**v }
|
773
|
-
@values.collect! { |v| v - @distance_to_zero }
|
774
|
-
@values.collect! { |v| v + @offset }
|
775
|
-
end
|
776
|
-
end
|
777
|
-
end
|
778
|
-
|
779
|
-
# The transformer that does nothing (No OPeration).
|
780
|
-
class NOP
|
781
|
-
attr_accessor :offset, :values
|
782
|
-
|
783
|
-
# @params[Array] Values to transform / restore.
|
784
|
-
# @params[Float] Offset for restore.
|
785
|
-
def initialize *args
|
786
|
-
@offset = 0.0
|
787
|
-
@distance_to_zero = 0.0
|
788
|
-
case args.size
|
789
|
-
when 1
|
790
|
-
@values = args[0]
|
791
|
-
when 2
|
792
|
-
@values = args[0]
|
793
|
-
end
|
794
|
-
end
|
795
|
-
end
|
796
|
-
|
797
|
-
|
798
|
-
# Auto-Scaler for Arrays
|
799
|
-
# Center on mean and divide by standard deviation
|
800
|
-
class AutoScale
|
801
|
-
attr_accessor :scaled_values, :mean, :stdev
|
802
|
-
|
803
|
-
# @params[Array] Values to transform.
|
804
|
-
def initialize values
|
805
|
-
@scaled_values = values
|
806
|
-
@mean = @scaled_values.to_scale.mean
|
807
|
-
@stdev = @scaled_values.to_scale.standard_deviation_sample
|
808
|
-
@scaled_values = @scaled_values.collect {|vi| vi - @mean }
|
809
|
-
@scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0
|
810
|
-
end
|
811
|
-
end
|
812
|
-
|
813
|
-
# Principal Components Analysis
|
814
|
-
# Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
|
815
|
-
class PCA
|
816
|
-
attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
|
817
|
-
|
818
|
-
# Creates a transformed dataset as GSL::Matrix.
|
819
|
-
# @param [GSL::Matrix] Data matrix.
|
820
|
-
# @param [Float] Compression ratio from [0,1].
|
821
|
-
# @return [GSL::Matrix] Data transformed matrix.
|
822
|
-
def initialize data_matrix, compression=0.05
|
823
|
-
begin
|
824
|
-
@data_matrix = data_matrix
|
825
|
-
@compression = compression.to_f
|
826
|
-
@stdev = Array.new
|
827
|
-
@mean = Array.new
|
828
|
-
|
829
|
-
# Objective Feature Selection
|
830
|
-
raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
|
831
|
-
@data_matrix_selected = nil
|
832
|
-
(0..@data_matrix.size2-1).each { |i|
|
833
|
-
if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
|
834
|
-
if @data_matrix_selected.nil?
|
835
|
-
@data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
|
836
|
-
@data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
|
837
|
-
else
|
838
|
-
@data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
|
839
|
-
end
|
840
|
-
end
|
841
|
-
}
|
842
|
-
raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
|
843
|
-
|
844
|
-
# Scaling of Axes
|
845
|
-
@data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2)
|
846
|
-
(0..@data_matrix_selected.size2-1).each { |i|
|
847
|
-
@autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
|
848
|
-
@data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values
|
849
|
-
@stdev << @autoscaler.stdev
|
850
|
-
@mean << @autoscaler.mean
|
851
|
-
}
|
852
|
-
|
853
|
-
data_matrix_hash = Hash.new
|
854
|
-
(0..@data_matrix_scaled.size2-1).each { |i|
|
855
|
-
column_view = @data_matrix_scaled.col(i)
|
856
|
-
data_matrix_hash[i] = column_view.to_scale
|
857
|
-
}
|
858
|
-
dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
|
859
|
-
cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
|
860
|
-
pca=Statsample::Factor::PCA.new(cor_matrix)
|
861
|
-
pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
|
862
|
-
@eigenvalue_sums = Array.new
|
863
|
-
(0..dataset_hash.fields.size-1).each { |i|
|
864
|
-
@eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
|
865
|
-
}
|
866
|
-
eigenvectors_selected = Array.new
|
867
|
-
pca.eigenvectors.each_with_index { |ev, i|
|
868
|
-
if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0)
|
869
|
-
eigenvectors_selected << ev.to_a
|
870
|
-
end
|
871
|
-
}
|
872
|
-
@eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose
|
873
|
-
dataset_matrix = dataset_hash.to_gsl.transpose
|
874
|
-
@data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose
|
875
|
-
rescue Exception => e
|
876
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
877
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
878
|
-
end
|
879
|
-
end
|
880
|
-
|
881
|
-
# Restores data in the original feature space (possibly with compression loss).
|
882
|
-
# @return [GSL::Matrix] Data matrix.
|
883
|
-
def restore
|
884
|
-
begin
|
885
|
-
data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
|
886
|
-
# reverse scaling
|
887
|
-
(0..data_matrix_restored.size2-1).each { |i|
|
888
|
-
data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0
|
889
|
-
data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
|
890
|
-
}
|
891
|
-
data_matrix_restored
|
892
|
-
rescue Exception => e
|
893
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
894
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
895
|
-
end
|
896
|
-
end
|
897
|
-
|
898
|
-
end
|
899
|
-
|
900
|
-
end
|
901
|
-
|
902
|
-
# Gauss kernel
|
903
|
-
# @return [Float]
|
904
|
-
def self.gauss(x, sigma = 0.3)
|
905
|
-
d = 1.0 - x.to_f
|
906
|
-
Math.exp(-(d*d)/(2*sigma*sigma))
|
907
|
-
end
|
908
|
-
|
909
|
-
# For symbolic features
|
910
|
-
# @param [Array] Array to test, must indicate non-occurrence with 0.
|
911
|
-
# @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
|
912
|
-
def self.isnull_or_singular?(array)
|
913
|
-
nr_zeroes = array.count(0)
|
914
|
-
return (nr_zeroes == array.size) || # remove non-occurring feature
|
915
|
-
(nr_zeroes == array.size-1) || # remove singular feature
|
916
|
-
(nr_zeroes == 0) # also remove feature present everywhere
|
917
|
-
end
|
918
|
-
|
919
|
-
# Numeric value test
|
920
|
-
# @param[Object] value
|
921
|
-
# @return [Boolean] Whether value is a number
|
922
|
-
def self.numeric?(value)
|
923
|
-
true if Float(value) rescue false
|
924
|
-
end
|
925
|
-
|
926
|
-
# For symbolic features
|
927
|
-
# @param [Array] Array to test, must indicate non-occurrence with 0.
|
928
|
-
# @return [Boolean] Whether the feature has variance zero.
|
929
|
-
def self.zero_variance?(array)
|
930
|
-
return (array.to_scale.variance_population == 0.0)
|
931
|
-
end
|
932
|
-
|
933
|
-
# Sum of an array for Arrays.
|
934
|
-
# @param [Array] Array with values
|
935
|
-
# @return [Integer] Sum of size of values
|
936
|
-
def self.sum_size(array)
|
937
|
-
sum=0
|
938
|
-
array.each { |e| sum += e.size }
|
939
|
-
return sum
|
940
|
-
end
|
941
|
-
|
942
|
-
# Minimum Frequency
|
943
|
-
# @param [Integer] per-mil value
|
944
|
-
# return [Integer] min-frequency
|
945
|
-
def self.min_frequency(training_dataset,per_mil)
|
946
|
-
minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
|
947
|
-
minfreq = 2 unless minfreq > 2
|
948
|
-
Integer (minfreq)
|
949
|
-
end
|
950
|
-
|
951
|
-
# Effect calculation for classification
|
952
|
-
# @param [Array] Array of occurrences per class in the form of Enumerables.
|
953
|
-
# @param [Array] Array of database instance counts per class.
|
954
|
-
def self.effect(occurrences, db_instances)
|
955
|
-
max=0
|
956
|
-
max_value=0
|
957
|
-
nr_o = self.sum_size(occurrences)
|
958
|
-
nr_db = db_instances.to_scale.sum
|
959
|
-
|
960
|
-
occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
|
961
|
-
actual = o.size.to_f/nr_o
|
962
|
-
expected = db_instances[i].to_f/nr_db
|
963
|
-
if actual > expected
|
964
|
-
if ((actual - expected) / actual) > max_value
|
965
|
-
max_value = (actual - expected) / actual # 'Schleppzeiger'
|
966
|
-
max = i
|
967
|
-
end
|
968
|
-
end
|
969
|
-
}
|
970
|
-
max
|
971
|
-
end
|
972
|
-
|
973
|
-
# Returns Support value of an fingerprint
|
974
|
-
# @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required
|
975
|
-
# return [Numeric] Support value
|
976
|
-
def self.p_sum_support(params)
|
977
|
-
p_sum = 0.0
|
978
|
-
params[:features].each{|f|
|
979
|
-
compound_hits = params[:compound_features_hits][f]
|
980
|
-
neighbor_hits = params[:training_compound_features_hits][f]
|
981
|
-
p_sum += eval("(Algorithm.gauss(params[:weights][f]) * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))")
|
982
|
-
}
|
983
|
-
p_sum
|
984
|
-
end
|
985
|
-
|
986
540
|
end
|
987
541
|
end
|
988
|
-
|
989
|
-
|