opentox-ruby 3.0.1 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +8 -0
- data/Rakefile +2 -3
- data/VERSION +1 -1
- data/lib/algorithm.rb +227 -675
- data/lib/authorization.rb +10 -8
- data/lib/compound.rb +47 -11
- data/lib/dataset.rb +50 -2
- data/lib/environment.rb +6 -1
- data/lib/model.rb +37 -72
- data/lib/opentox-ruby.rb +1 -1
- data/lib/parser.rb +115 -57
- data/lib/r-util.rb +354 -0
- data/lib/rest_client_wrapper.rb +1 -1
- data/lib/serializer.rb +47 -30
- data/lib/stratification.R +201 -0
- data/lib/task.rb +5 -1
- data/lib/transform.rb +520 -0
- data/lib/utils.rb +372 -0
- data/lib/validation.rb +52 -6
- metadata +413 -428
data/ChangeLog
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
v3.1.0 2012-02-24
|
2
|
+
* utils.rb: added for special routines (e.g. descriptor calculation)
|
3
|
+
* task.rb: Polling with increasing interval
|
4
|
+
* parser.rb: CSV up and download fixed
|
5
|
+
* transform.rb: routines to create machine learning data matrices
|
6
|
+
* algorithm.rb: SVM parameter grid search, cos similarity as algorithm,
|
7
|
+
gauss() removed
|
8
|
+
|
1
9
|
v3.0.1 2011-10-19
|
2
10
|
* feature: model registration to ontology service
|
3
11
|
* ontology lib gets endpoints from ontology service
|
data/Rakefile
CHANGED
@@ -16,7 +16,7 @@ begin
|
|
16
16
|
gem.add_dependency "sinatra-respond_to", "=0.7.0"
|
17
17
|
gem.add_dependency "sinatra-static-assets", "=0.5.0"
|
18
18
|
gem.add_dependency "rest-client", "=1.6.1"
|
19
|
-
gem.add_dependency "rack", "=1.3.
|
19
|
+
gem.add_dependency "rack", "=1.3.5"
|
20
20
|
gem.add_dependency "rack-contrib", "=1.1.0"
|
21
21
|
gem.add_dependency "rack-flash", "=0.1.1"
|
22
22
|
gem.add_dependency "nokogiri", "=1.4.4"
|
@@ -42,10 +42,9 @@ begin
|
|
42
42
|
gem.add_dependency "dm-migrations", "=1.1.0"
|
43
43
|
gem.add_dependency "dm-validations", "=1.1.0"
|
44
44
|
gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
|
45
|
-
gem.add_dependency "ruby-plot", "=0.
|
45
|
+
gem.add_dependency "ruby-plot", "=0.6.0"
|
46
46
|
gem.add_dependency "gsl", "=1.14.7"
|
47
47
|
gem.add_dependency "statsample", "=1.1.0"
|
48
|
-
#gem.add_dependency "statsample-optimization", "=2.1.0"
|
49
48
|
|
50
49
|
gem.add_development_dependency 'jeweler'
|
51
50
|
gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
3.0
|
1
|
+
3.1.0
|
data/lib/algorithm.rb
CHANGED
@@ -5,6 +5,8 @@ R = nil
|
|
5
5
|
require "rinruby"
|
6
6
|
require "statsample"
|
7
7
|
require 'uri'
|
8
|
+
require 'transform.rb'
|
9
|
+
require 'utils.rb'
|
8
10
|
|
9
11
|
module OpenTox
|
10
12
|
|
@@ -13,7 +15,7 @@ module OpenTox
|
|
13
15
|
|
14
16
|
include OpenTox
|
15
17
|
|
16
|
-
# Execute algorithm with parameters,
|
18
|
+
# Execute algorithm with parameters, consult OpenTox API and webservice documentation for acceptable parameters
|
17
19
|
# @param [optional,Hash] params Algorithm parameters
|
18
20
|
# @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
|
19
21
|
# @return [String] URI of new resource (dataset, model, ...)
|
@@ -21,7 +23,7 @@ module OpenTox
|
|
21
23
|
LOGGER.info "Running algorithm '"+@uri.to_s+"' with params: "+params.inspect
|
22
24
|
RestClientWrapper.post(@uri, params, {:accept => 'text/uri-list'}, waiting_task).to_s
|
23
25
|
end
|
24
|
-
|
26
|
+
|
25
27
|
# Get OWL-DL representation in RDF/XML format
|
26
28
|
# @return [application/rdf+xml] RDF/XML representation
|
27
29
|
def to_rdfxml
|
@@ -33,7 +35,7 @@ module OpenTox
|
|
33
35
|
# Generic Algorithm class, should work with all OpenTox webservices
|
34
36
|
class Generic
|
35
37
|
include Algorithm
|
36
|
-
|
38
|
+
|
37
39
|
# Find Generic Opentox Algorithm via URI, and loads metadata, could raise NotFound/NotAuthorized error
|
38
40
|
# @param [String] uri Algorithm URI
|
39
41
|
# @return [OpenTox::Algorithm::Generic] Algorithm instance
|
@@ -44,14 +46,14 @@ module OpenTox
|
|
44
46
|
raise "cannot load algorithm metadata" if alg.metadata==nil or alg.metadata.size==0
|
45
47
|
alg
|
46
48
|
end
|
47
|
-
|
49
|
+
|
48
50
|
end
|
49
51
|
|
50
52
|
# Fminer algorithms (https://github.com/amaunz/fminer2)
|
51
53
|
class Fminer
|
52
54
|
include Algorithm
|
53
55
|
attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi
|
54
|
-
|
56
|
+
|
55
57
|
def check_params(params,per_mil,subjectid=nil)
|
56
58
|
raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
|
57
59
|
raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
|
@@ -81,7 +83,7 @@ module OpenTox
|
|
81
83
|
LOGGER.warn "Cannot find smiles for #{compound.to_s}."
|
82
84
|
next
|
83
85
|
end
|
84
|
-
|
86
|
+
|
85
87
|
value_map=params[:value_map] unless params[:value_map].nil?
|
86
88
|
entry.each do |feature,values|
|
87
89
|
if feature == @prediction_feature.uri
|
@@ -90,7 +92,7 @@ module OpenTox
|
|
90
92
|
LOGGER.warn "No #{feature} activity for #{compound.to_s}."
|
91
93
|
else
|
92
94
|
if @prediction_feature.feature_type == "classification"
|
93
|
-
activity= value_map.invert[value].to_i # activities are mapped to 1..n
|
95
|
+
activity= value_map.invert[value.to_s].to_i # activities are mapped to 1..n
|
94
96
|
@db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
|
95
97
|
elsif @prediction_feature.feature_type == "regression"
|
96
98
|
activity= value.to_f
|
@@ -115,23 +117,23 @@ module OpenTox
|
|
115
117
|
|
116
118
|
end
|
117
119
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
end
|
120
|
+
# Backbone Refinement Class mining (http://bbrc.maunz.de/)
|
121
|
+
class BBRC < Fminer
|
122
|
+
# Initialize bbrc algorithm
|
123
|
+
def initialize(subjectid=nil)
|
124
|
+
super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
|
125
|
+
load_metadata(subjectid)
|
125
126
|
end
|
127
|
+
end
|
126
128
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
end
|
129
|
+
# LAtent STructure Pattern Mining (http://last-pm.maunz.de)
|
130
|
+
class LAST < Fminer
|
131
|
+
# Initialize last algorithm
|
132
|
+
def initialize(subjectid=nil)
|
133
|
+
super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
|
134
|
+
load_metadata(subjectid)
|
134
135
|
end
|
136
|
+
end
|
135
137
|
|
136
138
|
|
137
139
|
# Create lazar prediction model
|
@@ -144,72 +146,6 @@ module OpenTox
|
|
144
146
|
end
|
145
147
|
end
|
146
148
|
|
147
|
-
# Utility methods without dedicated webservices
|
148
|
-
|
149
|
-
# Similarity calculations
|
150
|
-
module Similarity
|
151
|
-
include Algorithm
|
152
|
-
|
153
|
-
# Tanimoto similarity
|
154
|
-
# @param [Array] features_a Features of first compound
|
155
|
-
# @param [Array] features_b Features of second compound
|
156
|
-
# @param [optional, Hash] weights Weights for all features
|
157
|
-
# @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required
|
158
|
-
# @return [Float] (Weighted) tanimoto similarity
|
159
|
-
def self.tanimoto(features_a,features_b,weights=nil,params=nil)
|
160
|
-
common_features = features_a & features_b
|
161
|
-
all_features = (features_a + features_b).uniq
|
162
|
-
#LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}"
|
163
|
-
if common_features.size > 0
|
164
|
-
if weights
|
165
|
-
#LOGGER.debug "nr_hits: #{params[:nr_hits]}"
|
166
|
-
if !params.nil? && params[:nr_hits]
|
167
|
-
params[:weights] = weights
|
168
|
-
params[:mode] = "min"
|
169
|
-
params[:features] = common_features
|
170
|
-
common_p_sum = Algorithm.p_sum_support(params)
|
171
|
-
params[:mode] = "max"
|
172
|
-
params[:features] = all_features
|
173
|
-
all_p_sum = Algorithm.p_sum_support(params)
|
174
|
-
else
|
175
|
-
common_p_sum = 0.0
|
176
|
-
common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
|
177
|
-
all_p_sum = 0.0
|
178
|
-
all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
|
179
|
-
end
|
180
|
-
#LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}"
|
181
|
-
common_p_sum/all_p_sum
|
182
|
-
else
|
183
|
-
#LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}"
|
184
|
-
common_features.size.to_f/all_features.size.to_f
|
185
|
-
end
|
186
|
-
else
|
187
|
-
0.0
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
191
|
-
# Euclidean similarity
|
192
|
-
# @param [Hash] properties_a Properties of first compound
|
193
|
-
# @param [Hash] properties_b Properties of second compound
|
194
|
-
# @param [optional, Hash] weights Weights for all properties
|
195
|
-
# @return [Float] (Weighted) euclidean similarity
|
196
|
-
def self.euclidean(properties_a,properties_b,weights=nil)
|
197
|
-
common_properties = properties_a.keys & properties_b.keys
|
198
|
-
if common_properties.size > 1
|
199
|
-
dist_sum = 0
|
200
|
-
common_properties.each do |p|
|
201
|
-
if weights
|
202
|
-
dist_sum += ( (properties_a[p] - properties_b[p]) * Algorithm.gauss(weights[p]) )**2
|
203
|
-
else
|
204
|
-
dist_sum += (properties_a[p] - properties_b[p])**2
|
205
|
-
end
|
206
|
-
end
|
207
|
-
1/(1+Math.sqrt(dist_sum))
|
208
|
-
else
|
209
|
-
0.0
|
210
|
-
end
|
211
|
-
end
|
212
|
-
end
|
213
149
|
|
214
150
|
# Structural Graph Clustering by TU Munich
|
215
151
|
# Finds clusters similar to a query structure in a given training dataset
|
@@ -226,7 +162,7 @@ module OpenTox
|
|
226
162
|
raise "Invalid URI."
|
227
163
|
end
|
228
164
|
@training_dataset_uri = training_dataset_uri
|
229
|
-
if !
|
165
|
+
if !self.numeric? training_threshold || training_threshold <0 || training_threshold >1
|
230
166
|
raise "Training threshold out of bounds."
|
231
167
|
end
|
232
168
|
@training_threshold = training_threshold.to_f
|
@@ -259,7 +195,7 @@ module OpenTox
|
|
259
195
|
# @params[Float] Similarity threshold for query to clusters (optional)
|
260
196
|
def get_clusters query_compound_uri, query_threshold = 0.5
|
261
197
|
|
262
|
-
if !
|
198
|
+
if !self.numeric? query_threshold || query_threshold <0 || query_threshold >1
|
263
199
|
raise "Query threshold out of bounds."
|
264
200
|
end
|
265
201
|
@query_threshold = query_threshold.to_f
|
@@ -285,7 +221,7 @@ module OpenTox
|
|
285
221
|
metadata[DC.title][pattern]=""
|
286
222
|
feature_clusterid_map[feature_uri] = metadata[DC.title].to_i
|
287
223
|
}
|
288
|
-
|
224
|
+
|
289
225
|
# Integrity check
|
290
226
|
unless cluster_query_dataset.compounds.size == 1
|
291
227
|
raise "Number of predicted compounds is != 1."
|
@@ -295,11 +231,11 @@ module OpenTox
|
|
295
231
|
query_compound_uri = cluster_query_dataset.compounds[0]
|
296
232
|
@target_clusters_array = Array.new
|
297
233
|
cluster_query_dataset.features.keys.each { |cluster_membership_feature|
|
298
|
-
|
234
|
+
|
299
235
|
# Getting dataset URI for cluster
|
300
236
|
target_cluster = feature_clusterid_map[cluster_membership_feature]
|
301
237
|
dataset = @clusterid_dataset_map[target_cluster]
|
302
|
-
|
238
|
+
|
303
239
|
# Finally look up presence
|
304
240
|
data_entry = cluster_query_dataset.data_entries[query_compound_uri]
|
305
241
|
present = data_entry[cluster_membership_feature][0]
|
@@ -311,85 +247,13 @@ module OpenTox
|
|
311
247
|
|
312
248
|
end
|
313
249
|
|
314
|
-
module Neighbors
|
315
|
-
|
316
|
-
# Local multi-linear regression (MLR) prediction from neighbors.
|
317
|
-
# Uses propositionalized setting.
|
318
|
-
# @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
|
319
|
-
# @return [Numeric] A prediction value.
|
320
|
-
def self.local_mlr_prop(params)
|
321
|
-
|
322
|
-
confidence=0.0
|
323
|
-
prediction=nil
|
324
|
-
|
325
|
-
if params[:neighbors].size>0
|
326
|
-
props = params[:prop_kernel] ? get_props(params) : nil
|
327
|
-
acts = params[:neighbors].collect { |n| act = n[:activity].to_f }
|
328
|
-
sims = params[:neighbors].collect { |n| Algorithm.gauss(n[:similarity]) }
|
329
|
-
LOGGER.debug "Local MLR (Propositionalization / GSL)."
|
330
|
-
prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} )
|
331
|
-
transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
|
332
|
-
prediction = transformer.values[0]
|
333
|
-
prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
|
334
|
-
LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
|
335
|
-
params[:conf_stdev] = false if params[:conf_stdev].nil?
|
336
|
-
confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
|
337
|
-
confidence = nil if prediction.nil?
|
338
|
-
end
|
339
|
-
{:prediction => prediction, :confidence => confidence}
|
340
|
-
|
341
|
-
end
|
342
|
-
|
343
|
-
# Multi-linear regression weighted by similarity.
|
344
|
-
# Objective Feature Selection, Principal Components Analysis, Scaling of Axes.
|
345
|
-
# @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required
|
346
|
-
# @return [Numeric] A prediction value.
|
347
|
-
def self.mlr(params)
|
348
|
-
|
349
|
-
# GSL matrix operations:
|
350
|
-
# to_a : row-wise conversion to nested array
|
351
|
-
#
|
352
|
-
# Statsample operations (build on GSL):
|
353
|
-
# to_scale: convert into Statsample format
|
354
|
-
|
355
|
-
begin
|
356
|
-
n_prop = params[:n_prop].collect { |v| v }
|
357
|
-
q_prop = params[:q_prop].collect { |v| v }
|
358
|
-
n_prop << q_prop # attach q_prop
|
359
|
-
nr_cases, nr_features = get_sizes n_prop
|
360
|
-
data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
|
361
|
-
|
362
|
-
# Principal Components Analysis
|
363
|
-
LOGGER.debug "PCA..."
|
364
|
-
pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix)
|
365
|
-
data_matrix = pca.data_transformed_matrix
|
366
|
-
|
367
|
-
# Attach intercept column to data
|
368
|
-
intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1)
|
369
|
-
data_matrix = data_matrix.horzcat(intercept)
|
370
|
-
(0..data_matrix.size2-2).each { |i|
|
371
|
-
autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i))
|
372
|
-
data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values
|
373
|
-
}
|
374
250
|
|
375
|
-
# Detach query instance
|
376
|
-
n_prop = data_matrix.to_a
|
377
|
-
q_prop = n_prop.pop
|
378
|
-
nr_cases, nr_features = get_sizes n_prop
|
379
|
-
data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
|
380
251
|
|
381
|
-
|
382
|
-
LOGGER.debug "Creating MLR model ..."
|
383
|
-
c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl)
|
384
|
-
GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0]
|
385
|
-
rescue Exception => e
|
386
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
387
|
-
end
|
252
|
+
module Neighbors
|
388
253
|
|
389
|
-
end
|
390
254
|
|
391
255
|
# Classification with majority vote from neighbors weighted by similarity
|
392
|
-
# @param [Hash] params Keys `:
|
256
|
+
# @param [Hash] params Keys `:acts, :sims, :value_map` are required
|
393
257
|
# @return [Numeric] A prediction value.
|
394
258
|
def self.weighted_majority_vote(params)
|
395
259
|
|
@@ -398,12 +262,13 @@ module OpenTox
|
|
398
262
|
confidence = 0.0
|
399
263
|
prediction = nil
|
400
264
|
|
401
|
-
|
402
|
-
neighbor_weight = Algorithm.gauss(neighbor[:similarity]).to_f
|
403
|
-
neighbor_contribution += neighbor[:activity].to_f * neighbor_weight
|
265
|
+
LOGGER.debug "Weighted Majority Vote Classification."
|
404
266
|
|
267
|
+
params[:acts].each_index do |idx|
|
268
|
+
neighbor_weight = params[:sims][1][idx]
|
269
|
+
neighbor_contribution += params[:acts][idx] * neighbor_weight
|
405
270
|
if params[:value_map].size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
|
406
|
-
case
|
271
|
+
case params[:acts][idx]
|
407
272
|
when 1
|
408
273
|
confidence_sum -= neighbor_weight
|
409
274
|
when 2
|
@@ -413,294 +278,257 @@ module OpenTox
|
|
413
278
|
confidence_sum += neighbor_weight
|
414
279
|
end
|
415
280
|
end
|
416
|
-
|
417
281
|
if params[:value_map].size == 2
|
418
282
|
if confidence_sum >= 0.0
|
419
|
-
prediction = 2 unless params[:
|
283
|
+
prediction = 2 unless params[:acts].size==0
|
420
284
|
elsif confidence_sum < 0.0
|
421
|
-
prediction = 1 unless params[:
|
285
|
+
prediction = 1 unless params[:acts].size==0
|
422
286
|
end
|
423
287
|
else
|
424
|
-
prediction = (neighbor_contribution/confidence_sum).round unless params[:
|
288
|
+
prediction = (neighbor_contribution/confidence_sum).round unless params[:acts].size==0 # AM: new multinomial prediction
|
425
289
|
end
|
290
|
+
|
426
291
|
LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil?
|
427
|
-
confidence = confidence_sum/params[:
|
292
|
+
confidence = (confidence_sum/params[:acts].size).abs if params[:acts].size > 0
|
428
293
|
LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil?
|
429
294
|
return {:prediction => prediction, :confidence => confidence.abs}
|
430
295
|
end
|
431
296
|
|
297
|
+
|
298
|
+
|
432
299
|
# Local support vector regression from neighbors
|
433
|
-
# @param [Hash] params Keys `:
|
300
|
+
# @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required
|
434
301
|
# @return [Numeric] A prediction value.
|
435
302
|
def self.local_svm_regression(params)
|
436
303
|
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
304
|
+
begin
|
305
|
+
confidence = 0.0
|
306
|
+
prediction = nil
|
307
|
+
|
308
|
+
LOGGER.debug "Local SVM."
|
309
|
+
if params[:acts].size>0
|
310
|
+
if params[:props]
|
311
|
+
n_prop = params[:props][0].collect
|
312
|
+
q_prop = params[:props][1].collect
|
313
|
+
props = [ n_prop, q_prop ]
|
314
|
+
end
|
315
|
+
acts = params[:acts].collect
|
316
|
+
prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
|
317
|
+
prediction = nil if (!prediction.nil? && prediction.infinite?)
|
318
|
+
LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
|
319
|
+
confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]})
|
320
|
+
confidence = 0.0 if prediction.nil?
|
321
|
+
end
|
322
|
+
{:prediction => prediction, :confidence => confidence}
|
323
|
+
rescue Exception => e
|
324
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
325
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
451
326
|
end
|
452
|
-
|
453
|
-
|
327
|
+
|
454
328
|
end
|
455
329
|
|
456
|
-
|
457
|
-
#
|
330
|
+
|
331
|
+
# Local support vector regression from neighbors
|
332
|
+
# @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required
|
458
333
|
# @return [Numeric] A prediction value.
|
459
334
|
def self.local_svm_classification(params)
|
460
335
|
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
336
|
+
begin
|
337
|
+
confidence = 0.0
|
338
|
+
prediction = nil
|
339
|
+
|
340
|
+
LOGGER.debug "Local SVM."
|
341
|
+
if params[:acts].size>0
|
342
|
+
if params[:props]
|
343
|
+
n_prop = params[:props][0].collect
|
344
|
+
q_prop = params[:props][1].collect
|
345
|
+
props = [ n_prop, q_prop ]
|
346
|
+
end
|
347
|
+
acts = params[:acts].collect
|
348
|
+
acts = acts.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
|
349
|
+
prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
|
350
|
+
prediction = prediction.sub(/Val/,"") if prediction # Convert back to Float
|
351
|
+
confidence = 0.0 if prediction.nil?
|
352
|
+
LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
|
353
|
+
confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]})
|
354
|
+
end
|
355
|
+
{:prediction => prediction, :confidence => confidence}
|
356
|
+
rescue Exception => e
|
357
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
358
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
471
359
|
end
|
472
|
-
|
473
|
-
|
360
|
+
|
474
361
|
end
|
475
362
|
|
476
363
|
|
364
|
+
|
477
365
|
# Local support vector prediction from neighbors.
|
478
|
-
# Uses
|
366
|
+
# Uses propositionalized setting.
|
479
367
|
# Not to be called directly (use local_svm_regression or local_svm_classification).
|
368
|
+
# @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
|
480
369
|
# @param [Array] acts, activities for neighbors.
|
481
|
-
# @param [
|
482
|
-
# @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
|
483
|
-
# @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
|
370
|
+
# @param [Float] min_train_performance, parameter to control censoring
|
484
371
|
# @return [Numeric] A prediction value.
|
485
|
-
def self.
|
486
|
-
|
487
|
-
|
488
|
-
|
372
|
+
def self.local_svm_prop(props, acts, min_train_performance)
|
373
|
+
|
374
|
+
LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
|
375
|
+
n_prop = props[0] # is a matrix, i.e. two nested Arrays.
|
376
|
+
q_prop = props[1] # is an Array.
|
489
377
|
|
490
378
|
prediction = nil
|
491
379
|
if Algorithm::zero_variance? acts
|
492
380
|
prediction = acts[0]
|
493
381
|
else
|
494
|
-
# gram matrix
|
495
|
-
(0..(neighbor_matches.length-1)).each do |i|
|
496
|
-
neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]]
|
497
|
-
gram_matrix[i] = [] unless gram_matrix[i]
|
498
|
-
# upper triangle
|
499
|
-
((i+1)..(neighbor_matches.length-1)).each do |j|
|
500
|
-
neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]]
|
501
|
-
sim_params = {}
|
502
|
-
if params[:nr_hits]
|
503
|
-
sim_params[:nr_hits] = true
|
504
|
-
sim_params[:compound_features_hits] = neighbor_i_hits
|
505
|
-
sim_params[:training_compound_features_hits] = neighbor_j_hits
|
506
|
-
end
|
507
|
-
sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)")
|
508
|
-
gram_matrix[i][j] = Algorithm.gauss(sim)
|
509
|
-
gram_matrix[j] = [] unless gram_matrix[j]
|
510
|
-
gram_matrix[j][i] = gram_matrix[i][j] # lower triangle
|
511
|
-
end
|
512
|
-
gram_matrix[i][i] = 1.0
|
513
|
-
end
|
514
|
-
|
515
|
-
|
516
382
|
#LOGGER.debug gram_matrix.to_yaml
|
517
383
|
@r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
|
518
|
-
@r.eval "
|
519
|
-
|
520
|
-
#
|
521
|
-
@r.
|
522
|
-
@r.n = neighbor_matches.size
|
523
|
-
@r.y = acts
|
524
|
-
@r.sims = sims
|
525
|
-
|
384
|
+
@r.eval "set.seed(1)"
|
385
|
+
@r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab"
|
386
|
+
@r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore"
|
387
|
+
@r.eval "registerDoMC()" # switch on parallel processing
|
526
388
|
begin
|
527
|
-
LOGGER.debug "Preparing R data ..."
|
528
|
-
# prepare data
|
529
|
-
@r.eval "y<-as.vector(y)"
|
530
|
-
@r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
|
531
|
-
@r.eval "sims<-as.vector(sims)"
|
532
|
-
|
533
|
-
# model + support vectors
|
534
|
-
LOGGER.debug "Creating SVM model ..."
|
535
|
-
@r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"#{type}\", nu=0.5)"
|
536
|
-
@r.eval "sv<-as.vector(SVindex(model))"
|
537
|
-
@r.eval "sims<-sims[sv]"
|
538
|
-
@r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
|
539
|
-
LOGGER.debug "Predicting ..."
|
540
|
-
if type == "nu-svr"
|
541
|
-
@r.eval "p<-predict(model,sims)[1,1]"
|
542
|
-
elsif type == "C-bsvc"
|
543
|
-
@r.eval "p<-predict(model,sims)"
|
544
|
-
end
|
545
|
-
if type == "nu-svr"
|
546
|
-
prediction = @r.p
|
547
|
-
elsif type == "C-bsvc"
|
548
|
-
#prediction = (@r.p.to_f == 1.0 ? true : false)
|
549
|
-
prediction = @r.p
|
550
|
-
end
|
551
|
-
@r.quit # free R
|
552
|
-
rescue Exception => e
|
553
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
554
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
555
|
-
end
|
556
|
-
|
557
|
-
end
|
558
|
-
prediction
|
559
|
-
end
|
560
|
-
|
561
|
-
# Local support vector prediction from neighbors.
|
562
|
-
# Uses propositionalized setting.
|
563
|
-
# Not to be called directly (use local_svm_regression or local_svm_classification).
|
564
|
-
# @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
|
565
|
-
# @param [Array] acts, activities for neighbors.
|
566
|
-
# @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
|
567
|
-
# @return [Numeric] A prediction value.
|
568
|
-
def self.local_svm_prop(props, acts, type)
|
569
|
-
|
570
|
-
LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
|
571
|
-
n_prop = props[0] # is a matrix, i.e. two nested Arrays.
|
572
|
-
q_prop = props[1] # is an Array.
|
573
389
|
|
574
|
-
prediction = nil
|
575
|
-
if Algorithm::zero_variance? acts
|
576
|
-
prediction = acts[0]
|
577
|
-
else
|
578
|
-
#LOGGER.debug gram_matrix.to_yaml
|
579
|
-
@r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
|
580
|
-
@r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
|
581
|
-
LOGGER.debug "Setting R data ..."
|
582
390
|
# set data
|
391
|
+
LOGGER.debug "Setting R data ..."
|
583
392
|
@r.n_prop = n_prop.flatten
|
584
393
|
@r.n_prop_x_size = n_prop.size
|
585
394
|
@r.n_prop_y_size = n_prop[0].size
|
586
395
|
@r.y = acts
|
587
396
|
@r.q_prop = q_prop
|
397
|
+
#@r.eval "y = matrix(y)"
|
398
|
+
@r.eval "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
|
399
|
+
@r.eval "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
|
588
400
|
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
if
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
if type == "nu-svr"
|
606
|
-
prediction = @r.p
|
607
|
-
elsif type == "C-bsvc"
|
608
|
-
#prediction = (@r.p.to_f == 1.0 ? true : false)
|
609
|
-
prediction = @r.p
|
610
|
-
end
|
611
|
-
@r.quit # free R
|
612
|
-
rescue Exception => e
|
613
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
614
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
615
|
-
end
|
616
|
-
end
|
617
|
-
prediction
|
618
|
-
end
|
401
|
+
# prepare data
|
402
|
+
LOGGER.debug "Preparing R data ..."
|
403
|
+
@r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification
|
404
|
+
|
405
|
+
@r.eval <<-EOR
|
406
|
+
rem = nearZeroVar(prop_matrix)
|
407
|
+
if (length(rem) > 0) {
|
408
|
+
prop_matrix = prop_matrix[,-rem,drop=F]
|
409
|
+
q_prop = q_prop[,-rem,drop=F]
|
410
|
+
}
|
411
|
+
rem = findCorrelation(cor(prop_matrix))
|
412
|
+
if (length(rem) > 0) {
|
413
|
+
prop_matrix = prop_matrix[,-rem,drop=F]
|
414
|
+
q_prop = q_prop[,-rem,drop=F]
|
415
|
+
}
|
416
|
+
EOR
|
619
417
|
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
if sim_median.nil?
|
627
|
-
confidence = nil
|
628
|
-
else
|
629
|
-
standard_deviation = params[:acts].to_scale.standard_deviation_sample
|
630
|
-
confidence = (sim_median*Math.exp(-1*standard_deviation)).abs
|
631
|
-
if confidence.nan?
|
632
|
-
confidence = nil
|
633
|
-
end
|
634
|
-
end
|
635
|
-
else
|
636
|
-
conf = params[:sims].inject{|sum,x| sum + x }
|
637
|
-
confidence = conf/params[:neighbors].size
|
638
|
-
end
|
639
|
-
LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
|
640
|
-
return confidence
|
641
|
-
end
|
418
|
+
# model + support vectors
|
419
|
+
LOGGER.debug "Creating R SVM model ..."
|
420
|
+
@r.eval <<-EOR
|
421
|
+
model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale"))
|
422
|
+
perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
|
423
|
+
EOR
|
642
424
|
|
643
|
-
# Get X and Y size of a nested Array (Matrix)
|
644
|
-
def self.get_sizes(matrix)
|
645
|
-
begin
|
646
|
-
nr_cases = matrix.size
|
647
|
-
nr_features = matrix[0].size
|
648
|
-
rescue Exception => e
|
649
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
650
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
651
|
-
end
|
652
|
-
#puts "NRC: #{nr_cases}, NRF: #{nr_features}"
|
653
|
-
[ nr_cases, nr_features ]
|
654
|
-
end
|
655
425
|
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
row = []
|
669
|
-
params[:features].each do |f|
|
670
|
-
if ! params[:fingerprints][n].nil?
|
671
|
-
row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0)
|
672
|
-
else
|
673
|
-
row << 0.0
|
674
|
-
end
|
675
|
-
end
|
676
|
-
matrix << row
|
677
|
-
end
|
678
|
-
row = []
|
679
|
-
params[:features].each do |f|
|
680
|
-
if params[:nr_hits]
|
681
|
-
compound_feature_hits = params[:compound].match_hits([f])
|
682
|
-
row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f]))
|
683
|
-
else
|
684
|
-
row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f])
|
685
|
-
end
|
426
|
+
# prediction
|
427
|
+
LOGGER.debug "Predicting ..."
|
428
|
+
@r.eval "p = predict(model,q_prop)"
|
429
|
+
@r.eval "if (class(y)!='numeric') p = as.character(p)"
|
430
|
+
prediction = @r.p
|
431
|
+
|
432
|
+
# censoring
|
433
|
+
prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance )
|
434
|
+
LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}"
|
435
|
+
rescue Exception => e
|
436
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
437
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
686
438
|
end
|
687
|
-
|
688
|
-
LOGGER.debug "get_props failed with '" + $! + "'"
|
439
|
+
@r.quit # free R
|
689
440
|
end
|
690
|
-
|
441
|
+
prediction
|
691
442
|
end
|
692
443
|
|
693
444
|
end
|
694
445
|
|
446
|
+
module FeatureSelection
|
447
|
+
include Algorithm
|
448
|
+
# Recursive Feature Elimination using caret
|
449
|
+
# @param [Hash] required keys: ds_csv_file, prediction_feature, fds_csv_file (dataset CSV file, prediction feature column name, and feature dataset CSV file), optional: del_missing (delete rows with missing values).
|
450
|
+
# @return [String] feature dataset CSV file composed of selected features.
|
451
|
+
def self.rfe(params)
|
452
|
+
@r=RinRuby.new(false,false)
|
453
|
+
@r.ds_csv_file = params[:ds_csv_file].to_s
|
454
|
+
@r.prediction_feature = params[:prediction_feature].to_s
|
455
|
+
@r.fds_csv_file = params[:fds_csv_file].to_s
|
456
|
+
@r.del_missing = params[:del_missing] == true ? 1 : 0
|
457
|
+
r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
|
458
|
+
@r.f_fds_r = r_result_file.to_s
|
459
|
+
|
460
|
+
# need packs 'randomForest', 'RANN'
|
461
|
+
@r.eval <<-EOR
|
462
|
+
set.seed(1)
|
463
|
+
suppressPackageStartupMessages(library('caret'))
|
464
|
+
suppressPackageStartupMessages(library('randomForest'))
|
465
|
+
suppressPackageStartupMessages(library('RANN'))
|
466
|
+
suppressPackageStartupMessages(library('doMC'))
|
467
|
+
registerDoMC()
|
468
|
+
|
469
|
+
acts = read.csv(ds_csv_file, check.names=F)
|
470
|
+
feats = read.csv(fds_csv_file, check.names=F)
|
471
|
+
ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-)
|
472
|
+
|
473
|
+
features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]
|
474
|
+
y = ds[,which(names(ds) == prediction_feature)]
|
475
|
+
|
476
|
+
# assumes a data matrix 'features' and a vector 'y' of target values
|
477
|
+
row.names(features)=NULL
|
478
|
+
|
479
|
+
pp = NULL
|
480
|
+
if (del_missing) {
|
481
|
+
# needed if rows should be removed
|
482
|
+
na_ids = apply(features,1,function(x)any(is.na(x)))
|
483
|
+
features = features[!na_ids,]
|
484
|
+
y = y[!na_ids]
|
485
|
+
pp = preProcess(features, method=c("scale", "center"))
|
486
|
+
} else {
|
487
|
+
# Use imputation if NA's random (only then!)
|
488
|
+
pp = preProcess(features, method=c("scale", "center", "knnImpute"))
|
489
|
+
}
|
490
|
+
features = predict(pp, features)
|
491
|
+
|
492
|
+
# determine subsets
|
493
|
+
subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
|
494
|
+
subsets = c(2,3,4,5,7,10,subsets)
|
495
|
+
subsets = unique(sort(round(subsets)))
|
496
|
+
subsets = subsets[subsets<=dim(features)[2]]
|
497
|
+
subsets = subsets[subsets>1]
|
498
|
+
|
499
|
+
# Recursive feature elimination
|
500
|
+
rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)
|
501
|
+
|
502
|
+
# read existing dataset and select most useful features
|
503
|
+
csv=feats[,c("SMILES", rfProfile$optVariables)]
|
504
|
+
write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')
|
505
|
+
EOR
|
506
|
+
r_result_file
|
507
|
+
end
|
508
|
+
end
|
509
|
+
|
695
510
|
module Substructure
|
696
511
|
include Algorithm
|
697
512
|
# Substructure matching
|
698
|
-
# @param [
|
699
|
-
# @param [Array] features Array with Smarts strings
|
513
|
+
# @param [Hash] required keys: compound, features
|
700
514
|
# @return [Array] Array with matching Smarts
|
701
|
-
def self.match(
|
702
|
-
compound.match(features)
|
515
|
+
def self.match(params)
|
516
|
+
params[:compound].match(params[:features])
|
703
517
|
end
|
518
|
+
|
519
|
+
# Substructure matching with number of non-unique hits
|
520
|
+
# @param [Hash] required keys: compound, features
|
521
|
+
# @return [Hash] Hash with matching Smarts and number of hits
|
522
|
+
def self.match_hits(params)
|
523
|
+
params[:compound].match_hits(params[:features])
|
524
|
+
end
|
525
|
+
|
526
|
+
# Substructure matching with number of non-unique hits
|
527
|
+
# @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type
|
528
|
+
# @return [Hash] Hash with matching Smarts and number of hits
|
529
|
+
def self.lookup(params)
|
530
|
+
params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid])
|
531
|
+
end
|
704
532
|
end
|
705
533
|
|
706
534
|
module Dataset
|
@@ -709,281 +537,5 @@ module OpenTox
|
|
709
537
|
def features(dataset_uri,compound_uri)
|
710
538
|
end
|
711
539
|
end
|
712
|
-
|
713
|
-
module Transform
|
714
|
-
include Algorithm
|
715
|
-
|
716
|
-
# The transformer that inverts values.
|
717
|
-
# 1/x is used, after values have been moved >= 1.
|
718
|
-
class Inverter
|
719
|
-
attr_accessor :offset, :values
|
720
|
-
|
721
|
-
# @params[Array] Values to transform.
|
722
|
-
# @params[Float] Offset for restore.
|
723
|
-
def initialize *args
|
724
|
-
case args.size
|
725
|
-
when 1
|
726
|
-
begin
|
727
|
-
values=args[0]
|
728
|
-
raise "Cannot transform, values empty." if @values.size==0
|
729
|
-
@values = values.collect { |v| -1.0 * v }
|
730
|
-
@offset = 1.0 - @values.minmax[0]
|
731
|
-
@offset = -1.0 * @offset if @offset>0.0
|
732
|
-
@values.collect! { |v| v - @offset } # slide >1
|
733
|
-
@values.collect! { |v| 1 / v } # invert to [0,1]
|
734
|
-
rescue Exception => e
|
735
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
736
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
737
|
-
end
|
738
|
-
when 2
|
739
|
-
@offset = args[1].to_f
|
740
|
-
@values = args[0].collect { |v| 1 / v }
|
741
|
-
@values.collect! { |v| v + @offset }
|
742
|
-
@values.collect! { |v| -1.0 * v }
|
743
|
-
end
|
744
|
-
end
|
745
|
-
end
|
746
|
-
|
747
|
-
# The transformer that takes logs.
|
748
|
-
# Log10 is used, after values have been moved > 0.
|
749
|
-
class Log10
|
750
|
-
attr_accessor :offset, :values
|
751
|
-
|
752
|
-
# @params[Array] Values to transform / restore.
|
753
|
-
# @params[Float] Offset for restore.
|
754
|
-
def initialize *args
|
755
|
-
@distance_to_zero = 0.000000001 # 1 / 1 billion
|
756
|
-
case args.size
|
757
|
-
when 1
|
758
|
-
begin
|
759
|
-
values=args[0]
|
760
|
-
raise "Cannot transform, values empty." if values.size==0
|
761
|
-
@offset = values.minmax[0]
|
762
|
-
@offset = -1.0 * @offset if @offset>0.0
|
763
|
-
@values = values.collect { |v| v - @offset } # slide > anchor
|
764
|
-
@values.collect! { |v| v + @distance_to_zero } #
|
765
|
-
@values.collect! { |v| Math::log10 v } # log10 (can fail)
|
766
|
-
rescue Exception => e
|
767
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
768
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
769
|
-
end
|
770
|
-
when 2
|
771
|
-
@offset = args[1].to_f
|
772
|
-
@values = args[0].collect { |v| 10**v }
|
773
|
-
@values.collect! { |v| v - @distance_to_zero }
|
774
|
-
@values.collect! { |v| v + @offset }
|
775
|
-
end
|
776
|
-
end
|
777
|
-
end
|
778
|
-
|
779
|
-
# The transformer that does nothing (No OPeration).
|
780
|
-
class NOP
|
781
|
-
attr_accessor :offset, :values
|
782
|
-
|
783
|
-
# @params[Array] Values to transform / restore.
|
784
|
-
# @params[Float] Offset for restore.
|
785
|
-
def initialize *args
|
786
|
-
@offset = 0.0
|
787
|
-
@distance_to_zero = 0.0
|
788
|
-
case args.size
|
789
|
-
when 1
|
790
|
-
@values = args[0]
|
791
|
-
when 2
|
792
|
-
@values = args[0]
|
793
|
-
end
|
794
|
-
end
|
795
|
-
end
|
796
|
-
|
797
|
-
|
798
|
-
# Auto-Scaler for Arrays
|
799
|
-
# Center on mean and divide by standard deviation
|
800
|
-
class AutoScale
|
801
|
-
attr_accessor :scaled_values, :mean, :stdev
|
802
|
-
|
803
|
-
# @params[Array] Values to transform.
|
804
|
-
def initialize values
|
805
|
-
@scaled_values = values
|
806
|
-
@mean = @scaled_values.to_scale.mean
|
807
|
-
@stdev = @scaled_values.to_scale.standard_deviation_sample
|
808
|
-
@scaled_values = @scaled_values.collect {|vi| vi - @mean }
|
809
|
-
@scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0
|
810
|
-
end
|
811
|
-
end
|
812
|
-
|
813
|
-
# Principal Components Analysis
|
814
|
-
# Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
|
815
|
-
class PCA
|
816
|
-
attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
|
817
|
-
|
818
|
-
# Creates a transformed dataset as GSL::Matrix.
|
819
|
-
# @param [GSL::Matrix] Data matrix.
|
820
|
-
# @param [Float] Compression ratio from [0,1].
|
821
|
-
# @return [GSL::Matrix] Data transformed matrix.
|
822
|
-
def initialize data_matrix, compression=0.05
|
823
|
-
begin
|
824
|
-
@data_matrix = data_matrix
|
825
|
-
@compression = compression.to_f
|
826
|
-
@stdev = Array.new
|
827
|
-
@mean = Array.new
|
828
|
-
|
829
|
-
# Objective Feature Selection
|
830
|
-
raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
|
831
|
-
@data_matrix_selected = nil
|
832
|
-
(0..@data_matrix.size2-1).each { |i|
|
833
|
-
if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
|
834
|
-
if @data_matrix_selected.nil?
|
835
|
-
@data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
|
836
|
-
@data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
|
837
|
-
else
|
838
|
-
@data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
|
839
|
-
end
|
840
|
-
end
|
841
|
-
}
|
842
|
-
raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
|
843
|
-
|
844
|
-
# Scaling of Axes
|
845
|
-
@data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2)
|
846
|
-
(0..@data_matrix_selected.size2-1).each { |i|
|
847
|
-
@autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
|
848
|
-
@data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values
|
849
|
-
@stdev << @autoscaler.stdev
|
850
|
-
@mean << @autoscaler.mean
|
851
|
-
}
|
852
|
-
|
853
|
-
data_matrix_hash = Hash.new
|
854
|
-
(0..@data_matrix_scaled.size2-1).each { |i|
|
855
|
-
column_view = @data_matrix_scaled.col(i)
|
856
|
-
data_matrix_hash[i] = column_view.to_scale
|
857
|
-
}
|
858
|
-
dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
|
859
|
-
cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
|
860
|
-
pca=Statsample::Factor::PCA.new(cor_matrix)
|
861
|
-
pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
|
862
|
-
@eigenvalue_sums = Array.new
|
863
|
-
(0..dataset_hash.fields.size-1).each { |i|
|
864
|
-
@eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
|
865
|
-
}
|
866
|
-
eigenvectors_selected = Array.new
|
867
|
-
pca.eigenvectors.each_with_index { |ev, i|
|
868
|
-
if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0)
|
869
|
-
eigenvectors_selected << ev.to_a
|
870
|
-
end
|
871
|
-
}
|
872
|
-
@eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose
|
873
|
-
dataset_matrix = dataset_hash.to_gsl.transpose
|
874
|
-
@data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose
|
875
|
-
rescue Exception => e
|
876
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
877
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
878
|
-
end
|
879
|
-
end
|
880
|
-
|
881
|
-
# Restores data in the original feature space (possibly with compression loss).
|
882
|
-
# @return [GSL::Matrix] Data matrix.
|
883
|
-
def restore
|
884
|
-
begin
|
885
|
-
data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
|
886
|
-
# reverse scaling
|
887
|
-
(0..data_matrix_restored.size2-1).each { |i|
|
888
|
-
data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0
|
889
|
-
data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
|
890
|
-
}
|
891
|
-
data_matrix_restored
|
892
|
-
rescue Exception => e
|
893
|
-
LOGGER.debug "#{e.class}: #{e.message}"
|
894
|
-
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
895
|
-
end
|
896
|
-
end
|
897
|
-
|
898
|
-
end
|
899
|
-
|
900
|
-
end
|
901
|
-
|
902
|
-
# Gauss kernel
|
903
|
-
# @return [Float]
|
904
|
-
def self.gauss(x, sigma = 0.3)
|
905
|
-
d = 1.0 - x.to_f
|
906
|
-
Math.exp(-(d*d)/(2*sigma*sigma))
|
907
|
-
end
|
908
|
-
|
909
|
-
# For symbolic features
|
910
|
-
# @param [Array] Array to test, must indicate non-occurrence with 0.
|
911
|
-
# @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
|
912
|
-
def self.isnull_or_singular?(array)
|
913
|
-
nr_zeroes = array.count(0)
|
914
|
-
return (nr_zeroes == array.size) || # remove non-occurring feature
|
915
|
-
(nr_zeroes == array.size-1) || # remove singular feature
|
916
|
-
(nr_zeroes == 0) # also remove feature present everywhere
|
917
|
-
end
|
918
|
-
|
919
|
-
# Numeric value test
|
920
|
-
# @param[Object] value
|
921
|
-
# @return [Boolean] Whether value is a number
|
922
|
-
def self.numeric?(value)
|
923
|
-
true if Float(value) rescue false
|
924
|
-
end
|
925
|
-
|
926
|
-
# For symbolic features
|
927
|
-
# @param [Array] Array to test, must indicate non-occurrence with 0.
|
928
|
-
# @return [Boolean] Whether the feature has variance zero.
|
929
|
-
def self.zero_variance?(array)
|
930
|
-
return (array.to_scale.variance_population == 0.0)
|
931
|
-
end
|
932
|
-
|
933
|
-
# Sum of an array for Arrays.
|
934
|
-
# @param [Array] Array with values
|
935
|
-
# @return [Integer] Sum of size of values
|
936
|
-
def self.sum_size(array)
|
937
|
-
sum=0
|
938
|
-
array.each { |e| sum += e.size }
|
939
|
-
return sum
|
940
|
-
end
|
941
|
-
|
942
|
-
# Minimum Frequency
|
943
|
-
# @param [Integer] per-mil value
|
944
|
-
# return [Integer] min-frequency
|
945
|
-
def self.min_frequency(training_dataset,per_mil)
|
946
|
-
minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
|
947
|
-
minfreq = 2 unless minfreq > 2
|
948
|
-
Integer (minfreq)
|
949
|
-
end
|
950
|
-
|
951
|
-
# Effect calculation for classification
|
952
|
-
# @param [Array] Array of occurrences per class in the form of Enumerables.
|
953
|
-
# @param [Array] Array of database instance counts per class.
|
954
|
-
def self.effect(occurrences, db_instances)
|
955
|
-
max=0
|
956
|
-
max_value=0
|
957
|
-
nr_o = self.sum_size(occurrences)
|
958
|
-
nr_db = db_instances.to_scale.sum
|
959
|
-
|
960
|
-
occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
|
961
|
-
actual = o.size.to_f/nr_o
|
962
|
-
expected = db_instances[i].to_f/nr_db
|
963
|
-
if actual > expected
|
964
|
-
if ((actual - expected) / actual) > max_value
|
965
|
-
max_value = (actual - expected) / actual # 'Schleppzeiger'
|
966
|
-
max = i
|
967
|
-
end
|
968
|
-
end
|
969
|
-
}
|
970
|
-
max
|
971
|
-
end
|
972
|
-
|
973
|
-
# Returns Support value of an fingerprint
|
974
|
-
# @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required
|
975
|
-
# return [Numeric] Support value
|
976
|
-
def self.p_sum_support(params)
|
977
|
-
p_sum = 0.0
|
978
|
-
params[:features].each{|f|
|
979
|
-
compound_hits = params[:compound_features_hits][f]
|
980
|
-
neighbor_hits = params[:training_compound_features_hits][f]
|
981
|
-
p_sum += eval("(Algorithm.gauss(params[:weights][f]) * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))")
|
982
|
-
}
|
983
|
-
p_sum
|
984
|
-
end
|
985
|
-
|
986
540
|
end
|
987
541
|
end
|
988
|
-
|
989
|
-
|