opentox-ruby 2.0.1 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +39 -46
- data/VERSION +1 -1
- data/lib/algorithm.rb +797 -80
- data/lib/compound.rb +40 -0
- data/lib/config/config_ru.rb +2 -0
- data/lib/dataset.rb +57 -18
- data/lib/environment.rb +3 -3
- data/lib/feature.rb +15 -13
- data/lib/helper.rb +1 -2
- data/lib/model.rb +185 -82
- data/lib/opentox-ruby.rb +1 -1
- data/lib/overwrite.rb +2 -1
- data/lib/parser.rb +247 -69
- data/lib/rest_client_wrapper.rb +3 -2
- data/lib/serializer.rb +24 -10
- data/lib/task.rb +10 -3
- data/lib/to-html.rb +66 -41
- data/lib/validation.rb +93 -29
- metadata +206 -117
data/Rakefile
CHANGED
@@ -8,53 +8,46 @@ begin
|
|
8
8
|
gem.summary = %Q{Ruby wrapper for the OpenTox REST API}
|
9
9
|
gem.description = %Q{Ruby wrapper for the OpenTox REST API (http://www.opentox.org)}
|
10
10
|
gem.email = "helma@in-silico.ch"
|
11
|
-
gem.homepage = "http://github.com/
|
11
|
+
gem.homepage = "http://github.com/opentox/opentox-ruby"
|
12
12
|
gem.authors = ["Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler"]
|
13
|
-
# dependencies
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
=
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
].each {|dep| gem.add_dependency dep, ">= 1" }
|
52
|
-
=end
|
53
|
-
#valiation-gem
|
54
|
-
gem.add_dependency "haml", ">=3"
|
55
|
-
# validation-gems
|
56
|
-
gem.add_dependency "ruby-plot", "~>0.4.0"
|
57
|
-
['jeweler'].each { |dep| gem.add_development_dependency dep }
|
13
|
+
# dependencies with versions
|
14
|
+
gem.add_dependency "sinatra", "=1.2.6"
|
15
|
+
gem.add_dependency "emk-sinatra-url-for", "=0.2.1"
|
16
|
+
gem.add_dependency "sinatra-respond_to", "=0.7.0"
|
17
|
+
gem.add_dependency "sinatra-static-assets", "=0.5.0"
|
18
|
+
gem.add_dependency "rest-client", "=1.6.1"
|
19
|
+
gem.add_dependency "rack", "=1.3.1"
|
20
|
+
gem.add_dependency "rack-contrib", "=1.1.0"
|
21
|
+
gem.add_dependency "rack-flash", "=0.1.1"
|
22
|
+
gem.add_dependency "nokogiri", "=1.4.4"
|
23
|
+
gem.add_dependency "rubyzip", "=0.9.4"
|
24
|
+
gem.add_dependency "roo", "=1.9.3"
|
25
|
+
gem.add_dependency "spreadsheet", "=0.6.5.4"
|
26
|
+
gem.add_dependency "google-spreadsheet-ruby", "=0.1.5"
|
27
|
+
gem.add_dependency "yajl-ruby", "=0.8.2"
|
28
|
+
#gem.add_dependency "mail", "=2.3.0"
|
29
|
+
gem.add_dependency "rinruby", "=2.0.2"
|
30
|
+
gem.add_dependency "ohm", "=0.1.3"
|
31
|
+
gem.add_dependency "ohm-contrib", "=0.1.1"
|
32
|
+
gem.add_dependency "SystemTimer", "=1.2.3"
|
33
|
+
gem.add_dependency "rjb", "=1.3.4"
|
34
|
+
gem.add_dependency "haml", "=3.1.1"
|
35
|
+
# for headless browser tests
|
36
|
+
gem.add_dependency "akephalos", "=0.2.5"
|
37
|
+
#valiation-gems
|
38
|
+
gem.add_dependency "dm-core", "=1.1.0"
|
39
|
+
gem.add_dependency "dm-serializer", "=1.1.0"
|
40
|
+
gem.add_dependency "dm-timestamps", "=1.1.0"
|
41
|
+
gem.add_dependency "dm-types", "=1.1.0"
|
42
|
+
gem.add_dependency "dm-migrations", "=1.1.0"
|
43
|
+
gem.add_dependency "dm-validations", "=1.1.0"
|
44
|
+
gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
|
45
|
+
gem.add_dependency "ruby-plot", "=0.5.0"
|
46
|
+
gem.add_dependency "gsl", "=1.14.7"
|
47
|
+
gem.add_dependency "statsample", "=1.1.0"
|
48
|
+
#gem.add_dependency "statsample-optimization", "=2.1.0"
|
49
|
+
|
50
|
+
gem.add_development_dependency 'jeweler'
|
58
51
|
gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
|
59
52
|
end
|
60
53
|
Jeweler::GemcutterTasks.new
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.0
|
1
|
+
2.1.0
|
data/lib/algorithm.rb
CHANGED
@@ -3,6 +3,8 @@
|
|
3
3
|
# avoids compiling R with X
|
4
4
|
R = nil
|
5
5
|
require "rinruby"
|
6
|
+
require "statsample"
|
7
|
+
require 'uri'
|
6
8
|
|
7
9
|
module OpenTox
|
8
10
|
|
@@ -16,6 +18,7 @@ module OpenTox
|
|
16
18
|
# @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
|
17
19
|
# @return [String] URI of new resource (dataset, model, ...)
|
18
20
|
def run(params=nil, waiting_task=nil)
|
21
|
+
LOGGER.info "Running algorithm '"+@uri.to_s+"' with params: "+params.inspect
|
19
22
|
RestClientWrapper.post(@uri, params, {:accept => 'text/uri-list'}, waiting_task).to_s
|
20
23
|
end
|
21
24
|
|
@@ -45,12 +48,75 @@ module OpenTox
|
|
45
48
|
end
|
46
49
|
|
47
50
|
# Fminer algorithms (https://github.com/amaunz/fminer2)
|
48
|
-
|
51
|
+
class Fminer
|
49
52
|
include Algorithm
|
53
|
+
attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi
|
54
|
+
|
55
|
+
def check_params(params,per_mil,subjectid=nil)
|
56
|
+
raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
|
57
|
+
raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
|
58
|
+
@prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
|
59
|
+
@training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", subjectid
|
60
|
+
raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature])
|
61
|
+
|
62
|
+
unless params[:min_frequency].nil?
|
63
|
+
@minfreq=params[:min_frequency].to_i
|
64
|
+
raise "Minimum frequency must be a number >0!" unless @minfreq>0
|
65
|
+
else
|
66
|
+
@minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def add_fminer_data(fminer_instance, params, value_map)
|
71
|
+
|
72
|
+
id = 1 # fminer start id is not 0
|
73
|
+
@training_dataset.data_entries.each do |compound,entry|
|
74
|
+
begin
|
75
|
+
smiles = OpenTox::Compound.smiles(compound.to_s)
|
76
|
+
rescue
|
77
|
+
LOGGER.warn "No resource for #{compound.to_s}"
|
78
|
+
next
|
79
|
+
end
|
80
|
+
if smiles == '' or smiles.nil?
|
81
|
+
LOGGER.warn "Cannot find smiles for #{compound.to_s}."
|
82
|
+
next
|
83
|
+
end
|
84
|
+
|
85
|
+
value_map=params[:value_map] unless params[:value_map].nil?
|
86
|
+
entry.each do |feature,values|
|
87
|
+
if feature == @prediction_feature.uri
|
88
|
+
values.each do |value|
|
89
|
+
if value.nil?
|
90
|
+
LOGGER.warn "No #{feature} activity for #{compound.to_s}."
|
91
|
+
else
|
92
|
+
if @prediction_feature.feature_type == "classification"
|
93
|
+
activity= value_map.invert[value].to_i # activities are mapped to 1..n
|
94
|
+
@db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
|
95
|
+
elsif @prediction_feature.feature_type == "regression"
|
96
|
+
activity= value.to_f
|
97
|
+
end
|
98
|
+
begin
|
99
|
+
fminer_instance.AddCompound(smiles,id)
|
100
|
+
fminer_instance.AddActivity(activity, id)
|
101
|
+
@all_activities[id]=activity # DV: insert global information
|
102
|
+
@compounds[id] = compound
|
103
|
+
@smi[id] = smiles
|
104
|
+
id += 1
|
105
|
+
rescue Exception => e
|
106
|
+
LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
|
107
|
+
LOGGER.warn e.backtrace
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
50
117
|
|
51
118
|
# Backbone Refinement Class mining (http://bbrc.maunz.de/)
|
52
|
-
class BBRC
|
53
|
-
include Fminer
|
119
|
+
class BBRC < Fminer
|
54
120
|
# Initialize bbrc algorithm
|
55
121
|
def initialize(subjectid=nil)
|
56
122
|
super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
|
@@ -59,8 +125,7 @@ module OpenTox
|
|
59
125
|
end
|
60
126
|
|
61
127
|
# LAtent STructure Pattern Mining (http://last-pm.maunz.de)
|
62
|
-
class LAST
|
63
|
-
include Fminer
|
128
|
+
class LAST < Fminer
|
64
129
|
# Initialize last algorithm
|
65
130
|
def initialize(subjectid=nil)
|
66
131
|
super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
|
@@ -68,7 +133,6 @@ module OpenTox
|
|
68
133
|
end
|
69
134
|
end
|
70
135
|
|
71
|
-
end
|
72
136
|
|
73
137
|
# Create lazar prediction model
|
74
138
|
class Lazar
|
@@ -90,19 +154,34 @@ module OpenTox
|
|
90
154
|
# @param [Array] features_a Features of first compound
|
91
155
|
# @param [Array] features_b Features of second compound
|
92
156
|
# @param [optional, Hash] weights Weights for all features
|
157
|
+
# @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required
|
93
158
|
# @return [Float] (Weighted) tanimoto similarity
|
94
|
-
def self.tanimoto(features_a,features_b,weights=nil)
|
159
|
+
def self.tanimoto(features_a,features_b,weights=nil,params=nil)
|
95
160
|
common_features = features_a & features_b
|
96
161
|
all_features = (features_a + features_b).uniq
|
97
|
-
|
162
|
+
#LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}"
|
98
163
|
if common_features.size > 0
|
99
164
|
if weights
|
100
|
-
|
101
|
-
|
102
|
-
|
165
|
+
#LOGGER.debug "nr_hits: #{params[:nr_hits]}"
|
166
|
+
if !params.nil? && params[:nr_hits]
|
167
|
+
params[:weights] = weights
|
168
|
+
params[:mode] = "min"
|
169
|
+
params[:features] = common_features
|
170
|
+
common_p_sum = Algorithm.p_sum_support(params)
|
171
|
+
params[:mode] = "max"
|
172
|
+
params[:features] = all_features
|
173
|
+
all_p_sum = Algorithm.p_sum_support(params)
|
174
|
+
else
|
175
|
+
common_p_sum = 0.0
|
176
|
+
common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
|
177
|
+
all_p_sum = 0.0
|
178
|
+
all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
|
179
|
+
end
|
180
|
+
#LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}"
|
103
181
|
common_p_sum/all_p_sum
|
104
182
|
else
|
105
|
-
common_features.
|
183
|
+
#LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}"
|
184
|
+
common_features.size.to_f/all_features.size.to_f
|
106
185
|
end
|
107
186
|
else
|
108
187
|
0.0
|
@@ -132,65 +211,300 @@ module OpenTox
|
|
132
211
|
end
|
133
212
|
end
|
134
213
|
|
214
|
+
# Structural Graph Clustering by TU Munich
|
215
|
+
# Finds clusters similar to a query structure in a given training dataset
|
216
|
+
# May be queried for cluster membership of an unknown compound
|
217
|
+
class StructuralClustering
|
218
|
+
attr_accessor :training_dataset_uri, :training_threshold, :query_dataset_uri, :query_threshold, :target_clusters_array
|
219
|
+
|
220
|
+
# @params[String] Training dataset_uri
|
221
|
+
# @params[Float] Similarity threshold for training (optional)
|
222
|
+
# @params[String] Cluster service uri (no AA)
|
223
|
+
def initialize training_dataset_uri, training_threshold=0.8, cluster_service_uri = "http://opentox-dev.informatik.tu-muenchen.de:8080/OpenTox/algorithm/StructuralClustering"
|
224
|
+
|
225
|
+
if (training_dataset_uri =~ URI::regexp).nil? || (cluster_service_uri =~ URI::regexp).nil?
|
226
|
+
raise "Invalid URI."
|
227
|
+
end
|
228
|
+
@training_dataset_uri = training_dataset_uri
|
229
|
+
if !OpenTox::Algorithm.numeric? training_threshold || training_threshold <0 || training_threshold >1
|
230
|
+
raise "Training threshold out of bounds."
|
231
|
+
end
|
232
|
+
@training_threshold = training_threshold.to_f
|
233
|
+
|
234
|
+
# Train a cluster model
|
235
|
+
params = {:dataset_uri => @training_dataset_uri, :threshold => @training_threshold }
|
236
|
+
@cluster_model_uri = OpenTox::RestClientWrapper.post cluster_service_uri, params
|
237
|
+
cluster_model_rdf = OpenTox::RestClientWrapper.get @cluster_model_uri
|
238
|
+
@datasets = OpenTox::Parser::Owl.from_rdf cluster_model_rdf, OT.Dataset, true # must extract OT.Datasets from model
|
239
|
+
|
240
|
+
# Process parsed OWL objects
|
241
|
+
@clusterid_dataset_map = Hash.new
|
242
|
+
@datasets.each { |d|
|
243
|
+
begin
|
244
|
+
d.metadata[OT.hasSource]["Structural Clustering cluster "] = "" # must parse in metadata for string (not elegant)
|
245
|
+
@clusterid_dataset_map[d.metadata[OT.hasSource].to_i] = d.uri
|
246
|
+
rescue Exception => e
|
247
|
+
# ignore other entries!
|
248
|
+
end
|
249
|
+
}
|
250
|
+
end
|
251
|
+
|
252
|
+
# Whether a model has been trained
|
253
|
+
def trained?
|
254
|
+
!@cluster_model_uri.nil?
|
255
|
+
end
|
256
|
+
|
257
|
+
# Instance query: clusters for a compound
|
258
|
+
# @params[String] Query compound
|
259
|
+
# @params[Float] Similarity threshold for query to clusters (optional)
|
260
|
+
def get_clusters query_compound_uri, query_threshold = 0.5
|
261
|
+
|
262
|
+
if !OpenTox::Algorithm.numeric? query_threshold || query_threshold <0 || query_threshold >1
|
263
|
+
raise "Query threshold out of bounds."
|
264
|
+
end
|
265
|
+
@query_threshold = query_threshold.to_f
|
266
|
+
|
267
|
+
|
268
|
+
# Preparing a query dataset
|
269
|
+
query_dataset = OpenTox::Dataset.new
|
270
|
+
@query_dataset_uri = query_dataset.save
|
271
|
+
query_dataset = OpenTox::Dataset.find @query_dataset_uri
|
272
|
+
query_dataset.add_compound query_compound_uri
|
273
|
+
@query_dataset_uri = query_dataset.save
|
274
|
+
|
275
|
+
# Obtaining a clustering for query compound
|
276
|
+
params = { :dataset_uri => @query_dataset_uri, :threshold => @query_threshold }
|
277
|
+
cluster_query_dataset_uri = OpenTox::RestClientWrapper.post @cluster_model_uri, params
|
278
|
+
cluster_query_dataset = OpenTox::Dataset.new cluster_query_dataset_uri
|
279
|
+
cluster_query_dataset.load_all
|
280
|
+
|
281
|
+
# Reading cluster ids for features from metadata
|
282
|
+
feature_clusterid_map = Hash.new
|
283
|
+
pattern="Prediction feature for cluster assignment " # must parse for string in metadata (not elegant)
|
284
|
+
cluster_query_dataset.features.each { |feature_uri,metadata|
|
285
|
+
metadata[DC.title][pattern]=""
|
286
|
+
feature_clusterid_map[feature_uri] = metadata[DC.title].to_i
|
287
|
+
}
|
288
|
+
|
289
|
+
# Integrity check
|
290
|
+
unless cluster_query_dataset.compounds.size == 1
|
291
|
+
raise "Number of predicted compounds is != 1."
|
292
|
+
end
|
293
|
+
|
294
|
+
# Process data entry
|
295
|
+
query_compound_uri = cluster_query_dataset.compounds[0]
|
296
|
+
@target_clusters_array = Array.new
|
297
|
+
cluster_query_dataset.features.keys.each { |cluster_membership_feature|
|
298
|
+
|
299
|
+
# Getting dataset URI for cluster
|
300
|
+
target_cluster = feature_clusterid_map[cluster_membership_feature]
|
301
|
+
dataset = @clusterid_dataset_map[target_cluster]
|
302
|
+
|
303
|
+
# Finally look up presence
|
304
|
+
data_entry = cluster_query_dataset.data_entries[query_compound_uri]
|
305
|
+
present = data_entry[cluster_membership_feature][0]
|
306
|
+
|
307
|
+
# Store result
|
308
|
+
@target_clusters_array << dataset if present > 0.5 # 0.0 for absence, 1.0 for presence
|
309
|
+
}
|
310
|
+
end
|
311
|
+
|
312
|
+
end
|
313
|
+
|
135
314
|
module Neighbors
|
136
315
|
|
316
|
+
# Local multi-linear regression (MLR) prediction from neighbors.
|
317
|
+
# Uses propositionalized setting.
|
318
|
+
# @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
|
319
|
+
# @return [Numeric] A prediction value.
|
320
|
+
def self.local_mlr_prop(params)
|
321
|
+
|
322
|
+
confidence=0.0
|
323
|
+
prediction=nil
|
324
|
+
|
325
|
+
if params[:neighbors].size>0
|
326
|
+
props = params[:prop_kernel] ? get_props(params) : nil
|
327
|
+
acts = params[:neighbors].collect { |n| act = n[:activity].to_f }
|
328
|
+
sims = params[:neighbors].collect { |n| Algorithm.gauss(n[:similarity]) }
|
329
|
+
LOGGER.debug "Local MLR (Propositionalization / GSL)."
|
330
|
+
prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} )
|
331
|
+
transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
|
332
|
+
prediction = transformer.values[0]
|
333
|
+
prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
|
334
|
+
LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
|
335
|
+
params[:conf_stdev] = false if params[:conf_stdev].nil?
|
336
|
+
confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
|
337
|
+
confidence = nil if prediction.nil?
|
338
|
+
end
|
339
|
+
{:prediction => prediction, :confidence => confidence}
|
340
|
+
|
341
|
+
end
|
342
|
+
|
343
|
+
# Multi-linear regression weighted by similarity.
|
344
|
+
# Objective Feature Selection, Principal Components Analysis, Scaling of Axes.
|
345
|
+
# @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required
|
346
|
+
# @return [Numeric] A prediction value.
|
347
|
+
def self.mlr(params)
|
348
|
+
|
349
|
+
# GSL matrix operations:
|
350
|
+
# to_a : row-wise conversion to nested array
|
351
|
+
#
|
352
|
+
# Statsample operations (build on GSL):
|
353
|
+
# to_scale: convert into Statsample format
|
354
|
+
|
355
|
+
begin
|
356
|
+
n_prop = params[:n_prop].collect { |v| v }
|
357
|
+
q_prop = params[:q_prop].collect { |v| v }
|
358
|
+
n_prop << q_prop # attach q_prop
|
359
|
+
nr_cases, nr_features = get_sizes n_prop
|
360
|
+
data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
|
361
|
+
|
362
|
+
# Principal Components Analysis
|
363
|
+
LOGGER.debug "PCA..."
|
364
|
+
pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix)
|
365
|
+
data_matrix = pca.data_transformed_matrix
|
366
|
+
|
367
|
+
# Attach intercept column to data
|
368
|
+
intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1)
|
369
|
+
data_matrix = data_matrix.horzcat(intercept)
|
370
|
+
(0..data_matrix.size2-2).each { |i|
|
371
|
+
autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i))
|
372
|
+
data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values
|
373
|
+
}
|
374
|
+
|
375
|
+
# Detach query instance
|
376
|
+
n_prop = data_matrix.to_a
|
377
|
+
q_prop = n_prop.pop
|
378
|
+
nr_cases, nr_features = get_sizes n_prop
|
379
|
+
data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
|
380
|
+
|
381
|
+
# model + support vectors
|
382
|
+
LOGGER.debug "Creating MLR model ..."
|
383
|
+
c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl)
|
384
|
+
GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0]
|
385
|
+
rescue Exception => e
|
386
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
387
|
+
end
|
388
|
+
|
389
|
+
end
|
390
|
+
|
137
391
|
# Classification with majority vote from neighbors weighted by similarity
|
138
|
-
# @param [
|
139
|
-
# @
|
140
|
-
|
141
|
-
|
142
|
-
|
392
|
+
# @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
|
393
|
+
# @return [Numeric] A prediction value.
|
394
|
+
def self.weighted_majority_vote(params)
|
395
|
+
|
396
|
+
neighbor_contribution = 0.0
|
397
|
+
confidence_sum = 0.0
|
143
398
|
confidence = 0.0
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
399
|
+
prediction = nil
|
400
|
+
|
401
|
+
params[:neighbors].each do |neighbor|
|
402
|
+
neighbor_weight = Algorithm.gauss(neighbor[:similarity]).to_f
|
403
|
+
neighbor_contribution += neighbor[:activity].to_f * neighbor_weight
|
404
|
+
|
405
|
+
if params[:value_map].size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
|
406
|
+
case neighbor[:activity]
|
407
|
+
when 1
|
408
|
+
confidence_sum -= neighbor_weight
|
409
|
+
when 2
|
410
|
+
confidence_sum += neighbor_weight
|
411
|
+
end
|
412
|
+
else
|
413
|
+
confidence_sum += neighbor_weight
|
150
414
|
end
|
151
415
|
end
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
416
|
+
|
417
|
+
if params[:value_map].size == 2
|
418
|
+
if confidence_sum >= 0.0
|
419
|
+
prediction = 2 unless params[:neighbors].size==0
|
420
|
+
elsif confidence_sum < 0.0
|
421
|
+
prediction = 1 unless params[:neighbors].size==0
|
422
|
+
end
|
423
|
+
else
|
424
|
+
prediction = (neighbor_contribution/confidence_sum).round unless params[:neighbors].size==0 # AM: new multinomial prediction
|
425
|
+
end
|
426
|
+
LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil?
|
427
|
+
confidence = confidence_sum/params[:neighbors].size if params[:neighbors].size > 0
|
428
|
+
LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil?
|
429
|
+
return {:prediction => prediction, :confidence => confidence.abs}
|
161
430
|
end
|
162
431
|
|
163
432
|
# Local support vector regression from neighbors
|
164
|
-
# @param [
|
165
|
-
# @
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
433
|
+
# @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
|
434
|
+
# @return [Numeric] A prediction value.
|
435
|
+
def self.local_svm_regression(params)
|
436
|
+
|
437
|
+
confidence = 0.0
|
438
|
+
prediction = nil
|
439
|
+
if params[:neighbors].size>0
|
440
|
+
props = params[:prop_kernel] ? get_props(params) : nil
|
441
|
+
acts = params[:neighbors].collect{ |n| n[:activity].to_f }
|
442
|
+
sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) }
|
443
|
+
prediction = props.nil? ? local_svm(acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr")
|
444
|
+
transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
|
445
|
+
prediction = transformer.values[0]
|
446
|
+
prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
|
447
|
+
LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
|
448
|
+
params[:conf_stdev] = false if params[:conf_stdev].nil?
|
449
|
+
confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
|
450
|
+
confidence = nil if prediction.nil?
|
177
451
|
end
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
452
|
+
{:prediction => prediction, :confidence => confidence}
|
453
|
+
|
454
|
+
end
|
455
|
+
|
456
|
+
# Local support vector classification from neighbors
|
457
|
+
# @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
|
458
|
+
# @return [Numeric] A prediction value.
|
459
|
+
def self.local_svm_classification(params)
|
182
460
|
|
183
|
-
|
461
|
+
confidence = 0.0
|
462
|
+
prediction = nil
|
463
|
+
if params[:neighbors].size>0
|
464
|
+
props = params[:prop_kernel] ? get_props(params) : nil
|
465
|
+
acts = params[:neighbors].collect { |n| act = n[:activity] }
|
466
|
+
sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
|
467
|
+
prediction = props.nil? ? local_svm(acts, sims, "C-bsvc", params) : local_svm_prop(props, acts, "C-bsvc")
|
468
|
+
LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
|
469
|
+
params[:conf_stdev] = false if params[:conf_stdev].nil?
|
470
|
+
confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
|
471
|
+
end
|
472
|
+
{:prediction => prediction, :confidence => confidence}
|
473
|
+
|
474
|
+
end
|
475
|
+
|
476
|
+
|
477
|
+
# Local support vector prediction from neighbors.
|
478
|
+
# Uses pre-defined Kernel Matrix.
|
479
|
+
# Not to be called directly (use local_svm_regression or local_svm_classification).
|
480
|
+
# @param [Array] acts, activities for neighbors.
|
481
|
+
# @param [Array] sims, similarities for neighbors.
|
482
|
+
# @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
|
483
|
+
# @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
|
484
|
+
# @return [Numeric] A prediction value.
|
485
|
+
def self.local_svm(acts, sims, type, params)
|
486
|
+
LOGGER.debug "Local SVM (Weighted Tanimoto Kernel)."
|
487
|
+
neighbor_matches = params[:neighbors].collect{ |n| n[:features] } # URIs of matches
|
184
488
|
gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
|
185
|
-
|
186
|
-
|
489
|
+
|
490
|
+
prediction = nil
|
491
|
+
if Algorithm::zero_variance? acts
|
492
|
+
prediction = acts[0]
|
187
493
|
else
|
188
494
|
# gram matrix
|
189
495
|
(0..(neighbor_matches.length-1)).each do |i|
|
496
|
+
neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]]
|
190
497
|
gram_matrix[i] = [] unless gram_matrix[i]
|
191
498
|
# upper triangle
|
192
499
|
((i+1)..(neighbor_matches.length-1)).each do |j|
|
193
|
-
|
500
|
+
neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]]
|
501
|
+
sim_params = {}
|
502
|
+
if params[:nr_hits]
|
503
|
+
sim_params[:nr_hits] = true
|
504
|
+
sim_params[:compound_features_hits] = neighbor_i_hits
|
505
|
+
sim_params[:training_compound_features_hits] = neighbor_j_hits
|
506
|
+
end
|
507
|
+
sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)")
|
194
508
|
gram_matrix[i][j] = Algorithm.gauss(sim)
|
195
509
|
gram_matrix[j] = [] unless gram_matrix[j]
|
196
510
|
gram_matrix[j][i] = gram_matrix[i][j] # lower triangle
|
@@ -198,6 +512,7 @@ module OpenTox
|
|
198
512
|
gram_matrix[i][i] = 1.0
|
199
513
|
end
|
200
514
|
|
515
|
+
|
201
516
|
#LOGGER.debug gram_matrix.to_yaml
|
202
517
|
@r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
|
203
518
|
@r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
|
@@ -208,27 +523,171 @@ module OpenTox
|
|
208
523
|
@r.y = acts
|
209
524
|
@r.sims = sims
|
210
525
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
526
|
+
begin
|
527
|
+
LOGGER.debug "Preparing R data ..."
|
528
|
+
# prepare data
|
529
|
+
@r.eval "y<-as.vector(y)"
|
530
|
+
@r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
|
531
|
+
@r.eval "sims<-as.vector(sims)"
|
532
|
+
|
533
|
+
# model + support vectors
|
534
|
+
LOGGER.debug "Creating SVM model ..."
|
535
|
+
@r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"#{type}\", nu=0.5)"
|
536
|
+
@r.eval "sv<-as.vector(SVindex(model))"
|
537
|
+
@r.eval "sims<-sims[sv]"
|
538
|
+
@r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
|
539
|
+
LOGGER.debug "Predicting ..."
|
540
|
+
if type == "nu-svr"
|
541
|
+
@r.eval "p<-predict(model,sims)[1,1]"
|
542
|
+
elsif type == "C-bsvc"
|
543
|
+
@r.eval "p<-predict(model,sims)"
|
544
|
+
end
|
545
|
+
if type == "nu-svr"
|
546
|
+
prediction = @r.p
|
547
|
+
elsif type == "C-bsvc"
|
548
|
+
#prediction = (@r.p.to_f == 1.0 ? true : false)
|
549
|
+
prediction = @r.p
|
550
|
+
end
|
551
|
+
@r.quit # free R
|
552
|
+
rescue Exception => e
|
553
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
554
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
555
|
+
end
|
556
|
+
|
228
557
|
end
|
229
|
-
|
230
|
-
|
231
|
-
|
558
|
+
prediction
|
559
|
+
end
|
560
|
+
|
561
|
+
# Local support vector prediction from neighbors.
|
562
|
+
# Uses propositionalized setting.
|
563
|
+
# Not to be called directly (use local_svm_regression or local_svm_classification).
|
564
|
+
# @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
|
565
|
+
# @param [Array] acts, activities for neighbors.
|
566
|
+
# @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
|
567
|
+
# @return [Numeric] A prediction value.
|
568
|
+
def self.local_svm_prop(props, acts, type)
|
569
|
+
|
570
|
+
LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
|
571
|
+
n_prop = props[0] # is a matrix, i.e. two nested Arrays.
|
572
|
+
q_prop = props[1] # is an Array.
|
573
|
+
|
574
|
+
prediction = nil
|
575
|
+
if Algorithm::zero_variance? acts
|
576
|
+
prediction = acts[0]
|
577
|
+
else
|
578
|
+
#LOGGER.debug gram_matrix.to_yaml
|
579
|
+
@r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
|
580
|
+
@r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
|
581
|
+
LOGGER.debug "Setting R data ..."
|
582
|
+
# set data
|
583
|
+
@r.n_prop = n_prop.flatten
|
584
|
+
@r.n_prop_x_size = n_prop.size
|
585
|
+
@r.n_prop_y_size = n_prop[0].size
|
586
|
+
@r.y = acts
|
587
|
+
@r.q_prop = q_prop
|
588
|
+
|
589
|
+
begin
|
590
|
+
LOGGER.debug "Preparing R data ..."
|
591
|
+
# prepare data
|
592
|
+
@r.eval "y<-matrix(y)"
|
593
|
+
@r.eval "prop_matrix<-matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=TRUE)"
|
594
|
+
@r.eval "q_prop<-matrix(q_prop, 1, n_prop_y_size, byrow=TRUE)"
|
595
|
+
|
596
|
+
# model + support vectors
|
597
|
+
LOGGER.debug "Creating SVM model ..."
|
598
|
+
@r.eval "model<-ksvm(prop_matrix, y, type=\"#{type}\", nu=0.5)"
|
599
|
+
LOGGER.debug "Predicting ..."
|
600
|
+
if type == "nu-svr"
|
601
|
+
@r.eval "p<-predict(model,q_prop)[1,1]"
|
602
|
+
elsif type == "C-bsvc"
|
603
|
+
@r.eval "p<-predict(model,q_prop)"
|
604
|
+
end
|
605
|
+
if type == "nu-svr"
|
606
|
+
prediction = @r.p
|
607
|
+
elsif type == "C-bsvc"
|
608
|
+
#prediction = (@r.p.to_f == 1.0 ? true : false)
|
609
|
+
prediction = @r.p
|
610
|
+
end
|
611
|
+
@r.quit # free R
|
612
|
+
rescue Exception => e
|
613
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
614
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
615
|
+
end
|
616
|
+
end
|
617
|
+
prediction
|
618
|
+
end
|
619
|
+
|
620
|
+
# Get confidence for regression, with standard deviation of neighbor activity if conf_stdev is set.
|
621
|
+
# @param[Hash] Required keys: :sims, :acts, :neighbors, :conf_stdev
|
622
|
+
# @return[Float] Confidence
|
623
|
+
def self.get_confidence(params)
|
624
|
+
if params[:conf_stdev]
|
625
|
+
sim_median = params[:sims].to_scale.median
|
626
|
+
if sim_median.nil?
|
627
|
+
confidence = nil
|
628
|
+
else
|
629
|
+
standard_deviation = params[:acts].to_scale.standard_deviation_sample
|
630
|
+
confidence = (sim_median*Math.exp(-1*standard_deviation)).abs
|
631
|
+
if confidence.nan?
|
632
|
+
confidence = nil
|
633
|
+
end
|
634
|
+
end
|
635
|
+
else
|
636
|
+
conf = params[:sims].inject{|sum,x| sum + x }
|
637
|
+
confidence = conf/params[:neighbors].size
|
638
|
+
end
|
639
|
+
LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
|
640
|
+
return confidence
|
641
|
+
end
|
642
|
+
|
643
|
+
# Get X and Y size of a nested Array (Matrix)
|
644
|
+
def self.get_sizes(matrix)
|
645
|
+
begin
|
646
|
+
nr_cases = matrix.size
|
647
|
+
nr_features = matrix[0].size
|
648
|
+
rescue Exception => e
|
649
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
650
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
651
|
+
end
|
652
|
+
#puts "NRC: #{nr_cases}, NRF: #{nr_features}"
|
653
|
+
[ nr_cases, nr_features ]
|
654
|
+
end
|
655
|
+
|
656
|
+
# Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features)
|
657
|
+
# Same for the vector describing the query compound
|
658
|
+
# @param[Array] neighbors.
|
659
|
+
# @param[OpenTox::Compound] query compound.
|
660
|
+
# @param[Array] Dataset Features.
|
661
|
+
# @param[Array] Fingerprints of neighbors.
|
662
|
+
# @param[Float] p-values of Features.
|
663
|
+
def self.get_props (params)
|
664
|
+
matrix = Array.new
|
665
|
+
begin
|
666
|
+
params[:neighbors].each do |n|
|
667
|
+
n = n[:compound]
|
668
|
+
row = []
|
669
|
+
params[:features].each do |f|
|
670
|
+
if ! params[:fingerprints][n].nil?
|
671
|
+
row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0)
|
672
|
+
else
|
673
|
+
row << 0.0
|
674
|
+
end
|
675
|
+
end
|
676
|
+
matrix << row
|
677
|
+
end
|
678
|
+
row = []
|
679
|
+
params[:features].each do |f|
|
680
|
+
if params[:nr_hits]
|
681
|
+
compound_feature_hits = params[:compound].match_hits([f])
|
682
|
+
row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f]))
|
683
|
+
else
|
684
|
+
row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f])
|
685
|
+
end
|
686
|
+
end
|
687
|
+
rescue Exception => e
|
688
|
+
LOGGER.debug "get_props failed with '" + $! + "'"
|
689
|
+
end
|
690
|
+
[ matrix, row ]
|
232
691
|
end
|
233
692
|
|
234
693
|
end
|
@@ -250,6 +709,195 @@ module OpenTox
|
|
250
709
|
def features(dataset_uri,compound_uri)
|
251
710
|
end
|
252
711
|
end
|
712
|
+
|
713
|
+
module Transform
|
714
|
+
include Algorithm
|
715
|
+
|
716
|
+
# The transformer that inverts values.
|
717
|
+
# 1/x is used, after values have been moved >= 1.
|
718
|
+
class Inverter
|
719
|
+
attr_accessor :offset, :values
|
720
|
+
|
721
|
+
# @params[Array] Values to transform.
|
722
|
+
# @params[Float] Offset for restore.
|
723
|
+
def initialize *args
|
724
|
+
case args.size
|
725
|
+
when 1
|
726
|
+
begin
|
727
|
+
values=args[0]
|
728
|
+
raise "Cannot transform, values empty." if @values.size==0
|
729
|
+
@values = values.collect { |v| -1.0 * v }
|
730
|
+
@offset = 1.0 - @values.minmax[0]
|
731
|
+
@offset = -1.0 * @offset if @offset>0.0
|
732
|
+
@values.collect! { |v| v - @offset } # slide >1
|
733
|
+
@values.collect! { |v| 1 / v } # invert to [0,1]
|
734
|
+
rescue Exception => e
|
735
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
736
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
737
|
+
end
|
738
|
+
when 2
|
739
|
+
@offset = args[1].to_f
|
740
|
+
@values = args[0].collect { |v| 1 / v }
|
741
|
+
@values.collect! { |v| v + @offset }
|
742
|
+
@values.collect! { |v| -1.0 * v }
|
743
|
+
end
|
744
|
+
end
|
745
|
+
end
|
746
|
+
|
747
|
+
# The transformer that takes logs.
|
748
|
+
# Log10 is used, after values have been moved > 0.
|
749
|
+
class Log10
|
750
|
+
attr_accessor :offset, :values
|
751
|
+
|
752
|
+
# @params[Array] Values to transform / restore.
|
753
|
+
# @params[Float] Offset for restore.
|
754
|
+
def initialize *args
|
755
|
+
@distance_to_zero = 0.000000001 # 1 / 1 billion
|
756
|
+
case args.size
|
757
|
+
when 1
|
758
|
+
begin
|
759
|
+
values=args[0]
|
760
|
+
raise "Cannot transform, values empty." if values.size==0
|
761
|
+
@offset = values.minmax[0]
|
762
|
+
@offset = -1.0 * @offset if @offset>0.0
|
763
|
+
@values = values.collect { |v| v - @offset } # slide > anchor
|
764
|
+
@values.collect! { |v| v + @distance_to_zero } #
|
765
|
+
@values.collect! { |v| Math::log10 v } # log10 (can fail)
|
766
|
+
rescue Exception => e
|
767
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
768
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
769
|
+
end
|
770
|
+
when 2
|
771
|
+
@offset = args[1].to_f
|
772
|
+
@values = args[0].collect { |v| 10**v }
|
773
|
+
@values.collect! { |v| v - @distance_to_zero }
|
774
|
+
@values.collect! { |v| v + @offset }
|
775
|
+
end
|
776
|
+
end
|
777
|
+
end
|
778
|
+
|
779
|
+
# The transformer that does nothing (No OPeration).
|
780
|
+
class NOP
|
781
|
+
attr_accessor :offset, :values
|
782
|
+
|
783
|
+
# @params[Array] Values to transform / restore.
|
784
|
+
# @params[Float] Offset for restore.
|
785
|
+
def initialize *args
|
786
|
+
@offset = 0.0
|
787
|
+
@distance_to_zero = 0.0
|
788
|
+
case args.size
|
789
|
+
when 1
|
790
|
+
@values = args[0]
|
791
|
+
when 2
|
792
|
+
@values = args[0]
|
793
|
+
end
|
794
|
+
end
|
795
|
+
end
|
796
|
+
|
797
|
+
|
798
|
+
# Auto-Scaler for Arrays
|
799
|
+
# Center on mean and divide by standard deviation
|
800
|
+
class AutoScale
|
801
|
+
attr_accessor :scaled_values, :mean, :stdev
|
802
|
+
|
803
|
+
# @params[Array] Values to transform.
|
804
|
+
def initialize values
|
805
|
+
@scaled_values = values
|
806
|
+
@mean = @scaled_values.to_scale.mean
|
807
|
+
@stdev = @scaled_values.to_scale.standard_deviation_sample
|
808
|
+
@scaled_values = @scaled_values.collect {|vi| vi - @mean }
|
809
|
+
@scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0
|
810
|
+
end
|
811
|
+
end
|
812
|
+
|
813
|
+
# Principal Components Analysis
|
814
|
+
# Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
|
815
|
+
class PCA
|
816
|
+
attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
|
817
|
+
|
818
|
+
# Creates a transformed dataset as GSL::Matrix.
|
819
|
+
# @param [GSL::Matrix] Data matrix.
|
820
|
+
# @param [Float] Compression ratio from [0,1].
|
821
|
+
# @return [GSL::Matrix] Data transformed matrix.
|
822
|
+
def initialize data_matrix, compression=0.05
|
823
|
+
begin
|
824
|
+
@data_matrix = data_matrix
|
825
|
+
@compression = compression.to_f
|
826
|
+
@stdev = Array.new
|
827
|
+
@mean = Array.new
|
828
|
+
|
829
|
+
# Objective Feature Selection
|
830
|
+
raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
|
831
|
+
@data_matrix_selected = nil
|
832
|
+
(0..@data_matrix.size2-1).each { |i|
|
833
|
+
if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
|
834
|
+
if @data_matrix_selected.nil?
|
835
|
+
@data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
|
836
|
+
@data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
|
837
|
+
else
|
838
|
+
@data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
|
839
|
+
end
|
840
|
+
end
|
841
|
+
}
|
842
|
+
raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
|
843
|
+
|
844
|
+
# Scaling of Axes
|
845
|
+
@data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2)
|
846
|
+
(0..@data_matrix_selected.size2-1).each { |i|
|
847
|
+
@autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
|
848
|
+
@data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values
|
849
|
+
@stdev << @autoscaler.stdev
|
850
|
+
@mean << @autoscaler.mean
|
851
|
+
}
|
852
|
+
|
853
|
+
data_matrix_hash = Hash.new
|
854
|
+
(0..@data_matrix_scaled.size2-1).each { |i|
|
855
|
+
column_view = @data_matrix_scaled.col(i)
|
856
|
+
data_matrix_hash[i] = column_view.to_scale
|
857
|
+
}
|
858
|
+
dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
|
859
|
+
cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
|
860
|
+
pca=Statsample::Factor::PCA.new(cor_matrix)
|
861
|
+
pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
|
862
|
+
@eigenvalue_sums = Array.new
|
863
|
+
(0..dataset_hash.fields.size-1).each { |i|
|
864
|
+
@eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
|
865
|
+
}
|
866
|
+
eigenvectors_selected = Array.new
|
867
|
+
pca.eigenvectors.each_with_index { |ev, i|
|
868
|
+
if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0)
|
869
|
+
eigenvectors_selected << ev.to_a
|
870
|
+
end
|
871
|
+
}
|
872
|
+
@eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose
|
873
|
+
dataset_matrix = dataset_hash.to_gsl.transpose
|
874
|
+
@data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose
|
875
|
+
rescue Exception => e
|
876
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
877
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
878
|
+
end
|
879
|
+
end
|
880
|
+
|
881
|
+
# Restores data in the original feature space (possibly with compression loss).
|
882
|
+
# @return [GSL::Matrix] Data matrix.
|
883
|
+
def restore
|
884
|
+
begin
|
885
|
+
data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
|
886
|
+
# reverse scaling
|
887
|
+
(0..data_matrix_restored.size2-1).each { |i|
|
888
|
+
data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0
|
889
|
+
data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
|
890
|
+
}
|
891
|
+
data_matrix_restored
|
892
|
+
rescue Exception => e
|
893
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
894
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
895
|
+
end
|
896
|
+
end
|
897
|
+
|
898
|
+
end
|
899
|
+
|
900
|
+
end
|
253
901
|
|
254
902
|
# Gauss kernel
|
255
903
|
# @return [Float]
|
@@ -257,16 +905,85 @@ module OpenTox
|
|
257
905
|
d = 1.0 - x.to_f
|
258
906
|
Math.exp(-(d*d)/(2*sigma*sigma))
|
259
907
|
end
|
908
|
+
|
909
|
+
# For symbolic features
|
910
|
+
# @param [Array] Array to test, must indicate non-occurrence with 0.
|
911
|
+
# @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
|
912
|
+
def self.isnull_or_singular?(array)
|
913
|
+
nr_zeroes = array.count(0)
|
914
|
+
return (nr_zeroes == array.size) || # remove non-occurring feature
|
915
|
+
(nr_zeroes == array.size-1) || # remove singular feature
|
916
|
+
(nr_zeroes == 0) # also remove feature present everywhere
|
917
|
+
end
|
918
|
+
|
919
|
+
# Numeric value test
|
920
|
+
# @param[Object] value
|
921
|
+
# @return [Boolean] Whether value is a number
|
922
|
+
def self.numeric?(value)
|
923
|
+
true if Float(value) rescue false
|
924
|
+
end
|
925
|
+
|
926
|
+
# For symbolic features
|
927
|
+
# @param [Array] Array to test, must indicate non-occurrence with 0.
|
928
|
+
# @return [Boolean] Whether the feature has variance zero.
|
929
|
+
def self.zero_variance?(array)
|
930
|
+
return (array.to_scale.variance_sample == 0.0)
|
931
|
+
end
|
260
932
|
|
261
|
-
#
|
933
|
+
# Sum of an array for Arrays.
|
262
934
|
# @param [Array] Array with values
|
263
|
-
# @return [
|
264
|
-
def self.
|
265
|
-
|
266
|
-
array.
|
267
|
-
|
268
|
-
|
935
|
+
# @return [Integer] Sum of size of values
|
936
|
+
def self.sum_size(array)
|
937
|
+
sum=0
|
938
|
+
array.each { |e| sum += e.size }
|
939
|
+
return sum
|
940
|
+
end
|
941
|
+
|
942
|
+
# Minimum Frequency
|
943
|
+
# @param [Integer] per-mil value
|
944
|
+
# return [Integer] min-frequency
|
945
|
+
def self.min_frequency(training_dataset,per_mil)
|
946
|
+
minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
|
947
|
+
minfreq = 2 unless minfreq > 2
|
948
|
+
Integer (minfreq)
|
269
949
|
end
|
270
950
|
|
951
|
+
# Effect calculation for classification
|
952
|
+
# @param [Array] Array of occurrences per class in the form of Enumerables.
|
953
|
+
# @param [Array] Array of database instance counts per class.
|
954
|
+
def self.effect(occurrences, db_instances)
|
955
|
+
max=0
|
956
|
+
max_value=0
|
957
|
+
nr_o = self.sum_size(occurrences)
|
958
|
+
nr_db = db_instances.to_scale.sum
|
959
|
+
|
960
|
+
occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
|
961
|
+
actual = o.size.to_f/nr_o
|
962
|
+
expected = db_instances[i].to_f/nr_db
|
963
|
+
if actual > expected
|
964
|
+
if ((actual - expected) / actual) > max_value
|
965
|
+
max_value = (actual - expected) / actual # 'Schleppzeiger'
|
966
|
+
max = i
|
967
|
+
end
|
968
|
+
end
|
969
|
+
}
|
970
|
+
max
|
971
|
+
end
|
972
|
+
|
973
|
+
# Returns Support value of an fingerprint
|
974
|
+
# @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required
|
975
|
+
# return [Numeric] Support value
|
976
|
+
def self.p_sum_support(params)
|
977
|
+
p_sum = 0.0
|
978
|
+
params[:features].each{|f|
|
979
|
+
compound_hits = params[:compound_features_hits][f]
|
980
|
+
neighbor_hits = params[:training_compound_features_hits][f]
|
981
|
+
p_sum += eval("(Algorithm.gauss(params[:weights][f]) * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))")
|
982
|
+
}
|
983
|
+
p_sum
|
984
|
+
end
|
985
|
+
|
271
986
|
end
|
272
987
|
end
|
988
|
+
|
989
|
+
|