opentox-ruby 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +39 -46
- data/VERSION +1 -1
- data/lib/algorithm.rb +797 -80
- data/lib/compound.rb +40 -0
- data/lib/config/config_ru.rb +2 -0
- data/lib/dataset.rb +57 -18
- data/lib/environment.rb +3 -3
- data/lib/feature.rb +15 -13
- data/lib/helper.rb +1 -2
- data/lib/model.rb +185 -82
- data/lib/opentox-ruby.rb +1 -1
- data/lib/overwrite.rb +2 -1
- data/lib/parser.rb +247 -69
- data/lib/rest_client_wrapper.rb +3 -2
- data/lib/serializer.rb +24 -10
- data/lib/task.rb +10 -3
- data/lib/to-html.rb +66 -41
- data/lib/validation.rb +93 -29
- metadata +206 -117
data/Rakefile
CHANGED
@@ -8,53 +8,46 @@ begin
|
|
8
8
|
gem.summary = %Q{Ruby wrapper for the OpenTox REST API}
|
9
9
|
gem.description = %Q{Ruby wrapper for the OpenTox REST API (http://www.opentox.org)}
|
10
10
|
gem.email = "helma@in-silico.ch"
|
11
|
-
gem.homepage = "http://github.com/
|
11
|
+
gem.homepage = "http://github.com/opentox/opentox-ruby"
|
12
12
|
gem.authors = ["Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler"]
|
13
|
-
# dependencies
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
=
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
].each {|dep| gem.add_dependency dep, ">= 1" }
|
52
|
-
=end
|
53
|
-
#valiation-gem
|
54
|
-
gem.add_dependency "haml", ">=3"
|
55
|
-
# validation-gems
|
56
|
-
gem.add_dependency "ruby-plot", "~>0.4.0"
|
57
|
-
['jeweler'].each { |dep| gem.add_development_dependency dep }
|
13
|
+
# dependencies with versions
|
14
|
+
gem.add_dependency "sinatra", "=1.2.6"
|
15
|
+
gem.add_dependency "emk-sinatra-url-for", "=0.2.1"
|
16
|
+
gem.add_dependency "sinatra-respond_to", "=0.7.0"
|
17
|
+
gem.add_dependency "sinatra-static-assets", "=0.5.0"
|
18
|
+
gem.add_dependency "rest-client", "=1.6.1"
|
19
|
+
gem.add_dependency "rack", "=1.3.1"
|
20
|
+
gem.add_dependency "rack-contrib", "=1.1.0"
|
21
|
+
gem.add_dependency "rack-flash", "=0.1.1"
|
22
|
+
gem.add_dependency "nokogiri", "=1.4.4"
|
23
|
+
gem.add_dependency "rubyzip", "=0.9.4"
|
24
|
+
gem.add_dependency "roo", "=1.9.3"
|
25
|
+
gem.add_dependency "spreadsheet", "=0.6.5.4"
|
26
|
+
gem.add_dependency "google-spreadsheet-ruby", "=0.1.5"
|
27
|
+
gem.add_dependency "yajl-ruby", "=0.8.2"
|
28
|
+
#gem.add_dependency "mail", "=2.3.0"
|
29
|
+
gem.add_dependency "rinruby", "=2.0.2"
|
30
|
+
gem.add_dependency "ohm", "=0.1.3"
|
31
|
+
gem.add_dependency "ohm-contrib", "=0.1.1"
|
32
|
+
gem.add_dependency "SystemTimer", "=1.2.3"
|
33
|
+
gem.add_dependency "rjb", "=1.3.4"
|
34
|
+
gem.add_dependency "haml", "=3.1.1"
|
35
|
+
# for headless browser tests
|
36
|
+
gem.add_dependency "akephalos", "=0.2.5"
|
37
|
+
#valiation-gems
|
38
|
+
gem.add_dependency "dm-core", "=1.1.0"
|
39
|
+
gem.add_dependency "dm-serializer", "=1.1.0"
|
40
|
+
gem.add_dependency "dm-timestamps", "=1.1.0"
|
41
|
+
gem.add_dependency "dm-types", "=1.1.0"
|
42
|
+
gem.add_dependency "dm-migrations", "=1.1.0"
|
43
|
+
gem.add_dependency "dm-validations", "=1.1.0"
|
44
|
+
gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
|
45
|
+
gem.add_dependency "ruby-plot", "=0.5.0"
|
46
|
+
gem.add_dependency "gsl", "=1.14.7"
|
47
|
+
gem.add_dependency "statsample", "=1.1.0"
|
48
|
+
#gem.add_dependency "statsample-optimization", "=2.1.0"
|
49
|
+
|
50
|
+
gem.add_development_dependency 'jeweler'
|
58
51
|
gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
|
59
52
|
end
|
60
53
|
Jeweler::GemcutterTasks.new
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.0
|
1
|
+
2.1.0
|
data/lib/algorithm.rb
CHANGED
@@ -3,6 +3,8 @@
|
|
3
3
|
# avoids compiling R with X
|
4
4
|
R = nil
|
5
5
|
require "rinruby"
|
6
|
+
require "statsample"
|
7
|
+
require 'uri'
|
6
8
|
|
7
9
|
module OpenTox
|
8
10
|
|
@@ -16,6 +18,7 @@ module OpenTox
|
|
16
18
|
# @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
|
17
19
|
# @return [String] URI of new resource (dataset, model, ...)
|
18
20
|
def run(params=nil, waiting_task=nil)
|
21
|
+
LOGGER.info "Running algorithm '"+@uri.to_s+"' with params: "+params.inspect
|
19
22
|
RestClientWrapper.post(@uri, params, {:accept => 'text/uri-list'}, waiting_task).to_s
|
20
23
|
end
|
21
24
|
|
@@ -45,12 +48,75 @@ module OpenTox
|
|
45
48
|
end
|
46
49
|
|
47
50
|
# Fminer algorithms (https://github.com/amaunz/fminer2)
|
48
|
-
|
51
|
+
class Fminer
|
49
52
|
include Algorithm
|
53
|
+
attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi
|
54
|
+
|
55
|
+
def check_params(params,per_mil,subjectid=nil)
|
56
|
+
raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
|
57
|
+
raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
|
58
|
+
@prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
|
59
|
+
@training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", subjectid
|
60
|
+
raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature])
|
61
|
+
|
62
|
+
unless params[:min_frequency].nil?
|
63
|
+
@minfreq=params[:min_frequency].to_i
|
64
|
+
raise "Minimum frequency must be a number >0!" unless @minfreq>0
|
65
|
+
else
|
66
|
+
@minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def add_fminer_data(fminer_instance, params, value_map)
|
71
|
+
|
72
|
+
id = 1 # fminer start id is not 0
|
73
|
+
@training_dataset.data_entries.each do |compound,entry|
|
74
|
+
begin
|
75
|
+
smiles = OpenTox::Compound.smiles(compound.to_s)
|
76
|
+
rescue
|
77
|
+
LOGGER.warn "No resource for #{compound.to_s}"
|
78
|
+
next
|
79
|
+
end
|
80
|
+
if smiles == '' or smiles.nil?
|
81
|
+
LOGGER.warn "Cannot find smiles for #{compound.to_s}."
|
82
|
+
next
|
83
|
+
end
|
84
|
+
|
85
|
+
value_map=params[:value_map] unless params[:value_map].nil?
|
86
|
+
entry.each do |feature,values|
|
87
|
+
if feature == @prediction_feature.uri
|
88
|
+
values.each do |value|
|
89
|
+
if value.nil?
|
90
|
+
LOGGER.warn "No #{feature} activity for #{compound.to_s}."
|
91
|
+
else
|
92
|
+
if @prediction_feature.feature_type == "classification"
|
93
|
+
activity= value_map.invert[value].to_i # activities are mapped to 1..n
|
94
|
+
@db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
|
95
|
+
elsif @prediction_feature.feature_type == "regression"
|
96
|
+
activity= value.to_f
|
97
|
+
end
|
98
|
+
begin
|
99
|
+
fminer_instance.AddCompound(smiles,id)
|
100
|
+
fminer_instance.AddActivity(activity, id)
|
101
|
+
@all_activities[id]=activity # DV: insert global information
|
102
|
+
@compounds[id] = compound
|
103
|
+
@smi[id] = smiles
|
104
|
+
id += 1
|
105
|
+
rescue Exception => e
|
106
|
+
LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
|
107
|
+
LOGGER.warn e.backtrace
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
50
117
|
|
51
118
|
# Backbone Refinement Class mining (http://bbrc.maunz.de/)
|
52
|
-
class BBRC
|
53
|
-
include Fminer
|
119
|
+
class BBRC < Fminer
|
54
120
|
# Initialize bbrc algorithm
|
55
121
|
def initialize(subjectid=nil)
|
56
122
|
super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
|
@@ -59,8 +125,7 @@ module OpenTox
|
|
59
125
|
end
|
60
126
|
|
61
127
|
# LAtent STructure Pattern Mining (http://last-pm.maunz.de)
|
62
|
-
class LAST
|
63
|
-
include Fminer
|
128
|
+
class LAST < Fminer
|
64
129
|
# Initialize last algorithm
|
65
130
|
def initialize(subjectid=nil)
|
66
131
|
super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
|
@@ -68,7 +133,6 @@ module OpenTox
|
|
68
133
|
end
|
69
134
|
end
|
70
135
|
|
71
|
-
end
|
72
136
|
|
73
137
|
# Create lazar prediction model
|
74
138
|
class Lazar
|
@@ -90,19 +154,34 @@ module OpenTox
|
|
90
154
|
# @param [Array] features_a Features of first compound
|
91
155
|
# @param [Array] features_b Features of second compound
|
92
156
|
# @param [optional, Hash] weights Weights for all features
|
157
|
+
# @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required
|
93
158
|
# @return [Float] (Weighted) tanimoto similarity
|
94
|
-
def self.tanimoto(features_a,features_b,weights=nil)
|
159
|
+
def self.tanimoto(features_a,features_b,weights=nil,params=nil)
|
95
160
|
common_features = features_a & features_b
|
96
161
|
all_features = (features_a + features_b).uniq
|
97
|
-
|
162
|
+
#LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}"
|
98
163
|
if common_features.size > 0
|
99
164
|
if weights
|
100
|
-
|
101
|
-
|
102
|
-
|
165
|
+
#LOGGER.debug "nr_hits: #{params[:nr_hits]}"
|
166
|
+
if !params.nil? && params[:nr_hits]
|
167
|
+
params[:weights] = weights
|
168
|
+
params[:mode] = "min"
|
169
|
+
params[:features] = common_features
|
170
|
+
common_p_sum = Algorithm.p_sum_support(params)
|
171
|
+
params[:mode] = "max"
|
172
|
+
params[:features] = all_features
|
173
|
+
all_p_sum = Algorithm.p_sum_support(params)
|
174
|
+
else
|
175
|
+
common_p_sum = 0.0
|
176
|
+
common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
|
177
|
+
all_p_sum = 0.0
|
178
|
+
all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
|
179
|
+
end
|
180
|
+
#LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}"
|
103
181
|
common_p_sum/all_p_sum
|
104
182
|
else
|
105
|
-
common_features.
|
183
|
+
#LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}"
|
184
|
+
common_features.size.to_f/all_features.size.to_f
|
106
185
|
end
|
107
186
|
else
|
108
187
|
0.0
|
@@ -132,65 +211,300 @@ module OpenTox
|
|
132
211
|
end
|
133
212
|
end
|
134
213
|
|
214
|
+
# Structural Graph Clustering by TU Munich
|
215
|
+
# Finds clusters similar to a query structure in a given training dataset
|
216
|
+
# May be queried for cluster membership of an unknown compound
|
217
|
+
class StructuralClustering
|
218
|
+
attr_accessor :training_dataset_uri, :training_threshold, :query_dataset_uri, :query_threshold, :target_clusters_array
|
219
|
+
|
220
|
+
# @params[String] Training dataset_uri
|
221
|
+
# @params[Float] Similarity threshold for training (optional)
|
222
|
+
# @params[String] Cluster service uri (no AA)
|
223
|
+
def initialize training_dataset_uri, training_threshold=0.8, cluster_service_uri = "http://opentox-dev.informatik.tu-muenchen.de:8080/OpenTox/algorithm/StructuralClustering"
|
224
|
+
|
225
|
+
if (training_dataset_uri =~ URI::regexp).nil? || (cluster_service_uri =~ URI::regexp).nil?
|
226
|
+
raise "Invalid URI."
|
227
|
+
end
|
228
|
+
@training_dataset_uri = training_dataset_uri
|
229
|
+
if !OpenTox::Algorithm.numeric? training_threshold || training_threshold <0 || training_threshold >1
|
230
|
+
raise "Training threshold out of bounds."
|
231
|
+
end
|
232
|
+
@training_threshold = training_threshold.to_f
|
233
|
+
|
234
|
+
# Train a cluster model
|
235
|
+
params = {:dataset_uri => @training_dataset_uri, :threshold => @training_threshold }
|
236
|
+
@cluster_model_uri = OpenTox::RestClientWrapper.post cluster_service_uri, params
|
237
|
+
cluster_model_rdf = OpenTox::RestClientWrapper.get @cluster_model_uri
|
238
|
+
@datasets = OpenTox::Parser::Owl.from_rdf cluster_model_rdf, OT.Dataset, true # must extract OT.Datasets from model
|
239
|
+
|
240
|
+
# Process parsed OWL objects
|
241
|
+
@clusterid_dataset_map = Hash.new
|
242
|
+
@datasets.each { |d|
|
243
|
+
begin
|
244
|
+
d.metadata[OT.hasSource]["Structural Clustering cluster "] = "" # must parse in metadata for string (not elegant)
|
245
|
+
@clusterid_dataset_map[d.metadata[OT.hasSource].to_i] = d.uri
|
246
|
+
rescue Exception => e
|
247
|
+
# ignore other entries!
|
248
|
+
end
|
249
|
+
}
|
250
|
+
end
|
251
|
+
|
252
|
+
# Whether a model has been trained
|
253
|
+
def trained?
|
254
|
+
!@cluster_model_uri.nil?
|
255
|
+
end
|
256
|
+
|
257
|
+
# Instance query: clusters for a compound
|
258
|
+
# @params[String] Query compound
|
259
|
+
# @params[Float] Similarity threshold for query to clusters (optional)
|
260
|
+
def get_clusters query_compound_uri, query_threshold = 0.5
|
261
|
+
|
262
|
+
if !OpenTox::Algorithm.numeric? query_threshold || query_threshold <0 || query_threshold >1
|
263
|
+
raise "Query threshold out of bounds."
|
264
|
+
end
|
265
|
+
@query_threshold = query_threshold.to_f
|
266
|
+
|
267
|
+
|
268
|
+
# Preparing a query dataset
|
269
|
+
query_dataset = OpenTox::Dataset.new
|
270
|
+
@query_dataset_uri = query_dataset.save
|
271
|
+
query_dataset = OpenTox::Dataset.find @query_dataset_uri
|
272
|
+
query_dataset.add_compound query_compound_uri
|
273
|
+
@query_dataset_uri = query_dataset.save
|
274
|
+
|
275
|
+
# Obtaining a clustering for query compound
|
276
|
+
params = { :dataset_uri => @query_dataset_uri, :threshold => @query_threshold }
|
277
|
+
cluster_query_dataset_uri = OpenTox::RestClientWrapper.post @cluster_model_uri, params
|
278
|
+
cluster_query_dataset = OpenTox::Dataset.new cluster_query_dataset_uri
|
279
|
+
cluster_query_dataset.load_all
|
280
|
+
|
281
|
+
# Reading cluster ids for features from metadata
|
282
|
+
feature_clusterid_map = Hash.new
|
283
|
+
pattern="Prediction feature for cluster assignment " # must parse for string in metadata (not elegant)
|
284
|
+
cluster_query_dataset.features.each { |feature_uri,metadata|
|
285
|
+
metadata[DC.title][pattern]=""
|
286
|
+
feature_clusterid_map[feature_uri] = metadata[DC.title].to_i
|
287
|
+
}
|
288
|
+
|
289
|
+
# Integrity check
|
290
|
+
unless cluster_query_dataset.compounds.size == 1
|
291
|
+
raise "Number of predicted compounds is != 1."
|
292
|
+
end
|
293
|
+
|
294
|
+
# Process data entry
|
295
|
+
query_compound_uri = cluster_query_dataset.compounds[0]
|
296
|
+
@target_clusters_array = Array.new
|
297
|
+
cluster_query_dataset.features.keys.each { |cluster_membership_feature|
|
298
|
+
|
299
|
+
# Getting dataset URI for cluster
|
300
|
+
target_cluster = feature_clusterid_map[cluster_membership_feature]
|
301
|
+
dataset = @clusterid_dataset_map[target_cluster]
|
302
|
+
|
303
|
+
# Finally look up presence
|
304
|
+
data_entry = cluster_query_dataset.data_entries[query_compound_uri]
|
305
|
+
present = data_entry[cluster_membership_feature][0]
|
306
|
+
|
307
|
+
# Store result
|
308
|
+
@target_clusters_array << dataset if present > 0.5 # 0.0 for absence, 1.0 for presence
|
309
|
+
}
|
310
|
+
end
|
311
|
+
|
312
|
+
end
|
313
|
+
|
135
314
|
module Neighbors
|
136
315
|
|
316
|
+
# Local multi-linear regression (MLR) prediction from neighbors.
|
317
|
+
# Uses propositionalized setting.
|
318
|
+
# @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
|
319
|
+
# @return [Numeric] A prediction value.
|
320
|
+
def self.local_mlr_prop(params)
|
321
|
+
|
322
|
+
confidence=0.0
|
323
|
+
prediction=nil
|
324
|
+
|
325
|
+
if params[:neighbors].size>0
|
326
|
+
props = params[:prop_kernel] ? get_props(params) : nil
|
327
|
+
acts = params[:neighbors].collect { |n| act = n[:activity].to_f }
|
328
|
+
sims = params[:neighbors].collect { |n| Algorithm.gauss(n[:similarity]) }
|
329
|
+
LOGGER.debug "Local MLR (Propositionalization / GSL)."
|
330
|
+
prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} )
|
331
|
+
transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
|
332
|
+
prediction = transformer.values[0]
|
333
|
+
prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
|
334
|
+
LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
|
335
|
+
params[:conf_stdev] = false if params[:conf_stdev].nil?
|
336
|
+
confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
|
337
|
+
confidence = nil if prediction.nil?
|
338
|
+
end
|
339
|
+
{:prediction => prediction, :confidence => confidence}
|
340
|
+
|
341
|
+
end
|
342
|
+
|
343
|
+
# Multi-linear regression weighted by similarity.
|
344
|
+
# Objective Feature Selection, Principal Components Analysis, Scaling of Axes.
|
345
|
+
# @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required
|
346
|
+
# @return [Numeric] A prediction value.
|
347
|
+
def self.mlr(params)
|
348
|
+
|
349
|
+
# GSL matrix operations:
|
350
|
+
# to_a : row-wise conversion to nested array
|
351
|
+
#
|
352
|
+
# Statsample operations (build on GSL):
|
353
|
+
# to_scale: convert into Statsample format
|
354
|
+
|
355
|
+
begin
|
356
|
+
n_prop = params[:n_prop].collect { |v| v }
|
357
|
+
q_prop = params[:q_prop].collect { |v| v }
|
358
|
+
n_prop << q_prop # attach q_prop
|
359
|
+
nr_cases, nr_features = get_sizes n_prop
|
360
|
+
data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
|
361
|
+
|
362
|
+
# Principal Components Analysis
|
363
|
+
LOGGER.debug "PCA..."
|
364
|
+
pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix)
|
365
|
+
data_matrix = pca.data_transformed_matrix
|
366
|
+
|
367
|
+
# Attach intercept column to data
|
368
|
+
intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1)
|
369
|
+
data_matrix = data_matrix.horzcat(intercept)
|
370
|
+
(0..data_matrix.size2-2).each { |i|
|
371
|
+
autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i))
|
372
|
+
data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values
|
373
|
+
}
|
374
|
+
|
375
|
+
# Detach query instance
|
376
|
+
n_prop = data_matrix.to_a
|
377
|
+
q_prop = n_prop.pop
|
378
|
+
nr_cases, nr_features = get_sizes n_prop
|
379
|
+
data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
|
380
|
+
|
381
|
+
# model + support vectors
|
382
|
+
LOGGER.debug "Creating MLR model ..."
|
383
|
+
c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl)
|
384
|
+
GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0]
|
385
|
+
rescue Exception => e
|
386
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
387
|
+
end
|
388
|
+
|
389
|
+
end
|
390
|
+
|
137
391
|
# Classification with majority vote from neighbors weighted by similarity
|
138
|
-
# @param [
|
139
|
-
# @
|
140
|
-
|
141
|
-
|
142
|
-
|
392
|
+
# @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
|
393
|
+
# @return [Numeric] A prediction value.
|
394
|
+
def self.weighted_majority_vote(params)
|
395
|
+
|
396
|
+
neighbor_contribution = 0.0
|
397
|
+
confidence_sum = 0.0
|
143
398
|
confidence = 0.0
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
399
|
+
prediction = nil
|
400
|
+
|
401
|
+
params[:neighbors].each do |neighbor|
|
402
|
+
neighbor_weight = Algorithm.gauss(neighbor[:similarity]).to_f
|
403
|
+
neighbor_contribution += neighbor[:activity].to_f * neighbor_weight
|
404
|
+
|
405
|
+
if params[:value_map].size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
|
406
|
+
case neighbor[:activity]
|
407
|
+
when 1
|
408
|
+
confidence_sum -= neighbor_weight
|
409
|
+
when 2
|
410
|
+
confidence_sum += neighbor_weight
|
411
|
+
end
|
412
|
+
else
|
413
|
+
confidence_sum += neighbor_weight
|
150
414
|
end
|
151
415
|
end
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
416
|
+
|
417
|
+
if params[:value_map].size == 2
|
418
|
+
if confidence_sum >= 0.0
|
419
|
+
prediction = 2 unless params[:neighbors].size==0
|
420
|
+
elsif confidence_sum < 0.0
|
421
|
+
prediction = 1 unless params[:neighbors].size==0
|
422
|
+
end
|
423
|
+
else
|
424
|
+
prediction = (neighbor_contribution/confidence_sum).round unless params[:neighbors].size==0 # AM: new multinomial prediction
|
425
|
+
end
|
426
|
+
LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil?
|
427
|
+
confidence = confidence_sum/params[:neighbors].size if params[:neighbors].size > 0
|
428
|
+
LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil?
|
429
|
+
return {:prediction => prediction, :confidence => confidence.abs}
|
161
430
|
end
|
162
431
|
|
163
432
|
# Local support vector regression from neighbors
|
164
|
-
# @param [
|
165
|
-
# @
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
433
|
+
# @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
|
434
|
+
# @return [Numeric] A prediction value.
|
435
|
+
def self.local_svm_regression(params)
|
436
|
+
|
437
|
+
confidence = 0.0
|
438
|
+
prediction = nil
|
439
|
+
if params[:neighbors].size>0
|
440
|
+
props = params[:prop_kernel] ? get_props(params) : nil
|
441
|
+
acts = params[:neighbors].collect{ |n| n[:activity].to_f }
|
442
|
+
sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) }
|
443
|
+
prediction = props.nil? ? local_svm(acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr")
|
444
|
+
transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
|
445
|
+
prediction = transformer.values[0]
|
446
|
+
prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
|
447
|
+
LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
|
448
|
+
params[:conf_stdev] = false if params[:conf_stdev].nil?
|
449
|
+
confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
|
450
|
+
confidence = nil if prediction.nil?
|
177
451
|
end
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
452
|
+
{:prediction => prediction, :confidence => confidence}
|
453
|
+
|
454
|
+
end
|
455
|
+
|
456
|
+
# Local support vector classification from neighbors
|
457
|
+
# @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
|
458
|
+
# @return [Numeric] A prediction value.
|
459
|
+
def self.local_svm_classification(params)
|
182
460
|
|
183
|
-
|
461
|
+
confidence = 0.0
|
462
|
+
prediction = nil
|
463
|
+
if params[:neighbors].size>0
|
464
|
+
props = params[:prop_kernel] ? get_props(params) : nil
|
465
|
+
acts = params[:neighbors].collect { |n| act = n[:activity] }
|
466
|
+
sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
|
467
|
+
prediction = props.nil? ? local_svm(acts, sims, "C-bsvc", params) : local_svm_prop(props, acts, "C-bsvc")
|
468
|
+
LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
|
469
|
+
params[:conf_stdev] = false if params[:conf_stdev].nil?
|
470
|
+
confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
|
471
|
+
end
|
472
|
+
{:prediction => prediction, :confidence => confidence}
|
473
|
+
|
474
|
+
end
|
475
|
+
|
476
|
+
|
477
|
+
# Local support vector prediction from neighbors.
|
478
|
+
# Uses pre-defined Kernel Matrix.
|
479
|
+
# Not to be called directly (use local_svm_regression or local_svm_classification).
|
480
|
+
# @param [Array] acts, activities for neighbors.
|
481
|
+
# @param [Array] sims, similarities for neighbors.
|
482
|
+
# @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
|
483
|
+
# @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
|
484
|
+
# @return [Numeric] A prediction value.
|
485
|
+
def self.local_svm(acts, sims, type, params)
|
486
|
+
LOGGER.debug "Local SVM (Weighted Tanimoto Kernel)."
|
487
|
+
neighbor_matches = params[:neighbors].collect{ |n| n[:features] } # URIs of matches
|
184
488
|
gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
|
185
|
-
|
186
|
-
|
489
|
+
|
490
|
+
prediction = nil
|
491
|
+
if Algorithm::zero_variance? acts
|
492
|
+
prediction = acts[0]
|
187
493
|
else
|
188
494
|
# gram matrix
|
189
495
|
(0..(neighbor_matches.length-1)).each do |i|
|
496
|
+
neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]]
|
190
497
|
gram_matrix[i] = [] unless gram_matrix[i]
|
191
498
|
# upper triangle
|
192
499
|
((i+1)..(neighbor_matches.length-1)).each do |j|
|
193
|
-
|
500
|
+
neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]]
|
501
|
+
sim_params = {}
|
502
|
+
if params[:nr_hits]
|
503
|
+
sim_params[:nr_hits] = true
|
504
|
+
sim_params[:compound_features_hits] = neighbor_i_hits
|
505
|
+
sim_params[:training_compound_features_hits] = neighbor_j_hits
|
506
|
+
end
|
507
|
+
sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)")
|
194
508
|
gram_matrix[i][j] = Algorithm.gauss(sim)
|
195
509
|
gram_matrix[j] = [] unless gram_matrix[j]
|
196
510
|
gram_matrix[j][i] = gram_matrix[i][j] # lower triangle
|
@@ -198,6 +512,7 @@ module OpenTox
|
|
198
512
|
gram_matrix[i][i] = 1.0
|
199
513
|
end
|
200
514
|
|
515
|
+
|
201
516
|
#LOGGER.debug gram_matrix.to_yaml
|
202
517
|
@r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
|
203
518
|
@r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
|
@@ -208,27 +523,171 @@ module OpenTox
|
|
208
523
|
@r.y = acts
|
209
524
|
@r.sims = sims
|
210
525
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
526
|
+
begin
|
527
|
+
LOGGER.debug "Preparing R data ..."
|
528
|
+
# prepare data
|
529
|
+
@r.eval "y<-as.vector(y)"
|
530
|
+
@r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
|
531
|
+
@r.eval "sims<-as.vector(sims)"
|
532
|
+
|
533
|
+
# model + support vectors
|
534
|
+
LOGGER.debug "Creating SVM model ..."
|
535
|
+
@r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"#{type}\", nu=0.5)"
|
536
|
+
@r.eval "sv<-as.vector(SVindex(model))"
|
537
|
+
@r.eval "sims<-sims[sv]"
|
538
|
+
@r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
|
539
|
+
LOGGER.debug "Predicting ..."
|
540
|
+
if type == "nu-svr"
|
541
|
+
@r.eval "p<-predict(model,sims)[1,1]"
|
542
|
+
elsif type == "C-bsvc"
|
543
|
+
@r.eval "p<-predict(model,sims)"
|
544
|
+
end
|
545
|
+
if type == "nu-svr"
|
546
|
+
prediction = @r.p
|
547
|
+
elsif type == "C-bsvc"
|
548
|
+
#prediction = (@r.p.to_f == 1.0 ? true : false)
|
549
|
+
prediction = @r.p
|
550
|
+
end
|
551
|
+
@r.quit # free R
|
552
|
+
rescue Exception => e
|
553
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
554
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
555
|
+
end
|
556
|
+
|
228
557
|
end
|
229
|
-
|
230
|
-
|
231
|
-
|
558
|
+
prediction
|
559
|
+
end
|
560
|
+
|
561
|
+
# Local support vector prediction from neighbors.
|
562
|
+
# Uses propositionalized setting.
|
563
|
+
# Not to be called directly (use local_svm_regression or local_svm_classification).
|
564
|
+
# @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
|
565
|
+
# @param [Array] acts, activities for neighbors.
|
566
|
+
# @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
|
567
|
+
# @return [Numeric] A prediction value.
|
568
|
+
def self.local_svm_prop(props, acts, type)
|
569
|
+
|
570
|
+
LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
|
571
|
+
n_prop = props[0] # is a matrix, i.e. two nested Arrays.
|
572
|
+
q_prop = props[1] # is an Array.
|
573
|
+
|
574
|
+
prediction = nil
|
575
|
+
if Algorithm::zero_variance? acts
|
576
|
+
prediction = acts[0]
|
577
|
+
else
|
578
|
+
#LOGGER.debug gram_matrix.to_yaml
|
579
|
+
@r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
|
580
|
+
@r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
|
581
|
+
LOGGER.debug "Setting R data ..."
|
582
|
+
# set data
|
583
|
+
@r.n_prop = n_prop.flatten
|
584
|
+
@r.n_prop_x_size = n_prop.size
|
585
|
+
@r.n_prop_y_size = n_prop[0].size
|
586
|
+
@r.y = acts
|
587
|
+
@r.q_prop = q_prop
|
588
|
+
|
589
|
+
begin
|
590
|
+
LOGGER.debug "Preparing R data ..."
|
591
|
+
# prepare data
|
592
|
+
@r.eval "y<-matrix(y)"
|
593
|
+
@r.eval "prop_matrix<-matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=TRUE)"
|
594
|
+
@r.eval "q_prop<-matrix(q_prop, 1, n_prop_y_size, byrow=TRUE)"
|
595
|
+
|
596
|
+
# model + support vectors
|
597
|
+
LOGGER.debug "Creating SVM model ..."
|
598
|
+
@r.eval "model<-ksvm(prop_matrix, y, type=\"#{type}\", nu=0.5)"
|
599
|
+
LOGGER.debug "Predicting ..."
|
600
|
+
if type == "nu-svr"
|
601
|
+
@r.eval "p<-predict(model,q_prop)[1,1]"
|
602
|
+
elsif type == "C-bsvc"
|
603
|
+
@r.eval "p<-predict(model,q_prop)"
|
604
|
+
end
|
605
|
+
if type == "nu-svr"
|
606
|
+
prediction = @r.p
|
607
|
+
elsif type == "C-bsvc"
|
608
|
+
#prediction = (@r.p.to_f == 1.0 ? true : false)
|
609
|
+
prediction = @r.p
|
610
|
+
end
|
611
|
+
@r.quit # free R
|
612
|
+
rescue Exception => e
|
613
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
614
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
615
|
+
end
|
616
|
+
end
|
617
|
+
prediction
|
618
|
+
end
|
619
|
+
|
620
|
+
# Get confidence for regression, with standard deviation of neighbor activity if conf_stdev is set.
|
621
|
+
# @param[Hash] Required keys: :sims, :acts, :neighbors, :conf_stdev
|
622
|
+
# @return[Float] Confidence
|
623
|
+
def self.get_confidence(params)
|
624
|
+
if params[:conf_stdev]
|
625
|
+
sim_median = params[:sims].to_scale.median
|
626
|
+
if sim_median.nil?
|
627
|
+
confidence = nil
|
628
|
+
else
|
629
|
+
standard_deviation = params[:acts].to_scale.standard_deviation_sample
|
630
|
+
confidence = (sim_median*Math.exp(-1*standard_deviation)).abs
|
631
|
+
if confidence.nan?
|
632
|
+
confidence = nil
|
633
|
+
end
|
634
|
+
end
|
635
|
+
else
|
636
|
+
conf = params[:sims].inject{|sum,x| sum + x }
|
637
|
+
confidence = conf/params[:neighbors].size
|
638
|
+
end
|
639
|
+
LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
|
640
|
+
return confidence
|
641
|
+
end
|
642
|
+
|
643
|
+
# Get X and Y size of a nested Array (Matrix)
|
644
|
+
def self.get_sizes(matrix)
|
645
|
+
begin
|
646
|
+
nr_cases = matrix.size
|
647
|
+
nr_features = matrix[0].size
|
648
|
+
rescue Exception => e
|
649
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
650
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
651
|
+
end
|
652
|
+
#puts "NRC: #{nr_cases}, NRF: #{nr_features}"
|
653
|
+
[ nr_cases, nr_features ]
|
654
|
+
end
|
655
|
+
|
656
|
+
# Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features)
|
657
|
+
# Same for the vector describing the query compound
|
658
|
+
# @param[Array] neighbors.
|
659
|
+
# @param[OpenTox::Compound] query compound.
|
660
|
+
# @param[Array] Dataset Features.
|
661
|
+
# @param[Array] Fingerprints of neighbors.
|
662
|
+
# @param[Float] p-values of Features.
|
663
|
+
def self.get_props (params)
|
664
|
+
matrix = Array.new
|
665
|
+
begin
|
666
|
+
params[:neighbors].each do |n|
|
667
|
+
n = n[:compound]
|
668
|
+
row = []
|
669
|
+
params[:features].each do |f|
|
670
|
+
if ! params[:fingerprints][n].nil?
|
671
|
+
row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0)
|
672
|
+
else
|
673
|
+
row << 0.0
|
674
|
+
end
|
675
|
+
end
|
676
|
+
matrix << row
|
677
|
+
end
|
678
|
+
row = []
|
679
|
+
params[:features].each do |f|
|
680
|
+
if params[:nr_hits]
|
681
|
+
compound_feature_hits = params[:compound].match_hits([f])
|
682
|
+
row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f]))
|
683
|
+
else
|
684
|
+
row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f])
|
685
|
+
end
|
686
|
+
end
|
687
|
+
rescue Exception => e
|
688
|
+
LOGGER.debug "get_props failed with '" + $! + "'"
|
689
|
+
end
|
690
|
+
[ matrix, row ]
|
232
691
|
end
|
233
692
|
|
234
693
|
end
|
@@ -250,6 +709,195 @@ module OpenTox
|
|
250
709
|
def features(dataset_uri,compound_uri)
|
251
710
|
end
|
252
711
|
end
|
712
|
+
|
713
|
+
module Transform
|
714
|
+
include Algorithm
|
715
|
+
|
716
|
+
# The transformer that inverts values.
|
717
|
+
# 1/x is used, after values have been moved >= 1.
|
718
|
+
class Inverter
|
719
|
+
attr_accessor :offset, :values
|
720
|
+
|
721
|
+
# @params[Array] Values to transform.
|
722
|
+
# @params[Float] Offset for restore.
|
723
|
+
def initialize *args
|
724
|
+
case args.size
|
725
|
+
when 1
|
726
|
+
begin
|
727
|
+
values=args[0]
|
728
|
+
raise "Cannot transform, values empty." if @values.size==0
|
729
|
+
@values = values.collect { |v| -1.0 * v }
|
730
|
+
@offset = 1.0 - @values.minmax[0]
|
731
|
+
@offset = -1.0 * @offset if @offset>0.0
|
732
|
+
@values.collect! { |v| v - @offset } # slide >1
|
733
|
+
@values.collect! { |v| 1 / v } # invert to [0,1]
|
734
|
+
rescue Exception => e
|
735
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
736
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
737
|
+
end
|
738
|
+
when 2
|
739
|
+
@offset = args[1].to_f
|
740
|
+
@values = args[0].collect { |v| 1 / v }
|
741
|
+
@values.collect! { |v| v + @offset }
|
742
|
+
@values.collect! { |v| -1.0 * v }
|
743
|
+
end
|
744
|
+
end
|
745
|
+
end
|
746
|
+
|
747
|
+
# The transformer that takes logs.
|
748
|
+
# Log10 is used, after values have been moved > 0.
|
749
|
+
class Log10
|
750
|
+
attr_accessor :offset, :values
|
751
|
+
|
752
|
+
# @params[Array] Values to transform / restore.
|
753
|
+
# @params[Float] Offset for restore.
|
754
|
+
def initialize *args
|
755
|
+
@distance_to_zero = 0.000000001 # 1 / 1 billion
|
756
|
+
case args.size
|
757
|
+
when 1
|
758
|
+
begin
|
759
|
+
values=args[0]
|
760
|
+
raise "Cannot transform, values empty." if values.size==0
|
761
|
+
@offset = values.minmax[0]
|
762
|
+
@offset = -1.0 * @offset if @offset>0.0
|
763
|
+
@values = values.collect { |v| v - @offset } # slide > anchor
|
764
|
+
@values.collect! { |v| v + @distance_to_zero } #
|
765
|
+
@values.collect! { |v| Math::log10 v } # log10 (can fail)
|
766
|
+
rescue Exception => e
|
767
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
768
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
769
|
+
end
|
770
|
+
when 2
|
771
|
+
@offset = args[1].to_f
|
772
|
+
@values = args[0].collect { |v| 10**v }
|
773
|
+
@values.collect! { |v| v - @distance_to_zero }
|
774
|
+
@values.collect! { |v| v + @offset }
|
775
|
+
end
|
776
|
+
end
|
777
|
+
end
|
778
|
+
|
779
|
+
# The transformer that does nothing (No OPeration).
|
780
|
+
class NOP
|
781
|
+
attr_accessor :offset, :values
|
782
|
+
|
783
|
+
# @params[Array] Values to transform / restore.
|
784
|
+
# @params[Float] Offset for restore.
|
785
|
+
def initialize *args
|
786
|
+
@offset = 0.0
|
787
|
+
@distance_to_zero = 0.0
|
788
|
+
case args.size
|
789
|
+
when 1
|
790
|
+
@values = args[0]
|
791
|
+
when 2
|
792
|
+
@values = args[0]
|
793
|
+
end
|
794
|
+
end
|
795
|
+
end
|
796
|
+
|
797
|
+
|
798
|
+
# Auto-Scaler for Arrays
|
799
|
+
# Center on mean and divide by standard deviation
|
800
|
+
class AutoScale
|
801
|
+
attr_accessor :scaled_values, :mean, :stdev
|
802
|
+
|
803
|
+
# @params[Array] Values to transform.
|
804
|
+
def initialize values
|
805
|
+
@scaled_values = values
|
806
|
+
@mean = @scaled_values.to_scale.mean
|
807
|
+
@stdev = @scaled_values.to_scale.standard_deviation_sample
|
808
|
+
@scaled_values = @scaled_values.collect {|vi| vi - @mean }
|
809
|
+
@scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0
|
810
|
+
end
|
811
|
+
end
|
812
|
+
|
813
|
+
# Principal Components Analysis
|
814
|
+
# Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
|
815
|
+
class PCA
|
816
|
+
attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
|
817
|
+
|
818
|
+
# Creates a transformed dataset as GSL::Matrix.
|
819
|
+
# @param [GSL::Matrix] Data matrix.
|
820
|
+
# @param [Float] Compression ratio from [0,1].
|
821
|
+
# @return [GSL::Matrix] Data transformed matrix.
|
822
|
+
def initialize data_matrix, compression=0.05
|
823
|
+
begin
|
824
|
+
@data_matrix = data_matrix
|
825
|
+
@compression = compression.to_f
|
826
|
+
@stdev = Array.new
|
827
|
+
@mean = Array.new
|
828
|
+
|
829
|
+
# Objective Feature Selection
|
830
|
+
raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
|
831
|
+
@data_matrix_selected = nil
|
832
|
+
(0..@data_matrix.size2-1).each { |i|
|
833
|
+
if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
|
834
|
+
if @data_matrix_selected.nil?
|
835
|
+
@data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
|
836
|
+
@data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
|
837
|
+
else
|
838
|
+
@data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
|
839
|
+
end
|
840
|
+
end
|
841
|
+
}
|
842
|
+
raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
|
843
|
+
|
844
|
+
# Scaling of Axes
|
845
|
+
@data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2)
|
846
|
+
(0..@data_matrix_selected.size2-1).each { |i|
|
847
|
+
@autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
|
848
|
+
@data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values
|
849
|
+
@stdev << @autoscaler.stdev
|
850
|
+
@mean << @autoscaler.mean
|
851
|
+
}
|
852
|
+
|
853
|
+
data_matrix_hash = Hash.new
|
854
|
+
(0..@data_matrix_scaled.size2-1).each { |i|
|
855
|
+
column_view = @data_matrix_scaled.col(i)
|
856
|
+
data_matrix_hash[i] = column_view.to_scale
|
857
|
+
}
|
858
|
+
dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
|
859
|
+
cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
|
860
|
+
pca=Statsample::Factor::PCA.new(cor_matrix)
|
861
|
+
pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
|
862
|
+
@eigenvalue_sums = Array.new
|
863
|
+
(0..dataset_hash.fields.size-1).each { |i|
|
864
|
+
@eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
|
865
|
+
}
|
866
|
+
eigenvectors_selected = Array.new
|
867
|
+
pca.eigenvectors.each_with_index { |ev, i|
|
868
|
+
if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0)
|
869
|
+
eigenvectors_selected << ev.to_a
|
870
|
+
end
|
871
|
+
}
|
872
|
+
@eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose
|
873
|
+
dataset_matrix = dataset_hash.to_gsl.transpose
|
874
|
+
@data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose
|
875
|
+
rescue Exception => e
|
876
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
877
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
878
|
+
end
|
879
|
+
end
|
880
|
+
|
881
|
+
# Restores data in the original feature space (possibly with compression loss).
|
882
|
+
# @return [GSL::Matrix] Data matrix.
|
883
|
+
def restore
|
884
|
+
begin
|
885
|
+
data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
|
886
|
+
# reverse scaling
|
887
|
+
(0..data_matrix_restored.size2-1).each { |i|
|
888
|
+
data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0
|
889
|
+
data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
|
890
|
+
}
|
891
|
+
data_matrix_restored
|
892
|
+
rescue Exception => e
|
893
|
+
LOGGER.debug "#{e.class}: #{e.message}"
|
894
|
+
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
|
895
|
+
end
|
896
|
+
end
|
897
|
+
|
898
|
+
end
|
899
|
+
|
900
|
+
end
|
253
901
|
|
254
902
|
# Gauss kernel
|
255
903
|
# @return [Float]
|
@@ -257,16 +905,85 @@ module OpenTox
|
|
257
905
|
d = 1.0 - x.to_f
|
258
906
|
Math.exp(-(d*d)/(2*sigma*sigma))
|
259
907
|
end
|
908
|
+
|
909
|
+
# For symbolic features
|
910
|
+
# @param [Array] Array to test, must indicate non-occurrence with 0.
|
911
|
+
# @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
|
912
|
+
def self.isnull_or_singular?(array)
|
913
|
+
nr_zeroes = array.count(0)
|
914
|
+
return (nr_zeroes == array.size) || # remove non-occurring feature
|
915
|
+
(nr_zeroes == array.size-1) || # remove singular feature
|
916
|
+
(nr_zeroes == 0) # also remove feature present everywhere
|
917
|
+
end
|
918
|
+
|
919
|
+
# Numeric value test
|
920
|
+
# @param[Object] value
|
921
|
+
# @return [Boolean] Whether value is a number
|
922
|
+
def self.numeric?(value)
|
923
|
+
true if Float(value) rescue false
|
924
|
+
end
|
925
|
+
|
926
|
+
# For symbolic features
|
927
|
+
# @param [Array] Array to test, must indicate non-occurrence with 0.
|
928
|
+
# @return [Boolean] Whether the feature has variance zero.
|
929
|
+
def self.zero_variance?(array)
|
930
|
+
return (array.to_scale.variance_sample == 0.0)
|
931
|
+
end
|
260
932
|
|
261
|
-
#
|
933
|
+
# Sum of an array for Arrays.
|
262
934
|
# @param [Array] Array with values
|
263
|
-
# @return [
|
264
|
-
def self.
|
265
|
-
|
266
|
-
array.
|
267
|
-
|
268
|
-
|
935
|
+
# @return [Integer] Sum of size of values
|
936
|
+
def self.sum_size(array)
|
937
|
+
sum=0
|
938
|
+
array.each { |e| sum += e.size }
|
939
|
+
return sum
|
940
|
+
end
|
941
|
+
|
942
|
+
# Minimum Frequency
|
943
|
+
# @param [Integer] per-mil value
|
944
|
+
# return [Integer] min-frequency
|
945
|
+
def self.min_frequency(training_dataset,per_mil)
|
946
|
+
minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
|
947
|
+
minfreq = 2 unless minfreq > 2
|
948
|
+
Integer (minfreq)
|
269
949
|
end
|
270
950
|
|
951
|
+
# Effect calculation for classification
|
952
|
+
# @param [Array] Array of occurrences per class in the form of Enumerables.
|
953
|
+
# @param [Array] Array of database instance counts per class.
|
954
|
+
def self.effect(occurrences, db_instances)
|
955
|
+
max=0
|
956
|
+
max_value=0
|
957
|
+
nr_o = self.sum_size(occurrences)
|
958
|
+
nr_db = db_instances.to_scale.sum
|
959
|
+
|
960
|
+
occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
|
961
|
+
actual = o.size.to_f/nr_o
|
962
|
+
expected = db_instances[i].to_f/nr_db
|
963
|
+
if actual > expected
|
964
|
+
if ((actual - expected) / actual) > max_value
|
965
|
+
max_value = (actual - expected) / actual # 'Schleppzeiger'
|
966
|
+
max = i
|
967
|
+
end
|
968
|
+
end
|
969
|
+
}
|
970
|
+
max
|
971
|
+
end
|
972
|
+
|
973
|
+
# Returns Support value of an fingerprint
|
974
|
+
# @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required
|
975
|
+
# return [Numeric] Support value
|
976
|
+
def self.p_sum_support(params)
|
977
|
+
p_sum = 0.0
|
978
|
+
params[:features].each{|f|
|
979
|
+
compound_hits = params[:compound_features_hits][f]
|
980
|
+
neighbor_hits = params[:training_compound_features_hits][f]
|
981
|
+
p_sum += eval("(Algorithm.gauss(params[:weights][f]) * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))")
|
982
|
+
}
|
983
|
+
p_sum
|
984
|
+
end
|
985
|
+
|
271
986
|
end
|
272
987
|
end
|
988
|
+
|
989
|
+
|