opentox-ruby 2.0.1 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -8,53 +8,46 @@ begin
8
8
  gem.summary = %Q{Ruby wrapper for the OpenTox REST API}
9
9
  gem.description = %Q{Ruby wrapper for the OpenTox REST API (http://www.opentox.org)}
10
10
  gem.email = "helma@in-silico.ch"
11
- gem.homepage = "http://github.com/helma/opentox-ruby"
11
+ gem.homepage = "http://github.com/opentox/opentox-ruby"
12
12
  gem.authors = ["Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler"]
13
- # dependencies
14
- [ "sinatra",
15
- "emk-sinatra-url-for",
16
- "sinatra-respond_to",
17
- "sinatra-static-assets",
18
- "rest-client",
19
- "rack",
20
- "rack-contrib",
21
- "rack-flash",
22
- "nokogiri",
23
- "rubyzip",
24
- "roo",
25
- "spreadsheet",
26
- "google-spreadsheet-ruby",
27
- "yajl-ruby",
28
- "tmail",
29
- "rinruby",
30
- "ohm",
31
- "ohm-contrib",
32
- "SystemTimer",
33
- "rjb",
34
- #valiation-gems
35
- "dm-core",
36
- "dm-serializer",
37
- "dm-timestamps",
38
- "dm-types",
39
- "dm-migrations",
40
- "dm-validations",
41
- "dm-sqlite-adapter"
42
- ].each { |dep| gem.add_dependency dep }
43
- =begin
44
- [ "dm-core",
45
- 'dm-serializer',
46
- 'dm-timestamps',
47
- 'dm-types',
48
- 'dm-migrations',
49
- "dm-mysql-adapter",
50
- "dm-validations",
51
- ].each {|dep| gem.add_dependency dep, ">= 1" }
52
- =end
53
- #valiation-gem
54
- gem.add_dependency "haml", ">=3"
55
- # validation-gems
56
- gem.add_dependency "ruby-plot", "~>0.4.0"
57
- ['jeweler'].each { |dep| gem.add_development_dependency dep }
13
+ # dependencies with versions
14
+ gem.add_dependency "sinatra", "=1.2.6"
15
+ gem.add_dependency "emk-sinatra-url-for", "=0.2.1"
16
+ gem.add_dependency "sinatra-respond_to", "=0.7.0"
17
+ gem.add_dependency "sinatra-static-assets", "=0.5.0"
18
+ gem.add_dependency "rest-client", "=1.6.1"
19
+ gem.add_dependency "rack", "=1.3.1"
20
+ gem.add_dependency "rack-contrib", "=1.1.0"
21
+ gem.add_dependency "rack-flash", "=0.1.1"
22
+ gem.add_dependency "nokogiri", "=1.4.4"
23
+ gem.add_dependency "rubyzip", "=0.9.4"
24
+ gem.add_dependency "roo", "=1.9.3"
25
+ gem.add_dependency "spreadsheet", "=0.6.5.4"
26
+ gem.add_dependency "google-spreadsheet-ruby", "=0.1.5"
27
+ gem.add_dependency "yajl-ruby", "=0.8.2"
28
+ #gem.add_dependency "mail", "=2.3.0"
29
+ gem.add_dependency "rinruby", "=2.0.2"
30
+ gem.add_dependency "ohm", "=0.1.3"
31
+ gem.add_dependency "ohm-contrib", "=0.1.1"
32
+ gem.add_dependency "SystemTimer", "=1.2.3"
33
+ gem.add_dependency "rjb", "=1.3.4"
34
+ gem.add_dependency "haml", "=3.1.1"
35
+ # for headless browser tests
36
+ gem.add_dependency "akephalos", "=0.2.5"
37
+ #valiation-gems
38
+ gem.add_dependency "dm-core", "=1.1.0"
39
+ gem.add_dependency "dm-serializer", "=1.1.0"
40
+ gem.add_dependency "dm-timestamps", "=1.1.0"
41
+ gem.add_dependency "dm-types", "=1.1.0"
42
+ gem.add_dependency "dm-migrations", "=1.1.0"
43
+ gem.add_dependency "dm-validations", "=1.1.0"
44
+ gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
45
+ gem.add_dependency "ruby-plot", "=0.5.0"
46
+ gem.add_dependency "gsl", "=1.14.7"
47
+ gem.add_dependency "statsample", "=1.1.0"
48
+ #gem.add_dependency "statsample-optimization", "=2.1.0"
49
+
50
+ gem.add_development_dependency 'jeweler'
58
51
  gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
59
52
  end
60
53
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.0.1
1
+ 2.1.0
@@ -3,6 +3,8 @@
3
3
  # avoids compiling R with X
4
4
  R = nil
5
5
  require "rinruby"
6
+ require "statsample"
7
+ require 'uri'
6
8
 
7
9
  module OpenTox
8
10
 
@@ -16,6 +18,7 @@ module OpenTox
16
18
  # @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
17
19
  # @return [String] URI of new resource (dataset, model, ...)
18
20
  def run(params=nil, waiting_task=nil)
21
+ LOGGER.info "Running algorithm '"+@uri.to_s+"' with params: "+params.inspect
19
22
  RestClientWrapper.post(@uri, params, {:accept => 'text/uri-list'}, waiting_task).to_s
20
23
  end
21
24
 
@@ -45,12 +48,75 @@ module OpenTox
45
48
  end
46
49
 
47
50
  # Fminer algorithms (https://github.com/amaunz/fminer2)
48
- module Fminer
51
+ class Fminer
49
52
  include Algorithm
53
+ attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi
54
+
55
+ def check_params(params,per_mil,subjectid=nil)
56
+ raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
57
+ raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
58
+ @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
59
+ @training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", subjectid
60
+ raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature])
61
+
62
+ unless params[:min_frequency].nil?
63
+ @minfreq=params[:min_frequency].to_i
64
+ raise "Minimum frequency must be a number >0!" unless @minfreq>0
65
+ else
66
+ @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
67
+ end
68
+ end
69
+
70
+ def add_fminer_data(fminer_instance, params, value_map)
71
+
72
+ id = 1 # fminer start id is not 0
73
+ @training_dataset.data_entries.each do |compound,entry|
74
+ begin
75
+ smiles = OpenTox::Compound.smiles(compound.to_s)
76
+ rescue
77
+ LOGGER.warn "No resource for #{compound.to_s}"
78
+ next
79
+ end
80
+ if smiles == '' or smiles.nil?
81
+ LOGGER.warn "Cannot find smiles for #{compound.to_s}."
82
+ next
83
+ end
84
+
85
+ value_map=params[:value_map] unless params[:value_map].nil?
86
+ entry.each do |feature,values|
87
+ if feature == @prediction_feature.uri
88
+ values.each do |value|
89
+ if value.nil?
90
+ LOGGER.warn "No #{feature} activity for #{compound.to_s}."
91
+ else
92
+ if @prediction_feature.feature_type == "classification"
93
+ activity= value_map.invert[value].to_i # activities are mapped to 1..n
94
+ @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
95
+ elsif @prediction_feature.feature_type == "regression"
96
+ activity= value.to_f
97
+ end
98
+ begin
99
+ fminer_instance.AddCompound(smiles,id)
100
+ fminer_instance.AddActivity(activity, id)
101
+ @all_activities[id]=activity # DV: insert global information
102
+ @compounds[id] = compound
103
+ @smi[id] = smiles
104
+ id += 1
105
+ rescue Exception => e
106
+ LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
107
+ LOGGER.warn e.backtrace
108
+ end
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
115
+
116
+ end
50
117
 
51
118
  # Backbone Refinement Class mining (http://bbrc.maunz.de/)
52
- class BBRC
53
- include Fminer
119
+ class BBRC < Fminer
54
120
  # Initialize bbrc algorithm
55
121
  def initialize(subjectid=nil)
56
122
  super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
@@ -59,8 +125,7 @@ module OpenTox
59
125
  end
60
126
 
61
127
  # LAtent STructure Pattern Mining (http://last-pm.maunz.de)
62
- class LAST
63
- include Fminer
128
+ class LAST < Fminer
64
129
  # Initialize last algorithm
65
130
  def initialize(subjectid=nil)
66
131
  super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
@@ -68,7 +133,6 @@ module OpenTox
68
133
  end
69
134
  end
70
135
 
71
- end
72
136
 
73
137
  # Create lazar prediction model
74
138
  class Lazar
@@ -90,19 +154,34 @@ module OpenTox
90
154
  # @param [Array] features_a Features of first compound
91
155
  # @param [Array] features_b Features of second compound
92
156
  # @param [optional, Hash] weights Weights for all features
157
+ # @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required
93
158
  # @return [Float] (Weighted) tanimoto similarity
94
- def self.tanimoto(features_a,features_b,weights=nil)
159
+ def self.tanimoto(features_a,features_b,weights=nil,params=nil)
95
160
  common_features = features_a & features_b
96
161
  all_features = (features_a + features_b).uniq
97
- common_p_sum = 0.0
162
+ #LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}"
98
163
  if common_features.size > 0
99
164
  if weights
100
- common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
101
- all_p_sum = 0.0
102
- all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
165
+ #LOGGER.debug "nr_hits: #{params[:nr_hits]}"
166
+ if !params.nil? && params[:nr_hits]
167
+ params[:weights] = weights
168
+ params[:mode] = "min"
169
+ params[:features] = common_features
170
+ common_p_sum = Algorithm.p_sum_support(params)
171
+ params[:mode] = "max"
172
+ params[:features] = all_features
173
+ all_p_sum = Algorithm.p_sum_support(params)
174
+ else
175
+ common_p_sum = 0.0
176
+ common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
177
+ all_p_sum = 0.0
178
+ all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
179
+ end
180
+ #LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}"
103
181
  common_p_sum/all_p_sum
104
182
  else
105
- common_features.to_f/all_features
183
+ #LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}"
184
+ common_features.size.to_f/all_features.size.to_f
106
185
  end
107
186
  else
108
187
  0.0
@@ -132,65 +211,300 @@ module OpenTox
132
211
  end
133
212
  end
134
213
 
214
+ # Structural Graph Clustering by TU Munich
215
+ # Finds clusters similar to a query structure in a given training dataset
216
+ # May be queried for cluster membership of an unknown compound
217
+ class StructuralClustering
218
+ attr_accessor :training_dataset_uri, :training_threshold, :query_dataset_uri, :query_threshold, :target_clusters_array
219
+
220
+ # @params[String] Training dataset_uri
221
+ # @params[Float] Similarity threshold for training (optional)
222
+ # @params[String] Cluster service uri (no AA)
223
+ def initialize training_dataset_uri, training_threshold=0.8, cluster_service_uri = "http://opentox-dev.informatik.tu-muenchen.de:8080/OpenTox/algorithm/StructuralClustering"
224
+
225
+ if (training_dataset_uri =~ URI::regexp).nil? || (cluster_service_uri =~ URI::regexp).nil?
226
+ raise "Invalid URI."
227
+ end
228
+ @training_dataset_uri = training_dataset_uri
229
+ if !OpenTox::Algorithm.numeric? training_threshold || training_threshold <0 || training_threshold >1
230
+ raise "Training threshold out of bounds."
231
+ end
232
+ @training_threshold = training_threshold.to_f
233
+
234
+ # Train a cluster model
235
+ params = {:dataset_uri => @training_dataset_uri, :threshold => @training_threshold }
236
+ @cluster_model_uri = OpenTox::RestClientWrapper.post cluster_service_uri, params
237
+ cluster_model_rdf = OpenTox::RestClientWrapper.get @cluster_model_uri
238
+ @datasets = OpenTox::Parser::Owl.from_rdf cluster_model_rdf, OT.Dataset, true # must extract OT.Datasets from model
239
+
240
+ # Process parsed OWL objects
241
+ @clusterid_dataset_map = Hash.new
242
+ @datasets.each { |d|
243
+ begin
244
+ d.metadata[OT.hasSource]["Structural Clustering cluster "] = "" # must parse in metadata for string (not elegant)
245
+ @clusterid_dataset_map[d.metadata[OT.hasSource].to_i] = d.uri
246
+ rescue Exception => e
247
+ # ignore other entries!
248
+ end
249
+ }
250
+ end
251
+
252
+ # Whether a model has been trained
253
+ def trained?
254
+ !@cluster_model_uri.nil?
255
+ end
256
+
257
+ # Instance query: clusters for a compound
258
+ # @params[String] Query compound
259
+ # @params[Float] Similarity threshold for query to clusters (optional)
260
+ def get_clusters query_compound_uri, query_threshold = 0.5
261
+
262
+ if !OpenTox::Algorithm.numeric? query_threshold || query_threshold <0 || query_threshold >1
263
+ raise "Query threshold out of bounds."
264
+ end
265
+ @query_threshold = query_threshold.to_f
266
+
267
+
268
+ # Preparing a query dataset
269
+ query_dataset = OpenTox::Dataset.new
270
+ @query_dataset_uri = query_dataset.save
271
+ query_dataset = OpenTox::Dataset.find @query_dataset_uri
272
+ query_dataset.add_compound query_compound_uri
273
+ @query_dataset_uri = query_dataset.save
274
+
275
+ # Obtaining a clustering for query compound
276
+ params = { :dataset_uri => @query_dataset_uri, :threshold => @query_threshold }
277
+ cluster_query_dataset_uri = OpenTox::RestClientWrapper.post @cluster_model_uri, params
278
+ cluster_query_dataset = OpenTox::Dataset.new cluster_query_dataset_uri
279
+ cluster_query_dataset.load_all
280
+
281
+ # Reading cluster ids for features from metadata
282
+ feature_clusterid_map = Hash.new
283
+ pattern="Prediction feature for cluster assignment " # must parse for string in metadata (not elegant)
284
+ cluster_query_dataset.features.each { |feature_uri,metadata|
285
+ metadata[DC.title][pattern]=""
286
+ feature_clusterid_map[feature_uri] = metadata[DC.title].to_i
287
+ }
288
+
289
+ # Integrity check
290
+ unless cluster_query_dataset.compounds.size == 1
291
+ raise "Number of predicted compounds is != 1."
292
+ end
293
+
294
+ # Process data entry
295
+ query_compound_uri = cluster_query_dataset.compounds[0]
296
+ @target_clusters_array = Array.new
297
+ cluster_query_dataset.features.keys.each { |cluster_membership_feature|
298
+
299
+ # Getting dataset URI for cluster
300
+ target_cluster = feature_clusterid_map[cluster_membership_feature]
301
+ dataset = @clusterid_dataset_map[target_cluster]
302
+
303
+ # Finally look up presence
304
+ data_entry = cluster_query_dataset.data_entries[query_compound_uri]
305
+ present = data_entry[cluster_membership_feature][0]
306
+
307
+ # Store result
308
+ @target_clusters_array << dataset if present > 0.5 # 0.0 for absence, 1.0 for presence
309
+ }
310
+ end
311
+
312
+ end
313
+
135
314
  module Neighbors
136
315
 
316
+ # Local multi-linear regression (MLR) prediction from neighbors.
317
+ # Uses propositionalized setting.
318
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
319
+ # @return [Numeric] A prediction value.
320
+ def self.local_mlr_prop(params)
321
+
322
+ confidence=0.0
323
+ prediction=nil
324
+
325
+ if params[:neighbors].size>0
326
+ props = params[:prop_kernel] ? get_props(params) : nil
327
+ acts = params[:neighbors].collect { |n| act = n[:activity].to_f }
328
+ sims = params[:neighbors].collect { |n| Algorithm.gauss(n[:similarity]) }
329
+ LOGGER.debug "Local MLR (Propositionalization / GSL)."
330
+ prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} )
331
+ transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
332
+ prediction = transformer.values[0]
333
+ prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
334
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
335
+ params[:conf_stdev] = false if params[:conf_stdev].nil?
336
+ confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
337
+ confidence = nil if prediction.nil?
338
+ end
339
+ {:prediction => prediction, :confidence => confidence}
340
+
341
+ end
342
+
343
+ # Multi-linear regression weighted by similarity.
344
+ # Objective Feature Selection, Principal Components Analysis, Scaling of Axes.
345
+ # @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required
346
+ # @return [Numeric] A prediction value.
347
+ def self.mlr(params)
348
+
349
+ # GSL matrix operations:
350
+ # to_a : row-wise conversion to nested array
351
+ #
352
+ # Statsample operations (build on GSL):
353
+ # to_scale: convert into Statsample format
354
+
355
+ begin
356
+ n_prop = params[:n_prop].collect { |v| v }
357
+ q_prop = params[:q_prop].collect { |v| v }
358
+ n_prop << q_prop # attach q_prop
359
+ nr_cases, nr_features = get_sizes n_prop
360
+ data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
361
+
362
+ # Principal Components Analysis
363
+ LOGGER.debug "PCA..."
364
+ pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix)
365
+ data_matrix = pca.data_transformed_matrix
366
+
367
+ # Attach intercept column to data
368
+ intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1)
369
+ data_matrix = data_matrix.horzcat(intercept)
370
+ (0..data_matrix.size2-2).each { |i|
371
+ autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i))
372
+ data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values
373
+ }
374
+
375
+ # Detach query instance
376
+ n_prop = data_matrix.to_a
377
+ q_prop = n_prop.pop
378
+ nr_cases, nr_features = get_sizes n_prop
379
+ data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
380
+
381
+ # model + support vectors
382
+ LOGGER.debug "Creating MLR model ..."
383
+ c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl)
384
+ GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0]
385
+ rescue Exception => e
386
+ LOGGER.debug "#{e.class}: #{e.message}"
387
+ end
388
+
389
+ end
390
+
137
391
  # Classification with majority vote from neighbors weighted by similarity
138
- # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity`
139
- # @param [optional] params Ignored (only for compatibility with local_svm_regression)
140
- # @return [Hash] Hash with keys `:prediction, :confidence`
141
- def self.weighted_majority_vote(neighbors,params={})
142
- conf = 0.0
392
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
393
+ # @return [Numeric] A prediction value.
394
+ def self.weighted_majority_vote(params)
395
+
396
+ neighbor_contribution = 0.0
397
+ confidence_sum = 0.0
143
398
  confidence = 0.0
144
- neighbors.each do |neighbor|
145
- case neighbor[:activity].to_s
146
- when 'true'
147
- conf += Algorithm.gauss(neighbor[:similarity])
148
- when 'false'
149
- conf -= Algorithm.gauss(neighbor[:similarity])
399
+ prediction = nil
400
+
401
+ params[:neighbors].each do |neighbor|
402
+ neighbor_weight = Algorithm.gauss(neighbor[:similarity]).to_f
403
+ neighbor_contribution += neighbor[:activity].to_f * neighbor_weight
404
+
405
+ if params[:value_map].size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
406
+ case neighbor[:activity]
407
+ when 1
408
+ confidence_sum -= neighbor_weight
409
+ when 2
410
+ confidence_sum += neighbor_weight
411
+ end
412
+ else
413
+ confidence_sum += neighbor_weight
150
414
  end
151
415
  end
152
- if conf > 0.0
153
- prediction = true
154
- elsif conf < 0.0
155
- prediction = false
156
- else
157
- prediction = nil
158
- end
159
- confidence = conf/neighbors.size if neighbors.size > 0
160
- {:prediction => prediction, :confidence => confidence.abs}
416
+
417
+ if params[:value_map].size == 2
418
+ if confidence_sum >= 0.0
419
+ prediction = 2 unless params[:neighbors].size==0
420
+ elsif confidence_sum < 0.0
421
+ prediction = 1 unless params[:neighbors].size==0
422
+ end
423
+ else
424
+ prediction = (neighbor_contribution/confidence_sum).round unless params[:neighbors].size==0 # AM: new multinomial prediction
425
+ end
426
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil?
427
+ confidence = confidence_sum/params[:neighbors].size if params[:neighbors].size > 0
428
+ LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil?
429
+ return {:prediction => prediction, :confidence => confidence.abs}
161
430
  end
162
431
 
163
432
  # Local support vector regression from neighbors
164
- # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features`
165
- # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required
166
- # @return [Hash] Hash with keys `:prediction, :confidence`
167
- def self.local_svm_regression(neighbors,params )
168
- sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values between query and neighbors
169
- conf = sims.inject{|sum,x| sum + x }
170
-
171
- # AM: Control log taking
172
- take_logs=true
173
- neighbors.each do |n|
174
- if (! n[:activity].nil?) && (n[:activity].to_f < 0.0)
175
- take_logs = false
176
- end
433
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
434
+ # @return [Numeric] A prediction value.
435
+ def self.local_svm_regression(params)
436
+
437
+ confidence = 0.0
438
+ prediction = nil
439
+ if params[:neighbors].size>0
440
+ props = params[:prop_kernel] ? get_props(params) : nil
441
+ acts = params[:neighbors].collect{ |n| n[:activity].to_f }
442
+ sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) }
443
+ prediction = props.nil? ? local_svm(acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr")
444
+ transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
445
+ prediction = transformer.values[0]
446
+ prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
447
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
448
+ params[:conf_stdev] = false if params[:conf_stdev].nil?
449
+ confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
450
+ confidence = nil if prediction.nil?
177
451
  end
178
- acts = neighbors.collect do |n|
179
- act = n[:activity]
180
- take_logs ? Math.log10(act.to_f) : act.to_f
181
- end # activities of neighbors for supervised learning
452
+ {:prediction => prediction, :confidence => confidence}
453
+
454
+ end
455
+
456
+ # Local support vector classification from neighbors
457
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
458
+ # @return [Numeric] A prediction value.
459
+ def self.local_svm_classification(params)
182
460
 
183
- neighbor_matches = neighbors.collect{ |n| n[:features] } # as in classification: URIs of matches
461
+ confidence = 0.0
462
+ prediction = nil
463
+ if params[:neighbors].size>0
464
+ props = params[:prop_kernel] ? get_props(params) : nil
465
+ acts = params[:neighbors].collect { |n| act = n[:activity] }
466
+ sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
467
+ prediction = props.nil? ? local_svm(acts, sims, "C-bsvc", params) : local_svm_prop(props, acts, "C-bsvc")
468
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
469
+ params[:conf_stdev] = false if params[:conf_stdev].nil?
470
+ confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
471
+ end
472
+ {:prediction => prediction, :confidence => confidence}
473
+
474
+ end
475
+
476
+
477
+ # Local support vector prediction from neighbors.
478
+ # Uses pre-defined Kernel Matrix.
479
+ # Not to be called directly (use local_svm_regression or local_svm_classification).
480
+ # @param [Array] acts, activities for neighbors.
481
+ # @param [Array] sims, similarities for neighbors.
482
+ # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
483
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
484
+ # @return [Numeric] A prediction value.
485
+ def self.local_svm(acts, sims, type, params)
486
+ LOGGER.debug "Local SVM (Weighted Tanimoto Kernel)."
487
+ neighbor_matches = params[:neighbors].collect{ |n| n[:features] } # URIs of matches
184
488
  gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
185
- if neighbor_matches.size == 0
186
- raise "No neighbors found"
489
+
490
+ prediction = nil
491
+ if Algorithm::zero_variance? acts
492
+ prediction = acts[0]
187
493
  else
188
494
  # gram matrix
189
495
  (0..(neighbor_matches.length-1)).each do |i|
496
+ neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]]
190
497
  gram_matrix[i] = [] unless gram_matrix[i]
191
498
  # upper triangle
192
499
  ((i+1)..(neighbor_matches.length-1)).each do |j|
193
- sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values])")
500
+ neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]]
501
+ sim_params = {}
502
+ if params[:nr_hits]
503
+ sim_params[:nr_hits] = true
504
+ sim_params[:compound_features_hits] = neighbor_i_hits
505
+ sim_params[:training_compound_features_hits] = neighbor_j_hits
506
+ end
507
+ sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)")
194
508
  gram_matrix[i][j] = Algorithm.gauss(sim)
195
509
  gram_matrix[j] = [] unless gram_matrix[j]
196
510
  gram_matrix[j][i] = gram_matrix[i][j] # lower triangle
@@ -198,6 +512,7 @@ module OpenTox
198
512
  gram_matrix[i][i] = 1.0
199
513
  end
200
514
 
515
+
201
516
  #LOGGER.debug gram_matrix.to_yaml
202
517
  @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
203
518
  @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
@@ -208,27 +523,171 @@ module OpenTox
208
523
  @r.y = acts
209
524
  @r.sims = sims
210
525
 
211
- LOGGER.debug "Preparing R data ..."
212
- # prepare data
213
- @r.eval "y<-as.vector(y)"
214
- @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
215
- @r.eval "sims<-as.vector(sims)"
216
-
217
- # model + support vectors
218
- LOGGER.debug "Creating SVM model ..."
219
- @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"nu-svr\", nu=0.8)"
220
- @r.eval "sv<-as.vector(SVindex(model))"
221
- @r.eval "sims<-sims[sv]"
222
- @r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
223
- LOGGER.debug "Predicting ..."
224
- @r.eval "p<-predict(model,sims)[1,1]"
225
- prediction = 10**(@r.p.to_f) if take_logs
226
- LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
227
- @r.quit # free R
526
+ begin
527
+ LOGGER.debug "Preparing R data ..."
528
+ # prepare data
529
+ @r.eval "y<-as.vector(y)"
530
+ @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
531
+ @r.eval "sims<-as.vector(sims)"
532
+
533
+ # model + support vectors
534
+ LOGGER.debug "Creating SVM model ..."
535
+ @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"#{type}\", nu=0.5)"
536
+ @r.eval "sv<-as.vector(SVindex(model))"
537
+ @r.eval "sims<-sims[sv]"
538
+ @r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
539
+ LOGGER.debug "Predicting ..."
540
+ if type == "nu-svr"
541
+ @r.eval "p<-predict(model,sims)[1,1]"
542
+ elsif type == "C-bsvc"
543
+ @r.eval "p<-predict(model,sims)"
544
+ end
545
+ if type == "nu-svr"
546
+ prediction = @r.p
547
+ elsif type == "C-bsvc"
548
+ #prediction = (@r.p.to_f == 1.0 ? true : false)
549
+ prediction = @r.p
550
+ end
551
+ @r.quit # free R
552
+ rescue Exception => e
553
+ LOGGER.debug "#{e.class}: #{e.message}"
554
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
555
+ end
556
+
228
557
  end
229
- confidence = conf/neighbors.size if neighbors.size > 0
230
- {:prediction => prediction, :confidence => confidence}
231
-
558
+ prediction
559
+ end
560
+
561
+ # Local support vector prediction from neighbors.
562
+ # Uses propositionalized setting.
563
+ # Not to be called directly (use local_svm_regression or local_svm_classification).
564
+ # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
565
+ # @param [Array] acts, activities for neighbors.
566
+ # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
567
+ # @return [Numeric] A prediction value.
568
+ def self.local_svm_prop(props, acts, type)
569
+
570
+ LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
571
+ n_prop = props[0] # is a matrix, i.e. two nested Arrays.
572
+ q_prop = props[1] # is an Array.
573
+
574
+ prediction = nil
575
+ if Algorithm::zero_variance? acts
576
+ prediction = acts[0]
577
+ else
578
+ #LOGGER.debug gram_matrix.to_yaml
579
+ @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
580
+ @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
581
+ LOGGER.debug "Setting R data ..."
582
+ # set data
583
+ @r.n_prop = n_prop.flatten
584
+ @r.n_prop_x_size = n_prop.size
585
+ @r.n_prop_y_size = n_prop[0].size
586
+ @r.y = acts
587
+ @r.q_prop = q_prop
588
+
589
+ begin
590
+ LOGGER.debug "Preparing R data ..."
591
+ # prepare data
592
+ @r.eval "y<-matrix(y)"
593
+ @r.eval "prop_matrix<-matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=TRUE)"
594
+ @r.eval "q_prop<-matrix(q_prop, 1, n_prop_y_size, byrow=TRUE)"
595
+
596
+ # model + support vectors
597
+ LOGGER.debug "Creating SVM model ..."
598
+ @r.eval "model<-ksvm(prop_matrix, y, type=\"#{type}\", nu=0.5)"
599
+ LOGGER.debug "Predicting ..."
600
+ if type == "nu-svr"
601
+ @r.eval "p<-predict(model,q_prop)[1,1]"
602
+ elsif type == "C-bsvc"
603
+ @r.eval "p<-predict(model,q_prop)"
604
+ end
605
+ if type == "nu-svr"
606
+ prediction = @r.p
607
+ elsif type == "C-bsvc"
608
+ #prediction = (@r.p.to_f == 1.0 ? true : false)
609
+ prediction = @r.p
610
+ end
611
+ @r.quit # free R
612
+ rescue Exception => e
613
+ LOGGER.debug "#{e.class}: #{e.message}"
614
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
615
+ end
616
+ end
617
+ prediction
618
+ end
619
+
620
+ # Get confidence for regression, with standard deviation of neighbor activity if conf_stdev is set.
621
+ # @param[Hash] Required keys: :sims, :acts, :neighbors, :conf_stdev
622
+ # @return[Float] Confidence
623
+ def self.get_confidence(params)
624
+ if params[:conf_stdev]
625
+ sim_median = params[:sims].to_scale.median
626
+ if sim_median.nil?
627
+ confidence = nil
628
+ else
629
+ standard_deviation = params[:acts].to_scale.standard_deviation_sample
630
+ confidence = (sim_median*Math.exp(-1*standard_deviation)).abs
631
+ if confidence.nan?
632
+ confidence = nil
633
+ end
634
+ end
635
+ else
636
+ conf = params[:sims].inject{|sum,x| sum + x }
637
+ confidence = conf/params[:neighbors].size
638
+ end
639
+ LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
640
+ return confidence
641
+ end
642
+
643
+ # Get X and Y size of a nested Array (Matrix)
644
+ def self.get_sizes(matrix)
645
+ begin
646
+ nr_cases = matrix.size
647
+ nr_features = matrix[0].size
648
+ rescue Exception => e
649
+ LOGGER.debug "#{e.class}: #{e.message}"
650
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
651
+ end
652
+ #puts "NRC: #{nr_cases}, NRF: #{nr_features}"
653
+ [ nr_cases, nr_features ]
654
+ end
655
+
656
+ # Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features)
657
+ # Same for the vector describing the query compound
658
+ # @param[Array] neighbors.
659
+ # @param[OpenTox::Compound] query compound.
660
+ # @param[Array] Dataset Features.
661
+ # @param[Array] Fingerprints of neighbors.
662
+ # @param[Float] p-values of Features.
663
+ def self.get_props (params)
664
+ matrix = Array.new
665
+ begin
666
+ params[:neighbors].each do |n|
667
+ n = n[:compound]
668
+ row = []
669
+ params[:features].each do |f|
670
+ if ! params[:fingerprints][n].nil?
671
+ row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0)
672
+ else
673
+ row << 0.0
674
+ end
675
+ end
676
+ matrix << row
677
+ end
678
+ row = []
679
+ params[:features].each do |f|
680
+ if params[:nr_hits]
681
+ compound_feature_hits = params[:compound].match_hits([f])
682
+ row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f]))
683
+ else
684
+ row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f])
685
+ end
686
+ end
687
+ rescue Exception => e
688
+ LOGGER.debug "get_props failed with '" + $! + "'"
689
+ end
690
+ [ matrix, row ]
232
691
  end
233
692
 
234
693
  end
@@ -250,6 +709,195 @@ module OpenTox
250
709
  def features(dataset_uri,compound_uri)
251
710
  end
252
711
  end
712
+
713
+ module Transform
714
+ include Algorithm
715
+
716
+ # The transformer that inverts values.
717
+ # 1/x is used, after values have been moved >= 1.
718
+ class Inverter
719
+ attr_accessor :offset, :values
720
+
721
+ # @params[Array] Values to transform.
722
+ # @params[Float] Offset for restore.
723
+ def initialize *args
724
+ case args.size
725
+ when 1
726
+ begin
727
+ values=args[0]
728
+ raise "Cannot transform, values empty." if @values.size==0
729
+ @values = values.collect { |v| -1.0 * v }
730
+ @offset = 1.0 - @values.minmax[0]
731
+ @offset = -1.0 * @offset if @offset>0.0
732
+ @values.collect! { |v| v - @offset } # slide >1
733
+ @values.collect! { |v| 1 / v } # invert to [0,1]
734
+ rescue Exception => e
735
+ LOGGER.debug "#{e.class}: #{e.message}"
736
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
737
+ end
738
+ when 2
739
+ @offset = args[1].to_f
740
+ @values = args[0].collect { |v| 1 / v }
741
+ @values.collect! { |v| v + @offset }
742
+ @values.collect! { |v| -1.0 * v }
743
+ end
744
+ end
745
+ end
746
+
747
+ # The transformer that takes logs.
748
+ # Log10 is used, after values have been moved > 0.
749
+ class Log10
750
+ attr_accessor :offset, :values
751
+
752
+ # @params[Array] Values to transform / restore.
753
+ # @params[Float] Offset for restore.
754
+ def initialize *args
755
+ @distance_to_zero = 0.000000001 # 1 / 1 billion
756
+ case args.size
757
+ when 1
758
+ begin
759
+ values=args[0]
760
+ raise "Cannot transform, values empty." if values.size==0
761
+ @offset = values.minmax[0]
762
+ @offset = -1.0 * @offset if @offset>0.0
763
+ @values = values.collect { |v| v - @offset } # slide > anchor
764
+ @values.collect! { |v| v + @distance_to_zero } #
765
+ @values.collect! { |v| Math::log10 v } # log10 (can fail)
766
+ rescue Exception => e
767
+ LOGGER.debug "#{e.class}: #{e.message}"
768
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
769
+ end
770
+ when 2
771
+ @offset = args[1].to_f
772
+ @values = args[0].collect { |v| 10**v }
773
+ @values.collect! { |v| v - @distance_to_zero }
774
+ @values.collect! { |v| v + @offset }
775
+ end
776
+ end
777
+ end
778
+
779
+ # The transformer that does nothing (No OPeration).
780
+ class NOP
781
+ attr_accessor :offset, :values
782
+
783
+ # @params[Array] Values to transform / restore.
784
+ # @params[Float] Offset for restore.
785
+ def initialize *args
786
+ @offset = 0.0
787
+ @distance_to_zero = 0.0
788
+ case args.size
789
+ when 1
790
+ @values = args[0]
791
+ when 2
792
+ @values = args[0]
793
+ end
794
+ end
795
+ end
796
+
797
+
798
+ # Auto-Scaler for Arrays
799
+ # Center on mean and divide by standard deviation
800
+ class AutoScale
801
+ attr_accessor :scaled_values, :mean, :stdev
802
+
803
+ # @params[Array] Values to transform.
804
+ def initialize values
805
+ @scaled_values = values
806
+ @mean = @scaled_values.to_scale.mean
807
+ @stdev = @scaled_values.to_scale.standard_deviation_sample
808
+ @scaled_values = @scaled_values.collect {|vi| vi - @mean }
809
+ @scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0
810
+ end
811
+ end
812
+
813
+ # Principal Components Analysis
814
+ # Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
815
+ class PCA
816
+ attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
817
+
818
+ # Creates a transformed dataset as GSL::Matrix.
819
+ # @param [GSL::Matrix] Data matrix.
820
+ # @param [Float] Compression ratio from [0,1].
821
+ # @return [GSL::Matrix] Data transformed matrix.
822
+ def initialize data_matrix, compression=0.05
823
+ begin
824
+ @data_matrix = data_matrix
825
+ @compression = compression.to_f
826
+ @stdev = Array.new
827
+ @mean = Array.new
828
+
829
+ # Objective Feature Selection
830
+ raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
831
+ @data_matrix_selected = nil
832
+ (0..@data_matrix.size2-1).each { |i|
833
+ if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
834
+ if @data_matrix_selected.nil?
835
+ @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
836
+ @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
837
+ else
838
+ @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
839
+ end
840
+ end
841
+ }
842
+ raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
843
+
844
+ # Scaling of Axes
845
+ @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2)
846
+ (0..@data_matrix_selected.size2-1).each { |i|
847
+ @autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
848
+ @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values
849
+ @stdev << @autoscaler.stdev
850
+ @mean << @autoscaler.mean
851
+ }
852
+
853
+ data_matrix_hash = Hash.new
854
+ (0..@data_matrix_scaled.size2-1).each { |i|
855
+ column_view = @data_matrix_scaled.col(i)
856
+ data_matrix_hash[i] = column_view.to_scale
857
+ }
858
+ dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
859
+ cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
860
+ pca=Statsample::Factor::PCA.new(cor_matrix)
861
+ pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
862
+ @eigenvalue_sums = Array.new
863
+ (0..dataset_hash.fields.size-1).each { |i|
864
+ @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
865
+ }
866
+ eigenvectors_selected = Array.new
867
+ pca.eigenvectors.each_with_index { |ev, i|
868
+ if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0)
869
+ eigenvectors_selected << ev.to_a
870
+ end
871
+ }
872
+ @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose
873
+ dataset_matrix = dataset_hash.to_gsl.transpose
874
+ @data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose
875
+ rescue Exception => e
876
+ LOGGER.debug "#{e.class}: #{e.message}"
877
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
878
+ end
879
+ end
880
+
881
+ # Restores data in the original feature space (possibly with compression loss).
882
+ # @return [GSL::Matrix] Data matrix.
883
+ def restore
884
+ begin
885
+ data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
886
+ # reverse scaling
887
+ (0..data_matrix_restored.size2-1).each { |i|
888
+ data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0
889
+ data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
890
+ }
891
+ data_matrix_restored
892
+ rescue Exception => e
893
+ LOGGER.debug "#{e.class}: #{e.message}"
894
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
895
+ end
896
+ end
897
+
898
+ end
899
+
900
+ end
253
901
 
254
902
  # Gauss kernel
255
903
  # @return [Float]
@@ -257,16 +905,85 @@ module OpenTox
257
905
  d = 1.0 - x.to_f
258
906
  Math.exp(-(d*d)/(2*sigma*sigma))
259
907
  end
908
+
909
+ # For symbolic features
910
+ # @param [Array] Array to test, must indicate non-occurrence with 0.
911
+ # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
912
+ def self.isnull_or_singular?(array)
913
+ nr_zeroes = array.count(0)
914
+ return (nr_zeroes == array.size) || # remove non-occurring feature
915
+ (nr_zeroes == array.size-1) || # remove singular feature
916
+ (nr_zeroes == 0) # also remove feature present everywhere
917
+ end
918
+
919
+ # Numeric value test
920
+ # @param[Object] value
921
+ # @return [Boolean] Whether value is a number
922
+ def self.numeric?(value)
923
+ true if Float(value) rescue false
924
+ end
925
+
926
+ # For symbolic features
927
+ # @param [Array] Array to test, must indicate non-occurrence with 0.
928
+ # @return [Boolean] Whether the feature has variance zero.
929
+ def self.zero_variance?(array)
930
+ return (array.to_scale.variance_sample == 0.0)
931
+ end
260
932
 
261
- # Median of an array
933
+ # Sum of an array for Arrays.
262
934
  # @param [Array] Array with values
263
- # @return [Float] Median
264
- def self.median(array)
265
- return nil if array.empty?
266
- array.sort!
267
- m_pos = array.size / 2
268
- return array.size % 2 == 1 ? array[m_pos] : (array[m_pos-1] + array[m_pos])/2
935
+ # @return [Integer] Sum of size of values
936
+ def self.sum_size(array)
937
+ sum=0
938
+ array.each { |e| sum += e.size }
939
+ return sum
940
+ end
941
+
942
+ # Minimum Frequency
943
+ # @param [Integer] per-mil value
944
+ # return [Integer] min-frequency
945
+ def self.min_frequency(training_dataset,per_mil)
946
+ minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
947
+ minfreq = 2 unless minfreq > 2
948
+ Integer (minfreq)
269
949
  end
270
950
 
951
+ # Effect calculation for classification
952
+ # @param [Array] Array of occurrences per class in the form of Enumerables.
953
+ # @param [Array] Array of database instance counts per class.
954
+ def self.effect(occurrences, db_instances)
955
+ max=0
956
+ max_value=0
957
+ nr_o = self.sum_size(occurrences)
958
+ nr_db = db_instances.to_scale.sum
959
+
960
+ occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
961
+ actual = o.size.to_f/nr_o
962
+ expected = db_instances[i].to_f/nr_db
963
+ if actual > expected
964
+ if ((actual - expected) / actual) > max_value
965
+ max_value = (actual - expected) / actual # 'Schleppzeiger'
966
+ max = i
967
+ end
968
+ end
969
+ }
970
+ max
971
+ end
972
+
973
+ # Returns Support value of an fingerprint
974
+ # @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required
975
+ # return [Numeric] Support value
976
+ def self.p_sum_support(params)
977
+ p_sum = 0.0
978
+ params[:features].each{|f|
979
+ compound_hits = params[:compound_features_hits][f]
980
+ neighbor_hits = params[:training_compound_features_hits][f]
981
+ p_sum += eval("(Algorithm.gauss(params[:weights][f]) * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))")
982
+ }
983
+ p_sum
984
+ end
985
+
271
986
  end
272
987
  end
988
+
989
+