opentox-ruby 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -8,53 +8,46 @@ begin
8
8
  gem.summary = %Q{Ruby wrapper for the OpenTox REST API}
9
9
  gem.description = %Q{Ruby wrapper for the OpenTox REST API (http://www.opentox.org)}
10
10
  gem.email = "helma@in-silico.ch"
11
- gem.homepage = "http://github.com/helma/opentox-ruby"
11
+ gem.homepage = "http://github.com/opentox/opentox-ruby"
12
12
  gem.authors = ["Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler"]
13
- # dependencies
14
- [ "sinatra",
15
- "emk-sinatra-url-for",
16
- "sinatra-respond_to",
17
- "sinatra-static-assets",
18
- "rest-client",
19
- "rack",
20
- "rack-contrib",
21
- "rack-flash",
22
- "nokogiri",
23
- "rubyzip",
24
- "roo",
25
- "spreadsheet",
26
- "google-spreadsheet-ruby",
27
- "yajl-ruby",
28
- "tmail",
29
- "rinruby",
30
- "ohm",
31
- "ohm-contrib",
32
- "SystemTimer",
33
- "rjb",
34
- #valiation-gems
35
- "dm-core",
36
- "dm-serializer",
37
- "dm-timestamps",
38
- "dm-types",
39
- "dm-migrations",
40
- "dm-validations",
41
- "dm-sqlite-adapter"
42
- ].each { |dep| gem.add_dependency dep }
43
- =begin
44
- [ "dm-core",
45
- 'dm-serializer',
46
- 'dm-timestamps',
47
- 'dm-types',
48
- 'dm-migrations',
49
- "dm-mysql-adapter",
50
- "dm-validations",
51
- ].each {|dep| gem.add_dependency dep, ">= 1" }
52
- =end
53
- #valiation-gem
54
- gem.add_dependency "haml", ">=3"
55
- # validation-gems
56
- gem.add_dependency "ruby-plot", "~>0.4.0"
57
- ['jeweler'].each { |dep| gem.add_development_dependency dep }
13
+ # dependencies with versions
14
+ gem.add_dependency "sinatra", "=1.2.6"
15
+ gem.add_dependency "emk-sinatra-url-for", "=0.2.1"
16
+ gem.add_dependency "sinatra-respond_to", "=0.7.0"
17
+ gem.add_dependency "sinatra-static-assets", "=0.5.0"
18
+ gem.add_dependency "rest-client", "=1.6.1"
19
+ gem.add_dependency "rack", "=1.3.1"
20
+ gem.add_dependency "rack-contrib", "=1.1.0"
21
+ gem.add_dependency "rack-flash", "=0.1.1"
22
+ gem.add_dependency "nokogiri", "=1.4.4"
23
+ gem.add_dependency "rubyzip", "=0.9.4"
24
+ gem.add_dependency "roo", "=1.9.3"
25
+ gem.add_dependency "spreadsheet", "=0.6.5.4"
26
+ gem.add_dependency "google-spreadsheet-ruby", "=0.1.5"
27
+ gem.add_dependency "yajl-ruby", "=0.8.2"
28
+ #gem.add_dependency "mail", "=2.3.0"
29
+ gem.add_dependency "rinruby", "=2.0.2"
30
+ gem.add_dependency "ohm", "=0.1.3"
31
+ gem.add_dependency "ohm-contrib", "=0.1.1"
32
+ gem.add_dependency "SystemTimer", "=1.2.3"
33
+ gem.add_dependency "rjb", "=1.3.4"
34
+ gem.add_dependency "haml", "=3.1.1"
35
+ # for headless browser tests
36
+ gem.add_dependency "akephalos", "=0.2.5"
37
+ #valiation-gems
38
+ gem.add_dependency "dm-core", "=1.1.0"
39
+ gem.add_dependency "dm-serializer", "=1.1.0"
40
+ gem.add_dependency "dm-timestamps", "=1.1.0"
41
+ gem.add_dependency "dm-types", "=1.1.0"
42
+ gem.add_dependency "dm-migrations", "=1.1.0"
43
+ gem.add_dependency "dm-validations", "=1.1.0"
44
+ gem.add_dependency "dm-sqlite-adapter", "=1.1.0"
45
+ gem.add_dependency "ruby-plot", "=0.5.0"
46
+ gem.add_dependency "gsl", "=1.14.7"
47
+ gem.add_dependency "statsample", "=1.1.0"
48
+ #gem.add_dependency "statsample-optimization", "=2.1.0"
49
+
50
+ gem.add_development_dependency 'jeweler'
58
51
  gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore']
59
52
  end
60
53
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.0.1
1
+ 2.1.0
@@ -3,6 +3,8 @@
3
3
  # avoids compiling R with X
4
4
  R = nil
5
5
  require "rinruby"
6
+ require "statsample"
7
+ require 'uri'
6
8
 
7
9
  module OpenTox
8
10
 
@@ -16,6 +18,7 @@ module OpenTox
16
18
  # @param [optional,OpenTox::Task] waiting_task (can be a OpenTox::Subtask as well), progress is updated accordingly
17
19
  # @return [String] URI of new resource (dataset, model, ...)
18
20
  def run(params=nil, waiting_task=nil)
21
+ LOGGER.info "Running algorithm '"+@uri.to_s+"' with params: "+params.inspect
19
22
  RestClientWrapper.post(@uri, params, {:accept => 'text/uri-list'}, waiting_task).to_s
20
23
  end
21
24
 
@@ -45,12 +48,75 @@ module OpenTox
45
48
  end
46
49
 
47
50
  # Fminer algorithms (https://github.com/amaunz/fminer2)
48
- module Fminer
51
+ class Fminer
49
52
  include Algorithm
53
+ attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi
54
+
55
+ def check_params(params,per_mil,subjectid=nil)
56
+ raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
57
+ raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
58
+ @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid
59
+ @training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", subjectid
60
+ raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature])
61
+
62
+ unless params[:min_frequency].nil?
63
+ @minfreq=params[:min_frequency].to_i
64
+ raise "Minimum frequency must be a number >0!" unless @minfreq>0
65
+ else
66
+ @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
67
+ end
68
+ end
69
+
70
+ def add_fminer_data(fminer_instance, params, value_map)
71
+
72
+ id = 1 # fminer start id is not 0
73
+ @training_dataset.data_entries.each do |compound,entry|
74
+ begin
75
+ smiles = OpenTox::Compound.smiles(compound.to_s)
76
+ rescue
77
+ LOGGER.warn "No resource for #{compound.to_s}"
78
+ next
79
+ end
80
+ if smiles == '' or smiles.nil?
81
+ LOGGER.warn "Cannot find smiles for #{compound.to_s}."
82
+ next
83
+ end
84
+
85
+ value_map=params[:value_map] unless params[:value_map].nil?
86
+ entry.each do |feature,values|
87
+ if feature == @prediction_feature.uri
88
+ values.each do |value|
89
+ if value.nil?
90
+ LOGGER.warn "No #{feature} activity for #{compound.to_s}."
91
+ else
92
+ if @prediction_feature.feature_type == "classification"
93
+ activity= value_map.invert[value].to_i # activities are mapped to 1..n
94
+ @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
95
+ elsif @prediction_feature.feature_type == "regression"
96
+ activity= value.to_f
97
+ end
98
+ begin
99
+ fminer_instance.AddCompound(smiles,id)
100
+ fminer_instance.AddActivity(activity, id)
101
+ @all_activities[id]=activity # DV: insert global information
102
+ @compounds[id] = compound
103
+ @smi[id] = smiles
104
+ id += 1
105
+ rescue Exception => e
106
+ LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
107
+ LOGGER.warn e.backtrace
108
+ end
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
115
+
116
+ end
50
117
 
51
118
  # Backbone Refinement Class mining (http://bbrc.maunz.de/)
52
- class BBRC
53
- include Fminer
119
+ class BBRC < Fminer
54
120
  # Initialize bbrc algorithm
55
121
  def initialize(subjectid=nil)
56
122
  super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
@@ -59,8 +125,7 @@ module OpenTox
59
125
  end
60
126
 
61
127
  # LAtent STructure Pattern Mining (http://last-pm.maunz.de)
62
- class LAST
63
- include Fminer
128
+ class LAST < Fminer
64
129
  # Initialize last algorithm
65
130
  def initialize(subjectid=nil)
66
131
  super File.join(CONFIG[:services]["opentox-algorithm"], "fminer/last")
@@ -68,7 +133,6 @@ module OpenTox
68
133
  end
69
134
  end
70
135
 
71
- end
72
136
 
73
137
  # Create lazar prediction model
74
138
  class Lazar
@@ -90,19 +154,34 @@ module OpenTox
90
154
  # @param [Array] features_a Features of first compound
91
155
  # @param [Array] features_b Features of second compound
92
156
  # @param [optional, Hash] weights Weights for all features
157
+ # @param [optional, Hash] params Keys: `:training_compound, :compound, :training_compound_features_hits, :nr_hits, :compound_features_hits` are required
93
158
  # @return [Float] (Weighted) tanimoto similarity
94
- def self.tanimoto(features_a,features_b,weights=nil)
159
+ def self.tanimoto(features_a,features_b,weights=nil,params=nil)
95
160
  common_features = features_a & features_b
96
161
  all_features = (features_a + features_b).uniq
97
- common_p_sum = 0.0
162
+ #LOGGER.debug "dv --------------- common: #{common_features}, all: #{all_features}"
98
163
  if common_features.size > 0
99
164
  if weights
100
- common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
101
- all_p_sum = 0.0
102
- all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
165
+ #LOGGER.debug "nr_hits: #{params[:nr_hits]}"
166
+ if !params.nil? && params[:nr_hits]
167
+ params[:weights] = weights
168
+ params[:mode] = "min"
169
+ params[:features] = common_features
170
+ common_p_sum = Algorithm.p_sum_support(params)
171
+ params[:mode] = "max"
172
+ params[:features] = all_features
173
+ all_p_sum = Algorithm.p_sum_support(params)
174
+ else
175
+ common_p_sum = 0.0
176
+ common_features.each{|f| common_p_sum += Algorithm.gauss(weights[f])}
177
+ all_p_sum = 0.0
178
+ all_features.each{|f| all_p_sum += Algorithm.gauss(weights[f])}
179
+ end
180
+ #LOGGER.debug "common_p_sum: #{common_p_sum}, all_p_sum: #{all_p_sum}, c/a: #{common_p_sum/all_p_sum}"
103
181
  common_p_sum/all_p_sum
104
182
  else
105
- common_features.to_f/all_features
183
+ #LOGGER.debug "common_features : #{common_features}, all_features: #{all_features}, c/a: #{(common_features.size/all_features.size).to_f}"
184
+ common_features.size.to_f/all_features.size.to_f
106
185
  end
107
186
  else
108
187
  0.0
@@ -132,65 +211,300 @@ module OpenTox
132
211
  end
133
212
  end
134
213
 
214
+ # Structural Graph Clustering by TU Munich
215
+ # Finds clusters similar to a query structure in a given training dataset
216
+ # May be queried for cluster membership of an unknown compound
217
+ class StructuralClustering
218
+ attr_accessor :training_dataset_uri, :training_threshold, :query_dataset_uri, :query_threshold, :target_clusters_array
219
+
220
+ # @params[String] Training dataset_uri
221
+ # @params[Float] Similarity threshold for training (optional)
222
+ # @params[String] Cluster service uri (no AA)
223
+ def initialize training_dataset_uri, training_threshold=0.8, cluster_service_uri = "http://opentox-dev.informatik.tu-muenchen.de:8080/OpenTox/algorithm/StructuralClustering"
224
+
225
+ if (training_dataset_uri =~ URI::regexp).nil? || (cluster_service_uri =~ URI::regexp).nil?
226
+ raise "Invalid URI."
227
+ end
228
+ @training_dataset_uri = training_dataset_uri
229
+ if !OpenTox::Algorithm.numeric? training_threshold || training_threshold <0 || training_threshold >1
230
+ raise "Training threshold out of bounds."
231
+ end
232
+ @training_threshold = training_threshold.to_f
233
+
234
+ # Train a cluster model
235
+ params = {:dataset_uri => @training_dataset_uri, :threshold => @training_threshold }
236
+ @cluster_model_uri = OpenTox::RestClientWrapper.post cluster_service_uri, params
237
+ cluster_model_rdf = OpenTox::RestClientWrapper.get @cluster_model_uri
238
+ @datasets = OpenTox::Parser::Owl.from_rdf cluster_model_rdf, OT.Dataset, true # must extract OT.Datasets from model
239
+
240
+ # Process parsed OWL objects
241
+ @clusterid_dataset_map = Hash.new
242
+ @datasets.each { |d|
243
+ begin
244
+ d.metadata[OT.hasSource]["Structural Clustering cluster "] = "" # must parse in metadata for string (not elegant)
245
+ @clusterid_dataset_map[d.metadata[OT.hasSource].to_i] = d.uri
246
+ rescue Exception => e
247
+ # ignore other entries!
248
+ end
249
+ }
250
+ end
251
+
252
+ # Whether a model has been trained
253
+ def trained?
254
+ !@cluster_model_uri.nil?
255
+ end
256
+
257
+ # Instance query: clusters for a compound
258
+ # @params[String] Query compound
259
+ # @params[Float] Similarity threshold for query to clusters (optional)
260
+ def get_clusters query_compound_uri, query_threshold = 0.5
261
+
262
+ if !OpenTox::Algorithm.numeric? query_threshold || query_threshold <0 || query_threshold >1
263
+ raise "Query threshold out of bounds."
264
+ end
265
+ @query_threshold = query_threshold.to_f
266
+
267
+
268
+ # Preparing a query dataset
269
+ query_dataset = OpenTox::Dataset.new
270
+ @query_dataset_uri = query_dataset.save
271
+ query_dataset = OpenTox::Dataset.find @query_dataset_uri
272
+ query_dataset.add_compound query_compound_uri
273
+ @query_dataset_uri = query_dataset.save
274
+
275
+ # Obtaining a clustering for query compound
276
+ params = { :dataset_uri => @query_dataset_uri, :threshold => @query_threshold }
277
+ cluster_query_dataset_uri = OpenTox::RestClientWrapper.post @cluster_model_uri, params
278
+ cluster_query_dataset = OpenTox::Dataset.new cluster_query_dataset_uri
279
+ cluster_query_dataset.load_all
280
+
281
+ # Reading cluster ids for features from metadata
282
+ feature_clusterid_map = Hash.new
283
+ pattern="Prediction feature for cluster assignment " # must parse for string in metadata (not elegant)
284
+ cluster_query_dataset.features.each { |feature_uri,metadata|
285
+ metadata[DC.title][pattern]=""
286
+ feature_clusterid_map[feature_uri] = metadata[DC.title].to_i
287
+ }
288
+
289
+ # Integrity check
290
+ unless cluster_query_dataset.compounds.size == 1
291
+ raise "Number of predicted compounds is != 1."
292
+ end
293
+
294
+ # Process data entry
295
+ query_compound_uri = cluster_query_dataset.compounds[0]
296
+ @target_clusters_array = Array.new
297
+ cluster_query_dataset.features.keys.each { |cluster_membership_feature|
298
+
299
+ # Getting dataset URI for cluster
300
+ target_cluster = feature_clusterid_map[cluster_membership_feature]
301
+ dataset = @clusterid_dataset_map[target_cluster]
302
+
303
+ # Finally look up presence
304
+ data_entry = cluster_query_dataset.data_entries[query_compound_uri]
305
+ present = data_entry[cluster_membership_feature][0]
306
+
307
+ # Store result
308
+ @target_clusters_array << dataset if present > 0.5 # 0.0 for absence, 1.0 for presence
309
+ }
310
+ end
311
+
312
+ end
313
+
135
314
  module Neighbors
136
315
 
316
+ # Local multi-linear regression (MLR) prediction from neighbors.
317
+ # Uses propositionalized setting.
318
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
319
+ # @return [Numeric] A prediction value.
320
+ def self.local_mlr_prop(params)
321
+
322
+ confidence=0.0
323
+ prediction=nil
324
+
325
+ if params[:neighbors].size>0
326
+ props = params[:prop_kernel] ? get_props(params) : nil
327
+ acts = params[:neighbors].collect { |n| act = n[:activity].to_f }
328
+ sims = params[:neighbors].collect { |n| Algorithm.gauss(n[:similarity]) }
329
+ LOGGER.debug "Local MLR (Propositionalization / GSL)."
330
+ prediction = mlr( {:n_prop => props[0], :q_prop => props[1], :sims => sims, :acts => acts} )
331
+ transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
332
+ prediction = transformer.values[0]
333
+ prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
334
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
335
+ params[:conf_stdev] = false if params[:conf_stdev].nil?
336
+ confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
337
+ confidence = nil if prediction.nil?
338
+ end
339
+ {:prediction => prediction, :confidence => confidence}
340
+
341
+ end
342
+
343
+ # Multi-linear regression weighted by similarity.
344
+ # Objective Feature Selection, Principal Components Analysis, Scaling of Axes.
345
+ # @param [Hash] params Keys `:n_prop, :q_prop, :sims, :acts` are required
346
+ # @return [Numeric] A prediction value.
347
+ def self.mlr(params)
348
+
349
+ # GSL matrix operations:
350
+ # to_a : row-wise conversion to nested array
351
+ #
352
+ # Statsample operations (build on GSL):
353
+ # to_scale: convert into Statsample format
354
+
355
+ begin
356
+ n_prop = params[:n_prop].collect { |v| v }
357
+ q_prop = params[:q_prop].collect { |v| v }
358
+ n_prop << q_prop # attach q_prop
359
+ nr_cases, nr_features = get_sizes n_prop
360
+ data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
361
+
362
+ # Principal Components Analysis
363
+ LOGGER.debug "PCA..."
364
+ pca = OpenTox::Algorithm::Transform::PCA.new(data_matrix)
365
+ data_matrix = pca.data_transformed_matrix
366
+
367
+ # Attach intercept column to data
368
+ intercept = GSL::Matrix.alloc(Array.new(nr_cases,1.0),nr_cases,1)
369
+ data_matrix = data_matrix.horzcat(intercept)
370
+ (0..data_matrix.size2-2).each { |i|
371
+ autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(data_matrix.col(i))
372
+ data_matrix.col(i)[0..data_matrix.size1-1] = autoscaler.scaled_values
373
+ }
374
+
375
+ # Detach query instance
376
+ n_prop = data_matrix.to_a
377
+ q_prop = n_prop.pop
378
+ nr_cases, nr_features = get_sizes n_prop
379
+ data_matrix = GSL::Matrix.alloc(n_prop.flatten, nr_cases, nr_features)
380
+
381
+ # model + support vectors
382
+ LOGGER.debug "Creating MLR model ..."
383
+ c, cov, chisq, status = GSL::MultiFit::wlinear(data_matrix, params[:sims].to_scale.to_gsl, params[:acts].to_scale.to_gsl)
384
+ GSL::MultiFit::linear_est(q_prop.to_scale.to_gsl, c, cov)[0]
385
+ rescue Exception => e
386
+ LOGGER.debug "#{e.class}: #{e.message}"
387
+ end
388
+
389
+ end
390
+
137
391
  # Classification with majority vote from neighbors weighted by similarity
138
- # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity`
139
- # @param [optional] params Ignored (only for compatibility with local_svm_regression)
140
- # @return [Hash] Hash with keys `:prediction, :confidence`
141
- def self.weighted_majority_vote(neighbors,params={})
142
- conf = 0.0
392
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
393
+ # @return [Numeric] A prediction value.
394
+ def self.weighted_majority_vote(params)
395
+
396
+ neighbor_contribution = 0.0
397
+ confidence_sum = 0.0
143
398
  confidence = 0.0
144
- neighbors.each do |neighbor|
145
- case neighbor[:activity].to_s
146
- when 'true'
147
- conf += Algorithm.gauss(neighbor[:similarity])
148
- when 'false'
149
- conf -= Algorithm.gauss(neighbor[:similarity])
399
+ prediction = nil
400
+
401
+ params[:neighbors].each do |neighbor|
402
+ neighbor_weight = Algorithm.gauss(neighbor[:similarity]).to_f
403
+ neighbor_contribution += neighbor[:activity].to_f * neighbor_weight
404
+
405
+ if params[:value_map].size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
406
+ case neighbor[:activity]
407
+ when 1
408
+ confidence_sum -= neighbor_weight
409
+ when 2
410
+ confidence_sum += neighbor_weight
411
+ end
412
+ else
413
+ confidence_sum += neighbor_weight
150
414
  end
151
415
  end
152
- if conf > 0.0
153
- prediction = true
154
- elsif conf < 0.0
155
- prediction = false
156
- else
157
- prediction = nil
158
- end
159
- confidence = conf/neighbors.size if neighbors.size > 0
160
- {:prediction => prediction, :confidence => confidence.abs}
416
+
417
+ if params[:value_map].size == 2
418
+ if confidence_sum >= 0.0
419
+ prediction = 2 unless params[:neighbors].size==0
420
+ elsif confidence_sum < 0.0
421
+ prediction = 1 unless params[:neighbors].size==0
422
+ end
423
+ else
424
+ prediction = (neighbor_contribution/confidence_sum).round unless params[:neighbors].size==0 # AM: new multinomial prediction
425
+ end
426
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'." unless prediction.nil?
427
+ confidence = confidence_sum/params[:neighbors].size if params[:neighbors].size > 0
428
+ LOGGER.debug "Confidence is: '" + confidence.to_s + "'." unless prediction.nil?
429
+ return {:prediction => prediction, :confidence => confidence.abs}
161
430
  end
162
431
 
163
432
  # Local support vector regression from neighbors
164
- # @param [Array] neighbors, each neighbor is a hash with keys `:similarity, :activity, :features`
165
- # @param [Hash] params Keys `:similarity_algorithm,:p_values` are required
166
- # @return [Hash] Hash with keys `:prediction, :confidence`
167
- def self.local_svm_regression(neighbors,params )
168
- sims = neighbors.collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values between query and neighbors
169
- conf = sims.inject{|sum,x| sum + x }
170
-
171
- # AM: Control log taking
172
- take_logs=true
173
- neighbors.each do |n|
174
- if (! n[:activity].nil?) && (n[:activity].to_f < 0.0)
175
- take_logs = false
176
- end
433
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
434
+ # @return [Numeric] A prediction value.
435
+ def self.local_svm_regression(params)
436
+
437
+ confidence = 0.0
438
+ prediction = nil
439
+ if params[:neighbors].size>0
440
+ props = params[:prop_kernel] ? get_props(params) : nil
441
+ acts = params[:neighbors].collect{ |n| n[:activity].to_f }
442
+ sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) }
443
+ prediction = props.nil? ? local_svm(acts, sims, "nu-svr", params) : local_svm_prop(props, acts, "nu-svr")
444
+ transformer = eval("OpenTox::Algorithm::Transform::#{params[:transform]["class"]}.new ([#{prediction}], #{params[:transform]["offset"]})")
445
+ prediction = transformer.values[0]
446
+ prediction = nil if prediction.infinite? || params[:prediction_min_max][1] < prediction || params[:prediction_min_max][0] > prediction
447
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
448
+ params[:conf_stdev] = false if params[:conf_stdev].nil?
449
+ confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
450
+ confidence = nil if prediction.nil?
177
451
  end
178
- acts = neighbors.collect do |n|
179
- act = n[:activity]
180
- take_logs ? Math.log10(act.to_f) : act.to_f
181
- end # activities of neighbors for supervised learning
452
+ {:prediction => prediction, :confidence => confidence}
453
+
454
+ end
455
+
456
+ # Local support vector classification from neighbors
457
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
458
+ # @return [Numeric] A prediction value.
459
+ def self.local_svm_classification(params)
182
460
 
183
- neighbor_matches = neighbors.collect{ |n| n[:features] } # as in classification: URIs of matches
461
+ confidence = 0.0
462
+ prediction = nil
463
+ if params[:neighbors].size>0
464
+ props = params[:prop_kernel] ? get_props(params) : nil
465
+ acts = params[:neighbors].collect { |n| act = n[:activity] }
466
+ sims = params[:neighbors].collect{ |n| Algorithm.gauss(n[:similarity]) } # similarity values btwn q and nbors
467
+ prediction = props.nil? ? local_svm(acts, sims, "C-bsvc", params) : local_svm_prop(props, acts, "C-bsvc")
468
+ LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
469
+ params[:conf_stdev] = false if params[:conf_stdev].nil?
470
+ confidence = get_confidence({:sims => sims, :acts => acts, :neighbors => params[:neighbors], :conf_stdev => params[:conf_stdev]})
471
+ end
472
+ {:prediction => prediction, :confidence => confidence}
473
+
474
+ end
475
+
476
+
477
+ # Local support vector prediction from neighbors.
478
+ # Uses pre-defined Kernel Matrix.
479
+ # Not to be called directly (use local_svm_regression or local_svm_classification).
480
+ # @param [Array] acts, activities for neighbors.
481
+ # @param [Array] sims, similarities for neighbors.
482
+ # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
483
+ # @param [Hash] params Keys `:neighbors,:compound,:features,:p_values,:similarity_algorithm,:prop_kernel,:value_map,:transform` are required
484
+ # @return [Numeric] A prediction value.
485
+ def self.local_svm(acts, sims, type, params)
486
+ LOGGER.debug "Local SVM (Weighted Tanimoto Kernel)."
487
+ neighbor_matches = params[:neighbors].collect{ |n| n[:features] } # URIs of matches
184
488
  gram_matrix = [] # square matrix of similarities between neighbors; implements weighted tanimoto kernel
185
- if neighbor_matches.size == 0
186
- raise "No neighbors found"
489
+
490
+ prediction = nil
491
+ if Algorithm::zero_variance? acts
492
+ prediction = acts[0]
187
493
  else
188
494
  # gram matrix
189
495
  (0..(neighbor_matches.length-1)).each do |i|
496
+ neighbor_i_hits = params[:fingerprints][params[:neighbors][i][:compound]]
190
497
  gram_matrix[i] = [] unless gram_matrix[i]
191
498
  # upper triangle
192
499
  ((i+1)..(neighbor_matches.length-1)).each do |j|
193
- sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values])")
500
+ neighbor_j_hits= params[:fingerprints][params[:neighbors][j][:compound]]
501
+ sim_params = {}
502
+ if params[:nr_hits]
503
+ sim_params[:nr_hits] = true
504
+ sim_params[:compound_features_hits] = neighbor_i_hits
505
+ sim_params[:training_compound_features_hits] = neighbor_j_hits
506
+ end
507
+ sim = eval("#{params[:similarity_algorithm]}(neighbor_matches[i], neighbor_matches[j], params[:p_values], sim_params)")
194
508
  gram_matrix[i][j] = Algorithm.gauss(sim)
195
509
  gram_matrix[j] = [] unless gram_matrix[j]
196
510
  gram_matrix[j][i] = gram_matrix[i][j] # lower triangle
@@ -198,6 +512,7 @@ module OpenTox
198
512
  gram_matrix[i][i] = 1.0
199
513
  end
200
514
 
515
+
201
516
  #LOGGER.debug gram_matrix.to_yaml
202
517
  @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
203
518
  @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
@@ -208,27 +523,171 @@ module OpenTox
208
523
  @r.y = acts
209
524
  @r.sims = sims
210
525
 
211
- LOGGER.debug "Preparing R data ..."
212
- # prepare data
213
- @r.eval "y<-as.vector(y)"
214
- @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
215
- @r.eval "sims<-as.vector(sims)"
216
-
217
- # model + support vectors
218
- LOGGER.debug "Creating SVM model ..."
219
- @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"nu-svr\", nu=0.8)"
220
- @r.eval "sv<-as.vector(SVindex(model))"
221
- @r.eval "sims<-sims[sv]"
222
- @r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
223
- LOGGER.debug "Predicting ..."
224
- @r.eval "p<-predict(model,sims)[1,1]"
225
- prediction = 10**(@r.p.to_f) if take_logs
226
- LOGGER.debug "Prediction is: '" + prediction.to_s + "'."
227
- @r.quit # free R
526
+ begin
527
+ LOGGER.debug "Preparing R data ..."
528
+ # prepare data
529
+ @r.eval "y<-as.vector(y)"
530
+ @r.eval "gram_matrix<-as.kernelMatrix(matrix(gram_matrix,n,n))"
531
+ @r.eval "sims<-as.vector(sims)"
532
+
533
+ # model + support vectors
534
+ LOGGER.debug "Creating SVM model ..."
535
+ @r.eval "model<-ksvm(gram_matrix, y, kernel=matrix, type=\"#{type}\", nu=0.5)"
536
+ @r.eval "sv<-as.vector(SVindex(model))"
537
+ @r.eval "sims<-sims[sv]"
538
+ @r.eval "sims<-as.kernelMatrix(matrix(sims,1))"
539
+ LOGGER.debug "Predicting ..."
540
+ if type == "nu-svr"
541
+ @r.eval "p<-predict(model,sims)[1,1]"
542
+ elsif type == "C-bsvc"
543
+ @r.eval "p<-predict(model,sims)"
544
+ end
545
+ if type == "nu-svr"
546
+ prediction = @r.p
547
+ elsif type == "C-bsvc"
548
+ #prediction = (@r.p.to_f == 1.0 ? true : false)
549
+ prediction = @r.p
550
+ end
551
+ @r.quit # free R
552
+ rescue Exception => e
553
+ LOGGER.debug "#{e.class}: #{e.message}"
554
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
555
+ end
556
+
228
557
  end
229
- confidence = conf/neighbors.size if neighbors.size > 0
230
- {:prediction => prediction, :confidence => confidence}
231
-
558
+ prediction
559
+ end
560
+
561
+ # Local support vector prediction from neighbors.
562
+ # Uses propositionalized setting.
563
+ # Not to be called directly (use local_svm_regression or local_svm_classification).
564
+ # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
565
+ # @param [Array] acts, activities for neighbors.
566
+ # @param [String] type, one of "nu-svr" (regression) or "C-bsvc" (classification).
567
+ # @return [Numeric] A prediction value.
568
+ def self.local_svm_prop(props, acts, type)
569
+
570
+ LOGGER.debug "Local SVM (Propositionalization / Kernlab Kernel)."
571
+ n_prop = props[0] # is a matrix, i.e. two nested Arrays.
572
+ q_prop = props[1] # is an Array.
573
+
574
+ prediction = nil
575
+ if Algorithm::zero_variance? acts
576
+ prediction = acts[0]
577
+ else
578
+ #LOGGER.debug gram_matrix.to_yaml
579
+ @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests
580
+ @r.eval "library('kernlab')" # this requires R package "kernlab" to be installed
581
+ LOGGER.debug "Setting R data ..."
582
+ # set data
583
+ @r.n_prop = n_prop.flatten
584
+ @r.n_prop_x_size = n_prop.size
585
+ @r.n_prop_y_size = n_prop[0].size
586
+ @r.y = acts
587
+ @r.q_prop = q_prop
588
+
589
+ begin
590
+ LOGGER.debug "Preparing R data ..."
591
+ # prepare data
592
+ @r.eval "y<-matrix(y)"
593
+ @r.eval "prop_matrix<-matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=TRUE)"
594
+ @r.eval "q_prop<-matrix(q_prop, 1, n_prop_y_size, byrow=TRUE)"
595
+
596
+ # model + support vectors
597
+ LOGGER.debug "Creating SVM model ..."
598
+ @r.eval "model<-ksvm(prop_matrix, y, type=\"#{type}\", nu=0.5)"
599
+ LOGGER.debug "Predicting ..."
600
+ if type == "nu-svr"
601
+ @r.eval "p<-predict(model,q_prop)[1,1]"
602
+ elsif type == "C-bsvc"
603
+ @r.eval "p<-predict(model,q_prop)"
604
+ end
605
+ if type == "nu-svr"
606
+ prediction = @r.p
607
+ elsif type == "C-bsvc"
608
+ #prediction = (@r.p.to_f == 1.0 ? true : false)
609
+ prediction = @r.p
610
+ end
611
+ @r.quit # free R
612
+ rescue Exception => e
613
+ LOGGER.debug "#{e.class}: #{e.message}"
614
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
615
+ end
616
+ end
617
+ prediction
618
+ end
619
+
620
+ # Get confidence for regression, with standard deviation of neighbor activity if conf_stdev is set.
621
+ # @param[Hash] Required keys: :sims, :acts, :neighbors, :conf_stdev
622
+ # @return[Float] Confidence
623
+ def self.get_confidence(params)
624
+ if params[:conf_stdev]
625
+ sim_median = params[:sims].to_scale.median
626
+ if sim_median.nil?
627
+ confidence = nil
628
+ else
629
+ standard_deviation = params[:acts].to_scale.standard_deviation_sample
630
+ confidence = (sim_median*Math.exp(-1*standard_deviation)).abs
631
+ if confidence.nan?
632
+ confidence = nil
633
+ end
634
+ end
635
+ else
636
+ conf = params[:sims].inject{|sum,x| sum + x }
637
+ confidence = conf/params[:neighbors].size
638
+ end
639
+ LOGGER.debug "Confidence is: '" + confidence.to_s + "'."
640
+ return confidence
641
+ end
642
+
643
+ # Get X and Y size of a nested Array (Matrix)
644
+ def self.get_sizes(matrix)
645
+ begin
646
+ nr_cases = matrix.size
647
+ nr_features = matrix[0].size
648
+ rescue Exception => e
649
+ LOGGER.debug "#{e.class}: #{e.message}"
650
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
651
+ end
652
+ #puts "NRC: #{nr_cases}, NRF: #{nr_features}"
653
+ [ nr_cases, nr_features ]
654
+ end
655
+
656
+ # Calculate the propositionalization matrix aka instantiation matrix (0/1 entries for features)
657
+ # Same for the vector describing the query compound
658
+ # @param[Array] neighbors.
659
+ # @param[OpenTox::Compound] query compound.
660
+ # @param[Array] Dataset Features.
661
+ # @param[Array] Fingerprints of neighbors.
662
+ # @param[Float] p-values of Features.
663
+ def self.get_props (params)
664
+ matrix = Array.new
665
+ begin
666
+ params[:neighbors].each do |n|
667
+ n = n[:compound]
668
+ row = []
669
+ params[:features].each do |f|
670
+ if ! params[:fingerprints][n].nil?
671
+ row << (params[:fingerprints][n].include?(f) ? (params[:p_values][f] * params[:fingerprints][n][f]) : 0.0)
672
+ else
673
+ row << 0.0
674
+ end
675
+ end
676
+ matrix << row
677
+ end
678
+ row = []
679
+ params[:features].each do |f|
680
+ if params[:nr_hits]
681
+ compound_feature_hits = params[:compound].match_hits([f])
682
+ row << (compound_feature_hits.size == 0 ? 0.0 : (params[:p_values][f] * compound_feature_hits[f]))
683
+ else
684
+ row << (params[:compound].match([f]).size == 0 ? 0.0 : params[:p_values][f])
685
+ end
686
+ end
687
+ rescue Exception => e
688
+ LOGGER.debug "get_props failed with '" + $! + "'"
689
+ end
690
+ [ matrix, row ]
232
691
  end
233
692
 
234
693
  end
@@ -250,6 +709,195 @@ module OpenTox
250
709
  def features(dataset_uri,compound_uri)
251
710
  end
252
711
  end
712
+
713
+ module Transform
714
+ include Algorithm
715
+
716
+ # The transformer that inverts values.
717
+ # 1/x is used, after values have been moved >= 1.
718
+ class Inverter
719
+ attr_accessor :offset, :values
720
+
721
+ # @params[Array] Values to transform.
722
+ # @params[Float] Offset for restore.
723
+ def initialize *args
724
+ case args.size
725
+ when 1
726
+ begin
727
+ values=args[0]
728
+ raise "Cannot transform, values empty." if @values.size==0
729
+ @values = values.collect { |v| -1.0 * v }
730
+ @offset = 1.0 - @values.minmax[0]
731
+ @offset = -1.0 * @offset if @offset>0.0
732
+ @values.collect! { |v| v - @offset } # slide >1
733
+ @values.collect! { |v| 1 / v } # invert to [0,1]
734
+ rescue Exception => e
735
+ LOGGER.debug "#{e.class}: #{e.message}"
736
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
737
+ end
738
+ when 2
739
+ @offset = args[1].to_f
740
+ @values = args[0].collect { |v| 1 / v }
741
+ @values.collect! { |v| v + @offset }
742
+ @values.collect! { |v| -1.0 * v }
743
+ end
744
+ end
745
+ end
746
+
747
+ # The transformer that takes logs.
748
+ # Log10 is used, after values have been moved > 0.
749
+ class Log10
750
+ attr_accessor :offset, :values
751
+
752
+ # @params[Array] Values to transform / restore.
753
+ # @params[Float] Offset for restore.
754
+ def initialize *args
755
+ @distance_to_zero = 0.000000001 # 1 / 1 billion
756
+ case args.size
757
+ when 1
758
+ begin
759
+ values=args[0]
760
+ raise "Cannot transform, values empty." if values.size==0
761
+ @offset = values.minmax[0]
762
+ @offset = -1.0 * @offset if @offset>0.0
763
+ @values = values.collect { |v| v - @offset } # slide > anchor
764
+ @values.collect! { |v| v + @distance_to_zero } #
765
+ @values.collect! { |v| Math::log10 v } # log10 (can fail)
766
+ rescue Exception => e
767
+ LOGGER.debug "#{e.class}: #{e.message}"
768
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
769
+ end
770
+ when 2
771
+ @offset = args[1].to_f
772
+ @values = args[0].collect { |v| 10**v }
773
+ @values.collect! { |v| v - @distance_to_zero }
774
+ @values.collect! { |v| v + @offset }
775
+ end
776
+ end
777
+ end
778
+
779
+ # The transformer that does nothing (No OPeration).
780
+ class NOP
781
+ attr_accessor :offset, :values
782
+
783
+ # @params[Array] Values to transform / restore.
784
+ # @params[Float] Offset for restore.
785
+ def initialize *args
786
+ @offset = 0.0
787
+ @distance_to_zero = 0.0
788
+ case args.size
789
+ when 1
790
+ @values = args[0]
791
+ when 2
792
+ @values = args[0]
793
+ end
794
+ end
795
+ end
796
+
797
+
798
+ # Auto-Scaler for Arrays
799
+ # Center on mean and divide by standard deviation
800
+ class AutoScale
801
+ attr_accessor :scaled_values, :mean, :stdev
802
+
803
+ # @params[Array] Values to transform.
804
+ def initialize values
805
+ @scaled_values = values
806
+ @mean = @scaled_values.to_scale.mean
807
+ @stdev = @scaled_values.to_scale.standard_deviation_sample
808
+ @scaled_values = @scaled_values.collect {|vi| vi - @mean }
809
+ @scaled_values.collect! {|vi| vi / @stdev } unless @stdev == 0.0
810
+ end
811
+ end
812
+
813
+ # Principal Components Analysis
814
+ # Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos
815
+ class PCA
816
+ attr_accessor :data_matrix, :data_transformed_matrix, :eigenvector_matrix, :eigenvalue_sums, :autoscaler
817
+
818
+ # Creates a transformed dataset as GSL::Matrix.
819
+ # @param [GSL::Matrix] Data matrix.
820
+ # @param [Float] Compression ratio from [0,1].
821
+ # @return [GSL::Matrix] Data transformed matrix.
822
+ def initialize data_matrix, compression=0.05
823
+ begin
824
+ @data_matrix = data_matrix
825
+ @compression = compression.to_f
826
+ @stdev = Array.new
827
+ @mean = Array.new
828
+
829
+ # Objective Feature Selection
830
+ raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
831
+ @data_matrix_selected = nil
832
+ (0..@data_matrix.size2-1).each { |i|
833
+ if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
834
+ if @data_matrix_selected.nil?
835
+ @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1)
836
+ @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
837
+ else
838
+ @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
839
+ end
840
+ end
841
+ }
842
+ raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)
843
+
844
+ # Scaling of Axes
845
+ @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @data_matrix_selected.size2)
846
+ (0..@data_matrix_selected.size2-1).each { |i|
847
+ @autoscaler = OpenTox::Algorithm::Transform::AutoScale.new(@data_matrix_selected.col(i))
848
+ @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = @autoscaler.scaled_values
849
+ @stdev << @autoscaler.stdev
850
+ @mean << @autoscaler.mean
851
+ }
852
+
853
+ data_matrix_hash = Hash.new
854
+ (0..@data_matrix_scaled.size2-1).each { |i|
855
+ column_view = @data_matrix_scaled.col(i)
856
+ data_matrix_hash[i] = column_view.to_scale
857
+ }
858
+ dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
859
+ cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
860
+ pca=Statsample::Factor::PCA.new(cor_matrix)
861
+ pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
862
+ @eigenvalue_sums = Array.new
863
+ (0..dataset_hash.fields.size-1).each { |i|
864
+ @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
865
+ }
866
+ eigenvectors_selected = Array.new
867
+ pca.eigenvectors.each_with_index { |ev, i|
868
+ if (@eigenvalue_sums[i] <= ((1.0-@compression)*dataset_hash.fields.size)) || (eigenvectors_selected.size == 0)
869
+ eigenvectors_selected << ev.to_a
870
+ end
871
+ }
872
+ @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, dataset_hash.fields.size).transpose
873
+ dataset_matrix = dataset_hash.to_gsl.transpose
874
+ @data_transformed_matrix = (@eigenvector_matrix.transpose * dataset_matrix).transpose
875
+ rescue Exception => e
876
+ LOGGER.debug "#{e.class}: #{e.message}"
877
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
878
+ end
879
+ end
880
+
881
+ # Restores data in the original feature space (possibly with compression loss).
882
+ # @return [GSL::Matrix] Data matrix.
883
+ def restore
884
+ begin
885
+ data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
886
+ # reverse scaling
887
+ (0..data_matrix_restored.size2-1).each { |i|
888
+ data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] *= @stdev[i] unless @stdev[i] == 0.0
889
+ data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
890
+ }
891
+ data_matrix_restored
892
+ rescue Exception => e
893
+ LOGGER.debug "#{e.class}: #{e.message}"
894
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
895
+ end
896
+ end
897
+
898
+ end
899
+
900
+ end
253
901
 
254
902
  # Gauss kernel
255
903
  # @return [Float]
@@ -257,16 +905,85 @@ module OpenTox
257
905
  d = 1.0 - x.to_f
258
906
  Math.exp(-(d*d)/(2*sigma*sigma))
259
907
  end
908
+
909
+ # For symbolic features
910
+ # @param [Array] Array to test, must indicate non-occurrence with 0.
911
+ # @return [Boolean] Whether the feature is singular or non-occurring or present everywhere.
912
+ def self.isnull_or_singular?(array)
913
+ nr_zeroes = array.count(0)
914
+ return (nr_zeroes == array.size) || # remove non-occurring feature
915
+ (nr_zeroes == array.size-1) || # remove singular feature
916
+ (nr_zeroes == 0) # also remove feature present everywhere
917
+ end
918
+
919
+ # Numeric value test
920
+ # @param[Object] value
921
+ # @return [Boolean] Whether value is a number
922
+ def self.numeric?(value)
923
+ true if Float(value) rescue false
924
+ end
925
+
926
+ # For symbolic features
927
+ # @param [Array] Array to test, must indicate non-occurrence with 0.
928
+ # @return [Boolean] Whether the feature has variance zero.
929
+ def self.zero_variance?(array)
930
+ return (array.to_scale.variance_sample == 0.0)
931
+ end
260
932
 
261
- # Median of an array
933
+ # Sum of an array for Arrays.
262
934
  # @param [Array] Array with values
263
- # @return [Float] Median
264
- def self.median(array)
265
- return nil if array.empty?
266
- array.sort!
267
- m_pos = array.size / 2
268
- return array.size % 2 == 1 ? array[m_pos] : (array[m_pos-1] + array[m_pos])/2
935
+ # @return [Integer] Sum of size of values
936
+ def self.sum_size(array)
937
+ sum=0
938
+ array.each { |e| sum += e.size }
939
+ return sum
940
+ end
941
+
942
+ # Minimum Frequency
943
+ # @param [Integer] per-mil value
944
+ # return [Integer] min-frequency
945
+ def self.min_frequency(training_dataset,per_mil)
946
+ minfreq = per_mil * training_dataset.compounds.size.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
947
+ minfreq = 2 unless minfreq > 2
948
+ Integer (minfreq)
269
949
  end
270
950
 
951
+ # Effect calculation for classification
952
+ # @param [Array] Array of occurrences per class in the form of Enumerables.
953
+ # @param [Array] Array of database instance counts per class.
954
+ def self.effect(occurrences, db_instances)
955
+ max=0
956
+ max_value=0
957
+ nr_o = self.sum_size(occurrences)
958
+ nr_db = db_instances.to_scale.sum
959
+
960
+ occurrences.each_with_index { |o,i| # fminer outputs occurrences sorted reverse by activity.
961
+ actual = o.size.to_f/nr_o
962
+ expected = db_instances[i].to_f/nr_db
963
+ if actual > expected
964
+ if ((actual - expected) / actual) > max_value
965
+ max_value = (actual - expected) / actual # 'Schleppzeiger'
966
+ max = i
967
+ end
968
+ end
969
+ }
970
+ max
971
+ end
972
+
973
+ # Returns Support value of an fingerprint
974
+ # @param [Hash] params Keys: `:compound_features_hits, :weights, :training_compound_features_hits, :features, :nr_hits:, :mode` are required
975
+ # return [Numeric] Support value
976
+ def self.p_sum_support(params)
977
+ p_sum = 0.0
978
+ params[:features].each{|f|
979
+ compound_hits = params[:compound_features_hits][f]
980
+ neighbor_hits = params[:training_compound_features_hits][f]
981
+ p_sum += eval("(Algorithm.gauss(params[:weights][f]) * ([compound_hits, neighbor_hits].compact.#{params[:mode]}))")
982
+ }
983
+ p_sum
984
+ end
985
+
271
986
  end
272
987
  end
988
+
989
+