lazar 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/model.rb CHANGED
@@ -9,6 +9,8 @@ module OpenTox
9
9
  include Mongoid::Timestamps
10
10
  store_in collection: "models"
11
11
 
12
+ attr_writer :independent_variables # store in GridFS to avoid Mongo database size limit problems
13
+
12
14
  field :name, type: String
13
15
  field :creator, type: String, default: __FILE__
14
16
  field :algorithms, type: Hash, default:{}
@@ -17,7 +19,7 @@ module OpenTox
17
19
  field :prediction_feature_id, type: BSON::ObjectId
18
20
  field :dependent_variables, type: Array, default:[]
19
21
  field :descriptor_ids, type:Array, default:[]
20
- field :independent_variables, type: Array, default:[]
22
+ field :independent_variables_id, type: BSON::ObjectId
21
23
  field :fingerprints, type: Array, default:[]
22
24
  field :descriptor_weights, type: Array, default:[]
23
25
  field :descriptor_means, type: Array, default:[]
@@ -25,7 +27,15 @@ module OpenTox
25
27
  field :scaled_variables, type: Array, default:[]
26
28
  field :version, type: Hash, default:{}
27
29
 
28
- def self.create prediction_feature:nil, training_dataset:nil, algorithms:{}
30
+ # Create a lazar model
31
+ # @param [OpenTox::Dataset] training_dataset
32
+ # @param [OpenTox::Feature, nil] prediction_feature
33
+ # By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature
34
+ # @param [Hash, nil] algorithms
35
+ # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys.
36
+ #
37
+ # @return [OpenTox::Model::Lazar]
38
+ def self.create prediction_feature:nil, training_dataset:, algorithms:{}
29
39
  bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
30
40
  prediction_feature = training_dataset.features.first unless prediction_feature
31
41
  # TODO: prediction_feature without training_dataset: use all available data
@@ -119,6 +129,7 @@ module OpenTox
119
129
  end
120
130
 
121
131
  descriptor_method = model.algorithms[:descriptors][:method]
132
+ model.independent_variables = []
122
133
  case descriptor_method
123
134
  # parse fingerprints
124
135
  when "fingerprint"
@@ -177,8 +188,12 @@ module OpenTox
177
188
  model
178
189
  end
179
190
 
191
+ # Predict a substance (compound or nanoparticle)
192
+ # @param [OpenTox::Substance]
193
+ # @return [Hash]
180
194
  def predict_substance substance
181
195
 
196
+ @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
182
197
  case algorithms[:similarity][:method]
183
198
  when /tanimoto/ # binary features
184
199
  similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
@@ -234,7 +249,7 @@ module OpenTox
234
249
  neighbor_dependent_variables << dependent_variables[i]
235
250
  independent_variables.each_with_index do |c,j|
236
251
  neighbor_independent_variables[j] ||= []
237
- neighbor_independent_variables[j] << independent_variables[j][i]
252
+ neighbor_independent_variables[j] << @independent_variables[j][i]
238
253
  end
239
254
  end
240
255
  end
@@ -256,6 +271,9 @@ module OpenTox
256
271
  prediction
257
272
  end
258
273
 
274
+ # Predict a substance (compound or nanoparticle), an array of substances or a dataset
275
+ # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
276
+ # @return [Hash, Array<Hash>, OpenTox::Dataset]
259
277
  def predict object
260
278
 
261
279
  training_dataset = Dataset.find training_dataset_id
@@ -302,34 +320,62 @@ module OpenTox
302
320
 
303
321
  end
304
322
 
323
+ # Save the model
324
+ # Stores independent_variables in GridFS to avoid Mongo database size limit problems
325
+ def save
326
+ file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables")
327
+ self.independent_variables_id = $gridfs.insert_one(file)
328
+ super
329
+ end
330
+
331
+ # Get independent variables
332
+ # @return [Array<Array>]
333
+ def independent_variables
334
+ @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
335
+ @independent_variables
336
+ end
337
+
338
+ # Get training dataset
339
+ # @return [OpenTox::Dataset]
305
340
  def training_dataset
306
341
  Dataset.find(training_dataset_id)
307
342
  end
308
343
 
344
+ # Get prediction feature
345
+ # @return [OpenTox::Feature]
309
346
  def prediction_feature
310
347
  Feature.find(prediction_feature_id)
311
348
  end
312
349
 
350
+ # Get training descriptors
351
+ # @return [Array<OpenTox::Feature>]
313
352
  def descriptors
314
353
  descriptor_ids.collect{|id| Feature.find(id)}
315
354
  end
316
355
 
356
+ # Get training substances
357
+ # @return [Array<OpenTox::Substance>]
317
358
  def substances
318
359
  substance_ids.collect{|id| Substance.find(id)}
319
360
  end
320
361
 
362
+ # Are fingerprints used as descriptors
363
+ # @return [TrueClass, FalseClass]
321
364
  def fingerprints?
322
365
  algorithms[:descriptors][:method] == "fingerprint" ? true : false
323
366
  end
324
367
 
325
368
  end
326
369
 
370
+ # Classification model
327
371
  class LazarClassification < Lazar
328
372
  end
329
373
 
374
+ # Regression model
330
375
  class LazarRegression < Lazar
331
376
  end
332
377
 
378
+ # Convenience class for generating and validating lazar models in a single step and predicting substances (compounds and nanoparticles), arrays of substances and datasets
333
379
  class Validation
334
380
 
335
381
  include OpenTox
@@ -343,42 +389,64 @@ module OpenTox
343
389
  field :model_id, type: BSON::ObjectId
344
390
  field :repeated_crossvalidation_id, type: BSON::ObjectId
345
391
 
392
+ # Predict a substance (compound or nanoparticle), an array of substances or a dataset
393
+ # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
394
+ # @return [Hash, Array<Hash>, OpenTox::Dataset]
346
395
  def predict object
347
396
  model.predict object
348
397
  end
349
398
 
399
+ # Get training dataset
400
+ # @return [OpenTox::Dataset]
350
401
  def training_dataset
351
402
  model.training_dataset
352
403
  end
353
404
 
405
+ # Get lazar model
406
+ # @return [OpenTox::Model::Lazar]
354
407
  def model
355
408
  Lazar.find model_id
356
409
  end
357
410
 
411
+ # Get algorithms
412
+ # @return [Hash]
358
413
  def algorithms
359
414
  model.algorithms
360
415
  end
361
416
 
417
+ # Get prediction feature
418
+ # @return [OpenTox::Feature]
362
419
  def prediction_feature
363
420
  model.prediction_feature
364
421
  end
365
422
 
423
+ # Get repeated crossvalidations
424
+ # @return [OpenTox::Validation::RepeatedCrossValidation]
366
425
  def repeated_crossvalidation
367
426
  OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required
368
427
  end
369
428
 
429
+ # Get crossvalidations
430
+ # @return [Array<OpenTox::CrossValidation]
370
431
  def crossvalidations
371
432
  repeated_crossvalidation.crossvalidations
372
433
  end
373
434
 
435
+ # Is it a regression model
436
+ # @return [TrueClass, FalseClass]
374
437
  def regression?
375
438
  model.is_a? LazarRegression
376
439
  end
377
440
 
441
+ # Is it a classification model
442
+ # @return [TrueClass, FalseClass]
378
443
  def classification?
379
444
  model.is_a? LazarClassification
380
445
  end
381
446
 
447
+ # Create and validate a lazar model from a csv file with training data and a json file with metadata
448
+ # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
449
+ # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations
382
450
  def self.from_csv_file file
383
451
  metadata_file = file.sub(/csv$/,"json")
384
452
  bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
@@ -391,6 +459,12 @@ module OpenTox
391
459
  model_validation
392
460
  end
393
461
 
462
+ # Create and validate a nano-lazar model, import data from eNanoMapper if necessary
463
+ # nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf
464
+ # @param [OpenTox::Dataset, nil] training_dataset
465
+ # @param [OpenTox::Feature, nil] prediction_feature
466
+ # @param [Hash, nil] algorithms
467
+ # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
394
468
  def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil
395
469
 
396
470
  # find/import training_dataset
data/lib/nanoparticle.rb CHANGED
@@ -1,25 +1,36 @@
1
1
  module OpenTox
2
2
 
3
+ # Nanoparticles
3
4
  class Nanoparticle < Substance
4
5
  include OpenTox
5
6
 
6
7
  field :core_id, type: String, default: nil
7
8
  field :coating_ids, type: Array, default: []
8
9
 
10
+ # Get core compound
11
+ # @return [OpenTox::Compound]
9
12
  def core
10
13
  Compound.find core_id
11
14
  end
12
15
 
16
+ # Get coatings
17
+ # @return [Array<OpenTox::Compound>]
13
18
  def coating
14
19
  coating_ids.collect{|i| Compound.find i }
15
20
  end
16
21
 
22
+ # Get nanoparticle fingerprint (union of core and coating fingerprints)
23
+ # @param [String] fingerprint type
24
+ # @return [Array<String>]
17
25
  def fingerprint type=DEFAULT_FINGERPRINT
18
26
  core_fp = core.fingerprint type
19
27
  coating_fp = coating.collect{|c| c.fingerprint type}.flatten.uniq.compact
20
28
  (core_fp.empty? or coating_fp.empty?) ? [] : (core_fp+coating_fp).uniq.compact
21
29
  end
22
30
 
31
+ # Calculate physchem properties
32
+ # @param [Array<Hash>] list of descriptors
33
+ # @return [Array<Float>]
23
34
  def calculate_properties descriptors=PhysChem::OPENBABEL
24
35
  if core.smiles and !coating.collect{|c| c.smiles}.compact.empty?
25
36
  core_prop = core.calculate_properties descriptors
@@ -28,6 +39,10 @@ module OpenTox
28
39
  end
29
40
  end
30
41
 
42
+ # Add (measured) feature values
43
+ # @param [OpenTox::Feature]
44
+ # @param [TrueClass,FalseClass,Float]
45
+ # @param [OpenTox::Dataset]
31
46
  def add_feature feature, value, dataset
32
47
  unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand
33
48
  case feature.category
@@ -55,6 +70,10 @@ module OpenTox
55
70
  end
56
71
  end
57
72
 
73
+ # Parse values from Ambit database
74
+ # @param [OpenTox::Feature]
75
+ # @param [TrueClass,FalseClass,Float]
76
+ # @param [OpenTox::Dataset]
58
77
  def parse_ambit_value feature, v, dataset
59
78
  # TODO add study id to warnings
60
79
  v.delete "unit"
data/lib/overwrite.rb CHANGED
@@ -2,41 +2,51 @@ require "base64"
2
2
  class Object
3
3
  # An object is blank if it's false, empty, or a whitespace string.
4
4
  # For example, "", " ", +nil+, [], and {} are all blank.
5
+ # @return [TrueClass,FalseClass]
5
6
  def blank?
6
7
  respond_to?(:empty?) ? empty? : !self
7
8
  end
8
9
 
10
+ # Is it a numeric object
11
+ # @return [TrueClass,FalseClass]
9
12
  def numeric?
10
13
  true if Float(self) rescue false
11
14
  end
12
15
 
13
16
  # Returns dimension of nested arrays
17
+ # @return [Fixnum]
14
18
  def dimension
15
19
  self.class == Array ? 1 + self[0].dimension : 0
16
20
  end
17
21
  end
18
22
 
19
23
  class Numeric
24
+ # Convert number to percent
25
+ # @return [Float]
20
26
  def percent_of(n)
21
27
  self.to_f / n.to_f * 100.0
22
28
  end
23
29
  end
24
30
 
25
31
  class Float
26
- # round to n significant digits
27
- # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby
32
+ # Round to n significant digits
33
+ # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby
34
+ # @param [Fixnum]
35
+ # @return [Float]
28
36
  def signif(n)
29
37
  Float("%.#{n}g" % self)
30
38
  end
31
39
 
32
- # converts -10 logarithmized values back
40
+ # Convert -10 log values to original values
41
+ # @return [Float]
33
42
  def delog10
34
43
  10**(-1*self)
35
44
  end
36
45
  end
37
46
 
38
47
  module Enumerable
39
- # @return [Array] only the duplicates of an enumerable
48
+ # Get duplicates
49
+ # @return [Array]
40
50
  def duplicates
41
51
  inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys
42
52
  end
@@ -51,7 +61,10 @@ module Enumerable
51
61
  end
52
62
 
53
63
  class String
54
- # @return [String] converts camel-case to underscore-case (OpenTox::SuperModel -> open_tox/super_model)
64
+ # Convert camel-case to underscore-case
65
+ # @example
66
+ # OpenTox::SuperModel -> open_tox/super_model
67
+ # @return [String]
55
68
  def underscore
56
69
  self.gsub(/::/, '/').
57
70
  gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
@@ -60,7 +73,7 @@ class String
60
73
  downcase
61
74
  end
62
75
 
63
- # convert strings to boolean values
76
+ # Convert strings to boolean values
64
77
  # @return [TrueClass,FalseClass] true or false
65
78
  def to_boolean
66
79
  return true if self == true || self =~ (/(true|t|yes|y|1)$/i)
@@ -71,7 +84,8 @@ class String
71
84
  end
72
85
 
73
86
  class File
74
- # @return [String] mime_type including charset using linux cmd command
87
+ # Get mime_type including charset using linux file command
88
+ # @return [String]
75
89
  def mime_type
76
90
  `file -ib '#{self.path}'`.chomp
77
91
  end
@@ -79,7 +93,7 @@ end
79
93
 
80
94
  class Array
81
95
 
82
- # Sum up the size of single arrays in an array of arrays
96
+ # Sum the size of single arrays in an array of arrays
83
97
  # @param [Array] Array of arrays
84
98
  # @return [Integer] Sum of size of array elements
85
99
  def sum_size
@@ -92,33 +106,43 @@ class Array
92
106
  }
93
107
  end
94
108
 
95
- # For symbolic features
109
+ # Check if the array has just one unique value.
96
110
  # @param [Array] Array to test.
97
- # @return [Boolean] Whether the array has just one unique value.
111
+ # @return [TrueClass,FalseClass]
98
112
  def zero_variance?
99
113
  return self.uniq.size == 1
100
114
  end
101
115
 
116
+ # Get the median of an array
117
+ # @return [Numeric]
102
118
  def median
103
119
  sorted = self.sort
104
120
  len = sorted.length
105
121
  (sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0
106
122
  end
107
123
 
124
+ # Get the mean of an array
125
+ # @return [Numeric]
108
126
  def mean
109
127
  self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size
110
128
  end
111
129
 
130
+ # Get the variance of an array
131
+ # @return [Numeric]
112
132
  def sample_variance
113
133
  m = self.mean
114
134
  sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 }
115
135
  sum/(self.compact.length - 1).to_f
116
136
  end
117
137
 
138
+ # Get the standard deviation of an array
139
+ # @return [Numeric]
118
140
  def standard_deviation
119
141
  Math.sqrt(self.sample_variance)
120
142
  end
121
143
 
144
+ # Convert array values for R
145
+ # @return [Array]
122
146
  def for_R
123
147
  if self.first.is_a?(String)
124
148
  #"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets
@@ -128,6 +152,8 @@ class Array
128
152
  end
129
153
  end
130
154
 
155
+ # Collect array with index
156
+ # in analogy to each_with_index
131
157
  def collect_with_index
132
158
  result = []
133
159
  self.each_with_index do |elt, idx|
@@ -139,11 +165,15 @@ end
139
165
 
140
166
  module URI
141
167
 
168
+ # Is it a https connection
169
+ # @param [String]
170
+ # @return [TrueClass,FalseClass]
142
171
  def self.ssl? uri
143
172
  URI.parse(uri).instance_of? URI::HTTPS
144
173
  end
145
174
 
146
- # @return [Boolean] checks if resource exists by making a HEAD-request
175
+ # Check if a http resource exists by making a HEAD-request
176
+ # @return [TrueClass,FalseClass]
147
177
  def self.accessible?(uri)
148
178
  parsed_uri = URI.parse(uri + (OpenTox::RestClientWrapper.subjectid ? "?subjectid=#{CGI.escape OpenTox::RestClientWrapper.subjectid}" : ""))
149
179
  http_code = URI.task?(uri) ? 600 : 400
@@ -163,6 +193,9 @@ module URI
163
193
  false
164
194
  end
165
195
 
196
+ # Is the URI valid
197
+ # @param [String]
198
+ # @return [TrueClass,FalseClass]
166
199
  def self.valid? uri
167
200
  u = URI.parse(uri)
168
201
  u.scheme!=nil and u.host!=nil
@@ -170,6 +203,8 @@ module URI
170
203
  false
171
204
  end
172
205
 
206
+ # Is the URI a task URI
207
+ # @param [String]
173
208
  def self.task? uri
174
209
  uri =~ /task/ and URI.valid? uri
175
210
  end
data/lib/physchem.rb CHANGED
@@ -39,6 +39,9 @@ module OpenTox
39
39
 
40
40
  require_relative "unique_descriptors.rb"
41
41
 
42
+ # Get descriptor features
43
+ # @param [Hash]
44
+ # @return [Array<OpenTox::PhysChem>]
42
45
  def self.descriptors desc=DESCRIPTORS
43
46
  desc.collect do |name,description|
44
47
  lib,desc = name.split('.',2)
@@ -46,6 +49,8 @@ module OpenTox
46
49
  end
47
50
  end
48
51
 
52
+ # Get unique descriptor features
53
+ # @return [Array<OpenTox::PhysChem>]
49
54
  def self.unique_descriptors
50
55
  udesc = []
51
56
  UNIQUEDESCRIPTORS.each do |name|
@@ -64,23 +69,28 @@ module OpenTox
64
69
  udesc
65
70
  end
66
71
 
72
+ # Get OpenBabel descriptor features
73
+ # @return [Array<OpenTox::PhysChem>]
67
74
  def self.openbabel_descriptors
68
75
  descriptors OPENBABEL
69
76
  end
70
77
 
78
+ # Get CDK descriptor features
79
+ # @return [Array<OpenTox::PhysChem>]
71
80
  def self.cdk_descriptors
72
81
  descriptors CDK
73
82
  end
74
83
 
84
+ # Get JOELIB descriptor features
85
+ # @return [Array<OpenTox::PhysChem>]
75
86
  def self.joelib_descriptors
76
87
  descriptors JOELIB
77
88
  end
78
89
 
79
- def calculate compound
80
- result = send library.downcase,descriptor,compound
81
- result[self.name]
82
- end
83
-
90
+ # Calculate OpenBabel descriptors
91
+ # @param [String] descriptor type
92
+ # @param [OpenTox::Compound]
93
+ # @return [Hash]
84
94
  def openbabel descriptor, compound
85
95
  obdescriptor = OpenBabel::OBDescriptor.find_type descriptor
86
96
  obmol = OpenBabel::OBMol.new
@@ -90,10 +100,18 @@ module OpenTox
90
100
  {"#{library.capitalize}.#{descriptor}" => fix_value(obdescriptor.predict(obmol))}
91
101
  end
92
102
 
103
+ # Calculate CDK descriptors
104
+ # @param [String] descriptor type
105
+ # @param [OpenTox::Compound]
106
+ # @return [Hash]
93
107
  def cdk descriptor, compound
94
108
  java_descriptor "cdk", descriptor, compound
95
109
  end
96
110
 
111
+ # Calculate JOELIB descriptors
112
+ # @param [String] descriptor type
113
+ # @param [OpenTox::Compound]
114
+ # @return [Hash]
97
115
  def joelib descriptor, compound
98
116
  java_descriptor "joelib", descriptor, compound
99
117
  end
data/lib/regression.rb CHANGED
@@ -1,8 +1,13 @@
1
1
  module OpenTox
2
2
  module Algorithm
3
3
 
4
+ # Regression algorithms
4
5
  class Regression
5
6
 
7
+ # Weighted average
8
+ # @param [Array<TrueClass,FalseClass>] dependent_variables
9
+ # @param [Array<Float>] weights
10
+ # @return [Hash]
6
11
  def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:nil
7
12
  # TODO: prediction_interval
8
13
  weighted_sum = 0.0
@@ -1,5 +1,6 @@
1
1
  module OpenTox
2
2
 
3
+ # Adjustments to the rest-client gem for OpenTox
3
4
  class RestClientWrapper
4
5
 
5
6
  attr_accessor :request, :response
data/lib/similarity.rb CHANGED
@@ -2,6 +2,10 @@ module OpenTox
2
2
  module Algorithm
3
3
 
4
4
  class Vector
5
+ # Get dot product
6
+ # @param [Vector]
7
+ # @param [Vector]
8
+ # @return [Numeric]
5
9
  def self.dot_product(a, b)
6
10
  products = a.zip(b).map{|a, b| a * b}
7
11
  products.inject(0) {|s,p| s + p}
@@ -15,6 +19,9 @@ module OpenTox
15
19
 
16
20
  class Similarity
17
21
 
22
+ # Get Tanimoto similarity
23
+ # @param [Array<Array<Float>>]
24
+ # @return [Float]
18
25
  def self.tanimoto fingerprints
19
26
  ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
20
27
  end
@@ -23,18 +30,28 @@ module OpenTox
23
30
  #( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
24
31
  #end
25
32
 
33
+ # Get Euclidean distance
34
+ # @param [Array<Array<Float>>]
35
+ # @return [Float]
26
36
  def self.euclid scaled_properties
27
37
  sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2}
28
38
  Math.sqrt(sq.inject(0) {|s,c| s + c})
29
39
  end
30
40
 
31
- # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
41
+ # Get cosine similarity
42
+ # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
43
+ # @param [Array<Array<Float>>]
44
+ # @return [Float]
32
45
  def self.cosine scaled_properties
33
46
  scaled_properties = remove_nils scaled_properties
34
47
  Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1]))
35
48
  end
36
49
 
37
- def self.weighted_cosine scaled_properties # [a,b,weights]
50
+ # Get weighted cosine similarity
51
+ # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
52
+ # @param [Array<Array<Float>>] [a,b,weights]
53
+ # @return [Float]
54
+ def self.weighted_cosine scaled_properties
38
55
  a,b,w = remove_nils scaled_properties
39
56
  return cosine(scaled_properties) if w.uniq.size == 1
40
57
  dot_product = 0
@@ -48,6 +65,9 @@ module OpenTox
48
65
  dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b))
49
66
  end
50
67
 
68
+ # Remove nil values
69
+ # @param [Array<Array<Float>>] [a,b,weights]
70
+ # @return [Array<Array<Float>>] [a,b,weights]
51
71
  def self.remove_nils scaled_properties
52
72
  a =[]; b = []; w = []
53
73
  (0..scaled_properties.first.size-1).each do |i|
data/lib/substance.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  module OpenTox
2
2
 
3
+ # Base class for substances (e.g. compunds, nanoparticles)
3
4
  class Substance
4
5
  field :properties, type: Hash, default: {}
5
6
  field :dataset_ids, type: Array, default: []
@@ -2,11 +2,17 @@ module OpenTox
2
2
 
3
3
  module Validation
4
4
 
5
+ # Training test set validation
5
6
  class TrainTest < Validation
6
7
 
7
8
  field :training_dataset_id, type: BSON::ObjectId
8
9
  field :test_dataset_id, type: BSON::ObjectId
9
10
 
11
+ # Create a training test set validation
12
+ # @param [OpenTox::Model::Lazar]
13
+ # @param [OpenTox::Dataset] training dataset
14
+ # @param [OpenTox::Dataset] test dataset
15
+ # @return [OpenTox::Validation::TrainTest]
10
16
  def self.create model, training_set, test_set
11
17
 
12
18
  validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
@@ -32,16 +38,21 @@ module OpenTox
32
38
  validation
33
39
  end
34
40
 
41
+ # Get test dataset
42
+ # @return [OpenTox::Dataset]
35
43
  def test_dataset
36
44
  Dataset.find test_dataset_id
37
45
  end
38
46
 
47
+ # Get training dataset
48
+ # @return [OpenTox::Dataset]
39
49
  def training_dataset
40
50
  Dataset.find training_dataset_id
41
51
  end
42
52
 
43
53
  end
44
54
 
55
+ # Training test set validation for classification models
45
56
  class ClassificationTrainTest < TrainTest
46
57
  include ClassificationStatistics
47
58
  field :accept_values, type: Array
@@ -54,6 +65,7 @@ module OpenTox
54
65
  field :probability_plot_id, type: BSON::ObjectId
55
66
  end
56
67
 
68
+ # Training test set validation for regression models
57
69
  class RegressionTrainTest < TrainTest
58
70
  include RegressionStatistics
59
71
  field :rmse, type: Float, default:0