lazar 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +64 -1
- data/VERSION +1 -1
- data/lib/algorithm.rb +1 -0
- data/lib/caret.rb +11 -2
- data/lib/classification.rb +6 -1
- data/lib/compound.rb +32 -23
- data/lib/crossvalidation.rb +22 -0
- data/lib/dataset.rb +30 -3
- data/lib/feature.rb +7 -0
- data/lib/feature_selection.rb +4 -1
- data/lib/import.rb +5 -1
- data/lib/leave-one-out-validation.rb +6 -0
- data/lib/model.rb +77 -3
- data/lib/nanoparticle.rb +19 -0
- data/lib/overwrite.rb +46 -11
- data/lib/physchem.rb +23 -5
- data/lib/regression.rb +5 -0
- data/lib/rest-client-wrapper.rb +1 -0
- data/lib/similarity.rb +22 -2
- data/lib/substance.rb +1 -0
- data/lib/train-test-validation.rb +12 -0
- data/lib/validation-statistics.rb +19 -0
- data/lib/validation.rb +3 -0
- data/test/feature.rb +2 -2
- data/test/model-nanoparticle.rb +7 -0
- data/test/nanomaterial-model-validation.rb +2 -3
- data/test/setup.rb +1 -5
- data/test/validation-regression.rb +2 -3
- metadata +50 -5
- data/lib/experiment.rb +0 -99
data/lib/model.rb
CHANGED
@@ -9,6 +9,8 @@ module OpenTox
|
|
9
9
|
include Mongoid::Timestamps
|
10
10
|
store_in collection: "models"
|
11
11
|
|
12
|
+
attr_writer :independent_variables # store in GridFS to avoid Mongo database size limit problems
|
13
|
+
|
12
14
|
field :name, type: String
|
13
15
|
field :creator, type: String, default: __FILE__
|
14
16
|
field :algorithms, type: Hash, default:{}
|
@@ -17,7 +19,7 @@ module OpenTox
|
|
17
19
|
field :prediction_feature_id, type: BSON::ObjectId
|
18
20
|
field :dependent_variables, type: Array, default:[]
|
19
21
|
field :descriptor_ids, type:Array, default:[]
|
20
|
-
field :
|
22
|
+
field :independent_variables_id, type: BSON::ObjectId
|
21
23
|
field :fingerprints, type: Array, default:[]
|
22
24
|
field :descriptor_weights, type: Array, default:[]
|
23
25
|
field :descriptor_means, type: Array, default:[]
|
@@ -25,7 +27,15 @@ module OpenTox
|
|
25
27
|
field :scaled_variables, type: Array, default:[]
|
26
28
|
field :version, type: Hash, default:{}
|
27
29
|
|
28
|
-
|
30
|
+
# Create a lazar model
|
31
|
+
# @param [OpenTox::Dataset] training_dataset
|
32
|
+
# @param [OpenTox::Feature, nil] prediction_feature
|
33
|
+
# By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature
|
34
|
+
# @param [Hash, nil] algorithms
|
35
|
+
# Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys.
|
36
|
+
#
|
37
|
+
# @return [OpenTox::Model::Lazar]
|
38
|
+
def self.create prediction_feature:nil, training_dataset:, algorithms:{}
|
29
39
|
bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
|
30
40
|
prediction_feature = training_dataset.features.first unless prediction_feature
|
31
41
|
# TODO: prediction_feature without training_dataset: use all available data
|
@@ -119,6 +129,7 @@ module OpenTox
|
|
119
129
|
end
|
120
130
|
|
121
131
|
descriptor_method = model.algorithms[:descriptors][:method]
|
132
|
+
model.independent_variables = []
|
122
133
|
case descriptor_method
|
123
134
|
# parse fingerprints
|
124
135
|
when "fingerprint"
|
@@ -177,8 +188,12 @@ module OpenTox
|
|
177
188
|
model
|
178
189
|
end
|
179
190
|
|
191
|
+
# Predict a substance (compound or nanoparticle)
|
192
|
+
# @param [OpenTox::Substance]
|
193
|
+
# @return [Hash]
|
180
194
|
def predict_substance substance
|
181
195
|
|
196
|
+
@independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
|
182
197
|
case algorithms[:similarity][:method]
|
183
198
|
when /tanimoto/ # binary features
|
184
199
|
similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
|
@@ -234,7 +249,7 @@ module OpenTox
|
|
234
249
|
neighbor_dependent_variables << dependent_variables[i]
|
235
250
|
independent_variables.each_with_index do |c,j|
|
236
251
|
neighbor_independent_variables[j] ||= []
|
237
|
-
neighbor_independent_variables[j] << independent_variables[j][i]
|
252
|
+
neighbor_independent_variables[j] << @independent_variables[j][i]
|
238
253
|
end
|
239
254
|
end
|
240
255
|
end
|
@@ -256,6 +271,9 @@ module OpenTox
|
|
256
271
|
prediction
|
257
272
|
end
|
258
273
|
|
274
|
+
# Predict a substance (compound or nanoparticle), an array of substances or a dataset
|
275
|
+
# @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
|
276
|
+
# @return [Hash, Array<Hash>, OpenTox::Dataset]
|
259
277
|
def predict object
|
260
278
|
|
261
279
|
training_dataset = Dataset.find training_dataset_id
|
@@ -302,34 +320,62 @@ module OpenTox
|
|
302
320
|
|
303
321
|
end
|
304
322
|
|
323
|
+
# Save the model
|
324
|
+
# Stores independent_variables in GridFS to avoid Mongo database size limit problems
|
325
|
+
def save
|
326
|
+
file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables")
|
327
|
+
self.independent_variables_id = $gridfs.insert_one(file)
|
328
|
+
super
|
329
|
+
end
|
330
|
+
|
331
|
+
# Get independent variables
|
332
|
+
# @return [Array<Array>]
|
333
|
+
def independent_variables
|
334
|
+
@independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
|
335
|
+
@independent_variables
|
336
|
+
end
|
337
|
+
|
338
|
+
# Get training dataset
|
339
|
+
# @return [OpenTox::Dataset]
|
305
340
|
def training_dataset
|
306
341
|
Dataset.find(training_dataset_id)
|
307
342
|
end
|
308
343
|
|
344
|
+
# Get prediction feature
|
345
|
+
# @return [OpenTox::Feature]
|
309
346
|
def prediction_feature
|
310
347
|
Feature.find(prediction_feature_id)
|
311
348
|
end
|
312
349
|
|
350
|
+
# Get training descriptors
|
351
|
+
# @return [Array<OpenTox::Feature>]
|
313
352
|
def descriptors
|
314
353
|
descriptor_ids.collect{|id| Feature.find(id)}
|
315
354
|
end
|
316
355
|
|
356
|
+
# Get training substances
|
357
|
+
# @return [Array<OpenTox::Substance>]
|
317
358
|
def substances
|
318
359
|
substance_ids.collect{|id| Substance.find(id)}
|
319
360
|
end
|
320
361
|
|
362
|
+
# Are fingerprints used as descriptors
|
363
|
+
# @return [TrueClass, FalseClass]
|
321
364
|
def fingerprints?
|
322
365
|
algorithms[:descriptors][:method] == "fingerprint" ? true : false
|
323
366
|
end
|
324
367
|
|
325
368
|
end
|
326
369
|
|
370
|
+
# Classification model
|
327
371
|
class LazarClassification < Lazar
|
328
372
|
end
|
329
373
|
|
374
|
+
# Regression model
|
330
375
|
class LazarRegression < Lazar
|
331
376
|
end
|
332
377
|
|
378
|
+
# Convenience class for generating and validating lazar models in a single step and predicting substances (compounds and nanoparticles), arrays of substances and datasets
|
333
379
|
class Validation
|
334
380
|
|
335
381
|
include OpenTox
|
@@ -343,42 +389,64 @@ module OpenTox
|
|
343
389
|
field :model_id, type: BSON::ObjectId
|
344
390
|
field :repeated_crossvalidation_id, type: BSON::ObjectId
|
345
391
|
|
392
|
+
# Predict a substance (compound or nanoparticle), an array of substances or a dataset
|
393
|
+
# @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
|
394
|
+
# @return [Hash, Array<Hash>, OpenTox::Dataset]
|
346
395
|
def predict object
|
347
396
|
model.predict object
|
348
397
|
end
|
349
398
|
|
399
|
+
# Get training dataset
|
400
|
+
# @return [OpenTox::Dataset]
|
350
401
|
def training_dataset
|
351
402
|
model.training_dataset
|
352
403
|
end
|
353
404
|
|
405
|
+
# Get lazar model
|
406
|
+
# @return [OpenTox::Model::Lazar]
|
354
407
|
def model
|
355
408
|
Lazar.find model_id
|
356
409
|
end
|
357
410
|
|
411
|
+
# Get algorithms
|
412
|
+
# @return [Hash]
|
358
413
|
def algorithms
|
359
414
|
model.algorithms
|
360
415
|
end
|
361
416
|
|
417
|
+
# Get prediction feature
|
418
|
+
# @return [OpenTox::Feature]
|
362
419
|
def prediction_feature
|
363
420
|
model.prediction_feature
|
364
421
|
end
|
365
422
|
|
423
|
+
# Get repeated crossvalidations
|
424
|
+
# @return [OpenTox::Validation::RepeatedCrossValidation]
|
366
425
|
def repeated_crossvalidation
|
367
426
|
OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required
|
368
427
|
end
|
369
428
|
|
429
|
+
# Get crossvalidations
|
430
|
+
# @return [Array<OpenTox::CrossValidation]
|
370
431
|
def crossvalidations
|
371
432
|
repeated_crossvalidation.crossvalidations
|
372
433
|
end
|
373
434
|
|
435
|
+
# Is it a regression model
|
436
|
+
# @return [TrueClass, FalseClass]
|
374
437
|
def regression?
|
375
438
|
model.is_a? LazarRegression
|
376
439
|
end
|
377
440
|
|
441
|
+
# Is it a classification model
|
442
|
+
# @return [TrueClass, FalseClass]
|
378
443
|
def classification?
|
379
444
|
model.is_a? LazarClassification
|
380
445
|
end
|
381
446
|
|
447
|
+
# Create and validate a lazar model from a csv file with training data and a json file with metadata
|
448
|
+
# @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
|
449
|
+
# @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations
|
382
450
|
def self.from_csv_file file
|
383
451
|
metadata_file = file.sub(/csv$/,"json")
|
384
452
|
bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
|
@@ -391,6 +459,12 @@ module OpenTox
|
|
391
459
|
model_validation
|
392
460
|
end
|
393
461
|
|
462
|
+
# Create and validate a nano-lazar model, import data from eNanoMapper if necessary
|
463
|
+
# nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf
|
464
|
+
# @param [OpenTox::Dataset, nil] training_dataset
|
465
|
+
# @param [OpenTox::Feature, nil] prediction_feature
|
466
|
+
# @param [Hash, nil] algorithms
|
467
|
+
# @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
|
394
468
|
def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil
|
395
469
|
|
396
470
|
# find/import training_dataset
|
data/lib/nanoparticle.rb
CHANGED
@@ -1,25 +1,36 @@
|
|
1
1
|
module OpenTox
|
2
2
|
|
3
|
+
# Nanoparticles
|
3
4
|
class Nanoparticle < Substance
|
4
5
|
include OpenTox
|
5
6
|
|
6
7
|
field :core_id, type: String, default: nil
|
7
8
|
field :coating_ids, type: Array, default: []
|
8
9
|
|
10
|
+
# Get core compound
|
11
|
+
# @return [OpenTox::Compound]
|
9
12
|
def core
|
10
13
|
Compound.find core_id
|
11
14
|
end
|
12
15
|
|
16
|
+
# Get coatings
|
17
|
+
# @return [Array<OpenTox::Compound>]
|
13
18
|
def coating
|
14
19
|
coating_ids.collect{|i| Compound.find i }
|
15
20
|
end
|
16
21
|
|
22
|
+
# Get nanoparticle fingerprint (union of core and coating fingerprints)
|
23
|
+
# @param [String] fingerprint type
|
24
|
+
# @return [Array<String>]
|
17
25
|
def fingerprint type=DEFAULT_FINGERPRINT
|
18
26
|
core_fp = core.fingerprint type
|
19
27
|
coating_fp = coating.collect{|c| c.fingerprint type}.flatten.uniq.compact
|
20
28
|
(core_fp.empty? or coating_fp.empty?) ? [] : (core_fp+coating_fp).uniq.compact
|
21
29
|
end
|
22
30
|
|
31
|
+
# Calculate physchem properties
|
32
|
+
# @param [Array<Hash>] list of descriptors
|
33
|
+
# @return [Array<Float>]
|
23
34
|
def calculate_properties descriptors=PhysChem::OPENBABEL
|
24
35
|
if core.smiles and !coating.collect{|c| c.smiles}.compact.empty?
|
25
36
|
core_prop = core.calculate_properties descriptors
|
@@ -28,6 +39,10 @@ module OpenTox
|
|
28
39
|
end
|
29
40
|
end
|
30
41
|
|
42
|
+
# Add (measured) feature values
|
43
|
+
# @param [OpenTox::Feature]
|
44
|
+
# @param [TrueClass,FalseClass,Float]
|
45
|
+
# @param [OpenTox::Dataset]
|
31
46
|
def add_feature feature, value, dataset
|
32
47
|
unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand
|
33
48
|
case feature.category
|
@@ -55,6 +70,10 @@ module OpenTox
|
|
55
70
|
end
|
56
71
|
end
|
57
72
|
|
73
|
+
# Parse values from Ambit database
|
74
|
+
# @param [OpenTox::Feature]
|
75
|
+
# @param [TrueClass,FalseClass,Float]
|
76
|
+
# @param [OpenTox::Dataset]
|
58
77
|
def parse_ambit_value feature, v, dataset
|
59
78
|
# TODO add study id to warnings
|
60
79
|
v.delete "unit"
|
data/lib/overwrite.rb
CHANGED
@@ -2,41 +2,51 @@ require "base64"
|
|
2
2
|
class Object
|
3
3
|
# An object is blank if it's false, empty, or a whitespace string.
|
4
4
|
# For example, "", " ", +nil+, [], and {} are all blank.
|
5
|
+
# @return [TrueClass,FalseClass]
|
5
6
|
def blank?
|
6
7
|
respond_to?(:empty?) ? empty? : !self
|
7
8
|
end
|
8
9
|
|
10
|
+
# Is it a numeric object
|
11
|
+
# @return [TrueClass,FalseClass]
|
9
12
|
def numeric?
|
10
13
|
true if Float(self) rescue false
|
11
14
|
end
|
12
15
|
|
13
16
|
# Returns dimension of nested arrays
|
17
|
+
# @return [Fixnum]
|
14
18
|
def dimension
|
15
19
|
self.class == Array ? 1 + self[0].dimension : 0
|
16
20
|
end
|
17
21
|
end
|
18
22
|
|
19
23
|
class Numeric
|
24
|
+
# Convert number to percent
|
25
|
+
# @return [Float]
|
20
26
|
def percent_of(n)
|
21
27
|
self.to_f / n.to_f * 100.0
|
22
28
|
end
|
23
29
|
end
|
24
30
|
|
25
31
|
class Float
|
26
|
-
#
|
27
|
-
#
|
32
|
+
# Round to n significant digits
|
33
|
+
# http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby
|
34
|
+
# @param [Fixnum]
|
35
|
+
# @return [Float]
|
28
36
|
def signif(n)
|
29
37
|
Float("%.#{n}g" % self)
|
30
38
|
end
|
31
39
|
|
32
|
-
#
|
40
|
+
# Convert -10 log values to original values
|
41
|
+
# @return [Float]
|
33
42
|
def delog10
|
34
43
|
10**(-1*self)
|
35
44
|
end
|
36
45
|
end
|
37
46
|
|
38
47
|
module Enumerable
|
39
|
-
#
|
48
|
+
# Get duplicates
|
49
|
+
# @return [Array]
|
40
50
|
def duplicates
|
41
51
|
inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys
|
42
52
|
end
|
@@ -51,7 +61,10 @@ module Enumerable
|
|
51
61
|
end
|
52
62
|
|
53
63
|
class String
|
54
|
-
#
|
64
|
+
# Convert camel-case to underscore-case
|
65
|
+
# @example
|
66
|
+
# OpenTox::SuperModel -> open_tox/super_model
|
67
|
+
# @return [String]
|
55
68
|
def underscore
|
56
69
|
self.gsub(/::/, '/').
|
57
70
|
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
@@ -60,7 +73,7 @@ class String
|
|
60
73
|
downcase
|
61
74
|
end
|
62
75
|
|
63
|
-
#
|
76
|
+
# Convert strings to boolean values
|
64
77
|
# @return [TrueClass,FalseClass] true or false
|
65
78
|
def to_boolean
|
66
79
|
return true if self == true || self =~ (/(true|t|yes|y|1)$/i)
|
@@ -71,7 +84,8 @@ class String
|
|
71
84
|
end
|
72
85
|
|
73
86
|
class File
|
74
|
-
#
|
87
|
+
# Get mime_type including charset using linux file command
|
88
|
+
# @return [String]
|
75
89
|
def mime_type
|
76
90
|
`file -ib '#{self.path}'`.chomp
|
77
91
|
end
|
@@ -79,7 +93,7 @@ end
|
|
79
93
|
|
80
94
|
class Array
|
81
95
|
|
82
|
-
# Sum
|
96
|
+
# Sum the size of single arrays in an array of arrays
|
83
97
|
# @param [Array] Array of arrays
|
84
98
|
# @return [Integer] Sum of size of array elements
|
85
99
|
def sum_size
|
@@ -92,33 +106,43 @@ class Array
|
|
92
106
|
}
|
93
107
|
end
|
94
108
|
|
95
|
-
#
|
109
|
+
# Check if the array has just one unique value.
|
96
110
|
# @param [Array] Array to test.
|
97
|
-
# @return [
|
111
|
+
# @return [TrueClass,FalseClass]
|
98
112
|
def zero_variance?
|
99
113
|
return self.uniq.size == 1
|
100
114
|
end
|
101
115
|
|
116
|
+
# Get the median of an array
|
117
|
+
# @return [Numeric]
|
102
118
|
def median
|
103
119
|
sorted = self.sort
|
104
120
|
len = sorted.length
|
105
121
|
(sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0
|
106
122
|
end
|
107
123
|
|
124
|
+
# Get the mean of an array
|
125
|
+
# @return [Numeric]
|
108
126
|
def mean
|
109
127
|
self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size
|
110
128
|
end
|
111
129
|
|
130
|
+
# Get the variance of an array
|
131
|
+
# @return [Numeric]
|
112
132
|
def sample_variance
|
113
133
|
m = self.mean
|
114
134
|
sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 }
|
115
135
|
sum/(self.compact.length - 1).to_f
|
116
136
|
end
|
117
137
|
|
138
|
+
# Get the standard deviation of an array
|
139
|
+
# @return [Numeric]
|
118
140
|
def standard_deviation
|
119
141
|
Math.sqrt(self.sample_variance)
|
120
142
|
end
|
121
143
|
|
144
|
+
# Convert array values for R
|
145
|
+
# @return [Array]
|
122
146
|
def for_R
|
123
147
|
if self.first.is_a?(String)
|
124
148
|
#"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets
|
@@ -128,6 +152,8 @@ class Array
|
|
128
152
|
end
|
129
153
|
end
|
130
154
|
|
155
|
+
# Collect array with index
|
156
|
+
# in analogy to each_with_index
|
131
157
|
def collect_with_index
|
132
158
|
result = []
|
133
159
|
self.each_with_index do |elt, idx|
|
@@ -139,11 +165,15 @@ end
|
|
139
165
|
|
140
166
|
module URI
|
141
167
|
|
168
|
+
# Is it a https connection
|
169
|
+
# @param [String]
|
170
|
+
# @return [TrueClass,FalseClass]
|
142
171
|
def self.ssl? uri
|
143
172
|
URI.parse(uri).instance_of? URI::HTTPS
|
144
173
|
end
|
145
174
|
|
146
|
-
#
|
175
|
+
# Check if a http resource exists by making a HEAD-request
|
176
|
+
# @return [TrueClass,FalseClass]
|
147
177
|
def self.accessible?(uri)
|
148
178
|
parsed_uri = URI.parse(uri + (OpenTox::RestClientWrapper.subjectid ? "?subjectid=#{CGI.escape OpenTox::RestClientWrapper.subjectid}" : ""))
|
149
179
|
http_code = URI.task?(uri) ? 600 : 400
|
@@ -163,6 +193,9 @@ module URI
|
|
163
193
|
false
|
164
194
|
end
|
165
195
|
|
196
|
+
# Is the URI valid
|
197
|
+
# @param [String]
|
198
|
+
# @return [TrueClass,FalseClass]
|
166
199
|
def self.valid? uri
|
167
200
|
u = URI.parse(uri)
|
168
201
|
u.scheme!=nil and u.host!=nil
|
@@ -170,6 +203,8 @@ module URI
|
|
170
203
|
false
|
171
204
|
end
|
172
205
|
|
206
|
+
# Is the URI a task URI
|
207
|
+
# @param [String]
|
173
208
|
def self.task? uri
|
174
209
|
uri =~ /task/ and URI.valid? uri
|
175
210
|
end
|
data/lib/physchem.rb
CHANGED
@@ -39,6 +39,9 @@ module OpenTox
|
|
39
39
|
|
40
40
|
require_relative "unique_descriptors.rb"
|
41
41
|
|
42
|
+
# Get descriptor features
|
43
|
+
# @param [Hash]
|
44
|
+
# @return [Array<OpenTox::PhysChem>]
|
42
45
|
def self.descriptors desc=DESCRIPTORS
|
43
46
|
desc.collect do |name,description|
|
44
47
|
lib,desc = name.split('.',2)
|
@@ -46,6 +49,8 @@ module OpenTox
|
|
46
49
|
end
|
47
50
|
end
|
48
51
|
|
52
|
+
# Get unique descriptor features
|
53
|
+
# @return [Array<OpenTox::PhysChem>]
|
49
54
|
def self.unique_descriptors
|
50
55
|
udesc = []
|
51
56
|
UNIQUEDESCRIPTORS.each do |name|
|
@@ -64,23 +69,28 @@ module OpenTox
|
|
64
69
|
udesc
|
65
70
|
end
|
66
71
|
|
72
|
+
# Get OpenBabel descriptor features
|
73
|
+
# @return [Array<OpenTox::PhysChem>]
|
67
74
|
def self.openbabel_descriptors
|
68
75
|
descriptors OPENBABEL
|
69
76
|
end
|
70
77
|
|
78
|
+
# Get CDK descriptor features
|
79
|
+
# @return [Array<OpenTox::PhysChem>]
|
71
80
|
def self.cdk_descriptors
|
72
81
|
descriptors CDK
|
73
82
|
end
|
74
83
|
|
84
|
+
# Get JOELIB descriptor features
|
85
|
+
# @return [Array<OpenTox::PhysChem>]
|
75
86
|
def self.joelib_descriptors
|
76
87
|
descriptors JOELIB
|
77
88
|
end
|
78
89
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
90
|
+
# Calculate OpenBabel descriptors
|
91
|
+
# @param [String] descriptor type
|
92
|
+
# @param [OpenTox::Compound]
|
93
|
+
# @return [Hash]
|
84
94
|
def openbabel descriptor, compound
|
85
95
|
obdescriptor = OpenBabel::OBDescriptor.find_type descriptor
|
86
96
|
obmol = OpenBabel::OBMol.new
|
@@ -90,10 +100,18 @@ module OpenTox
|
|
90
100
|
{"#{library.capitalize}.#{descriptor}" => fix_value(obdescriptor.predict(obmol))}
|
91
101
|
end
|
92
102
|
|
103
|
+
# Calculate CDK descriptors
|
104
|
+
# @param [String] descriptor type
|
105
|
+
# @param [OpenTox::Compound]
|
106
|
+
# @return [Hash]
|
93
107
|
def cdk descriptor, compound
|
94
108
|
java_descriptor "cdk", descriptor, compound
|
95
109
|
end
|
96
110
|
|
111
|
+
# Calculate JOELIB descriptors
|
112
|
+
# @param [String] descriptor type
|
113
|
+
# @param [OpenTox::Compound]
|
114
|
+
# @return [Hash]
|
97
115
|
def joelib descriptor, compound
|
98
116
|
java_descriptor "joelib", descriptor, compound
|
99
117
|
end
|
data/lib/regression.rb
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
module OpenTox
|
2
2
|
module Algorithm
|
3
3
|
|
4
|
+
# Regression algorithms
|
4
5
|
class Regression
|
5
6
|
|
7
|
+
# Weighted average
|
8
|
+
# @param [Array<TrueClass,FalseClass>] dependent_variables
|
9
|
+
# @param [Array<Float>] weights
|
10
|
+
# @return [Hash]
|
6
11
|
def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:nil
|
7
12
|
# TODO: prediction_interval
|
8
13
|
weighted_sum = 0.0
|
data/lib/rest-client-wrapper.rb
CHANGED
data/lib/similarity.rb
CHANGED
@@ -2,6 +2,10 @@ module OpenTox
|
|
2
2
|
module Algorithm
|
3
3
|
|
4
4
|
class Vector
|
5
|
+
# Get dot product
|
6
|
+
# @param [Vector]
|
7
|
+
# @param [Vector]
|
8
|
+
# @return [Numeric]
|
5
9
|
def self.dot_product(a, b)
|
6
10
|
products = a.zip(b).map{|a, b| a * b}
|
7
11
|
products.inject(0) {|s,p| s + p}
|
@@ -15,6 +19,9 @@ module OpenTox
|
|
15
19
|
|
16
20
|
class Similarity
|
17
21
|
|
22
|
+
# Get Tanimoto similarity
|
23
|
+
# @param [Array<Array<Float>>]
|
24
|
+
# @return [Float]
|
18
25
|
def self.tanimoto fingerprints
|
19
26
|
( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
|
20
27
|
end
|
@@ -23,18 +30,28 @@ module OpenTox
|
|
23
30
|
#( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
|
24
31
|
#end
|
25
32
|
|
33
|
+
# Get Euclidean distance
|
34
|
+
# @param [Array<Array<Float>>]
|
35
|
+
# @return [Float]
|
26
36
|
def self.euclid scaled_properties
|
27
37
|
sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2}
|
28
38
|
Math.sqrt(sq.inject(0) {|s,c| s + c})
|
29
39
|
end
|
30
40
|
|
31
|
-
#
|
41
|
+
# Get cosine similarity
|
42
|
+
# http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
|
43
|
+
# @param [Array<Array<Float>>]
|
44
|
+
# @return [Float]
|
32
45
|
def self.cosine scaled_properties
|
33
46
|
scaled_properties = remove_nils scaled_properties
|
34
47
|
Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1]))
|
35
48
|
end
|
36
49
|
|
37
|
-
|
50
|
+
# Get weighted cosine similarity
|
51
|
+
# http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
|
52
|
+
# @param [Array<Array<Float>>] [a,b,weights]
|
53
|
+
# @return [Float]
|
54
|
+
def self.weighted_cosine scaled_properties
|
38
55
|
a,b,w = remove_nils scaled_properties
|
39
56
|
return cosine(scaled_properties) if w.uniq.size == 1
|
40
57
|
dot_product = 0
|
@@ -48,6 +65,9 @@ module OpenTox
|
|
48
65
|
dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b))
|
49
66
|
end
|
50
67
|
|
68
|
+
# Remove nil values
|
69
|
+
# @param [Array<Array<Float>>] [a,b,weights]
|
70
|
+
# @return [Array<Array<Float>>] [a,b,weights]
|
51
71
|
def self.remove_nils scaled_properties
|
52
72
|
a =[]; b = []; w = []
|
53
73
|
(0..scaled_properties.first.size-1).each do |i|
|
data/lib/substance.rb
CHANGED
@@ -2,11 +2,17 @@ module OpenTox
|
|
2
2
|
|
3
3
|
module Validation
|
4
4
|
|
5
|
+
# Training test set validation
|
5
6
|
class TrainTest < Validation
|
6
7
|
|
7
8
|
field :training_dataset_id, type: BSON::ObjectId
|
8
9
|
field :test_dataset_id, type: BSON::ObjectId
|
9
10
|
|
11
|
+
# Create a training test set validation
|
12
|
+
# @param [OpenTox::Model::Lazar]
|
13
|
+
# @param [OpenTox::Dataset] training dataset
|
14
|
+
# @param [OpenTox::Dataset] test dataset
|
15
|
+
# @return [OpenTox::Validation::TrainTest]
|
10
16
|
def self.create model, training_set, test_set
|
11
17
|
|
12
18
|
validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
|
@@ -32,16 +38,21 @@ module OpenTox
|
|
32
38
|
validation
|
33
39
|
end
|
34
40
|
|
41
|
+
# Get test dataset
|
42
|
+
# @return [OpenTox::Dataset]
|
35
43
|
def test_dataset
|
36
44
|
Dataset.find test_dataset_id
|
37
45
|
end
|
38
46
|
|
47
|
+
# Get training dataset
|
48
|
+
# @return [OpenTox::Dataset]
|
39
49
|
def training_dataset
|
40
50
|
Dataset.find training_dataset_id
|
41
51
|
end
|
42
52
|
|
43
53
|
end
|
44
54
|
|
55
|
+
# Training test set validation for classification models
|
45
56
|
class ClassificationTrainTest < TrainTest
|
46
57
|
include ClassificationStatistics
|
47
58
|
field :accept_values, type: Array
|
@@ -54,6 +65,7 @@ module OpenTox
|
|
54
65
|
field :probability_plot_id, type: BSON::ObjectId
|
55
66
|
end
|
56
67
|
|
68
|
+
# Training test set validation for regression models
|
57
69
|
class RegressionTrainTest < TrainTest
|
58
70
|
include RegressionStatistics
|
59
71
|
field :rmse, type: Float, default:0
|