lazar 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +98 -2
- data/VERSION +1 -1
- data/lib/caret.rb +5 -6
- data/lib/classification.rb +5 -0
- data/lib/crossvalidation.rb +1 -0
- data/lib/dataset.rb +1 -1
- data/lib/lazar.rb +5 -2
- data/lib/leave-one-out-validation.rb +1 -0
- data/lib/model.rb +30 -18
- data/lib/regression.rb +1 -1
- data/lib/train-test-validation.rb +2 -0
- data/lib/unique_descriptors.rb +2 -1
- data/lib/validation-statistics.rb +24 -46
- data/test/dataset.rb +1 -1
- data/test/feature.rb +5 -5
- data/test/model-classification.rb +5 -3
- data/test/model-regression.rb +14 -14
- data/test/model-validation.rb +1 -1
- data/test/setup.rb +2 -0
- data/test/validation-classification.rb +1 -1
- data/test/validation-regression.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 698fd96821077269f4c31fdfca6bead6beab36f0
|
4
|
+
data.tar.gz: 2fd49abf99e8f5367b83764735c5c2e49caad4d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ef5d243c765f5d1d6bb4c2dbd53bf2cdcca380df532cef8c48447b907ff01086f1b50c261e1f494827a30e52e7d8e62e2c077334cdbe30370546680ffd018886
|
7
|
+
data.tar.gz: d9bb905832388dcb44bb3211fb6976c4e0894d75eb02fdf60c17fe19f25dec73ba24b5585fea8cb85e77b6793b46518462cf1d9b2488c465332b9db3af132028
|
data/README.md
CHANGED
@@ -59,7 +59,75 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
|
|
59
59
|
|
60
60
|
#### Experiment with other algorithms
|
61
61
|
|
62
|
-
You can pass
|
62
|
+
You can pass algorithm specifications as parameters to the `Model::Validation.create_from_csv_file` and `Model::Lazar.create` commands. Algorithms for descriptors, similarity calculations, feature_selection and local models are specified in the `algorithm` parameter. Unspecified algorithms and parameters are substituted by default values. The example below selects
|
63
|
+
|
64
|
+
- MP2D fingerprint descriptors
|
65
|
+
- Tanimoto similarity with a threshold of 0.1
|
66
|
+
- no feature selection
|
67
|
+
- weighted majority vote predictions
|
68
|
+
|
69
|
+
```
|
70
|
+
algorithms = {
|
71
|
+
:descriptors => { # descriptor algorithm
|
72
|
+
:method => "fingerprint", # fingerprint descriptors
|
73
|
+
:type => "MP2D" # fingerprint type, e.g. FP4, MACCS
|
74
|
+
},
|
75
|
+
:similarity => { # similarity algorithm
|
76
|
+
:method => "Algorithm::Similarity.tanimoto",
|
77
|
+
:min => 0.1 # similarity threshold for neighbors
|
78
|
+
},
|
79
|
+
:feature_selection => nil, # no feature selection
|
80
|
+
:prediction => { # local modelling algorithm
|
81
|
+
:method => "Algorithm::Classification.weighted_majority_vote",
|
82
|
+
},
|
83
|
+
}
|
84
|
+
|
85
|
+
training_dataset = Dataset.from_csv_file "hamster_carcinogenicity.csv"
|
86
|
+
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
87
|
+
```
|
88
|
+
|
89
|
+
The next example creates a regression model with
|
90
|
+
|
91
|
+
- calculated descriptors from OpenBabel libraries
|
92
|
+
- weighted cosine similarity and a threshold of 0.5
|
93
|
+
- descriptors that are correlated with the endpoint
|
94
|
+
- local partial least squares models from the R caret package
|
95
|
+
|
96
|
+
```
|
97
|
+
algorithms = {
|
98
|
+
:descriptors => { # descriptor algorithm
|
99
|
+
:method => "calculate_properties",
|
100
|
+
:features => PhysChem.openbabel_descriptors,
|
101
|
+
},
|
102
|
+
:similarity => { # similarity algorithm
|
103
|
+
:method => "Algorithm::Similarity.weighted_cosine",
|
104
|
+
:min => 0.5
|
105
|
+
},
|
106
|
+
:feature_selection => { # feature selection algorithm
|
107
|
+
:method => "Algorithm::FeatureSelection.correlation_filter",
|
108
|
+
},
|
109
|
+
:prediction => { # local modelling algorithm
|
110
|
+
:method => "Algorithm::Caret.pls",
|
111
|
+
},
|
112
|
+
}
|
113
|
+
training_dataset = Dataset.from_csv_file "EPAFHM_log10.csv"
|
114
|
+
model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
|
115
|
+
```
|
116
|
+
|
117
|
+
Please consult the [API documentation](http://rdoc.info/gems/lazar) and [source code](https:://github.com/opentox/lazar) for up to date information about implemented algorithms:
|
118
|
+
|
119
|
+
- Descriptor algorithms
|
120
|
+
- [Compounds](http://www.rubydoc.info/gems/lazar/OpenTox/Compound)
|
121
|
+
- [Nanoparticles](http://www.rubydoc.info/gems/lazar/OpenTox/Nanoparticle)
|
122
|
+
- [Similarity algorithms](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Similarity)
|
123
|
+
- [Feature selection algorithms](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/FeatureSelection)
|
124
|
+
- Local models
|
125
|
+
- [Classification](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Classification)
|
126
|
+
- [Regression](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Regression)
|
127
|
+
- [R caret](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Caret)
|
128
|
+
|
129
|
+
|
130
|
+
You can find more working examples in the `lazar` `model-*.rb` and `validation-*.rb` [tests](https://github.com/opentox/lazar/tree/master/test).
|
63
131
|
|
64
132
|
### Create and use `lazar` nanoparticle models
|
65
133
|
|
@@ -87,7 +155,35 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
|
|
87
155
|
|
88
156
|
#### Experiment with other datasets, endpoints and algorithms
|
89
157
|
|
90
|
-
You can pass training_dataset, prediction_feature and algorithms parameters to the `Model::Validation.create_from_enanomapper` command.
|
158
|
+
You can pass training_dataset, prediction_feature and algorithms parameters to the `Model::Validation.create_from_enanomapper` command. Procedure and options are the same as for compounds. The following commands create and validate a `nano-lazar` model with
|
159
|
+
|
160
|
+
- measured P-CHEM properties as descriptors
|
161
|
+
- descriptors selected with correlation filter
|
162
|
+
- weighted cosine similarity with a threshold of 0.5
|
163
|
+
- Caret random forests
|
164
|
+
|
165
|
+
```
|
166
|
+
algorithms = {
|
167
|
+
:descriptors => {
|
168
|
+
:method => "properties",
|
169
|
+
:categories => ["P-CHEM"],
|
170
|
+
},
|
171
|
+
:similarity => {
|
172
|
+
:method => "Algorithm::Similarity.weighted_cosine",
|
173
|
+
:min => 0.5
|
174
|
+
},
|
175
|
+
:feature_selection => {
|
176
|
+
:method => "Algorithm::FeatureSelection.correlation_filter",
|
177
|
+
},
|
178
|
+
:prediction => {
|
179
|
+
:method => "Algorithm::Caret.rf",
|
180
|
+
},
|
181
|
+
}
|
182
|
+
validation_model = Model::Validation.from_enanomapper algorithms: algorithms
|
183
|
+
```
|
184
|
+
|
185
|
+
|
186
|
+
Detailed documentation and validation results for nanoparticle models can be found in this [publication](https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf).
|
91
187
|
|
92
188
|
Documentation
|
93
189
|
-------------
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.0
|
1
|
+
1.1.0
|
data/lib/caret.rb
CHANGED
@@ -22,12 +22,11 @@ module OpenTox
|
|
22
22
|
end
|
23
23
|
if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == []
|
24
24
|
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
25
|
-
prediction[:
|
25
|
+
prediction[:warnings] << "No variables for regression model. Using weighted average of similar substances."
|
26
26
|
elsif
|
27
27
|
dependent_variables.size < 3
|
28
28
|
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
29
|
-
prediction[:
|
30
|
-
|
29
|
+
prediction[:warnings] << "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
|
31
30
|
else
|
32
31
|
dependent_variables.each_with_index do |v,i|
|
33
32
|
dependent_variables[i] = to_r(v)
|
@@ -52,7 +51,7 @@ module OpenTox
|
|
52
51
|
$logger.debug dependent_variables
|
53
52
|
$logger.debug independent_variables
|
54
53
|
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
55
|
-
prediction[:
|
54
|
+
prediction[:warnings] << "R caret model creation error. Using weighted average of similar substances."
|
56
55
|
return prediction
|
57
56
|
end
|
58
57
|
begin
|
@@ -73,12 +72,12 @@ module OpenTox
|
|
73
72
|
$logger.debug "R caret prediction error for:"
|
74
73
|
$logger.debug self.inspect
|
75
74
|
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
76
|
-
prediction[:
|
75
|
+
prediction[:warnings] << "R caret prediction error. Using weighted average of similar substances"
|
77
76
|
return prediction
|
78
77
|
end
|
79
78
|
if prediction.nil? or prediction[:value].nil?
|
80
79
|
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
81
|
-
prediction[:
|
80
|
+
prediction[:warnings] << "Empty R caret prediction. Using weighted average of similar substances."
|
82
81
|
end
|
83
82
|
end
|
84
83
|
prediction
|
data/lib/classification.rb
CHANGED
@@ -18,6 +18,11 @@ module OpenTox
|
|
18
18
|
class_weights.each do |a,w|
|
19
19
|
probabilities[a] = w.sum/weights.sum
|
20
20
|
end
|
21
|
+
# DG: hack to ensure always two probability values
|
22
|
+
if probabilities.keys.uniq.size == 1
|
23
|
+
missing_key = probabilities.keys.uniq[0].match(/^non/) ? probabilities.keys.uniq[0].sub(/non-/,"") : "non-"+probabilities.keys.uniq[0]
|
24
|
+
probabilities[missing_key] = 0.0
|
25
|
+
end
|
21
26
|
probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
|
22
27
|
p_max = probabilities.collect{|a,p| p}.max
|
23
28
|
prediction = probabilities.key(p_max)
|
data/lib/crossvalidation.rb
CHANGED
@@ -90,6 +90,7 @@ module OpenTox
|
|
90
90
|
field :within_prediction_interval, type: Integer, default:0
|
91
91
|
field :out_of_prediction_interval, type: Integer, default:0
|
92
92
|
field :correlation_plot_id, type: BSON::ObjectId
|
93
|
+
field :warnings, type: Array
|
93
94
|
end
|
94
95
|
|
95
96
|
# Independent repeated crossvalidations
|
data/lib/dataset.rb
CHANGED
data/lib/lazar.rb
CHANGED
@@ -16,16 +16,19 @@ raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', ple
|
|
16
16
|
|
17
17
|
ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"]
|
18
18
|
ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment
|
19
|
+
# search for a central mongo database in use
|
20
|
+
# http://opentox.github.io/installation/2017/03/07/use-central-mongodb-in-docker-environment
|
21
|
+
CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp
|
19
22
|
Mongoid.load_configuration({
|
20
23
|
:clients => {
|
21
24
|
:default => {
|
22
25
|
:database => ENV["LAZAR_ENV"],
|
23
|
-
:hosts => ["localhost:27017"],
|
26
|
+
:hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]),
|
24
27
|
}
|
25
28
|
}
|
26
29
|
})
|
27
30
|
Mongoid.raise_not_found_error = false # return nil if no document is found
|
28
|
-
$mongo = Mongo::Client.new("mongodb
|
31
|
+
$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}")
|
29
32
|
$gridfs = $mongo.database.fs
|
30
33
|
|
31
34
|
# Logger setup
|
data/lib/model.rb
CHANGED
@@ -57,7 +57,7 @@ module OpenTox
|
|
57
57
|
model.version = {:warning => "git is not installed"}
|
58
58
|
end
|
59
59
|
|
60
|
-
# set defaults
|
60
|
+
# set defaults#
|
61
61
|
substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
|
62
62
|
bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
|
63
63
|
|
@@ -68,10 +68,6 @@ module OpenTox
|
|
68
68
|
:method => "fingerprint",
|
69
69
|
:type => "MP2D",
|
70
70
|
},
|
71
|
-
:similarity => {
|
72
|
-
:method => "Algorithm::Similarity.tanimoto",
|
73
|
-
:min => 0.1
|
74
|
-
},
|
75
71
|
:feature_selection => nil
|
76
72
|
}
|
77
73
|
|
@@ -79,9 +75,17 @@ module OpenTox
|
|
79
75
|
model.algorithms[:prediction] = {
|
80
76
|
:method => "Algorithm::Classification.weighted_majority_vote",
|
81
77
|
}
|
78
|
+
model.algorithms[:similarity] = {
|
79
|
+
:method => "Algorithm::Similarity.tanimoto",
|
80
|
+
:min => 0.1,
|
81
|
+
}
|
82
82
|
elsif model.class == LazarRegression
|
83
83
|
model.algorithms[:prediction] = {
|
84
|
-
:method => "Algorithm::Caret.
|
84
|
+
:method => "Algorithm::Caret.rf",
|
85
|
+
}
|
86
|
+
model.algorithms[:similarity] = {
|
87
|
+
:method => "Algorithm::Similarity.tanimoto",
|
88
|
+
:min => 0.5,
|
85
89
|
}
|
86
90
|
end
|
87
91
|
|
@@ -93,7 +97,7 @@ module OpenTox
|
|
93
97
|
},
|
94
98
|
:similarity => {
|
95
99
|
:method => "Algorithm::Similarity.weighted_cosine",
|
96
|
-
:min => 0.5
|
100
|
+
:min => 0.5,
|
97
101
|
},
|
98
102
|
:prediction => {
|
99
103
|
:method => "Algorithm::Caret.rf",
|
@@ -141,7 +145,6 @@ module OpenTox
|
|
141
145
|
end
|
142
146
|
model.descriptor_ids = model.fingerprints.flatten.uniq
|
143
147
|
model.descriptor_ids.each do |d|
|
144
|
-
# resulting model may break BSON size limit (e.g. f Kazius dataset)
|
145
148
|
model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
|
146
149
|
end
|
147
150
|
# calculate physchem properties
|
@@ -191,7 +194,7 @@ module OpenTox
|
|
191
194
|
# Predict a substance (compound or nanoparticle)
|
192
195
|
# @param [OpenTox::Substance]
|
193
196
|
# @return [Hash]
|
194
|
-
def predict_substance substance
|
197
|
+
def predict_substance substance, threshold = self.algorithms[:similarity][:min]
|
195
198
|
|
196
199
|
@independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
|
197
200
|
case algorithms[:similarity][:method]
|
@@ -221,20 +224,19 @@ module OpenTox
|
|
221
224
|
bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
|
222
225
|
end
|
223
226
|
|
224
|
-
prediction = {}
|
227
|
+
prediction = {:warnings => [], :measurements => []}
|
228
|
+
prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min]
|
225
229
|
neighbor_ids = []
|
226
230
|
neighbor_similarities = []
|
227
231
|
neighbor_dependent_variables = []
|
228
232
|
neighbor_independent_variables = []
|
229
233
|
|
230
|
-
prediction = {}
|
231
234
|
# find neighbors
|
232
235
|
substance_ids.each_with_index do |s,i|
|
233
236
|
# handle query substance
|
234
237
|
if substance.id.to_s == s
|
235
|
-
prediction[:measurements] ||= []
|
236
238
|
prediction[:measurements] << dependent_variables[i]
|
237
|
-
prediction[:
|
239
|
+
prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
|
238
240
|
else
|
239
241
|
if fingerprints?
|
240
242
|
neighbor_descriptors = fingerprints[i]
|
@@ -243,7 +245,7 @@ module OpenTox
|
|
243
245
|
neighbor_descriptors = scaled_variables.collect{|v| v[i]}
|
244
246
|
end
|
245
247
|
sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
|
246
|
-
if sim >=
|
248
|
+
if sim >= threshold
|
247
249
|
neighbor_ids << s
|
248
250
|
neighbor_similarities << sim
|
249
251
|
neighbor_dependent_variables << dependent_variables[i]
|
@@ -258,17 +260,27 @@ module OpenTox
|
|
258
260
|
measurements = nil
|
259
261
|
|
260
262
|
if neighbor_similarities.empty?
|
261
|
-
prediction
|
263
|
+
prediction[:value] = nil
|
264
|
+
prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset."
|
262
265
|
elsif neighbor_similarities.size == 1
|
263
|
-
prediction
|
266
|
+
prediction[:value] = nil
|
267
|
+
prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set."
|
268
|
+
prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]
|
264
269
|
else
|
265
270
|
query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
|
266
271
|
# call prediction algorithm
|
267
272
|
result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
|
268
273
|
prediction.merge! result
|
269
274
|
prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
|
275
|
+
#if neighbor_similarities.max < algorithms[:similarity][:warn_min]
|
276
|
+
#prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain."
|
277
|
+
#end
|
278
|
+
end
|
279
|
+
if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
|
280
|
+
prediction
|
281
|
+
else # try again with a lower threshold
|
282
|
+
predict_substance substance, 0.2
|
270
283
|
end
|
271
|
-
prediction
|
272
284
|
end
|
273
285
|
|
274
286
|
# Predict a substance (compound or nanoparticle), an array of substances or a dataset
|
@@ -300,7 +312,7 @@ module OpenTox
|
|
300
312
|
# serialize result
|
301
313
|
if object.is_a? Substance
|
302
314
|
prediction = predictions[substances.first.id.to_s]
|
303
|
-
prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
|
315
|
+
prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity
|
304
316
|
return prediction
|
305
317
|
elsif object.is_a? Array
|
306
318
|
return predictions
|
data/lib/regression.rb
CHANGED
@@ -17,7 +17,7 @@ module OpenTox
|
|
17
17
|
sim_sum += weights[i]
|
18
18
|
end if dependent_variables
|
19
19
|
sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
|
20
|
-
{:value => prediction}
|
20
|
+
{:value => prediction, :warnings => ["Weighted average prediction, no prediction interval available."]}
|
21
21
|
end
|
22
22
|
|
23
23
|
end
|
@@ -27,6 +27,8 @@ module OpenTox
|
|
27
27
|
end
|
28
28
|
end
|
29
29
|
predictions.select!{|cid,p| p[:value] and p[:measurements]}
|
30
|
+
# hack to avoid mongos file size limit error on large datasets
|
31
|
+
#predictions.each{|cid,p| p[:neighbors] = []} if model.training_dataset.name.match(/mutagenicity/i)
|
30
32
|
validation = self.new(
|
31
33
|
:model_id => validation_model.id,
|
32
34
|
:test_dataset_id => test_set.id,
|
data/lib/unique_descriptors.rb
CHANGED
@@ -48,7 +48,8 @@ UNIQUEDESCRIPTORS = [
|
|
48
48
|
#"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors.
|
49
49
|
#"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors.
|
50
50
|
"Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states.
|
51
|
-
|
51
|
+
# TODO check why the next descriptor is not present in the CDK_DESCRIPTIONS variable.
|
52
|
+
#"Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
|
52
53
|
"Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices.
|
53
54
|
"Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments
|
54
55
|
"Cdk.LargestChain", #Returns the number of atoms in the largest chain
|
@@ -111,6 +111,7 @@ module OpenTox
|
|
111
111
|
# Get statistics
|
112
112
|
# @return [Hash]
|
113
113
|
def statistics
|
114
|
+
self.warnings = []
|
114
115
|
self.rmse = 0
|
115
116
|
self.mae = 0
|
116
117
|
self.within_prediction_interval = 0
|
@@ -132,8 +133,10 @@ module OpenTox
|
|
132
133
|
end
|
133
134
|
end
|
134
135
|
else
|
135
|
-
|
136
|
-
|
136
|
+
trd_id = model.training_dataset_id
|
137
|
+
smiles = Compound.find(cid).smiles
|
138
|
+
self.warnings << "No training activities for #{smiles} in training dataset #{trd_id}."
|
139
|
+
$logger.debug "No training activities for #{smiles} in training dataset #{trd_id}."
|
137
140
|
end
|
138
141
|
end
|
139
142
|
R.assign "measurement", x
|
@@ -146,6 +149,7 @@ module OpenTox
|
|
146
149
|
$logger.debug "RMSE #{rmse}"
|
147
150
|
$logger.debug "MAE #{mae}"
|
148
151
|
$logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval"
|
152
|
+
$logger.debug "#{warnings}"
|
149
153
|
save
|
150
154
|
{
|
151
155
|
:mae => mae,
|
@@ -179,8 +183,12 @@ module OpenTox
|
|
179
183
|
R.assign "prediction", y
|
180
184
|
R.eval "all = c(measurement,prediction)"
|
181
185
|
R.eval "range = c(min(all), max(all))"
|
182
|
-
|
183
|
-
|
186
|
+
if feature.name.match /Net cell association/ # ad hoc fix for awkward units
|
187
|
+
title = "log2(Net cell association [mL/ug(Mg)])"
|
188
|
+
else
|
189
|
+
title = feature.name
|
190
|
+
title += " [#{feature.unit}]" if feature.unit and !feature.unit.blank?
|
191
|
+
end
|
184
192
|
R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
|
185
193
|
R.eval "image = image + geom_abline(intercept=0, slope=1)"
|
186
194
|
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
@@ -191,51 +199,21 @@ module OpenTox
|
|
191
199
|
$gridfs.find_one(_id: correlation_plot_id).data
|
192
200
|
end
|
193
201
|
|
194
|
-
# Get predictions with
|
195
|
-
# @params [Fixnum] number of predictions
|
196
|
-
# @params [TrueClass,FalseClass,nil] include neighbors
|
197
|
-
# @params [TrueClass,FalseClass,nil] show common descriptors
|
202
|
+
# Get predictions with measurements outside of the prediction interval
|
198
203
|
# @return [Hash]
|
199
|
-
def worst_predictions
|
200
|
-
worst_predictions = predictions.
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
f=Feature.find(d)
|
210
|
-
{
|
211
|
-
:id => f.id.to_s,
|
212
|
-
:name => "#{f.name} (#{f.conditions})",
|
213
|
-
:p_value => d[:p_value],
|
214
|
-
:r_squared => d[:r_squared],
|
215
|
-
}
|
216
|
-
end
|
217
|
-
else
|
218
|
-
common_descriptors = n["common_descriptors"].size
|
219
|
-
end
|
220
|
-
{
|
221
|
-
:name => Substance.find(n["_id"]).name,
|
222
|
-
:id => n["_id"].to_s,
|
223
|
-
:common_descriptors => common_descriptors
|
224
|
-
}
|
225
|
-
end
|
226
|
-
else
|
227
|
-
neighbors = prediction["neighbors"].size
|
204
|
+
def worst_predictions
|
205
|
+
worst_predictions = predictions.select do |sid,p|
|
206
|
+
p["prediction_interval"] and p["value"] and (p["measurements"].max < p["prediction_interval"][0] or p["measurements"].min > p["prediction_interval"][1])
|
207
|
+
end.compact.to_h
|
208
|
+
worst_predictions.each do |sid,p|
|
209
|
+
p["error"] = (p["value"] - p["measurements"].median).abs
|
210
|
+
if p["measurements"].max < p["prediction_interval"][0]
|
211
|
+
p["distance_prediction_interval"] = (p["measurements"].max - p["prediction_interval"][0]).abs
|
212
|
+
elsif p["measurements"].min > p["prediction_interval"][1]
|
213
|
+
p["distance_prediction_interval"] = (p["measurements"].min - p["prediction_interval"][1]).abs
|
228
214
|
end
|
229
|
-
{
|
230
|
-
:id => substance.id.to_s,
|
231
|
-
:name => substance.name,
|
232
|
-
:feature => Feature.find(prediction["prediction_feature_id"]).name,
|
233
|
-
:error => (prediction["value"] - prediction["measurements"].median).abs,
|
234
|
-
:prediction => prediction["value"],
|
235
|
-
:measurements => prediction["measurements"],
|
236
|
-
:neighbors => neighbors
|
237
|
-
}
|
238
215
|
end
|
216
|
+
worst_predictions.sort_by{|sid,p| p["distance_prediction_interval"] }.to_h
|
239
217
|
end
|
240
218
|
end
|
241
219
|
end
|
data/test/dataset.rb
CHANGED
data/test/feature.rb
CHANGED
@@ -57,20 +57,20 @@ class FeatureTest < MiniTest::Test
|
|
57
57
|
def test_physchem_description
|
58
58
|
assert_equal 346, PhysChem.descriptors.size
|
59
59
|
assert_equal 15, PhysChem.openbabel_descriptors.size
|
60
|
-
assert_equal
|
60
|
+
assert_equal 286, PhysChem.cdk_descriptors.size
|
61
61
|
assert_equal 45, PhysChem.joelib_descriptors.size
|
62
|
-
assert_equal
|
62
|
+
assert_equal 309, PhysChem.unique_descriptors.size
|
63
63
|
end
|
64
64
|
|
65
65
|
def test_physchem
|
66
66
|
assert_equal 346, PhysChem.descriptors.size
|
67
67
|
c = Compound.from_smiles "CC(=O)CC(C)C"
|
68
68
|
logP = PhysChem.find_or_create_by :name => "Openbabel.logP"
|
69
|
-
assert_equal 1.6215,
|
69
|
+
assert_equal 1.6215, c.calculate_properties([logP]).first
|
70
70
|
jlogP = PhysChem.find_or_create_by :name => "Joelib.LogP"
|
71
|
-
assert_equal 3.5951,
|
71
|
+
assert_equal 3.5951, c.calculate_properties([jlogP]).first
|
72
72
|
alogP = PhysChem.find_or_create_by :name => "Cdk.ALOGP.ALogP"
|
73
|
-
assert_equal 0.35380000000000034,
|
73
|
+
assert_equal 0.35380000000000034, c.calculate_properties([alogP]).first
|
74
74
|
end
|
75
75
|
|
76
76
|
end
|
@@ -46,12 +46,14 @@ class LazarClassificationTest < MiniTest::Test
|
|
46
46
|
assert_equal compound_dataset.compounds, prediction_dataset.compounds
|
47
47
|
|
48
48
|
cid = prediction_dataset.compounds[7].id.to_s
|
49
|
-
assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:
|
49
|
+
assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0]
|
50
|
+
expectations = ["Cannot create prediction: Only one similar compound in the training set.",
|
51
|
+
"Could not find similar substances with experimental data in the training dataset."]
|
50
52
|
prediction_dataset.predictions.each do |cid,pred|
|
51
|
-
|
53
|
+
assert_includes expectations, pred[:warnings][0] if pred[:value].nil?
|
52
54
|
end
|
53
55
|
cid = Compound.from_smiles("CCOC(=O)N").id.to_s
|
54
|
-
assert_match "excluded", prediction_dataset.predictions[cid][:
|
56
|
+
assert_match "excluded", prediction_dataset.predictions[cid][:info]
|
55
57
|
# cleanup
|
56
58
|
[training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete}
|
57
59
|
end
|
data/test/model-regression.rb
CHANGED
@@ -10,21 +10,21 @@ class LazarRegressionTest < MiniTest::Test
|
|
10
10
|
},
|
11
11
|
:similarity => {
|
12
12
|
:method => "Algorithm::Similarity.tanimoto",
|
13
|
-
:min => 0.
|
13
|
+
:min => 0.5
|
14
14
|
},
|
15
15
|
:prediction => {
|
16
|
-
:method => "Algorithm::Caret.
|
16
|
+
:method => "Algorithm::Caret.rf",
|
17
17
|
},
|
18
18
|
:feature_selection => nil,
|
19
19
|
}
|
20
|
-
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"
|
20
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv")
|
21
21
|
model = Model::Lazar.create training_dataset: training_dataset
|
22
22
|
assert_kind_of Model::LazarRegression, model
|
23
23
|
assert_equal algorithms, model.algorithms
|
24
|
-
substance = training_dataset.substances[
|
24
|
+
substance = training_dataset.substances[145]
|
25
25
|
prediction = model.predict substance
|
26
26
|
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
27
|
-
substance = Compound.from_smiles "
|
27
|
+
substance = Compound.from_smiles "c1ccc(cc1)Oc1ccccc1"
|
28
28
|
prediction = model.predict substance
|
29
29
|
refute_nil prediction[:value]
|
30
30
|
refute_nil prediction[:prediction_interval]
|
@@ -59,8 +59,8 @@ class LazarRegressionTest < MiniTest::Test
|
|
59
59
|
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
60
60
|
compound = Compound.from_smiles "CCCSCCSCC"
|
61
61
|
prediction = model.predict compound
|
62
|
-
assert_equal
|
63
|
-
|
62
|
+
assert_equal 3, prediction[:neighbors].size
|
63
|
+
assert prediction[:value].round(2) > 1.37, "Prediction value (#{prediction[:value].round(2)}) should be larger than 1.37."
|
64
64
|
end
|
65
65
|
|
66
66
|
def test_local_physchem_regression
|
@@ -112,12 +112,12 @@ class LazarRegressionTest < MiniTest::Test
|
|
112
112
|
:method => "Algorithm::Similarity.cosine",
|
113
113
|
}
|
114
114
|
}
|
115
|
-
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.
|
115
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
|
116
116
|
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
117
117
|
assert_kind_of Model::LazarRegression, model
|
118
|
-
assert_equal "Algorithm::Caret.
|
118
|
+
assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
|
119
119
|
assert_equal "Algorithm::Similarity.cosine", model.algorithms[:similarity][:method]
|
120
|
-
assert_equal 0.
|
120
|
+
assert_equal 0.5, model.algorithms[:similarity][:min]
|
121
121
|
algorithms[:descriptors].delete :features
|
122
122
|
assert_equal algorithms[:descriptors], model.algorithms[:descriptors]
|
123
123
|
prediction = model.predict training_dataset.substances[10]
|
@@ -130,14 +130,14 @@ class LazarRegressionTest < MiniTest::Test
|
|
130
130
|
:method => "Algorithm::FeatureSelection.correlation_filter",
|
131
131
|
},
|
132
132
|
}
|
133
|
-
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"
|
133
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv")
|
134
134
|
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
135
135
|
assert_kind_of Model::LazarRegression, model
|
136
|
-
assert_equal "Algorithm::Caret.
|
136
|
+
assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
|
137
137
|
assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
|
138
|
-
assert_equal 0.
|
138
|
+
assert_equal 0.5, model.algorithms[:similarity][:min]
|
139
139
|
assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method]
|
140
|
-
prediction = model.predict training_dataset.substances[
|
140
|
+
prediction = model.predict training_dataset.substances[145]
|
141
141
|
refute_nil prediction[:value]
|
142
142
|
end
|
143
143
|
|
data/test/model-validation.rb
CHANGED
@@ -12,7 +12,7 @@ class ValidationModelTest < MiniTest::Test
|
|
12
12
|
m.crossvalidations.each do |cv|
|
13
13
|
assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split."
|
14
14
|
end
|
15
|
-
prediction = m.predict Compound.from_smiles("
|
15
|
+
prediction = m.predict Compound.from_smiles("OCC(CN(CC(O)C)N=O)O")
|
16
16
|
assert_equal "true", prediction[:value]
|
17
17
|
m.delete
|
18
18
|
end
|
data/test/setup.rb
CHANGED
@@ -3,6 +3,8 @@ require 'minitest/autorun'
|
|
3
3
|
require_relative '../lib/lazar.rb'
|
4
4
|
#require 'lazar'
|
5
5
|
include OpenTox
|
6
|
+
#$mongo.database.drop
|
7
|
+
#$gridfs = $mongo.database.fs # recreate GridFS indexes
|
6
8
|
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
|
7
9
|
DATA_DIR ||= File.join(TEST_DIR,"data")
|
8
10
|
training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
|
@@ -47,7 +47,7 @@ class ValidationClassificationTest < MiniTest::Test
|
|
47
47
|
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
48
48
|
model = Model::Lazar.create training_dataset: dataset
|
49
49
|
loo = ClassificationLeaveOneOut.create model
|
50
|
-
assert_equal
|
50
|
+
assert_equal 24, loo.nr_unpredicted
|
51
51
|
refute_empty loo.confusion_matrix
|
52
52
|
assert loo.accuracy > 0.77
|
53
53
|
assert loo.weighted_accuracy > loo.accuracy, "Weighted accuracy (#{loo.weighted_accuracy}) should be larger than accuracy (#{loo.accuracy})."
|
@@ -84,7 +84,7 @@ class ValidationRegressionTest < MiniTest::Test
|
|
84
84
|
repeated_cv = RepeatedCrossValidation.create model
|
85
85
|
repeated_cv.crossvalidations.each do |cv|
|
86
86
|
assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034"
|
87
|
-
|
87
|
+
assert cv.rmse < 0.5, "RMSE (#{cv.rmse}) should be smaller than 0.5"
|
88
88
|
end
|
89
89
|
end
|
90
90
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lazar
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler,
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-
|
12
|
+
date: 2017-05-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|