lazar 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +98 -2
- data/VERSION +1 -1
- data/lib/caret.rb +5 -6
- data/lib/classification.rb +5 -0
- data/lib/crossvalidation.rb +1 -0
- data/lib/dataset.rb +1 -1
- data/lib/lazar.rb +5 -2
- data/lib/leave-one-out-validation.rb +1 -0
- data/lib/model.rb +30 -18
- data/lib/regression.rb +1 -1
- data/lib/train-test-validation.rb +2 -0
- data/lib/unique_descriptors.rb +2 -1
- data/lib/validation-statistics.rb +24 -46
- data/test/dataset.rb +1 -1
- data/test/feature.rb +5 -5
- data/test/model-classification.rb +5 -3
- data/test/model-regression.rb +14 -14
- data/test/model-validation.rb +1 -1
- data/test/setup.rb +2 -0
- data/test/validation-classification.rb +1 -1
- data/test/validation-regression.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 698fd96821077269f4c31fdfca6bead6beab36f0
|
4
|
+
data.tar.gz: 2fd49abf99e8f5367b83764735c5c2e49caad4d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ef5d243c765f5d1d6bb4c2dbd53bf2cdcca380df532cef8c48447b907ff01086f1b50c261e1f494827a30e52e7d8e62e2c077334cdbe30370546680ffd018886
|
7
|
+
data.tar.gz: d9bb905832388dcb44bb3211fb6976c4e0894d75eb02fdf60c17fe19f25dec73ba24b5585fea8cb85e77b6793b46518462cf1d9b2488c465332b9db3af132028
|
data/README.md
CHANGED
@@ -59,7 +59,75 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
|
|
59
59
|
|
60
60
|
#### Experiment with other algorithms
|
61
61
|
|
62
|
-
You can pass
|
62
|
+
You can pass algorithm specifications as parameters to the `Model::Validation.create_from_csv_file` and `Model::Lazar.create` commands. Algorithms for descriptors, similarity calculations, feature_selection and local models are specified in the `algorithm` parameter. Unspecified algorithms and parameters are substituted by default values. The example below selects
|
63
|
+
|
64
|
+
- MP2D fingerprint descriptors
|
65
|
+
- Tanimoto similarity with a threshold of 0.1
|
66
|
+
- no feature selection
|
67
|
+
- weighted majority vote predictions
|
68
|
+
|
69
|
+
```
|
70
|
+
algorithms = {
|
71
|
+
:descriptors => { # descriptor algorithm
|
72
|
+
:method => "fingerprint", # fingerprint descriptors
|
73
|
+
:type => "MP2D" # fingerprint type, e.g. FP4, MACCS
|
74
|
+
},
|
75
|
+
:similarity => { # similarity algorithm
|
76
|
+
:method => "Algorithm::Similarity.tanimoto",
|
77
|
+
:min => 0.1 # similarity threshold for neighbors
|
78
|
+
},
|
79
|
+
:feature_selection => nil, # no feature selection
|
80
|
+
:prediction => { # local modelling algorithm
|
81
|
+
:method => "Algorithm::Classification.weighted_majority_vote",
|
82
|
+
},
|
83
|
+
}
|
84
|
+
|
85
|
+
training_dataset = Dataset.from_csv_file "hamster_carcinogenicity.csv"
|
86
|
+
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
87
|
+
```
|
88
|
+
|
89
|
+
The next example creates a regression model with
|
90
|
+
|
91
|
+
- calculated descriptors from OpenBabel libraries
|
92
|
+
- weighted cosine similarity and a threshold of 0.5
|
93
|
+
- descriptors that are correlated with the endpoint
|
94
|
+
- local partial least squares models from the R caret package
|
95
|
+
|
96
|
+
```
|
97
|
+
algorithms = {
|
98
|
+
:descriptors => { # descriptor algorithm
|
99
|
+
:method => "calculate_properties",
|
100
|
+
:features => PhysChem.openbabel_descriptors,
|
101
|
+
},
|
102
|
+
:similarity => { # similarity algorithm
|
103
|
+
:method => "Algorithm::Similarity.weighted_cosine",
|
104
|
+
:min => 0.5
|
105
|
+
},
|
106
|
+
:feature_selection => { # feature selection algorithm
|
107
|
+
:method => "Algorithm::FeatureSelection.correlation_filter",
|
108
|
+
},
|
109
|
+
:prediction => { # local modelling algorithm
|
110
|
+
:method => "Algorithm::Caret.pls",
|
111
|
+
},
|
112
|
+
}
|
113
|
+
training_dataset = Dataset.from_csv_file "EPAFHM_log10.csv"
|
114
|
+
model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
|
115
|
+
```
|
116
|
+
|
117
|
+
Please consult the [API documentation](http://rdoc.info/gems/lazar) and [source code](https:://github.com/opentox/lazar) for up to date information about implemented algorithms:
|
118
|
+
|
119
|
+
- Descriptor algorithms
|
120
|
+
- [Compounds](http://www.rubydoc.info/gems/lazar/OpenTox/Compound)
|
121
|
+
- [Nanoparticles](http://www.rubydoc.info/gems/lazar/OpenTox/Nanoparticle)
|
122
|
+
- [Similarity algorithms](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Similarity)
|
123
|
+
- [Feature selection algorithms](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/FeatureSelection)
|
124
|
+
- Local models
|
125
|
+
- [Classification](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Classification)
|
126
|
+
- [Regression](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Regression)
|
127
|
+
- [R caret](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Caret)
|
128
|
+
|
129
|
+
|
130
|
+
You can find more working examples in the `lazar` `model-*.rb` and `validation-*.rb` [tests](https://github.com/opentox/lazar/tree/master/test).
|
63
131
|
|
64
132
|
### Create and use `lazar` nanoparticle models
|
65
133
|
|
@@ -87,7 +155,35 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
|
|
87
155
|
|
88
156
|
#### Experiment with other datasets, endpoints and algorithms
|
89
157
|
|
90
|
-
You can pass training_dataset, prediction_feature and algorithms parameters to the `Model::Validation.create_from_enanomapper` command.
|
158
|
+
You can pass training_dataset, prediction_feature and algorithms parameters to the `Model::Validation.create_from_enanomapper` command. Procedure and options are the same as for compounds. The following commands create and validate a `nano-lazar` model with
|
159
|
+
|
160
|
+
- measured P-CHEM properties as descriptors
|
161
|
+
- descriptors selected with correlation filter
|
162
|
+
- weighted cosine similarity with a threshold of 0.5
|
163
|
+
- Caret random forests
|
164
|
+
|
165
|
+
```
|
166
|
+
algorithms = {
|
167
|
+
:descriptors => {
|
168
|
+
:method => "properties",
|
169
|
+
:categories => ["P-CHEM"],
|
170
|
+
},
|
171
|
+
:similarity => {
|
172
|
+
:method => "Algorithm::Similarity.weighted_cosine",
|
173
|
+
:min => 0.5
|
174
|
+
},
|
175
|
+
:feature_selection => {
|
176
|
+
:method => "Algorithm::FeatureSelection.correlation_filter",
|
177
|
+
},
|
178
|
+
:prediction => {
|
179
|
+
:method => "Algorithm::Caret.rf",
|
180
|
+
},
|
181
|
+
}
|
182
|
+
validation_model = Model::Validation.from_enanomapper algorithms: algorithms
|
183
|
+
```
|
184
|
+
|
185
|
+
|
186
|
+
Detailed documentation and validation results for nanoparticle models can be found in this [publication](https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf).
|
91
187
|
|
92
188
|
Documentation
|
93
189
|
-------------
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.0
|
1
|
+
1.1.0
|
data/lib/caret.rb
CHANGED
@@ -22,12 +22,11 @@ module OpenTox
|
|
22
22
|
end
|
23
23
|
if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == []
|
24
24
|
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
25
|
-
prediction[:
|
25
|
+
prediction[:warnings] << "No variables for regression model. Using weighted average of similar substances."
|
26
26
|
elsif
|
27
27
|
dependent_variables.size < 3
|
28
28
|
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
29
|
-
prediction[:
|
30
|
-
|
29
|
+
prediction[:warnings] << "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
|
31
30
|
else
|
32
31
|
dependent_variables.each_with_index do |v,i|
|
33
32
|
dependent_variables[i] = to_r(v)
|
@@ -52,7 +51,7 @@ module OpenTox
|
|
52
51
|
$logger.debug dependent_variables
|
53
52
|
$logger.debug independent_variables
|
54
53
|
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
55
|
-
prediction[:
|
54
|
+
prediction[:warnings] << "R caret model creation error. Using weighted average of similar substances."
|
56
55
|
return prediction
|
57
56
|
end
|
58
57
|
begin
|
@@ -73,12 +72,12 @@ module OpenTox
|
|
73
72
|
$logger.debug "R caret prediction error for:"
|
74
73
|
$logger.debug self.inspect
|
75
74
|
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
76
|
-
prediction[:
|
75
|
+
prediction[:warnings] << "R caret prediction error. Using weighted average of similar substances"
|
77
76
|
return prediction
|
78
77
|
end
|
79
78
|
if prediction.nil? or prediction[:value].nil?
|
80
79
|
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
|
81
|
-
prediction[:
|
80
|
+
prediction[:warnings] << "Empty R caret prediction. Using weighted average of similar substances."
|
82
81
|
end
|
83
82
|
end
|
84
83
|
prediction
|
data/lib/classification.rb
CHANGED
@@ -18,6 +18,11 @@ module OpenTox
|
|
18
18
|
class_weights.each do |a,w|
|
19
19
|
probabilities[a] = w.sum/weights.sum
|
20
20
|
end
|
21
|
+
# DG: hack to ensure always two probability values
|
22
|
+
if probabilities.keys.uniq.size == 1
|
23
|
+
missing_key = probabilities.keys.uniq[0].match(/^non/) ? probabilities.keys.uniq[0].sub(/non-/,"") : "non-"+probabilities.keys.uniq[0]
|
24
|
+
probabilities[missing_key] = 0.0
|
25
|
+
end
|
21
26
|
probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
|
22
27
|
p_max = probabilities.collect{|a,p| p}.max
|
23
28
|
prediction = probabilities.key(p_max)
|
data/lib/crossvalidation.rb
CHANGED
@@ -90,6 +90,7 @@ module OpenTox
|
|
90
90
|
field :within_prediction_interval, type: Integer, default:0
|
91
91
|
field :out_of_prediction_interval, type: Integer, default:0
|
92
92
|
field :correlation_plot_id, type: BSON::ObjectId
|
93
|
+
field :warnings, type: Array
|
93
94
|
end
|
94
95
|
|
95
96
|
# Independent repeated crossvalidations
|
data/lib/dataset.rb
CHANGED
data/lib/lazar.rb
CHANGED
@@ -16,16 +16,19 @@ raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', ple
|
|
16
16
|
|
17
17
|
ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"]
|
18
18
|
ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment
|
19
|
+
# search for a central mongo database in use
|
20
|
+
# http://opentox.github.io/installation/2017/03/07/use-central-mongodb-in-docker-environment
|
21
|
+
CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp
|
19
22
|
Mongoid.load_configuration({
|
20
23
|
:clients => {
|
21
24
|
:default => {
|
22
25
|
:database => ENV["LAZAR_ENV"],
|
23
|
-
:hosts => ["localhost:27017"],
|
26
|
+
:hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]),
|
24
27
|
}
|
25
28
|
}
|
26
29
|
})
|
27
30
|
Mongoid.raise_not_found_error = false # return nil if no document is found
|
28
|
-
$mongo = Mongo::Client.new("mongodb
|
31
|
+
$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}")
|
29
32
|
$gridfs = $mongo.database.fs
|
30
33
|
|
31
34
|
# Logger setup
|
data/lib/model.rb
CHANGED
@@ -57,7 +57,7 @@ module OpenTox
|
|
57
57
|
model.version = {:warning => "git is not installed"}
|
58
58
|
end
|
59
59
|
|
60
|
-
# set defaults
|
60
|
+
# set defaults#
|
61
61
|
substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
|
62
62
|
bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
|
63
63
|
|
@@ -68,10 +68,6 @@ module OpenTox
|
|
68
68
|
:method => "fingerprint",
|
69
69
|
:type => "MP2D",
|
70
70
|
},
|
71
|
-
:similarity => {
|
72
|
-
:method => "Algorithm::Similarity.tanimoto",
|
73
|
-
:min => 0.1
|
74
|
-
},
|
75
71
|
:feature_selection => nil
|
76
72
|
}
|
77
73
|
|
@@ -79,9 +75,17 @@ module OpenTox
|
|
79
75
|
model.algorithms[:prediction] = {
|
80
76
|
:method => "Algorithm::Classification.weighted_majority_vote",
|
81
77
|
}
|
78
|
+
model.algorithms[:similarity] = {
|
79
|
+
:method => "Algorithm::Similarity.tanimoto",
|
80
|
+
:min => 0.1,
|
81
|
+
}
|
82
82
|
elsif model.class == LazarRegression
|
83
83
|
model.algorithms[:prediction] = {
|
84
|
-
:method => "Algorithm::Caret.
|
84
|
+
:method => "Algorithm::Caret.rf",
|
85
|
+
}
|
86
|
+
model.algorithms[:similarity] = {
|
87
|
+
:method => "Algorithm::Similarity.tanimoto",
|
88
|
+
:min => 0.5,
|
85
89
|
}
|
86
90
|
end
|
87
91
|
|
@@ -93,7 +97,7 @@ module OpenTox
|
|
93
97
|
},
|
94
98
|
:similarity => {
|
95
99
|
:method => "Algorithm::Similarity.weighted_cosine",
|
96
|
-
:min => 0.5
|
100
|
+
:min => 0.5,
|
97
101
|
},
|
98
102
|
:prediction => {
|
99
103
|
:method => "Algorithm::Caret.rf",
|
@@ -141,7 +145,6 @@ module OpenTox
|
|
141
145
|
end
|
142
146
|
model.descriptor_ids = model.fingerprints.flatten.uniq
|
143
147
|
model.descriptor_ids.each do |d|
|
144
|
-
# resulting model may break BSON size limit (e.g. f Kazius dataset)
|
145
148
|
model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
|
146
149
|
end
|
147
150
|
# calculate physchem properties
|
@@ -191,7 +194,7 @@ module OpenTox
|
|
191
194
|
# Predict a substance (compound or nanoparticle)
|
192
195
|
# @param [OpenTox::Substance]
|
193
196
|
# @return [Hash]
|
194
|
-
def predict_substance substance
|
197
|
+
def predict_substance substance, threshold = self.algorithms[:similarity][:min]
|
195
198
|
|
196
199
|
@independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
|
197
200
|
case algorithms[:similarity][:method]
|
@@ -221,20 +224,19 @@ module OpenTox
|
|
221
224
|
bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
|
222
225
|
end
|
223
226
|
|
224
|
-
prediction = {}
|
227
|
+
prediction = {:warnings => [], :measurements => []}
|
228
|
+
prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min]
|
225
229
|
neighbor_ids = []
|
226
230
|
neighbor_similarities = []
|
227
231
|
neighbor_dependent_variables = []
|
228
232
|
neighbor_independent_variables = []
|
229
233
|
|
230
|
-
prediction = {}
|
231
234
|
# find neighbors
|
232
235
|
substance_ids.each_with_index do |s,i|
|
233
236
|
# handle query substance
|
234
237
|
if substance.id.to_s == s
|
235
|
-
prediction[:measurements] ||= []
|
236
238
|
prediction[:measurements] << dependent_variables[i]
|
237
|
-
prediction[:
|
239
|
+
prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
|
238
240
|
else
|
239
241
|
if fingerprints?
|
240
242
|
neighbor_descriptors = fingerprints[i]
|
@@ -243,7 +245,7 @@ module OpenTox
|
|
243
245
|
neighbor_descriptors = scaled_variables.collect{|v| v[i]}
|
244
246
|
end
|
245
247
|
sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
|
246
|
-
if sim >=
|
248
|
+
if sim >= threshold
|
247
249
|
neighbor_ids << s
|
248
250
|
neighbor_similarities << sim
|
249
251
|
neighbor_dependent_variables << dependent_variables[i]
|
@@ -258,17 +260,27 @@ module OpenTox
|
|
258
260
|
measurements = nil
|
259
261
|
|
260
262
|
if neighbor_similarities.empty?
|
261
|
-
prediction
|
263
|
+
prediction[:value] = nil
|
264
|
+
prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset."
|
262
265
|
elsif neighbor_similarities.size == 1
|
263
|
-
prediction
|
266
|
+
prediction[:value] = nil
|
267
|
+
prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set."
|
268
|
+
prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]
|
264
269
|
else
|
265
270
|
query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
|
266
271
|
# call prediction algorithm
|
267
272
|
result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
|
268
273
|
prediction.merge! result
|
269
274
|
prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
|
275
|
+
#if neighbor_similarities.max < algorithms[:similarity][:warn_min]
|
276
|
+
#prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain."
|
277
|
+
#end
|
278
|
+
end
|
279
|
+
if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
|
280
|
+
prediction
|
281
|
+
else # try again with a lower threshold
|
282
|
+
predict_substance substance, 0.2
|
270
283
|
end
|
271
|
-
prediction
|
272
284
|
end
|
273
285
|
|
274
286
|
# Predict a substance (compound or nanoparticle), an array of substances or a dataset
|
@@ -300,7 +312,7 @@ module OpenTox
|
|
300
312
|
# serialize result
|
301
313
|
if object.is_a? Substance
|
302
314
|
prediction = predictions[substances.first.id.to_s]
|
303
|
-
prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
|
315
|
+
prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity
|
304
316
|
return prediction
|
305
317
|
elsif object.is_a? Array
|
306
318
|
return predictions
|
data/lib/regression.rb
CHANGED
@@ -17,7 +17,7 @@ module OpenTox
|
|
17
17
|
sim_sum += weights[i]
|
18
18
|
end if dependent_variables
|
19
19
|
sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
|
20
|
-
{:value => prediction}
|
20
|
+
{:value => prediction, :warnings => ["Weighted average prediction, no prediction interval available."]}
|
21
21
|
end
|
22
22
|
|
23
23
|
end
|
@@ -27,6 +27,8 @@ module OpenTox
|
|
27
27
|
end
|
28
28
|
end
|
29
29
|
predictions.select!{|cid,p| p[:value] and p[:measurements]}
|
30
|
+
# hack to avoid mongos file size limit error on large datasets
|
31
|
+
#predictions.each{|cid,p| p[:neighbors] = []} if model.training_dataset.name.match(/mutagenicity/i)
|
30
32
|
validation = self.new(
|
31
33
|
:model_id => validation_model.id,
|
32
34
|
:test_dataset_id => test_set.id,
|
data/lib/unique_descriptors.rb
CHANGED
@@ -48,7 +48,8 @@ UNIQUEDESCRIPTORS = [
|
|
48
48
|
#"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors.
|
49
49
|
#"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors.
|
50
50
|
"Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states.
|
51
|
-
|
51
|
+
# TODO check why the next descriptor is not present in the CDK_DESCRIPTIONS variable.
|
52
|
+
#"Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
|
52
53
|
"Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices.
|
53
54
|
"Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments
|
54
55
|
"Cdk.LargestChain", #Returns the number of atoms in the largest chain
|
@@ -111,6 +111,7 @@ module OpenTox
|
|
111
111
|
# Get statistics
|
112
112
|
# @return [Hash]
|
113
113
|
def statistics
|
114
|
+
self.warnings = []
|
114
115
|
self.rmse = 0
|
115
116
|
self.mae = 0
|
116
117
|
self.within_prediction_interval = 0
|
@@ -132,8 +133,10 @@ module OpenTox
|
|
132
133
|
end
|
133
134
|
end
|
134
135
|
else
|
135
|
-
|
136
|
-
|
136
|
+
trd_id = model.training_dataset_id
|
137
|
+
smiles = Compound.find(cid).smiles
|
138
|
+
self.warnings << "No training activities for #{smiles} in training dataset #{trd_id}."
|
139
|
+
$logger.debug "No training activities for #{smiles} in training dataset #{trd_id}."
|
137
140
|
end
|
138
141
|
end
|
139
142
|
R.assign "measurement", x
|
@@ -146,6 +149,7 @@ module OpenTox
|
|
146
149
|
$logger.debug "RMSE #{rmse}"
|
147
150
|
$logger.debug "MAE #{mae}"
|
148
151
|
$logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval"
|
152
|
+
$logger.debug "#{warnings}"
|
149
153
|
save
|
150
154
|
{
|
151
155
|
:mae => mae,
|
@@ -179,8 +183,12 @@ module OpenTox
|
|
179
183
|
R.assign "prediction", y
|
180
184
|
R.eval "all = c(measurement,prediction)"
|
181
185
|
R.eval "range = c(min(all), max(all))"
|
182
|
-
|
183
|
-
|
186
|
+
if feature.name.match /Net cell association/ # ad hoc fix for awkward units
|
187
|
+
title = "log2(Net cell association [mL/ug(Mg)])"
|
188
|
+
else
|
189
|
+
title = feature.name
|
190
|
+
title += " [#{feature.unit}]" if feature.unit and !feature.unit.blank?
|
191
|
+
end
|
184
192
|
R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
|
185
193
|
R.eval "image = image + geom_abline(intercept=0, slope=1)"
|
186
194
|
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
@@ -191,51 +199,21 @@ module OpenTox
|
|
191
199
|
$gridfs.find_one(_id: correlation_plot_id).data
|
192
200
|
end
|
193
201
|
|
194
|
-
# Get predictions with
|
195
|
-
# @params [Fixnum] number of predictions
|
196
|
-
# @params [TrueClass,FalseClass,nil] include neighbors
|
197
|
-
# @params [TrueClass,FalseClass,nil] show common descriptors
|
202
|
+
# Get predictions with measurements outside of the prediction interval
|
198
203
|
# @return [Hash]
|
199
|
-
def worst_predictions
|
200
|
-
worst_predictions = predictions.
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
f=Feature.find(d)
|
210
|
-
{
|
211
|
-
:id => f.id.to_s,
|
212
|
-
:name => "#{f.name} (#{f.conditions})",
|
213
|
-
:p_value => d[:p_value],
|
214
|
-
:r_squared => d[:r_squared],
|
215
|
-
}
|
216
|
-
end
|
217
|
-
else
|
218
|
-
common_descriptors = n["common_descriptors"].size
|
219
|
-
end
|
220
|
-
{
|
221
|
-
:name => Substance.find(n["_id"]).name,
|
222
|
-
:id => n["_id"].to_s,
|
223
|
-
:common_descriptors => common_descriptors
|
224
|
-
}
|
225
|
-
end
|
226
|
-
else
|
227
|
-
neighbors = prediction["neighbors"].size
|
204
|
+
def worst_predictions
|
205
|
+
worst_predictions = predictions.select do |sid,p|
|
206
|
+
p["prediction_interval"] and p["value"] and (p["measurements"].max < p["prediction_interval"][0] or p["measurements"].min > p["prediction_interval"][1])
|
207
|
+
end.compact.to_h
|
208
|
+
worst_predictions.each do |sid,p|
|
209
|
+
p["error"] = (p["value"] - p["measurements"].median).abs
|
210
|
+
if p["measurements"].max < p["prediction_interval"][0]
|
211
|
+
p["distance_prediction_interval"] = (p["measurements"].max - p["prediction_interval"][0]).abs
|
212
|
+
elsif p["measurements"].min > p["prediction_interval"][1]
|
213
|
+
p["distance_prediction_interval"] = (p["measurements"].min - p["prediction_interval"][1]).abs
|
228
214
|
end
|
229
|
-
{
|
230
|
-
:id => substance.id.to_s,
|
231
|
-
:name => substance.name,
|
232
|
-
:feature => Feature.find(prediction["prediction_feature_id"]).name,
|
233
|
-
:error => (prediction["value"] - prediction["measurements"].median).abs,
|
234
|
-
:prediction => prediction["value"],
|
235
|
-
:measurements => prediction["measurements"],
|
236
|
-
:neighbors => neighbors
|
237
|
-
}
|
238
215
|
end
|
216
|
+
worst_predictions.sort_by{|sid,p| p["distance_prediction_interval"] }.to_h
|
239
217
|
end
|
240
218
|
end
|
241
219
|
end
|
data/test/dataset.rb
CHANGED
data/test/feature.rb
CHANGED
@@ -57,20 +57,20 @@ class FeatureTest < MiniTest::Test
|
|
57
57
|
def test_physchem_description
|
58
58
|
assert_equal 346, PhysChem.descriptors.size
|
59
59
|
assert_equal 15, PhysChem.openbabel_descriptors.size
|
60
|
-
assert_equal
|
60
|
+
assert_equal 286, PhysChem.cdk_descriptors.size
|
61
61
|
assert_equal 45, PhysChem.joelib_descriptors.size
|
62
|
-
assert_equal
|
62
|
+
assert_equal 309, PhysChem.unique_descriptors.size
|
63
63
|
end
|
64
64
|
|
65
65
|
def test_physchem
|
66
66
|
assert_equal 346, PhysChem.descriptors.size
|
67
67
|
c = Compound.from_smiles "CC(=O)CC(C)C"
|
68
68
|
logP = PhysChem.find_or_create_by :name => "Openbabel.logP"
|
69
|
-
assert_equal 1.6215,
|
69
|
+
assert_equal 1.6215, c.calculate_properties([logP]).first
|
70
70
|
jlogP = PhysChem.find_or_create_by :name => "Joelib.LogP"
|
71
|
-
assert_equal 3.5951,
|
71
|
+
assert_equal 3.5951, c.calculate_properties([jlogP]).first
|
72
72
|
alogP = PhysChem.find_or_create_by :name => "Cdk.ALOGP.ALogP"
|
73
|
-
assert_equal 0.35380000000000034,
|
73
|
+
assert_equal 0.35380000000000034, c.calculate_properties([alogP]).first
|
74
74
|
end
|
75
75
|
|
76
76
|
end
|
@@ -46,12 +46,14 @@ class LazarClassificationTest < MiniTest::Test
|
|
46
46
|
assert_equal compound_dataset.compounds, prediction_dataset.compounds
|
47
47
|
|
48
48
|
cid = prediction_dataset.compounds[7].id.to_s
|
49
|
-
assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:
|
49
|
+
assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0]
|
50
|
+
expectations = ["Cannot create prediction: Only one similar compound in the training set.",
|
51
|
+
"Could not find similar substances with experimental data in the training dataset."]
|
50
52
|
prediction_dataset.predictions.each do |cid,pred|
|
51
|
-
|
53
|
+
assert_includes expectations, pred[:warnings][0] if pred[:value].nil?
|
52
54
|
end
|
53
55
|
cid = Compound.from_smiles("CCOC(=O)N").id.to_s
|
54
|
-
assert_match "excluded", prediction_dataset.predictions[cid][:
|
56
|
+
assert_match "excluded", prediction_dataset.predictions[cid][:info]
|
55
57
|
# cleanup
|
56
58
|
[training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete}
|
57
59
|
end
|
data/test/model-regression.rb
CHANGED
@@ -10,21 +10,21 @@ class LazarRegressionTest < MiniTest::Test
|
|
10
10
|
},
|
11
11
|
:similarity => {
|
12
12
|
:method => "Algorithm::Similarity.tanimoto",
|
13
|
-
:min => 0.
|
13
|
+
:min => 0.5
|
14
14
|
},
|
15
15
|
:prediction => {
|
16
|
-
:method => "Algorithm::Caret.
|
16
|
+
:method => "Algorithm::Caret.rf",
|
17
17
|
},
|
18
18
|
:feature_selection => nil,
|
19
19
|
}
|
20
|
-
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"
|
20
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv")
|
21
21
|
model = Model::Lazar.create training_dataset: training_dataset
|
22
22
|
assert_kind_of Model::LazarRegression, model
|
23
23
|
assert_equal algorithms, model.algorithms
|
24
|
-
substance = training_dataset.substances[
|
24
|
+
substance = training_dataset.substances[145]
|
25
25
|
prediction = model.predict substance
|
26
26
|
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
|
27
|
-
substance = Compound.from_smiles "
|
27
|
+
substance = Compound.from_smiles "c1ccc(cc1)Oc1ccccc1"
|
28
28
|
prediction = model.predict substance
|
29
29
|
refute_nil prediction[:value]
|
30
30
|
refute_nil prediction[:prediction_interval]
|
@@ -59,8 +59,8 @@ class LazarRegressionTest < MiniTest::Test
|
|
59
59
|
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
60
60
|
compound = Compound.from_smiles "CCCSCCSCC"
|
61
61
|
prediction = model.predict compound
|
62
|
-
assert_equal
|
63
|
-
|
62
|
+
assert_equal 3, prediction[:neighbors].size
|
63
|
+
assert prediction[:value].round(2) > 1.37, "Prediction value (#{prediction[:value].round(2)}) should be larger than 1.37."
|
64
64
|
end
|
65
65
|
|
66
66
|
def test_local_physchem_regression
|
@@ -112,12 +112,12 @@ class LazarRegressionTest < MiniTest::Test
|
|
112
112
|
:method => "Algorithm::Similarity.cosine",
|
113
113
|
}
|
114
114
|
}
|
115
|
-
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.
|
115
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
|
116
116
|
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
117
117
|
assert_kind_of Model::LazarRegression, model
|
118
|
-
assert_equal "Algorithm::Caret.
|
118
|
+
assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
|
119
119
|
assert_equal "Algorithm::Similarity.cosine", model.algorithms[:similarity][:method]
|
120
|
-
assert_equal 0.
|
120
|
+
assert_equal 0.5, model.algorithms[:similarity][:min]
|
121
121
|
algorithms[:descriptors].delete :features
|
122
122
|
assert_equal algorithms[:descriptors], model.algorithms[:descriptors]
|
123
123
|
prediction = model.predict training_dataset.substances[10]
|
@@ -130,14 +130,14 @@ class LazarRegressionTest < MiniTest::Test
|
|
130
130
|
:method => "Algorithm::FeatureSelection.correlation_filter",
|
131
131
|
},
|
132
132
|
}
|
133
|
-
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"
|
133
|
+
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv")
|
134
134
|
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
|
135
135
|
assert_kind_of Model::LazarRegression, model
|
136
|
-
assert_equal "Algorithm::Caret.
|
136
|
+
assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
|
137
137
|
assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
|
138
|
-
assert_equal 0.
|
138
|
+
assert_equal 0.5, model.algorithms[:similarity][:min]
|
139
139
|
assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method]
|
140
|
-
prediction = model.predict training_dataset.substances[
|
140
|
+
prediction = model.predict training_dataset.substances[145]
|
141
141
|
refute_nil prediction[:value]
|
142
142
|
end
|
143
143
|
|
data/test/model-validation.rb
CHANGED
@@ -12,7 +12,7 @@ class ValidationModelTest < MiniTest::Test
|
|
12
12
|
m.crossvalidations.each do |cv|
|
13
13
|
assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split."
|
14
14
|
end
|
15
|
-
prediction = m.predict Compound.from_smiles("
|
15
|
+
prediction = m.predict Compound.from_smiles("OCC(CN(CC(O)C)N=O)O")
|
16
16
|
assert_equal "true", prediction[:value]
|
17
17
|
m.delete
|
18
18
|
end
|
data/test/setup.rb
CHANGED
@@ -3,6 +3,8 @@ require 'minitest/autorun'
|
|
3
3
|
require_relative '../lib/lazar.rb'
|
4
4
|
#require 'lazar'
|
5
5
|
include OpenTox
|
6
|
+
#$mongo.database.drop
|
7
|
+
#$gridfs = $mongo.database.fs # recreate GridFS indexes
|
6
8
|
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
|
7
9
|
DATA_DIR ||= File.join(TEST_DIR,"data")
|
8
10
|
training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
|
@@ -47,7 +47,7 @@ class ValidationClassificationTest < MiniTest::Test
|
|
47
47
|
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
|
48
48
|
model = Model::Lazar.create training_dataset: dataset
|
49
49
|
loo = ClassificationLeaveOneOut.create model
|
50
|
-
assert_equal
|
50
|
+
assert_equal 24, loo.nr_unpredicted
|
51
51
|
refute_empty loo.confusion_matrix
|
52
52
|
assert loo.accuracy > 0.77
|
53
53
|
assert loo.weighted_accuracy > loo.accuracy, "Weighted accuracy (#{loo.weighted_accuracy}) should be larger than accuracy (#{loo.accuracy})."
|
@@ -84,7 +84,7 @@ class ValidationRegressionTest < MiniTest::Test
|
|
84
84
|
repeated_cv = RepeatedCrossValidation.create model
|
85
85
|
repeated_cv.crossvalidations.each do |cv|
|
86
86
|
assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034"
|
87
|
-
|
87
|
+
assert cv.rmse < 0.5, "RMSE (#{cv.rmse}) should be smaller than 0.5"
|
88
88
|
end
|
89
89
|
end
|
90
90
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lazar
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler,
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-
|
12
|
+
date: 2017-05-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|