lazar 0.0.7 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/README.md +2 -1
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +15 -76
- data/ext/lazar/rinstall.R +9 -0
- data/lazar.gemspec +7 -7
- data/lib/classification.rb +5 -78
- data/lib/compound.rb +201 -44
- data/lib/crossvalidation.rb +224 -121
- data/lib/dataset.rb +83 -93
- data/lib/error.rb +1 -1
- data/lib/experiment.rb +99 -0
- data/lib/feature.rb +2 -54
- data/lib/lazar.rb +47 -34
- data/lib/leave-one-out-validation.rb +205 -0
- data/lib/model.rb +131 -76
- data/lib/opentox.rb +2 -2
- data/lib/overwrite.rb +37 -0
- data/lib/physchem.rb +133 -0
- data/lib/regression.rb +117 -189
- data/lib/rest-client-wrapper.rb +4 -5
- data/lib/unique_descriptors.rb +6 -7
- data/lib/validation.rb +63 -69
- data/test/all.rb +2 -2
- data/test/classification.rb +41 -0
- data/test/compound.rb +116 -7
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
- data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
- data/test/data/batch_prediction.csv +25 -0
- data/test/data/batch_prediction_inchi_small.csv +4 -0
- data/test/data/batch_prediction_smiles_small.csv +4 -0
- data/test/data/hamster_carcinogenicity.json +3 -0
- data/test/data/loael.csv +568 -0
- data/test/dataset-long.rb +5 -8
- data/test/dataset.rb +31 -11
- data/test/default_environment.rb +11 -0
- data/test/descriptor.rb +26 -41
- data/test/error.rb +1 -3
- data/test/experiment.rb +301 -0
- data/test/feature.rb +22 -10
- data/test/lazar-long.rb +43 -23
- data/test/lazar-physchem-short.rb +19 -16
- data/test/prediction_models.rb +20 -0
- data/test/regression.rb +43 -0
- data/test/setup.rb +3 -1
- data/test/test_environment.rb +10 -0
- data/test/validation.rb +92 -26
- metadata +64 -38
- data/lib/SMARTS_InteLigand.txt +0 -983
- data/lib/bbrc.rb +0 -165
- data/lib/descriptor.rb +0 -247
- data/lib/neighbor.rb +0 -25
- data/lib/similarity.rb +0 -58
- data/mongoid.yml +0 -8
- data/test/descriptor-long.rb +0 -26
- data/test/fminer-long.rb +0 -38
- data/test/fminer.rb +0 -52
- data/test/lazar-fminer.rb +0 -50
- data/test/lazar-regression.rb +0 -27
data/lib/crossvalidation.rb
CHANGED
@@ -6,12 +6,58 @@ module OpenTox
|
|
6
6
|
field :folds, type: Integer
|
7
7
|
field :nr_instances, type: Integer
|
8
8
|
field :nr_unpredicted, type: Integer
|
9
|
-
field :predictions, type: Array
|
9
|
+
field :predictions, type: Array, default: []
|
10
10
|
field :finished_at, type: Time
|
11
11
|
|
12
12
|
def time
|
13
13
|
finished_at - created_at
|
14
14
|
end
|
15
|
+
|
16
|
+
def validations
|
17
|
+
validation_ids.collect{|vid| Validation.find vid}
|
18
|
+
end
|
19
|
+
|
20
|
+
def model
|
21
|
+
Model::Lazar.find model_id
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.create model, n=10
|
25
|
+
model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
|
26
|
+
bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
|
27
|
+
cv = klass.new(
|
28
|
+
name: model.name,
|
29
|
+
model_id: model.id,
|
30
|
+
folds: n
|
31
|
+
)
|
32
|
+
cv.save # set created_at
|
33
|
+
nr_instances = 0
|
34
|
+
nr_unpredicted = 0
|
35
|
+
predictions = []
|
36
|
+
training_dataset = Dataset.find model.training_dataset_id
|
37
|
+
training_dataset.folds(n).each_with_index do |fold,fold_nr|
|
38
|
+
#fork do # parallel execution of validations
|
39
|
+
$logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
|
40
|
+
t = Time.now
|
41
|
+
validation = Validation.create(model, fold[0], fold[1],cv)
|
42
|
+
$logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
|
43
|
+
#end
|
44
|
+
end
|
45
|
+
#Process.waitall
|
46
|
+
cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
|
47
|
+
cv.validations.each do |validation|
|
48
|
+
nr_instances += validation.nr_instances
|
49
|
+
nr_unpredicted += validation.nr_unpredicted
|
50
|
+
predictions += validation.predictions
|
51
|
+
end
|
52
|
+
cv.update_attributes(
|
53
|
+
nr_instances: nr_instances,
|
54
|
+
nr_unpredicted: nr_unpredicted,
|
55
|
+
predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
|
56
|
+
)
|
57
|
+
$logger.debug "Nr unpredicted: #{nr_unpredicted}"
|
58
|
+
cv.statistics
|
59
|
+
cv
|
60
|
+
end
|
15
61
|
end
|
16
62
|
|
17
63
|
class ClassificationCrossValidation < CrossValidation
|
@@ -23,39 +69,41 @@ module OpenTox
|
|
23
69
|
field :weighted_accuracy, type: Float
|
24
70
|
field :true_rate, type: Hash
|
25
71
|
field :predictivity, type: Hash
|
72
|
+
field :confidence_plot_id, type: BSON::ObjectId
|
26
73
|
# TODO auc, f-measure (usability??)
|
27
74
|
|
28
|
-
def
|
29
|
-
cv = self.new
|
30
|
-
cv.save # set created_at
|
31
|
-
validation_ids = []
|
32
|
-
nr_instances = 0
|
33
|
-
nr_unpredicted = 0
|
34
|
-
predictions = []
|
35
|
-
validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
|
75
|
+
def statistics
|
36
76
|
accept_values = Feature.find(model.prediction_feature_id).accept_values
|
37
77
|
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
38
78
|
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
39
79
|
true_rate = {}
|
40
80
|
predictivity = {}
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
81
|
+
predictions.each do |pred|
|
82
|
+
compound_id,activities,prediction,confidence = pred
|
83
|
+
if activities and prediction #and confidence.numeric?
|
84
|
+
if activities.uniq.size == 1
|
85
|
+
activity = activities.uniq.first
|
86
|
+
if prediction == activity
|
87
|
+
if prediction == accept_values[0]
|
88
|
+
confusion_matrix[0][0] += 1
|
89
|
+
#weighted_confusion_matrix[0][0] += confidence
|
90
|
+
elsif prediction == accept_values[1]
|
91
|
+
confusion_matrix[1][1] += 1
|
92
|
+
#weighted_confusion_matrix[1][1] += confidence
|
93
|
+
end
|
94
|
+
elsif prediction != activity
|
95
|
+
if prediction == accept_values[0]
|
96
|
+
confusion_matrix[0][1] += 1
|
97
|
+
#weighted_confusion_matrix[0][1] += confidence
|
98
|
+
elsif prediction == accept_values[1]
|
99
|
+
confusion_matrix[1][0] += 1
|
100
|
+
#weighted_confusion_matrix[1][0] += confidence
|
101
|
+
end
|
102
|
+
end
|
55
103
|
end
|
104
|
+
else
|
105
|
+
nr_unpredicted += 1 if prediction.nil?
|
56
106
|
end
|
57
|
-
$logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
|
58
|
-
fold_nr +=1
|
59
107
|
end
|
60
108
|
true_rate = {}
|
61
109
|
predictivity = {}
|
@@ -64,30 +112,48 @@ module OpenTox
|
|
64
112
|
predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
|
65
113
|
end
|
66
114
|
confidence_sum = 0
|
67
|
-
weighted_confusion_matrix.each do |r|
|
68
|
-
r.each do |c|
|
69
|
-
confidence_sum += c
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
name: model.name,
|
74
|
-
model_id: model.id,
|
75
|
-
folds: n,
|
76
|
-
validation_ids: validation_ids,
|
77
|
-
nr_instances: nr_instances,
|
78
|
-
nr_unpredicted: nr_unpredicted,
|
115
|
+
#weighted_confusion_matrix.each do |r|
|
116
|
+
#r.each do |c|
|
117
|
+
#confidence_sum += c
|
118
|
+
#end
|
119
|
+
#end
|
120
|
+
update_attributes(
|
79
121
|
accept_values: accept_values,
|
80
122
|
confusion_matrix: confusion_matrix,
|
81
|
-
weighted_confusion_matrix: weighted_confusion_matrix,
|
123
|
+
#weighted_confusion_matrix: weighted_confusion_matrix,
|
82
124
|
accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
|
83
|
-
weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
|
125
|
+
#weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
|
84
126
|
true_rate: true_rate,
|
85
127
|
predictivity: predictivity,
|
86
|
-
predictions: predictions.sort{|a,b| b[3] <=> a[3]}, # sort according to confidence
|
87
128
|
finished_at: Time.now
|
88
129
|
)
|
89
|
-
|
90
|
-
|
130
|
+
$logger.debug "Accuracy #{accuracy}"
|
131
|
+
end
|
132
|
+
|
133
|
+
def confidence_plot
|
134
|
+
unless confidence_plot_id
|
135
|
+
tmpfile = "/tmp/#{id.to_s}_confidence.png"
|
136
|
+
accuracies = []
|
137
|
+
confidences = []
|
138
|
+
correct_predictions = 0
|
139
|
+
incorrect_predictions = 0
|
140
|
+
predictions.each do |p|
|
141
|
+
if p[1] and p[2]
|
142
|
+
p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1
|
143
|
+
accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
|
144
|
+
confidences << p[3]
|
145
|
+
|
146
|
+
end
|
147
|
+
end
|
148
|
+
R.assign "accuracy", accuracies
|
149
|
+
R.assign "confidence", confidences
|
150
|
+
R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
|
151
|
+
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
152
|
+
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
|
153
|
+
plot_id = $gridfs.insert_one(file)
|
154
|
+
update(:confidence_plot_id => plot_id)
|
155
|
+
end
|
156
|
+
$gridfs.find_one(_id: confidence_plot_id).data
|
91
157
|
end
|
92
158
|
|
93
159
|
#Average area under roc 0.646
|
@@ -99,98 +165,135 @@ module OpenTox
|
|
99
165
|
|
100
166
|
field :rmse, type: Float
|
101
167
|
field :mae, type: Float
|
102
|
-
field :
|
103
|
-
field :
|
168
|
+
field :r_squared, type: Float
|
169
|
+
field :correlation_plot_id, type: BSON::ObjectId
|
104
170
|
|
105
|
-
def
|
106
|
-
cv = self.new
|
107
|
-
cv.save # set created_at
|
108
|
-
validation_ids = []
|
109
|
-
nr_instances = 0
|
110
|
-
nr_unpredicted = 0
|
111
|
-
predictions = []
|
112
|
-
validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
|
113
|
-
fold_nr = 1
|
114
|
-
training_dataset = Dataset.find model.training_dataset_id
|
115
|
-
training_dataset.folds(n).each do |fold|
|
116
|
-
t = Time.now
|
117
|
-
$logger.debug "Predicting fold #{fold_nr}"
|
118
|
-
|
119
|
-
validation = validation_class.create(model, fold[0], fold[1])
|
120
|
-
validation_ids << validation.id
|
121
|
-
nr_instances += validation.nr_instances
|
122
|
-
nr_unpredicted += validation.nr_unpredicted
|
123
|
-
predictions += validation.predictions
|
124
|
-
$logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
|
125
|
-
fold_nr +=1
|
126
|
-
end
|
171
|
+
def statistics
|
127
172
|
rmse = 0
|
128
|
-
weighted_rmse = 0
|
129
|
-
rse = 0
|
130
|
-
weighted_rse = 0
|
131
173
|
mae = 0
|
132
|
-
|
133
|
-
|
134
|
-
weighted_rae = 0
|
135
|
-
n = 0
|
136
|
-
confidence_sum = 0
|
174
|
+
x = []
|
175
|
+
y = []
|
137
176
|
predictions.each do |pred|
|
138
177
|
compound_id,activity,prediction,confidence = pred
|
139
|
-
if activity and prediction
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
178
|
+
if activity and prediction
|
179
|
+
unless activity == [nil]
|
180
|
+
x << -Math.log10(activity.median)
|
181
|
+
y << -Math.log10(prediction)
|
182
|
+
error = Math.log10(prediction)-Math.log10(activity.median)
|
183
|
+
rmse += error**2
|
184
|
+
#weighted_rmse += confidence*error**2
|
185
|
+
mae += error.abs
|
186
|
+
#weighted_mae += confidence*error.abs
|
187
|
+
#confidence_sum += confidence
|
188
|
+
end
|
147
189
|
else
|
148
|
-
#
|
149
|
-
|
190
|
+
warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
191
|
+
$logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
150
192
|
end
|
151
193
|
end
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
nr_unpredicted: nr_unpredicted,
|
163
|
-
predictions: predictions.sort{|a,b| b[3] <=> a[3]},
|
194
|
+
R.assign "measurement", x
|
195
|
+
R.assign "prediction", y
|
196
|
+
R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
|
197
|
+
r = R.eval("r").to_ruby
|
198
|
+
|
199
|
+
mae = mae/predictions.size
|
200
|
+
#weighted_mae = weighted_mae/confidence_sum
|
201
|
+
rmse = Math.sqrt(rmse/predictions.size)
|
202
|
+
#weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
|
203
|
+
update_attributes(
|
164
204
|
mae: mae,
|
165
205
|
rmse: rmse,
|
166
|
-
weighted_mae: weighted_mae,
|
167
|
-
weighted_rmse: weighted_rmse
|
206
|
+
#weighted_mae: weighted_mae,
|
207
|
+
#weighted_rmse: weighted_rmse,
|
208
|
+
r_squared: r**2,
|
209
|
+
finished_at: Time.now
|
168
210
|
)
|
169
|
-
|
170
|
-
|
211
|
+
$logger.debug "R^2 #{r**2}"
|
212
|
+
$logger.debug "RMSE #{rmse}"
|
213
|
+
$logger.debug "MAE #{mae}"
|
214
|
+
end
|
215
|
+
|
216
|
+
def misclassifications n=nil
|
217
|
+
#n = predictions.size unless n
|
218
|
+
n ||= 10
|
219
|
+
model = Model::Lazar.find(self.model_id)
|
220
|
+
training_dataset = Dataset.find(model.training_dataset_id)
|
221
|
+
prediction_feature = training_dataset.features.first
|
222
|
+
predictions.collect do |p|
|
223
|
+
unless p.include? nil
|
224
|
+
compound = Compound.find(p[0])
|
225
|
+
neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
|
226
|
+
neighbors.collect! do |n|
|
227
|
+
neighbor = Compound.find(n[0])
|
228
|
+
values = training_dataset.values(neighbor,prediction_feature)
|
229
|
+
{ :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
|
230
|
+
end
|
231
|
+
{
|
232
|
+
:smiles => compound.smiles,
|
233
|
+
#:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name},
|
234
|
+
:measured => p[1],
|
235
|
+
:predicted => p[2],
|
236
|
+
#:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
|
237
|
+
:log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
|
238
|
+
:relative_error => (p[1]-p[2]).abs/p[1],
|
239
|
+
:confidence => p[3],
|
240
|
+
:neighbors => neighbors
|
241
|
+
}
|
242
|
+
end
|
243
|
+
end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1]
|
244
|
+
end
|
245
|
+
|
246
|
+
def confidence_plot
|
247
|
+
tmpfile = "/tmp/#{id.to_s}_confidence.png"
|
248
|
+
sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
|
249
|
+
R.assign "error", sorted_predictions.collect{|p| p[0]}
|
250
|
+
R.assign "confidence", sorted_predictions.collect{|p| p[1]}
|
251
|
+
# TODO fix axis names
|
252
|
+
R.eval "image = qplot(confidence,error)"
|
253
|
+
R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
|
254
|
+
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
255
|
+
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
|
256
|
+
plot_id = $gridfs.insert_one(file)
|
257
|
+
update(:confidence_plot_id => plot_id)
|
258
|
+
$gridfs.find_one(_id: confidence_plot_id).data
|
259
|
+
end
|
260
|
+
|
261
|
+
def correlation_plot
|
262
|
+
unless correlation_plot_id
|
263
|
+
tmpfile = "/tmp/#{id.to_s}_correlation.png"
|
264
|
+
x = predictions.collect{|p| p[1]}
|
265
|
+
y = predictions.collect{|p| p[2]}
|
266
|
+
attributes = Model::Lazar.find(self.model_id).attributes
|
267
|
+
attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
|
268
|
+
attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
|
269
|
+
R.assign "measurement", x
|
270
|
+
R.assign "prediction", y
|
271
|
+
R.eval "all = c(-log(measurement),-log(prediction))"
|
272
|
+
R.eval "range = c(min(all), max(all))"
|
273
|
+
R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
|
274
|
+
R.eval "image = image + geom_abline(intercept=0, slope=1)"
|
275
|
+
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
276
|
+
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png")
|
277
|
+
plot_id = $gridfs.insert_one(file)
|
278
|
+
update(:correlation_plot_id => plot_id)
|
279
|
+
end
|
280
|
+
$gridfs.find_one(_id: correlation_plot_id).data
|
171
281
|
end
|
282
|
+
end
|
172
283
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
R.eval "svg(filename='/tmp/#{id.to_s}.svg')"
|
187
|
-
R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)"
|
188
|
-
#R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)"
|
189
|
-
#R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)"
|
190
|
-
R.eval "abline(0,1,col='blue')"
|
191
|
-
#R.eval "abline(fitline,col='red')"
|
192
|
-
R.eval "dev.off()"
|
193
|
-
"/tmp/#{id.to_s}.svg"
|
284
|
+
class RepeatedCrossValidation
|
285
|
+
field :crossvalidation_ids, type: Array, default: []
|
286
|
+
def self.create model, folds=10, repeats=3
|
287
|
+
repeated_cross_validation = self.new
|
288
|
+
repeats.times do |n|
|
289
|
+
$logger.debug "Crossvalidation #{n+1} for #{model.name}"
|
290
|
+
repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
|
291
|
+
end
|
292
|
+
repeated_cross_validation.save
|
293
|
+
repeated_cross_validation
|
294
|
+
end
|
295
|
+
def crossvalidations
|
296
|
+
crossvalidation_ids.collect{|id| CrossValidation.find(id)}
|
194
297
|
end
|
195
298
|
end
|
196
299
|
|