lazar 0.0.7 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/README.md +2 -1
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +15 -76
  6. data/ext/lazar/rinstall.R +9 -0
  7. data/lazar.gemspec +7 -7
  8. data/lib/classification.rb +5 -78
  9. data/lib/compound.rb +201 -44
  10. data/lib/crossvalidation.rb +224 -121
  11. data/lib/dataset.rb +83 -93
  12. data/lib/error.rb +1 -1
  13. data/lib/experiment.rb +99 -0
  14. data/lib/feature.rb +2 -54
  15. data/lib/lazar.rb +47 -34
  16. data/lib/leave-one-out-validation.rb +205 -0
  17. data/lib/model.rb +131 -76
  18. data/lib/opentox.rb +2 -2
  19. data/lib/overwrite.rb +37 -0
  20. data/lib/physchem.rb +133 -0
  21. data/lib/regression.rb +117 -189
  22. data/lib/rest-client-wrapper.rb +4 -5
  23. data/lib/unique_descriptors.rb +6 -7
  24. data/lib/validation.rb +63 -69
  25. data/test/all.rb +2 -2
  26. data/test/classification.rb +41 -0
  27. data/test/compound.rb +116 -7
  28. data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
  29. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
  30. data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
  31. data/test/data/batch_prediction.csv +25 -0
  32. data/test/data/batch_prediction_inchi_small.csv +4 -0
  33. data/test/data/batch_prediction_smiles_small.csv +4 -0
  34. data/test/data/hamster_carcinogenicity.json +3 -0
  35. data/test/data/loael.csv +568 -0
  36. data/test/dataset-long.rb +5 -8
  37. data/test/dataset.rb +31 -11
  38. data/test/default_environment.rb +11 -0
  39. data/test/descriptor.rb +26 -41
  40. data/test/error.rb +1 -3
  41. data/test/experiment.rb +301 -0
  42. data/test/feature.rb +22 -10
  43. data/test/lazar-long.rb +43 -23
  44. data/test/lazar-physchem-short.rb +19 -16
  45. data/test/prediction_models.rb +20 -0
  46. data/test/regression.rb +43 -0
  47. data/test/setup.rb +3 -1
  48. data/test/test_environment.rb +10 -0
  49. data/test/validation.rb +92 -26
  50. metadata +64 -38
  51. data/lib/SMARTS_InteLigand.txt +0 -983
  52. data/lib/bbrc.rb +0 -165
  53. data/lib/descriptor.rb +0 -247
  54. data/lib/neighbor.rb +0 -25
  55. data/lib/similarity.rb +0 -58
  56. data/mongoid.yml +0 -8
  57. data/test/descriptor-long.rb +0 -26
  58. data/test/fminer-long.rb +0 -38
  59. data/test/fminer.rb +0 -52
  60. data/test/lazar-fminer.rb +0 -50
  61. data/test/lazar-regression.rb +0 -27
@@ -6,12 +6,58 @@ module OpenTox
6
6
  field :folds, type: Integer
7
7
  field :nr_instances, type: Integer
8
8
  field :nr_unpredicted, type: Integer
9
- field :predictions, type: Array
9
+ field :predictions, type: Array, default: []
10
10
  field :finished_at, type: Time
11
11
 
12
12
  def time
13
13
  finished_at - created_at
14
14
  end
15
+
16
+ def validations
17
+ validation_ids.collect{|vid| Validation.find vid}
18
+ end
19
+
20
+ def model
21
+ Model::Lazar.find model_id
22
+ end
23
+
24
+ def self.create model, n=10
25
+ model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
26
+ bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
27
+ cv = klass.new(
28
+ name: model.name,
29
+ model_id: model.id,
30
+ folds: n
31
+ )
32
+ cv.save # set created_at
33
+ nr_instances = 0
34
+ nr_unpredicted = 0
35
+ predictions = []
36
+ training_dataset = Dataset.find model.training_dataset_id
37
+ training_dataset.folds(n).each_with_index do |fold,fold_nr|
38
+ #fork do # parallel execution of validations
39
+ $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
40
+ t = Time.now
41
+ validation = Validation.create(model, fold[0], fold[1],cv)
42
+ $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
43
+ #end
44
+ end
45
+ #Process.waitall
46
+ cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
47
+ cv.validations.each do |validation|
48
+ nr_instances += validation.nr_instances
49
+ nr_unpredicted += validation.nr_unpredicted
50
+ predictions += validation.predictions
51
+ end
52
+ cv.update_attributes(
53
+ nr_instances: nr_instances,
54
+ nr_unpredicted: nr_unpredicted,
55
+ predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
56
+ )
57
+ $logger.debug "Nr unpredicted: #{nr_unpredicted}"
58
+ cv.statistics
59
+ cv
60
+ end
15
61
  end
16
62
 
17
63
  class ClassificationCrossValidation < CrossValidation
@@ -23,39 +69,41 @@ module OpenTox
23
69
  field :weighted_accuracy, type: Float
24
70
  field :true_rate, type: Hash
25
71
  field :predictivity, type: Hash
72
+ field :confidence_plot_id, type: BSON::ObjectId
26
73
  # TODO auc, f-measure (usability??)
27
74
 
28
- def self.create model, n=10
29
- cv = self.new
30
- cv.save # set created_at
31
- validation_ids = []
32
- nr_instances = 0
33
- nr_unpredicted = 0
34
- predictions = []
35
- validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
75
+ def statistics
36
76
  accept_values = Feature.find(model.prediction_feature_id).accept_values
37
77
  confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
38
78
  weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
39
79
  true_rate = {}
40
80
  predictivity = {}
41
- fold_nr = 1
42
- training_dataset = Dataset.find model.training_dataset_id
43
- training_dataset.folds(n).each do |fold|
44
- t = Time.now
45
- $logger.debug "Fold #{fold_nr}"
46
- validation = validation_class.create(model, fold[0], fold[1])
47
- validation_ids << validation.id
48
- nr_instances += validation.nr_instances
49
- nr_unpredicted += validation.nr_unpredicted
50
- predictions += validation.predictions
51
- validation.confusion_matrix.each_with_index do |r,i|
52
- r.each_with_index do |c,j|
53
- confusion_matrix[i][j] += c
54
- weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
81
+ predictions.each do |pred|
82
+ compound_id,activities,prediction,confidence = pred
83
+ if activities and prediction #and confidence.numeric?
84
+ if activities.uniq.size == 1
85
+ activity = activities.uniq.first
86
+ if prediction == activity
87
+ if prediction == accept_values[0]
88
+ confusion_matrix[0][0] += 1
89
+ #weighted_confusion_matrix[0][0] += confidence
90
+ elsif prediction == accept_values[1]
91
+ confusion_matrix[1][1] += 1
92
+ #weighted_confusion_matrix[1][1] += confidence
93
+ end
94
+ elsif prediction != activity
95
+ if prediction == accept_values[0]
96
+ confusion_matrix[0][1] += 1
97
+ #weighted_confusion_matrix[0][1] += confidence
98
+ elsif prediction == accept_values[1]
99
+ confusion_matrix[1][0] += 1
100
+ #weighted_confusion_matrix[1][0] += confidence
101
+ end
102
+ end
55
103
  end
104
+ else
105
+ nr_unpredicted += 1 if prediction.nil?
56
106
  end
57
- $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
58
- fold_nr +=1
59
107
  end
60
108
  true_rate = {}
61
109
  predictivity = {}
@@ -64,30 +112,48 @@ module OpenTox
64
112
  predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
65
113
  end
66
114
  confidence_sum = 0
67
- weighted_confusion_matrix.each do |r|
68
- r.each do |c|
69
- confidence_sum += c
70
- end
71
- end
72
- cv.update_attributes(
73
- name: model.name,
74
- model_id: model.id,
75
- folds: n,
76
- validation_ids: validation_ids,
77
- nr_instances: nr_instances,
78
- nr_unpredicted: nr_unpredicted,
115
+ #weighted_confusion_matrix.each do |r|
116
+ #r.each do |c|
117
+ #confidence_sum += c
118
+ #end
119
+ #end
120
+ update_attributes(
79
121
  accept_values: accept_values,
80
122
  confusion_matrix: confusion_matrix,
81
- weighted_confusion_matrix: weighted_confusion_matrix,
123
+ #weighted_confusion_matrix: weighted_confusion_matrix,
82
124
  accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
83
- weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
125
+ #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
84
126
  true_rate: true_rate,
85
127
  predictivity: predictivity,
86
- predictions: predictions.sort{|a,b| b[3] <=> a[3]}, # sort according to confidence
87
128
  finished_at: Time.now
88
129
  )
89
- cv.save
90
- cv
130
+ $logger.debug "Accuracy #{accuracy}"
131
+ end
132
+
133
+ def confidence_plot
134
+ unless confidence_plot_id
135
+ tmpfile = "/tmp/#{id.to_s}_confidence.png"
136
+ accuracies = []
137
+ confidences = []
138
+ correct_predictions = 0
139
+ incorrect_predictions = 0
140
+ predictions.each do |p|
141
+ if p[1] and p[2]
142
+ p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1
143
+ accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
144
+ confidences << p[3]
145
+
146
+ end
147
+ end
148
+ R.assign "accuracy", accuracies
149
+ R.assign "confidence", confidences
150
+ R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
151
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
152
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
153
+ plot_id = $gridfs.insert_one(file)
154
+ update(:confidence_plot_id => plot_id)
155
+ end
156
+ $gridfs.find_one(_id: confidence_plot_id).data
91
157
  end
92
158
 
93
159
  #Average area under roc 0.646
@@ -99,98 +165,135 @@ module OpenTox
99
165
 
100
166
  field :rmse, type: Float
101
167
  field :mae, type: Float
102
- field :weighted_rmse, type: Float
103
- field :weighted_mae, type: Float
168
+ field :r_squared, type: Float
169
+ field :correlation_plot_id, type: BSON::ObjectId
104
170
 
105
- def self.create model, n=10
106
- cv = self.new
107
- cv.save # set created_at
108
- validation_ids = []
109
- nr_instances = 0
110
- nr_unpredicted = 0
111
- predictions = []
112
- validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
113
- fold_nr = 1
114
- training_dataset = Dataset.find model.training_dataset_id
115
- training_dataset.folds(n).each do |fold|
116
- t = Time.now
117
- $logger.debug "Predicting fold #{fold_nr}"
118
-
119
- validation = validation_class.create(model, fold[0], fold[1])
120
- validation_ids << validation.id
121
- nr_instances += validation.nr_instances
122
- nr_unpredicted += validation.nr_unpredicted
123
- predictions += validation.predictions
124
- $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
125
- fold_nr +=1
126
- end
171
+ def statistics
127
172
  rmse = 0
128
- weighted_rmse = 0
129
- rse = 0
130
- weighted_rse = 0
131
173
  mae = 0
132
- weighted_mae = 0
133
- rae = 0
134
- weighted_rae = 0
135
- n = 0
136
- confidence_sum = 0
174
+ x = []
175
+ y = []
137
176
  predictions.each do |pred|
138
177
  compound_id,activity,prediction,confidence = pred
139
- if activity and prediction
140
- error = prediction-activity
141
- rmse += error**2
142
- weighted_rmse += confidence*error**2
143
- mae += error.abs
144
- weighted_mae += confidence*error.abs
145
- n += 1
146
- confidence_sum += confidence
178
+ if activity and prediction
179
+ unless activity == [nil]
180
+ x << -Math.log10(activity.median)
181
+ y << -Math.log10(prediction)
182
+ error = Math.log10(prediction)-Math.log10(activity.median)
183
+ rmse += error**2
184
+ #weighted_rmse += confidence*error**2
185
+ mae += error.abs
186
+ #weighted_mae += confidence*error.abs
187
+ #confidence_sum += confidence
188
+ end
147
189
  else
148
- # TODO: create warnings
149
- p pred
190
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
191
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
150
192
  end
151
193
  end
152
- mae = mae/n
153
- weighted_mae = weighted_mae/confidence_sum
154
- rmse = Math.sqrt(rmse/n)
155
- weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
156
- cv.update_attributes(
157
- name: model.name,
158
- model_id: model.id,
159
- folds: n,
160
- validation_ids: validation_ids,
161
- nr_instances: nr_instances,
162
- nr_unpredicted: nr_unpredicted,
163
- predictions: predictions.sort{|a,b| b[3] <=> a[3]},
194
+ R.assign "measurement", x
195
+ R.assign "prediction", y
196
+ R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
197
+ r = R.eval("r").to_ruby
198
+
199
+ mae = mae/predictions.size
200
+ #weighted_mae = weighted_mae/confidence_sum
201
+ rmse = Math.sqrt(rmse/predictions.size)
202
+ #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
203
+ update_attributes(
164
204
  mae: mae,
165
205
  rmse: rmse,
166
- weighted_mae: weighted_mae,
167
- weighted_rmse: weighted_rmse
206
+ #weighted_mae: weighted_mae,
207
+ #weighted_rmse: weighted_rmse,
208
+ r_squared: r**2,
209
+ finished_at: Time.now
168
210
  )
169
- cv.save
170
- cv
211
+ $logger.debug "R^2 #{r**2}"
212
+ $logger.debug "RMSE #{rmse}"
213
+ $logger.debug "MAE #{mae}"
214
+ end
215
+
216
+ def misclassifications n=nil
217
+ #n = predictions.size unless n
218
+ n ||= 10
219
+ model = Model::Lazar.find(self.model_id)
220
+ training_dataset = Dataset.find(model.training_dataset_id)
221
+ prediction_feature = training_dataset.features.first
222
+ predictions.collect do |p|
223
+ unless p.include? nil
224
+ compound = Compound.find(p[0])
225
+ neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
226
+ neighbors.collect! do |n|
227
+ neighbor = Compound.find(n[0])
228
+ values = training_dataset.values(neighbor,prediction_feature)
229
+ { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
230
+ end
231
+ {
232
+ :smiles => compound.smiles,
233
+ #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name},
234
+ :measured => p[1],
235
+ :predicted => p[2],
236
+ #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
237
+ :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
238
+ :relative_error => (p[1]-p[2]).abs/p[1],
239
+ :confidence => p[3],
240
+ :neighbors => neighbors
241
+ }
242
+ end
243
+ end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1]
244
+ end
245
+
246
+ def confidence_plot
247
+ tmpfile = "/tmp/#{id.to_s}_confidence.png"
248
+ sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
249
+ R.assign "error", sorted_predictions.collect{|p| p[0]}
250
+ R.assign "confidence", sorted_predictions.collect{|p| p[1]}
251
+ # TODO fix axis names
252
+ R.eval "image = qplot(confidence,error)"
253
+ R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
254
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
255
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
256
+ plot_id = $gridfs.insert_one(file)
257
+ update(:confidence_plot_id => plot_id)
258
+ $gridfs.find_one(_id: confidence_plot_id).data
259
+ end
260
+
261
+ def correlation_plot
262
+ unless correlation_plot_id
263
+ tmpfile = "/tmp/#{id.to_s}_correlation.png"
264
+ x = predictions.collect{|p| p[1]}
265
+ y = predictions.collect{|p| p[2]}
266
+ attributes = Model::Lazar.find(self.model_id).attributes
267
+ attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
268
+ attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
269
+ R.assign "measurement", x
270
+ R.assign "prediction", y
271
+ R.eval "all = c(-log(measurement),-log(prediction))"
272
+ R.eval "range = c(min(all), max(all))"
273
+ R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
274
+ R.eval "image = image + geom_abline(intercept=0, slope=1)"
275
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
276
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png")
277
+ plot_id = $gridfs.insert_one(file)
278
+ update(:correlation_plot_id => plot_id)
279
+ end
280
+ $gridfs.find_one(_id: correlation_plot_id).data
171
281
  end
282
+ end
172
283
 
173
- def plot
174
- # RMSE
175
- x = predictions.collect{|p| p[1]}
176
- y = predictions.collect{|p| p[2]}
177
- R.assign "Measurement", x
178
- R.assign "Prediction", y
179
- R.eval "par(pty='s')" # sets the plot type to be square
180
- #R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))"
181
- #R.eval "error <- log(Measurement)-log(Prediction)"
182
- R.eval "error <- Measurement-Prediction"
183
- R.eval "rmse <- sqrt(mean(error^2,na.rm=T))"
184
- R.eval "mae <- mean( abs(error), na.rm = TRUE)"
185
- R.eval "r <- cor(log(Prediction),log(Measurement))"
186
- R.eval "svg(filename='/tmp/#{id.to_s}.svg')"
187
- R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)"
188
- #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)"
189
- #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)"
190
- R.eval "abline(0,1,col='blue')"
191
- #R.eval "abline(fitline,col='red')"
192
- R.eval "dev.off()"
193
- "/tmp/#{id.to_s}.svg"
284
+ class RepeatedCrossValidation
285
+ field :crossvalidation_ids, type: Array, default: []
286
+ def self.create model, folds=10, repeats=3
287
+ repeated_cross_validation = self.new
288
+ repeats.times do |n|
289
+ $logger.debug "Crossvalidation #{n+1} for #{model.name}"
290
+ repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
291
+ end
292
+ repeated_cross_validation.save
293
+ repeated_cross_validation
294
+ end
295
+ def crossvalidations
296
+ crossvalidation_ids.collect{|id| CrossValidation.find(id)}
194
297
  end
195
298
  end
196
299