lazar 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/README.md +2 -1
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +15 -76
  6. data/ext/lazar/rinstall.R +9 -0
  7. data/lazar.gemspec +7 -7
  8. data/lib/classification.rb +5 -78
  9. data/lib/compound.rb +201 -44
  10. data/lib/crossvalidation.rb +224 -121
  11. data/lib/dataset.rb +83 -93
  12. data/lib/error.rb +1 -1
  13. data/lib/experiment.rb +99 -0
  14. data/lib/feature.rb +2 -54
  15. data/lib/lazar.rb +47 -34
  16. data/lib/leave-one-out-validation.rb +205 -0
  17. data/lib/model.rb +131 -76
  18. data/lib/opentox.rb +2 -2
  19. data/lib/overwrite.rb +37 -0
  20. data/lib/physchem.rb +133 -0
  21. data/lib/regression.rb +117 -189
  22. data/lib/rest-client-wrapper.rb +4 -5
  23. data/lib/unique_descriptors.rb +6 -7
  24. data/lib/validation.rb +63 -69
  25. data/test/all.rb +2 -2
  26. data/test/classification.rb +41 -0
  27. data/test/compound.rb +116 -7
  28. data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
  29. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
  30. data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
  31. data/test/data/batch_prediction.csv +25 -0
  32. data/test/data/batch_prediction_inchi_small.csv +4 -0
  33. data/test/data/batch_prediction_smiles_small.csv +4 -0
  34. data/test/data/hamster_carcinogenicity.json +3 -0
  35. data/test/data/loael.csv +568 -0
  36. data/test/dataset-long.rb +5 -8
  37. data/test/dataset.rb +31 -11
  38. data/test/default_environment.rb +11 -0
  39. data/test/descriptor.rb +26 -41
  40. data/test/error.rb +1 -3
  41. data/test/experiment.rb +301 -0
  42. data/test/feature.rb +22 -10
  43. data/test/lazar-long.rb +43 -23
  44. data/test/lazar-physchem-short.rb +19 -16
  45. data/test/prediction_models.rb +20 -0
  46. data/test/regression.rb +43 -0
  47. data/test/setup.rb +3 -1
  48. data/test/test_environment.rb +10 -0
  49. data/test/validation.rb +92 -26
  50. metadata +64 -38
  51. data/lib/SMARTS_InteLigand.txt +0 -983
  52. data/lib/bbrc.rb +0 -165
  53. data/lib/descriptor.rb +0 -247
  54. data/lib/neighbor.rb +0 -25
  55. data/lib/similarity.rb +0 -58
  56. data/mongoid.yml +0 -8
  57. data/test/descriptor-long.rb +0 -26
  58. data/test/fminer-long.rb +0 -38
  59. data/test/fminer.rb +0 -52
  60. data/test/lazar-fminer.rb +0 -50
  61. data/test/lazar-regression.rb +0 -27
@@ -6,12 +6,58 @@ module OpenTox
6
6
  field :folds, type: Integer
7
7
  field :nr_instances, type: Integer
8
8
  field :nr_unpredicted, type: Integer
9
- field :predictions, type: Array
9
+ field :predictions, type: Array, default: []
10
10
  field :finished_at, type: Time
11
11
 
12
12
  def time
13
13
  finished_at - created_at
14
14
  end
15
+
16
+ def validations
17
+ validation_ids.collect{|vid| Validation.find vid}
18
+ end
19
+
20
+ def model
21
+ Model::Lazar.find model_id
22
+ end
23
+
24
+ def self.create model, n=10
25
+ model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
26
+ bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
27
+ cv = klass.new(
28
+ name: model.name,
29
+ model_id: model.id,
30
+ folds: n
31
+ )
32
+ cv.save # set created_at
33
+ nr_instances = 0
34
+ nr_unpredicted = 0
35
+ predictions = []
36
+ training_dataset = Dataset.find model.training_dataset_id
37
+ training_dataset.folds(n).each_with_index do |fold,fold_nr|
38
+ #fork do # parallel execution of validations
39
+ $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
40
+ t = Time.now
41
+ validation = Validation.create(model, fold[0], fold[1],cv)
42
+ $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
43
+ #end
44
+ end
45
+ #Process.waitall
46
+ cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
47
+ cv.validations.each do |validation|
48
+ nr_instances += validation.nr_instances
49
+ nr_unpredicted += validation.nr_unpredicted
50
+ predictions += validation.predictions
51
+ end
52
+ cv.update_attributes(
53
+ nr_instances: nr_instances,
54
+ nr_unpredicted: nr_unpredicted,
55
+ predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
56
+ )
57
+ $logger.debug "Nr unpredicted: #{nr_unpredicted}"
58
+ cv.statistics
59
+ cv
60
+ end
15
61
  end
16
62
 
17
63
  class ClassificationCrossValidation < CrossValidation
@@ -23,39 +69,41 @@ module OpenTox
23
69
  field :weighted_accuracy, type: Float
24
70
  field :true_rate, type: Hash
25
71
  field :predictivity, type: Hash
72
+ field :confidence_plot_id, type: BSON::ObjectId
26
73
  # TODO auc, f-measure (usability??)
27
74
 
28
- def self.create model, n=10
29
- cv = self.new
30
- cv.save # set created_at
31
- validation_ids = []
32
- nr_instances = 0
33
- nr_unpredicted = 0
34
- predictions = []
35
- validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
75
+ def statistics
36
76
  accept_values = Feature.find(model.prediction_feature_id).accept_values
37
77
  confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
38
78
  weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
39
79
  true_rate = {}
40
80
  predictivity = {}
41
- fold_nr = 1
42
- training_dataset = Dataset.find model.training_dataset_id
43
- training_dataset.folds(n).each do |fold|
44
- t = Time.now
45
- $logger.debug "Fold #{fold_nr}"
46
- validation = validation_class.create(model, fold[0], fold[1])
47
- validation_ids << validation.id
48
- nr_instances += validation.nr_instances
49
- nr_unpredicted += validation.nr_unpredicted
50
- predictions += validation.predictions
51
- validation.confusion_matrix.each_with_index do |r,i|
52
- r.each_with_index do |c,j|
53
- confusion_matrix[i][j] += c
54
- weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
81
+ predictions.each do |pred|
82
+ compound_id,activities,prediction,confidence = pred
83
+ if activities and prediction #and confidence.numeric?
84
+ if activities.uniq.size == 1
85
+ activity = activities.uniq.first
86
+ if prediction == activity
87
+ if prediction == accept_values[0]
88
+ confusion_matrix[0][0] += 1
89
+ #weighted_confusion_matrix[0][0] += confidence
90
+ elsif prediction == accept_values[1]
91
+ confusion_matrix[1][1] += 1
92
+ #weighted_confusion_matrix[1][1] += confidence
93
+ end
94
+ elsif prediction != activity
95
+ if prediction == accept_values[0]
96
+ confusion_matrix[0][1] += 1
97
+ #weighted_confusion_matrix[0][1] += confidence
98
+ elsif prediction == accept_values[1]
99
+ confusion_matrix[1][0] += 1
100
+ #weighted_confusion_matrix[1][0] += confidence
101
+ end
102
+ end
55
103
  end
104
+ else
105
+ nr_unpredicted += 1 if prediction.nil?
56
106
  end
57
- $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
58
- fold_nr +=1
59
107
  end
60
108
  true_rate = {}
61
109
  predictivity = {}
@@ -64,30 +112,48 @@ module OpenTox
64
112
  predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
65
113
  end
66
114
  confidence_sum = 0
67
- weighted_confusion_matrix.each do |r|
68
- r.each do |c|
69
- confidence_sum += c
70
- end
71
- end
72
- cv.update_attributes(
73
- name: model.name,
74
- model_id: model.id,
75
- folds: n,
76
- validation_ids: validation_ids,
77
- nr_instances: nr_instances,
78
- nr_unpredicted: nr_unpredicted,
115
+ #weighted_confusion_matrix.each do |r|
116
+ #r.each do |c|
117
+ #confidence_sum += c
118
+ #end
119
+ #end
120
+ update_attributes(
79
121
  accept_values: accept_values,
80
122
  confusion_matrix: confusion_matrix,
81
- weighted_confusion_matrix: weighted_confusion_matrix,
123
+ #weighted_confusion_matrix: weighted_confusion_matrix,
82
124
  accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
83
- weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
125
+ #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
84
126
  true_rate: true_rate,
85
127
  predictivity: predictivity,
86
- predictions: predictions.sort{|a,b| b[3] <=> a[3]}, # sort according to confidence
87
128
  finished_at: Time.now
88
129
  )
89
- cv.save
90
- cv
130
+ $logger.debug "Accuracy #{accuracy}"
131
+ end
132
+
133
+ def confidence_plot
134
+ unless confidence_plot_id
135
+ tmpfile = "/tmp/#{id.to_s}_confidence.png"
136
+ accuracies = []
137
+ confidences = []
138
+ correct_predictions = 0
139
+ incorrect_predictions = 0
140
+ predictions.each do |p|
141
+ if p[1] and p[2]
142
+ p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1
143
+ accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
144
+ confidences << p[3]
145
+
146
+ end
147
+ end
148
+ R.assign "accuracy", accuracies
149
+ R.assign "confidence", confidences
150
+ R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
151
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
152
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
153
+ plot_id = $gridfs.insert_one(file)
154
+ update(:confidence_plot_id => plot_id)
155
+ end
156
+ $gridfs.find_one(_id: confidence_plot_id).data
91
157
  end
92
158
 
93
159
  #Average area under roc 0.646
@@ -99,98 +165,135 @@ module OpenTox
99
165
 
100
166
  field :rmse, type: Float
101
167
  field :mae, type: Float
102
- field :weighted_rmse, type: Float
103
- field :weighted_mae, type: Float
168
+ field :r_squared, type: Float
169
+ field :correlation_plot_id, type: BSON::ObjectId
104
170
 
105
- def self.create model, n=10
106
- cv = self.new
107
- cv.save # set created_at
108
- validation_ids = []
109
- nr_instances = 0
110
- nr_unpredicted = 0
111
- predictions = []
112
- validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
113
- fold_nr = 1
114
- training_dataset = Dataset.find model.training_dataset_id
115
- training_dataset.folds(n).each do |fold|
116
- t = Time.now
117
- $logger.debug "Predicting fold #{fold_nr}"
118
-
119
- validation = validation_class.create(model, fold[0], fold[1])
120
- validation_ids << validation.id
121
- nr_instances += validation.nr_instances
122
- nr_unpredicted += validation.nr_unpredicted
123
- predictions += validation.predictions
124
- $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
125
- fold_nr +=1
126
- end
171
+ def statistics
127
172
  rmse = 0
128
- weighted_rmse = 0
129
- rse = 0
130
- weighted_rse = 0
131
173
  mae = 0
132
- weighted_mae = 0
133
- rae = 0
134
- weighted_rae = 0
135
- n = 0
136
- confidence_sum = 0
174
+ x = []
175
+ y = []
137
176
  predictions.each do |pred|
138
177
  compound_id,activity,prediction,confidence = pred
139
- if activity and prediction
140
- error = prediction-activity
141
- rmse += error**2
142
- weighted_rmse += confidence*error**2
143
- mae += error.abs
144
- weighted_mae += confidence*error.abs
145
- n += 1
146
- confidence_sum += confidence
178
+ if activity and prediction
179
+ unless activity == [nil]
180
+ x << -Math.log10(activity.median)
181
+ y << -Math.log10(prediction)
182
+ error = Math.log10(prediction)-Math.log10(activity.median)
183
+ rmse += error**2
184
+ #weighted_rmse += confidence*error**2
185
+ mae += error.abs
186
+ #weighted_mae += confidence*error.abs
187
+ #confidence_sum += confidence
188
+ end
147
189
  else
148
- # TODO: create warnings
149
- p pred
190
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
191
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
150
192
  end
151
193
  end
152
- mae = mae/n
153
- weighted_mae = weighted_mae/confidence_sum
154
- rmse = Math.sqrt(rmse/n)
155
- weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
156
- cv.update_attributes(
157
- name: model.name,
158
- model_id: model.id,
159
- folds: n,
160
- validation_ids: validation_ids,
161
- nr_instances: nr_instances,
162
- nr_unpredicted: nr_unpredicted,
163
- predictions: predictions.sort{|a,b| b[3] <=> a[3]},
194
+ R.assign "measurement", x
195
+ R.assign "prediction", y
196
+ R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
197
+ r = R.eval("r").to_ruby
198
+
199
+ mae = mae/predictions.size
200
+ #weighted_mae = weighted_mae/confidence_sum
201
+ rmse = Math.sqrt(rmse/predictions.size)
202
+ #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
203
+ update_attributes(
164
204
  mae: mae,
165
205
  rmse: rmse,
166
- weighted_mae: weighted_mae,
167
- weighted_rmse: weighted_rmse
206
+ #weighted_mae: weighted_mae,
207
+ #weighted_rmse: weighted_rmse,
208
+ r_squared: r**2,
209
+ finished_at: Time.now
168
210
  )
169
- cv.save
170
- cv
211
+ $logger.debug "R^2 #{r**2}"
212
+ $logger.debug "RMSE #{rmse}"
213
+ $logger.debug "MAE #{mae}"
214
+ end
215
+
216
+ def misclassifications n=nil
217
+ #n = predictions.size unless n
218
+ n ||= 10
219
+ model = Model::Lazar.find(self.model_id)
220
+ training_dataset = Dataset.find(model.training_dataset_id)
221
+ prediction_feature = training_dataset.features.first
222
+ predictions.collect do |p|
223
+ unless p.include? nil
224
+ compound = Compound.find(p[0])
225
+ neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
226
+ neighbors.collect! do |n|
227
+ neighbor = Compound.find(n[0])
228
+ values = training_dataset.values(neighbor,prediction_feature)
229
+ { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
230
+ end
231
+ {
232
+ :smiles => compound.smiles,
233
+ #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name},
234
+ :measured => p[1],
235
+ :predicted => p[2],
236
+ #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
237
+ :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
238
+ :relative_error => (p[1]-p[2]).abs/p[1],
239
+ :confidence => p[3],
240
+ :neighbors => neighbors
241
+ }
242
+ end
243
+ end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1]
244
+ end
245
+
246
+ def confidence_plot
247
+ tmpfile = "/tmp/#{id.to_s}_confidence.png"
248
+ sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
249
+ R.assign "error", sorted_predictions.collect{|p| p[0]}
250
+ R.assign "confidence", sorted_predictions.collect{|p| p[1]}
251
+ # TODO fix axis names
252
+ R.eval "image = qplot(confidence,error)"
253
+ R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
254
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
255
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
256
+ plot_id = $gridfs.insert_one(file)
257
+ update(:confidence_plot_id => plot_id)
258
+ $gridfs.find_one(_id: confidence_plot_id).data
259
+ end
260
+
261
+ def correlation_plot
262
+ unless correlation_plot_id
263
+ tmpfile = "/tmp/#{id.to_s}_correlation.png"
264
+ x = predictions.collect{|p| p[1]}
265
+ y = predictions.collect{|p| p[2]}
266
+ attributes = Model::Lazar.find(self.model_id).attributes
267
+ attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
268
+ attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
269
+ R.assign "measurement", x
270
+ R.assign "prediction", y
271
+ R.eval "all = c(-log(measurement),-log(prediction))"
272
+ R.eval "range = c(min(all), max(all))"
273
+ R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
274
+ R.eval "image = image + geom_abline(intercept=0, slope=1)"
275
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
276
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png")
277
+ plot_id = $gridfs.insert_one(file)
278
+ update(:correlation_plot_id => plot_id)
279
+ end
280
+ $gridfs.find_one(_id: correlation_plot_id).data
171
281
  end
282
+ end
172
283
 
173
- def plot
174
- # RMSE
175
- x = predictions.collect{|p| p[1]}
176
- y = predictions.collect{|p| p[2]}
177
- R.assign "Measurement", x
178
- R.assign "Prediction", y
179
- R.eval "par(pty='s')" # sets the plot type to be square
180
- #R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))"
181
- #R.eval "error <- log(Measurement)-log(Prediction)"
182
- R.eval "error <- Measurement-Prediction"
183
- R.eval "rmse <- sqrt(mean(error^2,na.rm=T))"
184
- R.eval "mae <- mean( abs(error), na.rm = TRUE)"
185
- R.eval "r <- cor(log(Prediction),log(Measurement))"
186
- R.eval "svg(filename='/tmp/#{id.to_s}.svg')"
187
- R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)"
188
- #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)"
189
- #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)"
190
- R.eval "abline(0,1,col='blue')"
191
- #R.eval "abline(fitline,col='red')"
192
- R.eval "dev.off()"
193
- "/tmp/#{id.to_s}.svg"
284
+ class RepeatedCrossValidation
285
+ field :crossvalidation_ids, type: Array, default: []
286
+ def self.create model, folds=10, repeats=3
287
+ repeated_cross_validation = self.new
288
+ repeats.times do |n|
289
+ $logger.debug "Crossvalidation #{n+1} for #{model.name}"
290
+ repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
291
+ end
292
+ repeated_cross_validation.save
293
+ repeated_cross_validation
294
+ end
295
+ def crossvalidations
296
+ crossvalidation_ids.collect{|id| CrossValidation.find(id)}
194
297
  end
195
298
  end
196
299