lazar 0.9.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
@@ -1,301 +1,102 @@
1
1
  module OpenTox
2
2
 
3
- class CrossValidation
4
- field :validation_ids, type: Array, default: []
5
- field :model_id, type: BSON::ObjectId
6
- field :folds, type: Integer
7
- field :nr_instances, type: Integer
8
- field :nr_unpredicted, type: Integer
9
- field :predictions, type: Array, default: []
10
- field :finished_at, type: Time
11
-
12
- def time
13
- finished_at - created_at
14
- end
15
-
16
- def validations
17
- validation_ids.collect{|vid| Validation.find vid}
18
- end
19
-
20
- def model
21
- Model::Lazar.find model_id
22
- end
23
-
24
- def self.create model, n=10
25
- model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
26
- bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
27
- cv = klass.new(
28
- name: model.name,
29
- model_id: model.id,
30
- folds: n
31
- )
32
- cv.save # set created_at
33
- nr_instances = 0
34
- nr_unpredicted = 0
35
- predictions = []
36
- training_dataset = Dataset.find model.training_dataset_id
37
- training_dataset.folds(n).each_with_index do |fold,fold_nr|
38
- #fork do # parallel execution of validations
39
- $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
40
- t = Time.now
41
- validation = Validation.create(model, fold[0], fold[1],cv)
42
- $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
43
- #end
44
- end
45
- #Process.waitall
46
- cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
47
- cv.validations.each do |validation|
48
- nr_instances += validation.nr_instances
49
- nr_unpredicted += validation.nr_unpredicted
50
- predictions += validation.predictions
51
- end
52
- cv.update_attributes(
53
- nr_instances: nr_instances,
54
- nr_unpredicted: nr_unpredicted,
55
- predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
56
- )
57
- $logger.debug "Nr unpredicted: #{nr_unpredicted}"
58
- cv.statistics
59
- cv
60
- end
61
- end
62
-
63
- class ClassificationCrossValidation < CrossValidation
64
-
65
- field :accept_values, type: Array
66
- field :confusion_matrix, type: Array
67
- field :weighted_confusion_matrix, type: Array
68
- field :accuracy, type: Float
69
- field :weighted_accuracy, type: Float
70
- field :true_rate, type: Hash
71
- field :predictivity, type: Hash
72
- field :confidence_plot_id, type: BSON::ObjectId
73
- # TODO auc, f-measure (usability??)
74
-
75
- def statistics
76
- accept_values = Feature.find(model.prediction_feature_id).accept_values
77
- confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
78
- weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
79
- true_rate = {}
80
- predictivity = {}
81
- predictions.each do |pred|
82
- compound_id,activities,prediction,confidence = pred
83
- if activities and prediction #and confidence.numeric?
84
- if activities.uniq.size == 1
85
- activity = activities.uniq.first
86
- if prediction == activity
87
- if prediction == accept_values[0]
88
- confusion_matrix[0][0] += 1
89
- #weighted_confusion_matrix[0][0] += confidence
90
- elsif prediction == accept_values[1]
91
- confusion_matrix[1][1] += 1
92
- #weighted_confusion_matrix[1][1] += confidence
93
- end
94
- elsif prediction != activity
95
- if prediction == accept_values[0]
96
- confusion_matrix[0][1] += 1
97
- #weighted_confusion_matrix[0][1] += confidence
98
- elsif prediction == accept_values[1]
99
- confusion_matrix[1][0] += 1
100
- #weighted_confusion_matrix[1][0] += confidence
101
- end
102
- end
103
- end
104
- else
105
- nr_unpredicted += 1 if prediction.nil?
3
+ module Validation
4
+ class CrossValidation < Validation
5
+ field :validation_ids, type: Array, default: []
6
+ field :folds, type: Integer, default: 10
7
+
8
+ def self.create model, n=10
9
+ $logger.debug model.algorithms
10
+ klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification
11
+ klass = RegressionCrossValidation if model.is_a? Model::LazarRegression
12
+ bad_request_error "Unknown model class #{model.class}." unless klass
13
+
14
+ cv = klass.new(
15
+ name: model.name,
16
+ model_id: model.id,
17
+ folds: n
18
+ )
19
+ cv.save # set created_at
20
+
21
+ nr_instances = 0
22
+ nr_unpredicted = 0
23
+ training_dataset = model.training_dataset
24
+ training_dataset.folds(n).each_with_index do |fold,fold_nr|
25
+ #fork do # parallel execution of validations can lead to Rserve and memory problems
26
+ $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
27
+ t = Time.now
28
+ validation = TrainTest.create(model, fold[0], fold[1])
29
+ cv.validation_ids << validation.id
30
+ cv.nr_instances += validation.nr_instances
31
+ cv.nr_unpredicted += validation.nr_unpredicted
32
+ #cv.predictions.merge! validation.predictions
33
+ $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
34
+ #end
106
35
  end
36
+ #Process.waitall
37
+ cv.save
38
+ $logger.debug "Nr unpredicted: #{nr_unpredicted}"
39
+ cv.statistics
40
+ cv.update_attributes(finished_at: Time.now)
41
+ cv
107
42
  end
108
- true_rate = {}
109
- predictivity = {}
110
- accept_values.each_with_index do |v,i|
111
- true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
112
- predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
113
- end
114
- confidence_sum = 0
115
- #weighted_confusion_matrix.each do |r|
116
- #r.each do |c|
117
- #confidence_sum += c
118
- #end
119
- #end
120
- update_attributes(
121
- accept_values: accept_values,
122
- confusion_matrix: confusion_matrix,
123
- #weighted_confusion_matrix: weighted_confusion_matrix,
124
- accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
125
- #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
126
- true_rate: true_rate,
127
- predictivity: predictivity,
128
- finished_at: Time.now
129
- )
130
- $logger.debug "Accuracy #{accuracy}"
131
- end
132
-
133
- def confidence_plot
134
- unless confidence_plot_id
135
- tmpfile = "/tmp/#{id.to_s}_confidence.png"
136
- accuracies = []
137
- confidences = []
138
- correct_predictions = 0
139
- incorrect_predictions = 0
140
- predictions.each do |p|
141
- if p[1] and p[2]
142
- p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1
143
- accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
144
- confidences << p[3]
145
43
 
146
- end
147
- end
148
- R.assign "accuracy", accuracies
149
- R.assign "confidence", confidences
150
- R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
151
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
152
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
153
- plot_id = $gridfs.insert_one(file)
154
- update(:confidence_plot_id => plot_id)
44
+ def time
45
+ finished_at - created_at
155
46
  end
156
- $gridfs.find_one(_id: confidence_plot_id).data
157
- end
158
-
159
- #Average area under roc 0.646
160
- #Area under roc 0.646
161
- #F measure carcinogen: 0.769, noncarcinogen: 0.348
162
- end
163
47
 
164
- class RegressionCrossValidation < CrossValidation
165
-
166
- field :rmse, type: Float
167
- field :mae, type: Float
168
- field :r_squared, type: Float
169
- field :correlation_plot_id, type: BSON::ObjectId
170
-
171
- def statistics
172
- rmse = 0
173
- mae = 0
174
- x = []
175
- y = []
176
- predictions.each do |pred|
177
- compound_id,activity,prediction,confidence = pred
178
- if activity and prediction
179
- unless activity == [nil]
180
- x << -Math.log10(activity.median)
181
- y << -Math.log10(prediction)
182
- error = Math.log10(prediction)-Math.log10(activity.median)
183
- rmse += error**2
184
- #weighted_rmse += confidence*error**2
185
- mae += error.abs
186
- #weighted_mae += confidence*error.abs
187
- #confidence_sum += confidence
188
- end
189
- else
190
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
191
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
192
- end
48
+ def validations
49
+ validation_ids.collect{|vid| TrainTest.find vid}
193
50
  end
194
- R.assign "measurement", x
195
- R.assign "prediction", y
196
- R.eval "r <- cor(measurement,prediction,use='complete')"
197
- r = R.eval("r").to_ruby
198
51
 
199
- mae = mae/predictions.size
200
- #weighted_mae = weighted_mae/confidence_sum
201
- rmse = Math.sqrt(rmse/predictions.size)
202
- #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
203
- update_attributes(
204
- mae: mae,
205
- rmse: rmse,
206
- #weighted_mae: weighted_mae,
207
- #weighted_rmse: weighted_rmse,
208
- r_squared: r**2,
209
- finished_at: Time.now
210
- )
211
- $logger.debug "R^2 #{r**2}"
212
- $logger.debug "RMSE #{rmse}"
213
- $logger.debug "MAE #{mae}"
52
+ def predictions
53
+ predictions = {}
54
+ validations.each{|v| predictions.merge!(v.predictions)}
55
+ predictions
56
+ end
214
57
  end
215
58
 
216
- def misclassifications n=nil
217
- #n = predictions.size unless n
218
- n ||= 10
219
- model = Model::Lazar.find(self.model_id)
220
- training_dataset = Dataset.find(model.training_dataset_id)
221
- prediction_feature = training_dataset.features.first
222
- predictions.collect do |p|
223
- unless p.include? nil
224
- compound = Compound.find(p[0])
225
- neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
226
- neighbors.collect! do |n|
227
- neighbor = Compound.find(n[0])
228
- values = training_dataset.values(neighbor,prediction_feature)
229
- { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
230
- end
231
- {
232
- :smiles => compound.smiles,
233
- #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name},
234
- :measured => p[1],
235
- :predicted => p[2],
236
- #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
237
- :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
238
- :relative_error => (p[1]-p[2]).abs/p[1],
239
- :confidence => p[3],
240
- :neighbors => neighbors
241
- }
242
- end
243
- end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1]
59
+ class ClassificationCrossValidation < CrossValidation
60
+ include ClassificationStatistics
61
+ field :accept_values, type: Array
62
+ field :confusion_matrix, type: Array
63
+ field :weighted_confusion_matrix, type: Array
64
+ field :accuracy, type: Float
65
+ field :weighted_accuracy, type: Float
66
+ field :true_rate, type: Hash
67
+ field :predictivity, type: Hash
68
+ field :probability_plot_id, type: BSON::ObjectId
244
69
  end
245
70
 
246
- def confidence_plot
247
- tmpfile = "/tmp/#{id.to_s}_confidence.png"
248
- sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
249
- R.assign "error", sorted_predictions.collect{|p| p[0]}
250
- R.assign "confidence", sorted_predictions.collect{|p| p[1]}
251
- # TODO fix axis names
252
- R.eval "image = qplot(confidence,error)"
253
- R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
254
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
255
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
256
- plot_id = $gridfs.insert_one(file)
257
- update(:confidence_plot_id => plot_id)
258
- $gridfs.find_one(_id: confidence_plot_id).data
71
+ class RegressionCrossValidation < CrossValidation
72
+ include RegressionStatistics
73
+ field :rmse, type: Float, default:0
74
+ field :mae, type: Float, default:0
75
+ field :r_squared, type: Float
76
+ field :within_prediction_interval, type: Integer, default:0
77
+ field :out_of_prediction_interval, type: Integer, default:0
78
+ field :correlation_plot_id, type: BSON::ObjectId
259
79
  end
260
80
 
261
- def correlation_plot
262
- unless correlation_plot_id
263
- tmpfile = "/tmp/#{id.to_s}_correlation.png"
264
- x = predictions.collect{|p| p[1]}
265
- y = predictions.collect{|p| p[2]}
266
- attributes = Model::Lazar.find(self.model_id).attributes
267
- attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
268
- attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
269
- R.assign "measurement", x
270
- R.assign "prediction", y
271
- R.eval "all = c(-log(measurement),-log(prediction))"
272
- R.eval "range = c(min(all), max(all))"
273
- R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
274
- R.eval "image = image + geom_abline(intercept=0, slope=1)"
275
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
276
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png")
277
- plot_id = $gridfs.insert_one(file)
278
- update(:correlation_plot_id => plot_id)
81
+ class RepeatedCrossValidation < Validation
82
+ field :crossvalidation_ids, type: Array, default: []
83
+ field :correlation_plot_id, type: BSON::ObjectId
84
+
85
+ def self.create model, folds=10, repeats=3
86
+ repeated_cross_validation = self.new
87
+ repeats.times do |n|
88
+ $logger.debug "Crossvalidation #{n+1} for #{model.name}"
89
+ repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
90
+ end
91
+ repeated_cross_validation.save
92
+ repeated_cross_validation
279
93
  end
280
- $gridfs.find_one(_id: correlation_plot_id).data
281
- end
282
- end
283
94
 
284
- class RepeatedCrossValidation
285
- field :crossvalidation_ids, type: Array, default: []
286
- def self.create model, folds=10, repeats=3
287
- repeated_cross_validation = self.new
288
- repeats.times do |n|
289
- $logger.debug "Crossvalidation #{n+1} for #{model.name}"
290
- repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
95
+ def crossvalidations
96
+ crossvalidation_ids.collect{|id| CrossValidation.find(id)}
291
97
  end
292
- repeated_cross_validation.save
293
- repeated_cross_validation
294
- end
295
- def crossvalidations
296
- crossvalidation_ids.collect{|id| CrossValidation.find(id)}
98
+
297
99
  end
298
100
  end
299
101
 
300
-
301
102
  end