lazar 0.9.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
@@ -1,301 +1,102 @@
1
1
  module OpenTox
2
2
 
3
- class CrossValidation
4
- field :validation_ids, type: Array, default: []
5
- field :model_id, type: BSON::ObjectId
6
- field :folds, type: Integer
7
- field :nr_instances, type: Integer
8
- field :nr_unpredicted, type: Integer
9
- field :predictions, type: Array, default: []
10
- field :finished_at, type: Time
11
-
12
- def time
13
- finished_at - created_at
14
- end
15
-
16
- def validations
17
- validation_ids.collect{|vid| Validation.find vid}
18
- end
19
-
20
- def model
21
- Model::Lazar.find model_id
22
- end
23
-
24
- def self.create model, n=10
25
- model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
26
- bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
27
- cv = klass.new(
28
- name: model.name,
29
- model_id: model.id,
30
- folds: n
31
- )
32
- cv.save # set created_at
33
- nr_instances = 0
34
- nr_unpredicted = 0
35
- predictions = []
36
- training_dataset = Dataset.find model.training_dataset_id
37
- training_dataset.folds(n).each_with_index do |fold,fold_nr|
38
- #fork do # parallel execution of validations
39
- $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
40
- t = Time.now
41
- validation = Validation.create(model, fold[0], fold[1],cv)
42
- $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
43
- #end
44
- end
45
- #Process.waitall
46
- cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
47
- cv.validations.each do |validation|
48
- nr_instances += validation.nr_instances
49
- nr_unpredicted += validation.nr_unpredicted
50
- predictions += validation.predictions
51
- end
52
- cv.update_attributes(
53
- nr_instances: nr_instances,
54
- nr_unpredicted: nr_unpredicted,
55
- predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
56
- )
57
- $logger.debug "Nr unpredicted: #{nr_unpredicted}"
58
- cv.statistics
59
- cv
60
- end
61
- end
62
-
63
- class ClassificationCrossValidation < CrossValidation
64
-
65
- field :accept_values, type: Array
66
- field :confusion_matrix, type: Array
67
- field :weighted_confusion_matrix, type: Array
68
- field :accuracy, type: Float
69
- field :weighted_accuracy, type: Float
70
- field :true_rate, type: Hash
71
- field :predictivity, type: Hash
72
- field :confidence_plot_id, type: BSON::ObjectId
73
- # TODO auc, f-measure (usability??)
74
-
75
- def statistics
76
- accept_values = Feature.find(model.prediction_feature_id).accept_values
77
- confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
78
- weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
79
- true_rate = {}
80
- predictivity = {}
81
- predictions.each do |pred|
82
- compound_id,activities,prediction,confidence = pred
83
- if activities and prediction #and confidence.numeric?
84
- if activities.uniq.size == 1
85
- activity = activities.uniq.first
86
- if prediction == activity
87
- if prediction == accept_values[0]
88
- confusion_matrix[0][0] += 1
89
- #weighted_confusion_matrix[0][0] += confidence
90
- elsif prediction == accept_values[1]
91
- confusion_matrix[1][1] += 1
92
- #weighted_confusion_matrix[1][1] += confidence
93
- end
94
- elsif prediction != activity
95
- if prediction == accept_values[0]
96
- confusion_matrix[0][1] += 1
97
- #weighted_confusion_matrix[0][1] += confidence
98
- elsif prediction == accept_values[1]
99
- confusion_matrix[1][0] += 1
100
- #weighted_confusion_matrix[1][0] += confidence
101
- end
102
- end
103
- end
104
- else
105
- nr_unpredicted += 1 if prediction.nil?
3
+ module Validation
4
+ class CrossValidation < Validation
5
+ field :validation_ids, type: Array, default: []
6
+ field :folds, type: Integer, default: 10
7
+
8
+ def self.create model, n=10
9
+ $logger.debug model.algorithms
10
+ klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification
11
+ klass = RegressionCrossValidation if model.is_a? Model::LazarRegression
12
+ bad_request_error "Unknown model class #{model.class}." unless klass
13
+
14
+ cv = klass.new(
15
+ name: model.name,
16
+ model_id: model.id,
17
+ folds: n
18
+ )
19
+ cv.save # set created_at
20
+
21
+ nr_instances = 0
22
+ nr_unpredicted = 0
23
+ training_dataset = model.training_dataset
24
+ training_dataset.folds(n).each_with_index do |fold,fold_nr|
25
+ #fork do # parallel execution of validations can lead to Rserve and memory problems
26
+ $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
27
+ t = Time.now
28
+ validation = TrainTest.create(model, fold[0], fold[1])
29
+ cv.validation_ids << validation.id
30
+ cv.nr_instances += validation.nr_instances
31
+ cv.nr_unpredicted += validation.nr_unpredicted
32
+ #cv.predictions.merge! validation.predictions
33
+ $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
34
+ #end
106
35
  end
36
+ #Process.waitall
37
+ cv.save
38
+ $logger.debug "Nr unpredicted: #{nr_unpredicted}"
39
+ cv.statistics
40
+ cv.update_attributes(finished_at: Time.now)
41
+ cv
107
42
  end
108
- true_rate = {}
109
- predictivity = {}
110
- accept_values.each_with_index do |v,i|
111
- true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
112
- predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
113
- end
114
- confidence_sum = 0
115
- #weighted_confusion_matrix.each do |r|
116
- #r.each do |c|
117
- #confidence_sum += c
118
- #end
119
- #end
120
- update_attributes(
121
- accept_values: accept_values,
122
- confusion_matrix: confusion_matrix,
123
- #weighted_confusion_matrix: weighted_confusion_matrix,
124
- accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
125
- #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
126
- true_rate: true_rate,
127
- predictivity: predictivity,
128
- finished_at: Time.now
129
- )
130
- $logger.debug "Accuracy #{accuracy}"
131
- end
132
-
133
- def confidence_plot
134
- unless confidence_plot_id
135
- tmpfile = "/tmp/#{id.to_s}_confidence.png"
136
- accuracies = []
137
- confidences = []
138
- correct_predictions = 0
139
- incorrect_predictions = 0
140
- predictions.each do |p|
141
- if p[1] and p[2]
142
- p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1
143
- accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
144
- confidences << p[3]
145
43
 
146
- end
147
- end
148
- R.assign "accuracy", accuracies
149
- R.assign "confidence", confidences
150
- R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
151
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
152
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
153
- plot_id = $gridfs.insert_one(file)
154
- update(:confidence_plot_id => plot_id)
44
+ def time
45
+ finished_at - created_at
155
46
  end
156
- $gridfs.find_one(_id: confidence_plot_id).data
157
- end
158
-
159
- #Average area under roc 0.646
160
- #Area under roc 0.646
161
- #F measure carcinogen: 0.769, noncarcinogen: 0.348
162
- end
163
47
 
164
- class RegressionCrossValidation < CrossValidation
165
-
166
- field :rmse, type: Float
167
- field :mae, type: Float
168
- field :r_squared, type: Float
169
- field :correlation_plot_id, type: BSON::ObjectId
170
-
171
- def statistics
172
- rmse = 0
173
- mae = 0
174
- x = []
175
- y = []
176
- predictions.each do |pred|
177
- compound_id,activity,prediction,confidence = pred
178
- if activity and prediction
179
- unless activity == [nil]
180
- x << -Math.log10(activity.median)
181
- y << -Math.log10(prediction)
182
- error = Math.log10(prediction)-Math.log10(activity.median)
183
- rmse += error**2
184
- #weighted_rmse += confidence*error**2
185
- mae += error.abs
186
- #weighted_mae += confidence*error.abs
187
- #confidence_sum += confidence
188
- end
189
- else
190
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
191
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
192
- end
48
+ def validations
49
+ validation_ids.collect{|vid| TrainTest.find vid}
193
50
  end
194
- R.assign "measurement", x
195
- R.assign "prediction", y
196
- R.eval "r <- cor(measurement,prediction,use='complete')"
197
- r = R.eval("r").to_ruby
198
51
 
199
- mae = mae/predictions.size
200
- #weighted_mae = weighted_mae/confidence_sum
201
- rmse = Math.sqrt(rmse/predictions.size)
202
- #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
203
- update_attributes(
204
- mae: mae,
205
- rmse: rmse,
206
- #weighted_mae: weighted_mae,
207
- #weighted_rmse: weighted_rmse,
208
- r_squared: r**2,
209
- finished_at: Time.now
210
- )
211
- $logger.debug "R^2 #{r**2}"
212
- $logger.debug "RMSE #{rmse}"
213
- $logger.debug "MAE #{mae}"
52
+ def predictions
53
+ predictions = {}
54
+ validations.each{|v| predictions.merge!(v.predictions)}
55
+ predictions
56
+ end
214
57
  end
215
58
 
216
- def misclassifications n=nil
217
- #n = predictions.size unless n
218
- n ||= 10
219
- model = Model::Lazar.find(self.model_id)
220
- training_dataset = Dataset.find(model.training_dataset_id)
221
- prediction_feature = training_dataset.features.first
222
- predictions.collect do |p|
223
- unless p.include? nil
224
- compound = Compound.find(p[0])
225
- neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
226
- neighbors.collect! do |n|
227
- neighbor = Compound.find(n[0])
228
- values = training_dataset.values(neighbor,prediction_feature)
229
- { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
230
- end
231
- {
232
- :smiles => compound.smiles,
233
- #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name},
234
- :measured => p[1],
235
- :predicted => p[2],
236
- #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
237
- :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
238
- :relative_error => (p[1]-p[2]).abs/p[1],
239
- :confidence => p[3],
240
- :neighbors => neighbors
241
- }
242
- end
243
- end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1]
59
+ class ClassificationCrossValidation < CrossValidation
60
+ include ClassificationStatistics
61
+ field :accept_values, type: Array
62
+ field :confusion_matrix, type: Array
63
+ field :weighted_confusion_matrix, type: Array
64
+ field :accuracy, type: Float
65
+ field :weighted_accuracy, type: Float
66
+ field :true_rate, type: Hash
67
+ field :predictivity, type: Hash
68
+ field :probability_plot_id, type: BSON::ObjectId
244
69
  end
245
70
 
246
- def confidence_plot
247
- tmpfile = "/tmp/#{id.to_s}_confidence.png"
248
- sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
249
- R.assign "error", sorted_predictions.collect{|p| p[0]}
250
- R.assign "confidence", sorted_predictions.collect{|p| p[1]}
251
- # TODO fix axis names
252
- R.eval "image = qplot(confidence,error)"
253
- R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
254
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
255
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
256
- plot_id = $gridfs.insert_one(file)
257
- update(:confidence_plot_id => plot_id)
258
- $gridfs.find_one(_id: confidence_plot_id).data
71
+ class RegressionCrossValidation < CrossValidation
72
+ include RegressionStatistics
73
+ field :rmse, type: Float, default:0
74
+ field :mae, type: Float, default:0
75
+ field :r_squared, type: Float
76
+ field :within_prediction_interval, type: Integer, default:0
77
+ field :out_of_prediction_interval, type: Integer, default:0
78
+ field :correlation_plot_id, type: BSON::ObjectId
259
79
  end
260
80
 
261
- def correlation_plot
262
- unless correlation_plot_id
263
- tmpfile = "/tmp/#{id.to_s}_correlation.png"
264
- x = predictions.collect{|p| p[1]}
265
- y = predictions.collect{|p| p[2]}
266
- attributes = Model::Lazar.find(self.model_id).attributes
267
- attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
268
- attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
269
- R.assign "measurement", x
270
- R.assign "prediction", y
271
- R.eval "all = c(-log(measurement),-log(prediction))"
272
- R.eval "range = c(min(all), max(all))"
273
- R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
274
- R.eval "image = image + geom_abline(intercept=0, slope=1)"
275
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
276
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png")
277
- plot_id = $gridfs.insert_one(file)
278
- update(:correlation_plot_id => plot_id)
81
+ class RepeatedCrossValidation < Validation
82
+ field :crossvalidation_ids, type: Array, default: []
83
+ field :correlation_plot_id, type: BSON::ObjectId
84
+
85
+ def self.create model, folds=10, repeats=3
86
+ repeated_cross_validation = self.new
87
+ repeats.times do |n|
88
+ $logger.debug "Crossvalidation #{n+1} for #{model.name}"
89
+ repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
90
+ end
91
+ repeated_cross_validation.save
92
+ repeated_cross_validation
279
93
  end
280
- $gridfs.find_one(_id: correlation_plot_id).data
281
- end
282
- end
283
94
 
284
- class RepeatedCrossValidation
285
- field :crossvalidation_ids, type: Array, default: []
286
- def self.create model, folds=10, repeats=3
287
- repeated_cross_validation = self.new
288
- repeats.times do |n|
289
- $logger.debug "Crossvalidation #{n+1} for #{model.name}"
290
- repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
95
+ def crossvalidations
96
+ crossvalidation_ids.collect{|id| CrossValidation.find(id)}
291
97
  end
292
- repeated_cross_validation.save
293
- repeated_cross_validation
294
- end
295
- def crossvalidations
296
- crossvalidation_ids.collect{|id| CrossValidation.find(id)}
98
+
297
99
  end
298
100
  end
299
101
 
300
-
301
102
  end