lazar 0.9.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
@@ -1,205 +1,59 @@
1
1
  module OpenTox
2
2
 
3
- class LeaveOneOutValidation
4
-
5
- field :model_id, type: BSON::ObjectId
6
- field :dataset_id, type: BSON::ObjectId
7
- field :nr_instances, type: Integer
8
- field :nr_unpredicted, type: Integer
9
- field :predictions, type: Array
10
- field :finished_at, type: Time
11
-
12
- def self.create model
13
- model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation
14
- loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id
15
- compound_ids = model.training_dataset.compound_ids
16
- predictions = model.predict model.training_dataset.compounds
17
- predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]}
18
- predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?}
19
- loo.nr_instances = predictions.size
20
- predictions.select!{|p| p[:value]} # remove unpredicted
21
- loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]}
22
- loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
23
- loo.statistics
24
- loo.save
25
- loo
26
- end
27
-
28
- def model
29
- Model::Lazar.find model_id
30
- end
31
- end
32
-
33
- class ClassificationLeaveOneOutValidation < LeaveOneOutValidation
34
-
35
- field :accept_values, type: Array
36
- field :confusion_matrix, type: Array, default: []
37
- field :weighted_confusion_matrix, type: Array, default: []
38
- field :accuracy, type: Float
39
- field :weighted_accuracy, type: Float
40
- field :true_rate, type: Hash, default: {}
41
- field :predictivity, type: Hash, default: {}
42
- field :confidence_plot_id, type: BSON::ObjectId
43
-
44
- def statistics
45
- accept_values = Feature.find(model.prediction_feature_id).accept_values
46
- confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
47
- weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
48
- predictions.each do |pred|
49
- pred[:database_activities].each do |db_act|
50
- if pred[:value]
51
- if pred[:value] == db_act
52
- if pred[:value] == accept_values[0]
53
- confusion_matrix[0][0] += 1
54
- weighted_confusion_matrix[0][0] += pred[:confidence]
55
- elsif pred[:value] == accept_values[1]
56
- confusion_matrix[1][1] += 1
57
- weighted_confusion_matrix[1][1] += pred[:confidence]
58
- end
59
- else
60
- if pred[:value] == accept_values[0]
61
- confusion_matrix[0][1] += 1
62
- weighted_confusion_matrix[0][1] += pred[:confidence]
63
- elsif pred[:value] == accept_values[1]
64
- confusion_matrix[1][0] += 1
65
- weighted_confusion_matrix[1][0] += pred[:confidence]
66
- end
67
- end
3
+ module Validation
4
+
5
+ class LeaveOneOut < Validation
6
+
7
+ def self.create model
8
+ bad_request_error "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection]
9
+ $logger.debug "#{model.name}: LOO validation started"
10
+ t = Time.now
11
+ model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut
12
+ loo = klass.new :model_id => model.id
13
+ predictions = model.predict model.training_dataset.substances
14
+ predictions.each{|cid,p| p.delete(:neighbors)}
15
+ nr_unpredicted = 0
16
+ predictions.each do |cid,prediction|
17
+ if prediction[:value]
18
+ prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id])
19
+ else
20
+ nr_unpredicted += 1
68
21
  end
22
+ predictions.delete(cid) unless prediction[:value] and prediction[:measurements]
69
23
  end
24
+ predictions.select!{|cid,p| p[:value] and p[:measurements]}
25
+ loo.nr_instances = predictions.size
26
+ loo.nr_unpredicted = nr_unpredicted
27
+ loo.predictions = predictions
28
+ loo.statistics
29
+ $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds"
30
+ loo
70
31
  end
71
- accept_values.each_with_index do |v,i|
72
- true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
73
- predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
74
- end
75
- confidence_sum = 0
76
- weighted_confusion_matrix.each do |r|
77
- r.each do |c|
78
- confidence_sum += c
79
- end
80
- end
81
- update_attributes(
82
- accept_values: accept_values,
83
- confusion_matrix: confusion_matrix,
84
- weighted_confusion_matrix: weighted_confusion_matrix,
85
- accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
86
- weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
87
- true_rate: true_rate,
88
- predictivity: predictivity,
89
- finished_at: Time.now
90
- )
91
- $logger.debug "Accuracy #{accuracy}"
92
- end
93
-
94
- def confidence_plot
95
- unless confidence_plot_id
96
- tmpfile = "/tmp/#{id.to_s}_confidence.svg"
97
- accuracies = []
98
- confidences = []
99
- correct_predictions = 0
100
- incorrect_predictions = 0
101
- predictions.each do |p|
102
- p[:database_activities].each do |db_act|
103
- if p[:value]
104
- p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
105
- accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
106
- confidences << p[:confidence]
107
32
 
108
- end
109
- end
110
- end
111
- R.assign "accuracy", accuracies
112
- R.assign "confidence", confidences
113
- R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
114
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
115
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
116
- plot_id = $gridfs.insert_one(file)
117
- update(:confidence_plot_id => plot_id)
118
- end
119
- $gridfs.find_one(_id: confidence_plot_id).data
120
33
  end
121
- end
122
-
123
-
124
- class RegressionLeaveOneOutValidation < LeaveOneOutValidation
125
-
126
34
 
127
- field :rmse, type: Float, default: 0.0
128
- field :mae, type: Float, default: 0
129
- #field :weighted_rmse, type: Float, default: 0
130
- #field :weighted_mae, type: Float, default: 0
131
- field :r_squared, type: Float
132
- field :correlation_plot_id, type: BSON::ObjectId
133
- field :confidence_plot_id, type: BSON::ObjectId
134
-
135
- def statistics
136
- confidence_sum = 0
137
- predicted_values = []
138
- measured_values = []
139
- predictions.each do |pred|
140
- pred[:database_activities].each do |activity|
141
- if pred[:value]
142
- predicted_values << pred[:value]
143
- measured_values << activity
144
- error = Math.log10(pred[:value])-Math.log10(activity)
145
- self.rmse += error**2
146
- #self.weighted_rmse += pred[:confidence]*error**2
147
- self.mae += error.abs
148
- #self.weighted_mae += pred[:confidence]*error.abs
149
- #confidence_sum += pred[:confidence]
150
- end
151
- end
152
- if pred[:database_activities].empty?
153
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
154
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
155
- end
156
- end
157
- R.assign "measurement", measured_values
158
- R.assign "prediction", predicted_values
159
- R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
160
- r = R.eval("r").to_ruby
161
-
162
- self.mae = self.mae/predictions.size
163
- #self.weighted_mae = self.weighted_mae/confidence_sum
164
- self.rmse = Math.sqrt(self.rmse/predictions.size)
165
- #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
166
- self.r_squared = r**2
167
- self.finished_at = Time.now
168
- save
169
- $logger.debug "R^2 #{r**2}"
170
- $logger.debug "RMSE #{rmse}"
171
- $logger.debug "MAE #{mae}"
35
+ class ClassificationLeaveOneOut < LeaveOneOut
36
+ include ClassificationStatistics
37
+ field :accept_values, type: Array
38
+ field :confusion_matrix, type: Array, default: []
39
+ field :weighted_confusion_matrix, type: Array, default: []
40
+ field :accuracy, type: Float
41
+ field :weighted_accuracy, type: Float
42
+ field :true_rate, type: Hash, default: {}
43
+ field :predictivity, type: Hash, default: {}
44
+ field :confidence_plot_id, type: BSON::ObjectId
172
45
  end
173
-
174
- def correlation_plot
175
- unless correlation_plot_id
176
- tmpfile = "/tmp/#{id.to_s}_correlation.svg"
177
- predicted_values = []
178
- measured_values = []
179
- predictions.each do |pred|
180
- pred[:database_activities].each do |activity|
181
- if pred[:value]
182
- predicted_values << pred[:value]
183
- measured_values << activity
184
- end
185
- end
186
- end
187
- attributes = Model::Lazar.find(self.model_id).attributes
188
- attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
189
- attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
190
- R.assign "measurement", measured_values
191
- R.assign "prediction", predicted_values
192
- R.eval "all = c(-log(measurement),-log(prediction))"
193
- R.eval "range = c(min(all), max(all))"
194
- R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
195
- R.eval "image = image + geom_abline(intercept=0, slope=1)"
196
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
197
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
198
- plot_id = $gridfs.insert_one(file)
199
- update(:correlation_plot_id => plot_id)
200
- end
201
- $gridfs.find_one(_id: correlation_plot_id).data
46
+
47
+ class RegressionLeaveOneOut < LeaveOneOut
48
+ include RegressionStatistics
49
+ field :rmse, type: Float, default: 0
50
+ field :mae, type: Float, default: 0
51
+ field :r_squared, type: Float
52
+ field :within_prediction_interval, type: Integer, default:0
53
+ field :out_of_prediction_interval, type: Integer, default:0
54
+ field :correlation_plot_id, type: BSON::ObjectId
202
55
  end
56
+
203
57
  end
204
58
 
205
59
  end