lazar 0.9.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -4
  3. data/README.md +5 -15
  4. data/VERSION +1 -1
  5. data/ext/lazar/extconf.rb +1 -1
  6. data/ext/lazar/rinstall.R +9 -7
  7. data/java/CdkDescriptorInfo.class +0 -0
  8. data/java/CdkDescriptorInfo.java +3 -2
  9. data/java/CdkDescriptors.class +0 -0
  10. data/java/CdkDescriptors.java +28 -28
  11. data/java/Rakefile +3 -3
  12. data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
  13. data/lazar.gemspec +6 -7
  14. data/lib/algorithm.rb +2 -11
  15. data/lib/caret.rb +96 -0
  16. data/lib/classification.rb +14 -22
  17. data/lib/compound.rb +21 -87
  18. data/lib/crossvalidation.rb +80 -279
  19. data/lib/dataset.rb +105 -174
  20. data/lib/feature.rb +11 -18
  21. data/lib/feature_selection.rb +42 -0
  22. data/lib/import.rb +122 -0
  23. data/lib/lazar.rb +14 -4
  24. data/lib/leave-one-out-validation.rb +46 -192
  25. data/lib/model.rb +319 -128
  26. data/lib/nanoparticle.rb +98 -0
  27. data/lib/opentox.rb +7 -4
  28. data/lib/overwrite.rb +24 -3
  29. data/lib/physchem.rb +11 -10
  30. data/lib/regression.rb +7 -137
  31. data/lib/rest-client-wrapper.rb +0 -6
  32. data/lib/similarity.rb +65 -0
  33. data/lib/substance.rb +8 -0
  34. data/lib/train-test-validation.rb +69 -0
  35. data/lib/validation-statistics.rb +223 -0
  36. data/lib/validation.rb +17 -100
  37. data/scripts/mg2mmol.rb +17 -0
  38. data/scripts/mirror-enm2test.rb +4 -0
  39. data/scripts/mmol2-log10.rb +32 -0
  40. data/test/compound.rb +4 -94
  41. data/test/data/EPAFHM.medi_log10.csv +92 -0
  42. data/test/data/EPAFHM.mini_log10.csv +16 -0
  43. data/test/data/EPAFHM_log10.csv +581 -0
  44. data/test/data/loael_log10.csv +568 -0
  45. data/test/dataset.rb +195 -133
  46. data/test/descriptor.rb +27 -18
  47. data/test/error.rb +2 -2
  48. data/test/experiment.rb +4 -4
  49. data/test/feature.rb +2 -3
  50. data/test/gridfs.rb +10 -0
  51. data/test/model-classification.rb +106 -0
  52. data/test/model-nanoparticle.rb +128 -0
  53. data/test/model-regression.rb +171 -0
  54. data/test/model-validation.rb +19 -0
  55. data/test/nanomaterial-model-validation.rb +55 -0
  56. data/test/setup.rb +8 -4
  57. data/test/validation-classification.rb +67 -0
  58. data/test/validation-nanoparticle.rb +133 -0
  59. data/test/validation-regression.rb +92 -0
  60. metadata +50 -121
  61. data/test/classification.rb +0 -41
  62. data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
  63. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
  64. data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
  65. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
  66. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
  67. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
  68. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
  69. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
  70. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
  71. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
  72. data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
  73. data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
  74. data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
  75. data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
  76. data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
  77. data/test/data/boiling_points.ext.sdf +0 -11460
  78. data/test/data/cpdb_100.csv +0 -101
  79. data/test/data/hamster_carcinogenicity.ntriples +0 -618
  80. data/test/data/hamster_carcinogenicity.sdf +0 -2805
  81. data/test/data/hamster_carcinogenicity.xls +0 -0
  82. data/test/data/hamster_carcinogenicity.yaml +0 -352
  83. data/test/dataset-long.rb +0 -114
  84. data/test/lazar-long.rb +0 -92
  85. data/test/lazar-physchem-short.rb +0 -31
  86. data/test/prediction_models.rb +0 -20
  87. data/test/regression.rb +0 -43
  88. data/test/validation.rb +0 -108
@@ -1,205 +1,59 @@
1
1
  module OpenTox
2
2
 
3
- class LeaveOneOutValidation
4
-
5
- field :model_id, type: BSON::ObjectId
6
- field :dataset_id, type: BSON::ObjectId
7
- field :nr_instances, type: Integer
8
- field :nr_unpredicted, type: Integer
9
- field :predictions, type: Array
10
- field :finished_at, type: Time
11
-
12
- def self.create model
13
- model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation
14
- loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id
15
- compound_ids = model.training_dataset.compound_ids
16
- predictions = model.predict model.training_dataset.compounds
17
- predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]}
18
- predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?}
19
- loo.nr_instances = predictions.size
20
- predictions.select!{|p| p[:value]} # remove unpredicted
21
- loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]}
22
- loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
23
- loo.statistics
24
- loo.save
25
- loo
26
- end
27
-
28
- def model
29
- Model::Lazar.find model_id
30
- end
31
- end
32
-
33
- class ClassificationLeaveOneOutValidation < LeaveOneOutValidation
34
-
35
- field :accept_values, type: Array
36
- field :confusion_matrix, type: Array, default: []
37
- field :weighted_confusion_matrix, type: Array, default: []
38
- field :accuracy, type: Float
39
- field :weighted_accuracy, type: Float
40
- field :true_rate, type: Hash, default: {}
41
- field :predictivity, type: Hash, default: {}
42
- field :confidence_plot_id, type: BSON::ObjectId
43
-
44
- def statistics
45
- accept_values = Feature.find(model.prediction_feature_id).accept_values
46
- confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
47
- weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
48
- predictions.each do |pred|
49
- pred[:database_activities].each do |db_act|
50
- if pred[:value]
51
- if pred[:value] == db_act
52
- if pred[:value] == accept_values[0]
53
- confusion_matrix[0][0] += 1
54
- weighted_confusion_matrix[0][0] += pred[:confidence]
55
- elsif pred[:value] == accept_values[1]
56
- confusion_matrix[1][1] += 1
57
- weighted_confusion_matrix[1][1] += pred[:confidence]
58
- end
59
- else
60
- if pred[:value] == accept_values[0]
61
- confusion_matrix[0][1] += 1
62
- weighted_confusion_matrix[0][1] += pred[:confidence]
63
- elsif pred[:value] == accept_values[1]
64
- confusion_matrix[1][0] += 1
65
- weighted_confusion_matrix[1][0] += pred[:confidence]
66
- end
67
- end
3
+ module Validation
4
+
5
+ class LeaveOneOut < Validation
6
+
7
+ def self.create model
8
+ bad_request_error "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection]
9
+ $logger.debug "#{model.name}: LOO validation started"
10
+ t = Time.now
11
+ model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut
12
+ loo = klass.new :model_id => model.id
13
+ predictions = model.predict model.training_dataset.substances
14
+ predictions.each{|cid,p| p.delete(:neighbors)}
15
+ nr_unpredicted = 0
16
+ predictions.each do |cid,prediction|
17
+ if prediction[:value]
18
+ prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id])
19
+ else
20
+ nr_unpredicted += 1
68
21
  end
22
+ predictions.delete(cid) unless prediction[:value] and prediction[:measurements]
69
23
  end
24
+ predictions.select!{|cid,p| p[:value] and p[:measurements]}
25
+ loo.nr_instances = predictions.size
26
+ loo.nr_unpredicted = nr_unpredicted
27
+ loo.predictions = predictions
28
+ loo.statistics
29
+ $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds"
30
+ loo
70
31
  end
71
- accept_values.each_with_index do |v,i|
72
- true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
73
- predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
74
- end
75
- confidence_sum = 0
76
- weighted_confusion_matrix.each do |r|
77
- r.each do |c|
78
- confidence_sum += c
79
- end
80
- end
81
- update_attributes(
82
- accept_values: accept_values,
83
- confusion_matrix: confusion_matrix,
84
- weighted_confusion_matrix: weighted_confusion_matrix,
85
- accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
86
- weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
87
- true_rate: true_rate,
88
- predictivity: predictivity,
89
- finished_at: Time.now
90
- )
91
- $logger.debug "Accuracy #{accuracy}"
92
- end
93
-
94
- def confidence_plot
95
- unless confidence_plot_id
96
- tmpfile = "/tmp/#{id.to_s}_confidence.svg"
97
- accuracies = []
98
- confidences = []
99
- correct_predictions = 0
100
- incorrect_predictions = 0
101
- predictions.each do |p|
102
- p[:database_activities].each do |db_act|
103
- if p[:value]
104
- p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
105
- accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
106
- confidences << p[:confidence]
107
32
 
108
- end
109
- end
110
- end
111
- R.assign "accuracy", accuracies
112
- R.assign "confidence", confidences
113
- R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
114
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
115
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
116
- plot_id = $gridfs.insert_one(file)
117
- update(:confidence_plot_id => plot_id)
118
- end
119
- $gridfs.find_one(_id: confidence_plot_id).data
120
33
  end
121
- end
122
-
123
-
124
- class RegressionLeaveOneOutValidation < LeaveOneOutValidation
125
-
126
34
 
127
- field :rmse, type: Float, default: 0.0
128
- field :mae, type: Float, default: 0
129
- #field :weighted_rmse, type: Float, default: 0
130
- #field :weighted_mae, type: Float, default: 0
131
- field :r_squared, type: Float
132
- field :correlation_plot_id, type: BSON::ObjectId
133
- field :confidence_plot_id, type: BSON::ObjectId
134
-
135
- def statistics
136
- confidence_sum = 0
137
- predicted_values = []
138
- measured_values = []
139
- predictions.each do |pred|
140
- pred[:database_activities].each do |activity|
141
- if pred[:value]
142
- predicted_values << pred[:value]
143
- measured_values << activity
144
- error = Math.log10(pred[:value])-Math.log10(activity)
145
- self.rmse += error**2
146
- #self.weighted_rmse += pred[:confidence]*error**2
147
- self.mae += error.abs
148
- #self.weighted_mae += pred[:confidence]*error.abs
149
- #confidence_sum += pred[:confidence]
150
- end
151
- end
152
- if pred[:database_activities].empty?
153
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
154
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
155
- end
156
- end
157
- R.assign "measurement", measured_values
158
- R.assign "prediction", predicted_values
159
- R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
160
- r = R.eval("r").to_ruby
161
-
162
- self.mae = self.mae/predictions.size
163
- #self.weighted_mae = self.weighted_mae/confidence_sum
164
- self.rmse = Math.sqrt(self.rmse/predictions.size)
165
- #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
166
- self.r_squared = r**2
167
- self.finished_at = Time.now
168
- save
169
- $logger.debug "R^2 #{r**2}"
170
- $logger.debug "RMSE #{rmse}"
171
- $logger.debug "MAE #{mae}"
35
+ class ClassificationLeaveOneOut < LeaveOneOut
36
+ include ClassificationStatistics
37
+ field :accept_values, type: Array
38
+ field :confusion_matrix, type: Array, default: []
39
+ field :weighted_confusion_matrix, type: Array, default: []
40
+ field :accuracy, type: Float
41
+ field :weighted_accuracy, type: Float
42
+ field :true_rate, type: Hash, default: {}
43
+ field :predictivity, type: Hash, default: {}
44
+ field :confidence_plot_id, type: BSON::ObjectId
172
45
  end
173
-
174
- def correlation_plot
175
- unless correlation_plot_id
176
- tmpfile = "/tmp/#{id.to_s}_correlation.svg"
177
- predicted_values = []
178
- measured_values = []
179
- predictions.each do |pred|
180
- pred[:database_activities].each do |activity|
181
- if pred[:value]
182
- predicted_values << pred[:value]
183
- measured_values << activity
184
- end
185
- end
186
- end
187
- attributes = Model::Lazar.find(self.model_id).attributes
188
- attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
189
- attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
190
- R.assign "measurement", measured_values
191
- R.assign "prediction", predicted_values
192
- R.eval "all = c(-log(measurement),-log(prediction))"
193
- R.eval "range = c(min(all), max(all))"
194
- R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
195
- R.eval "image = image + geom_abline(intercept=0, slope=1)"
196
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
197
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
198
- plot_id = $gridfs.insert_one(file)
199
- update(:correlation_plot_id => plot_id)
200
- end
201
- $gridfs.find_one(_id: correlation_plot_id).data
46
+
47
+ class RegressionLeaveOneOut < LeaveOneOut
48
+ include RegressionStatistics
49
+ field :rmse, type: Float, default: 0
50
+ field :mae, type: Float, default: 0
51
+ field :r_squared, type: Float
52
+ field :within_prediction_interval, type: Integer, default:0
53
+ field :out_of_prediction_interval, type: Integer, default:0
54
+ field :correlation_plot_id, type: BSON::ObjectId
202
55
  end
56
+
203
57
  end
204
58
 
205
59
  end