lazar 0.9.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -4
- data/README.md +5 -15
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +1 -1
- data/ext/lazar/rinstall.R +9 -7
- data/java/CdkDescriptorInfo.class +0 -0
- data/java/CdkDescriptorInfo.java +3 -2
- data/java/CdkDescriptors.class +0 -0
- data/java/CdkDescriptors.java +28 -28
- data/java/Rakefile +3 -3
- data/java/{cdk-1.4.19.jar → cdk-2.0-SNAPSHOT.jar} +0 -0
- data/lazar.gemspec +6 -7
- data/lib/algorithm.rb +2 -11
- data/lib/caret.rb +96 -0
- data/lib/classification.rb +14 -22
- data/lib/compound.rb +21 -87
- data/lib/crossvalidation.rb +80 -279
- data/lib/dataset.rb +105 -174
- data/lib/feature.rb +11 -18
- data/lib/feature_selection.rb +42 -0
- data/lib/import.rb +122 -0
- data/lib/lazar.rb +14 -4
- data/lib/leave-one-out-validation.rb +46 -192
- data/lib/model.rb +319 -128
- data/lib/nanoparticle.rb +98 -0
- data/lib/opentox.rb +7 -4
- data/lib/overwrite.rb +24 -3
- data/lib/physchem.rb +11 -10
- data/lib/regression.rb +7 -137
- data/lib/rest-client-wrapper.rb +0 -6
- data/lib/similarity.rb +65 -0
- data/lib/substance.rb +8 -0
- data/lib/train-test-validation.rb +69 -0
- data/lib/validation-statistics.rb +223 -0
- data/lib/validation.rb +17 -100
- data/scripts/mg2mmol.rb +17 -0
- data/scripts/mirror-enm2test.rb +4 -0
- data/scripts/mmol2-log10.rb +32 -0
- data/test/compound.rb +4 -94
- data/test/data/EPAFHM.medi_log10.csv +92 -0
- data/test/data/EPAFHM.mini_log10.csv +16 -0
- data/test/data/EPAFHM_log10.csv +581 -0
- data/test/data/loael_log10.csv +568 -0
- data/test/dataset.rb +195 -133
- data/test/descriptor.rb +27 -18
- data/test/error.rb +2 -2
- data/test/experiment.rb +4 -4
- data/test/feature.rb +2 -3
- data/test/gridfs.rb +10 -0
- data/test/model-classification.rb +106 -0
- data/test/model-nanoparticle.rb +128 -0
- data/test/model-regression.rb +171 -0
- data/test/model-validation.rb +19 -0
- data/test/nanomaterial-model-validation.rb +55 -0
- data/test/setup.rb +8 -4
- data/test/validation-classification.rb +67 -0
- data/test/validation-nanoparticle.rb +133 -0
- data/test/validation-regression.rb +92 -0
- metadata +50 -121
- data/test/classification.rb +0 -41
- data/test/data/CPDBAS_v5c_1547_29Apr2008part.sdf +0 -13553
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_mouse_TD50.csv +0 -436
- data/test/data/CPDBAS_v5d_cleaned/CPDBAS_v5d_20Nov2008_rat_TD50.csv +0 -568
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Hamster.csv +0 -87
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mouse.csv +0 -978
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall.csv +0 -1120
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_MultiCellCall_no_duplicates.csv +0 -1113
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity.csv +0 -850
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Mutagenicity_no_duplicates.csv +0 -829
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_Rat.csv +0 -1198
- data/test/data/CPDBAS_v5d_cleaned/DSSTox_Carcinogenic_Potency_DBS_SingleCellCall.csv +0 -1505
- data/test/data/EPA_v4b_Fathead_Minnow_Acute_Toxicity_LC50_mmol.csv +0 -581
- data/test/data/FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv +0 -1217
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +0 -568
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +0 -568
- data/test/data/boiling_points.ext.sdf +0 -11460
- data/test/data/cpdb_100.csv +0 -101
- data/test/data/hamster_carcinogenicity.ntriples +0 -618
- data/test/data/hamster_carcinogenicity.sdf +0 -2805
- data/test/data/hamster_carcinogenicity.xls +0 -0
- data/test/data/hamster_carcinogenicity.yaml +0 -352
- data/test/dataset-long.rb +0 -114
- data/test/lazar-long.rb +0 -92
- data/test/lazar-physchem-short.rb +0 -31
- data/test/prediction_models.rb +0 -20
- data/test/regression.rb +0 -43
- data/test/validation.rb +0 -108
@@ -1,205 +1,59 @@
|
|
1
1
|
module OpenTox
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]}
|
22
|
-
loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
|
23
|
-
loo.statistics
|
24
|
-
loo.save
|
25
|
-
loo
|
26
|
-
end
|
27
|
-
|
28
|
-
def model
|
29
|
-
Model::Lazar.find model_id
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
class ClassificationLeaveOneOutValidation < LeaveOneOutValidation
|
34
|
-
|
35
|
-
field :accept_values, type: Array
|
36
|
-
field :confusion_matrix, type: Array, default: []
|
37
|
-
field :weighted_confusion_matrix, type: Array, default: []
|
38
|
-
field :accuracy, type: Float
|
39
|
-
field :weighted_accuracy, type: Float
|
40
|
-
field :true_rate, type: Hash, default: {}
|
41
|
-
field :predictivity, type: Hash, default: {}
|
42
|
-
field :confidence_plot_id, type: BSON::ObjectId
|
43
|
-
|
44
|
-
def statistics
|
45
|
-
accept_values = Feature.find(model.prediction_feature_id).accept_values
|
46
|
-
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
47
|
-
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
48
|
-
predictions.each do |pred|
|
49
|
-
pred[:database_activities].each do |db_act|
|
50
|
-
if pred[:value]
|
51
|
-
if pred[:value] == db_act
|
52
|
-
if pred[:value] == accept_values[0]
|
53
|
-
confusion_matrix[0][0] += 1
|
54
|
-
weighted_confusion_matrix[0][0] += pred[:confidence]
|
55
|
-
elsif pred[:value] == accept_values[1]
|
56
|
-
confusion_matrix[1][1] += 1
|
57
|
-
weighted_confusion_matrix[1][1] += pred[:confidence]
|
58
|
-
end
|
59
|
-
else
|
60
|
-
if pred[:value] == accept_values[0]
|
61
|
-
confusion_matrix[0][1] += 1
|
62
|
-
weighted_confusion_matrix[0][1] += pred[:confidence]
|
63
|
-
elsif pred[:value] == accept_values[1]
|
64
|
-
confusion_matrix[1][0] += 1
|
65
|
-
weighted_confusion_matrix[1][0] += pred[:confidence]
|
66
|
-
end
|
67
|
-
end
|
3
|
+
module Validation
|
4
|
+
|
5
|
+
class LeaveOneOut < Validation
|
6
|
+
|
7
|
+
def self.create model
|
8
|
+
bad_request_error "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection]
|
9
|
+
$logger.debug "#{model.name}: LOO validation started"
|
10
|
+
t = Time.now
|
11
|
+
model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut
|
12
|
+
loo = klass.new :model_id => model.id
|
13
|
+
predictions = model.predict model.training_dataset.substances
|
14
|
+
predictions.each{|cid,p| p.delete(:neighbors)}
|
15
|
+
nr_unpredicted = 0
|
16
|
+
predictions.each do |cid,prediction|
|
17
|
+
if prediction[:value]
|
18
|
+
prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id])
|
19
|
+
else
|
20
|
+
nr_unpredicted += 1
|
68
21
|
end
|
22
|
+
predictions.delete(cid) unless prediction[:value] and prediction[:measurements]
|
69
23
|
end
|
24
|
+
predictions.select!{|cid,p| p[:value] and p[:measurements]}
|
25
|
+
loo.nr_instances = predictions.size
|
26
|
+
loo.nr_unpredicted = nr_unpredicted
|
27
|
+
loo.predictions = predictions
|
28
|
+
loo.statistics
|
29
|
+
$logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds"
|
30
|
+
loo
|
70
31
|
end
|
71
|
-
accept_values.each_with_index do |v,i|
|
72
|
-
true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
|
73
|
-
predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
|
74
|
-
end
|
75
|
-
confidence_sum = 0
|
76
|
-
weighted_confusion_matrix.each do |r|
|
77
|
-
r.each do |c|
|
78
|
-
confidence_sum += c
|
79
|
-
end
|
80
|
-
end
|
81
|
-
update_attributes(
|
82
|
-
accept_values: accept_values,
|
83
|
-
confusion_matrix: confusion_matrix,
|
84
|
-
weighted_confusion_matrix: weighted_confusion_matrix,
|
85
|
-
accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
|
86
|
-
weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
|
87
|
-
true_rate: true_rate,
|
88
|
-
predictivity: predictivity,
|
89
|
-
finished_at: Time.now
|
90
|
-
)
|
91
|
-
$logger.debug "Accuracy #{accuracy}"
|
92
|
-
end
|
93
|
-
|
94
|
-
def confidence_plot
|
95
|
-
unless confidence_plot_id
|
96
|
-
tmpfile = "/tmp/#{id.to_s}_confidence.svg"
|
97
|
-
accuracies = []
|
98
|
-
confidences = []
|
99
|
-
correct_predictions = 0
|
100
|
-
incorrect_predictions = 0
|
101
|
-
predictions.each do |p|
|
102
|
-
p[:database_activities].each do |db_act|
|
103
|
-
if p[:value]
|
104
|
-
p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
|
105
|
-
accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
|
106
|
-
confidences << p[:confidence]
|
107
32
|
|
108
|
-
end
|
109
|
-
end
|
110
|
-
end
|
111
|
-
R.assign "accuracy", accuracies
|
112
|
-
R.assign "confidence", confidences
|
113
|
-
R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
|
114
|
-
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
115
|
-
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
|
116
|
-
plot_id = $gridfs.insert_one(file)
|
117
|
-
update(:confidence_plot_id => plot_id)
|
118
|
-
end
|
119
|
-
$gridfs.find_one(_id: confidence_plot_id).data
|
120
33
|
end
|
121
|
-
end
|
122
|
-
|
123
|
-
|
124
|
-
class RegressionLeaveOneOutValidation < LeaveOneOutValidation
|
125
|
-
|
126
34
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
predicted_values = []
|
138
|
-
measured_values = []
|
139
|
-
predictions.each do |pred|
|
140
|
-
pred[:database_activities].each do |activity|
|
141
|
-
if pred[:value]
|
142
|
-
predicted_values << pred[:value]
|
143
|
-
measured_values << activity
|
144
|
-
error = Math.log10(pred[:value])-Math.log10(activity)
|
145
|
-
self.rmse += error**2
|
146
|
-
#self.weighted_rmse += pred[:confidence]*error**2
|
147
|
-
self.mae += error.abs
|
148
|
-
#self.weighted_mae += pred[:confidence]*error.abs
|
149
|
-
#confidence_sum += pred[:confidence]
|
150
|
-
end
|
151
|
-
end
|
152
|
-
if pred[:database_activities].empty?
|
153
|
-
warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
154
|
-
$logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
155
|
-
end
|
156
|
-
end
|
157
|
-
R.assign "measurement", measured_values
|
158
|
-
R.assign "prediction", predicted_values
|
159
|
-
R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
|
160
|
-
r = R.eval("r").to_ruby
|
161
|
-
|
162
|
-
self.mae = self.mae/predictions.size
|
163
|
-
#self.weighted_mae = self.weighted_mae/confidence_sum
|
164
|
-
self.rmse = Math.sqrt(self.rmse/predictions.size)
|
165
|
-
#self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
|
166
|
-
self.r_squared = r**2
|
167
|
-
self.finished_at = Time.now
|
168
|
-
save
|
169
|
-
$logger.debug "R^2 #{r**2}"
|
170
|
-
$logger.debug "RMSE #{rmse}"
|
171
|
-
$logger.debug "MAE #{mae}"
|
35
|
+
class ClassificationLeaveOneOut < LeaveOneOut
|
36
|
+
include ClassificationStatistics
|
37
|
+
field :accept_values, type: Array
|
38
|
+
field :confusion_matrix, type: Array, default: []
|
39
|
+
field :weighted_confusion_matrix, type: Array, default: []
|
40
|
+
field :accuracy, type: Float
|
41
|
+
field :weighted_accuracy, type: Float
|
42
|
+
field :true_rate, type: Hash, default: {}
|
43
|
+
field :predictivity, type: Hash, default: {}
|
44
|
+
field :confidence_plot_id, type: BSON::ObjectId
|
172
45
|
end
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
predicted_values << pred[:value]
|
183
|
-
measured_values << activity
|
184
|
-
end
|
185
|
-
end
|
186
|
-
end
|
187
|
-
attributes = Model::Lazar.find(self.model_id).attributes
|
188
|
-
attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
|
189
|
-
attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
|
190
|
-
R.assign "measurement", measured_values
|
191
|
-
R.assign "prediction", predicted_values
|
192
|
-
R.eval "all = c(-log(measurement),-log(prediction))"
|
193
|
-
R.eval "range = c(min(all), max(all))"
|
194
|
-
R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
|
195
|
-
R.eval "image = image + geom_abline(intercept=0, slope=1)"
|
196
|
-
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
197
|
-
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
|
198
|
-
plot_id = $gridfs.insert_one(file)
|
199
|
-
update(:correlation_plot_id => plot_id)
|
200
|
-
end
|
201
|
-
$gridfs.find_one(_id: correlation_plot_id).data
|
46
|
+
|
47
|
+
class RegressionLeaveOneOut < LeaveOneOut
|
48
|
+
include RegressionStatistics
|
49
|
+
field :rmse, type: Float, default: 0
|
50
|
+
field :mae, type: Float, default: 0
|
51
|
+
field :r_squared, type: Float
|
52
|
+
field :within_prediction_interval, type: Integer, default:0
|
53
|
+
field :out_of_prediction_interval, type: Integer, default:0
|
54
|
+
field :correlation_plot_id, type: BSON::ObjectId
|
202
55
|
end
|
56
|
+
|
203
57
|
end
|
204
58
|
|
205
59
|
end
|