lazar 0.0.7 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/README.md +2 -1
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +15 -76
- data/ext/lazar/rinstall.R +9 -0
- data/lazar.gemspec +7 -7
- data/lib/classification.rb +5 -78
- data/lib/compound.rb +201 -44
- data/lib/crossvalidation.rb +224 -121
- data/lib/dataset.rb +83 -93
- data/lib/error.rb +1 -1
- data/lib/experiment.rb +99 -0
- data/lib/feature.rb +2 -54
- data/lib/lazar.rb +47 -34
- data/lib/leave-one-out-validation.rb +205 -0
- data/lib/model.rb +131 -76
- data/lib/opentox.rb +2 -2
- data/lib/overwrite.rb +37 -0
- data/lib/physchem.rb +133 -0
- data/lib/regression.rb +117 -189
- data/lib/rest-client-wrapper.rb +4 -5
- data/lib/unique_descriptors.rb +6 -7
- data/lib/validation.rb +63 -69
- data/test/all.rb +2 -2
- data/test/classification.rb +41 -0
- data/test/compound.rb +116 -7
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
- data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
- data/test/data/batch_prediction.csv +25 -0
- data/test/data/batch_prediction_inchi_small.csv +4 -0
- data/test/data/batch_prediction_smiles_small.csv +4 -0
- data/test/data/hamster_carcinogenicity.json +3 -0
- data/test/data/loael.csv +568 -0
- data/test/dataset-long.rb +5 -8
- data/test/dataset.rb +31 -11
- data/test/default_environment.rb +11 -0
- data/test/descriptor.rb +26 -41
- data/test/error.rb +1 -3
- data/test/experiment.rb +301 -0
- data/test/feature.rb +22 -10
- data/test/lazar-long.rb +43 -23
- data/test/lazar-physchem-short.rb +19 -16
- data/test/prediction_models.rb +20 -0
- data/test/regression.rb +43 -0
- data/test/setup.rb +3 -1
- data/test/test_environment.rb +10 -0
- data/test/validation.rb +92 -26
- metadata +64 -38
- data/lib/SMARTS_InteLigand.txt +0 -983
- data/lib/bbrc.rb +0 -165
- data/lib/descriptor.rb +0 -247
- data/lib/neighbor.rb +0 -25
- data/lib/similarity.rb +0 -58
- data/mongoid.yml +0 -8
- data/test/descriptor-long.rb +0 -26
- data/test/fminer-long.rb +0 -38
- data/test/fminer.rb +0 -52
- data/test/lazar-fminer.rb +0 -50
- data/test/lazar-regression.rb +0 -27
@@ -0,0 +1,205 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
class LeaveOneOutValidation
|
4
|
+
|
5
|
+
field :model_id, type: BSON::ObjectId
|
6
|
+
field :dataset_id, type: BSON::ObjectId
|
7
|
+
field :nr_instances, type: Integer
|
8
|
+
field :nr_unpredicted, type: Integer
|
9
|
+
field :predictions, type: Array
|
10
|
+
field :finished_at, type: Time
|
11
|
+
|
12
|
+
def self.create model
|
13
|
+
model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation
|
14
|
+
loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id
|
15
|
+
compound_ids = model.training_dataset.compound_ids
|
16
|
+
predictions = model.predict model.training_dataset.compounds
|
17
|
+
predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]}
|
18
|
+
predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?}
|
19
|
+
loo.nr_instances = predictions.size
|
20
|
+
predictions.select!{|p| p[:value]} # remove unpredicted
|
21
|
+
loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]}
|
22
|
+
loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
|
23
|
+
loo.statistics
|
24
|
+
loo.save
|
25
|
+
loo
|
26
|
+
end
|
27
|
+
|
28
|
+
def model
|
29
|
+
Model::Lazar.find model_id
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class ClassificationLeaveOneOutValidation < LeaveOneOutValidation
|
34
|
+
|
35
|
+
field :accept_values, type: Array
|
36
|
+
field :confusion_matrix, type: Array, default: []
|
37
|
+
field :weighted_confusion_matrix, type: Array, default: []
|
38
|
+
field :accuracy, type: Float
|
39
|
+
field :weighted_accuracy, type: Float
|
40
|
+
field :true_rate, type: Hash, default: {}
|
41
|
+
field :predictivity, type: Hash, default: {}
|
42
|
+
field :confidence_plot_id, type: BSON::ObjectId
|
43
|
+
|
44
|
+
def statistics
|
45
|
+
accept_values = Feature.find(model.prediction_feature_id).accept_values
|
46
|
+
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
47
|
+
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
48
|
+
predictions.each do |pred|
|
49
|
+
pred[:database_activities].each do |db_act|
|
50
|
+
if pred[:value]
|
51
|
+
if pred[:value] == db_act
|
52
|
+
if pred[:value] == accept_values[0]
|
53
|
+
confusion_matrix[0][0] += 1
|
54
|
+
weighted_confusion_matrix[0][0] += pred[:confidence]
|
55
|
+
elsif pred[:value] == accept_values[1]
|
56
|
+
confusion_matrix[1][1] += 1
|
57
|
+
weighted_confusion_matrix[1][1] += pred[:confidence]
|
58
|
+
end
|
59
|
+
else
|
60
|
+
if pred[:value] == accept_values[0]
|
61
|
+
confusion_matrix[0][1] += 1
|
62
|
+
weighted_confusion_matrix[0][1] += pred[:confidence]
|
63
|
+
elsif pred[:value] == accept_values[1]
|
64
|
+
confusion_matrix[1][0] += 1
|
65
|
+
weighted_confusion_matrix[1][0] += pred[:confidence]
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
accept_values.each_with_index do |v,i|
|
72
|
+
true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
|
73
|
+
predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
|
74
|
+
end
|
75
|
+
confidence_sum = 0
|
76
|
+
weighted_confusion_matrix.each do |r|
|
77
|
+
r.each do |c|
|
78
|
+
confidence_sum += c
|
79
|
+
end
|
80
|
+
end
|
81
|
+
update_attributes(
|
82
|
+
accept_values: accept_values,
|
83
|
+
confusion_matrix: confusion_matrix,
|
84
|
+
weighted_confusion_matrix: weighted_confusion_matrix,
|
85
|
+
accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
|
86
|
+
weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
|
87
|
+
true_rate: true_rate,
|
88
|
+
predictivity: predictivity,
|
89
|
+
finished_at: Time.now
|
90
|
+
)
|
91
|
+
$logger.debug "Accuracy #{accuracy}"
|
92
|
+
end
|
93
|
+
|
94
|
+
def confidence_plot
|
95
|
+
unless confidence_plot_id
|
96
|
+
tmpfile = "/tmp/#{id.to_s}_confidence.svg"
|
97
|
+
accuracies = []
|
98
|
+
confidences = []
|
99
|
+
correct_predictions = 0
|
100
|
+
incorrect_predictions = 0
|
101
|
+
predictions.each do |p|
|
102
|
+
p[:database_activities].each do |db_act|
|
103
|
+
if p[:value]
|
104
|
+
p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
|
105
|
+
accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
|
106
|
+
confidences << p[:confidence]
|
107
|
+
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
R.assign "accuracy", accuracies
|
112
|
+
R.assign "confidence", confidences
|
113
|
+
R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
|
114
|
+
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
115
|
+
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
|
116
|
+
plot_id = $gridfs.insert_one(file)
|
117
|
+
update(:confidence_plot_id => plot_id)
|
118
|
+
end
|
119
|
+
$gridfs.find_one(_id: confidence_plot_id).data
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
|
124
|
+
class RegressionLeaveOneOutValidation < LeaveOneOutValidation
|
125
|
+
|
126
|
+
|
127
|
+
field :rmse, type: Float, default: 0.0
|
128
|
+
field :mae, type: Float, default: 0
|
129
|
+
#field :weighted_rmse, type: Float, default: 0
|
130
|
+
#field :weighted_mae, type: Float, default: 0
|
131
|
+
field :r_squared, type: Float
|
132
|
+
field :correlation_plot_id, type: BSON::ObjectId
|
133
|
+
field :confidence_plot_id, type: BSON::ObjectId
|
134
|
+
|
135
|
+
def statistics
|
136
|
+
confidence_sum = 0
|
137
|
+
predicted_values = []
|
138
|
+
measured_values = []
|
139
|
+
predictions.each do |pred|
|
140
|
+
pred[:database_activities].each do |activity|
|
141
|
+
if pred[:value]
|
142
|
+
predicted_values << pred[:value]
|
143
|
+
measured_values << activity
|
144
|
+
error = Math.log10(pred[:value])-Math.log10(activity)
|
145
|
+
self.rmse += error**2
|
146
|
+
#self.weighted_rmse += pred[:confidence]*error**2
|
147
|
+
self.mae += error.abs
|
148
|
+
#self.weighted_mae += pred[:confidence]*error.abs
|
149
|
+
#confidence_sum += pred[:confidence]
|
150
|
+
end
|
151
|
+
end
|
152
|
+
if pred[:database_activities].empty?
|
153
|
+
warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
154
|
+
$logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
155
|
+
end
|
156
|
+
end
|
157
|
+
R.assign "measurement", measured_values
|
158
|
+
R.assign "prediction", predicted_values
|
159
|
+
R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
|
160
|
+
r = R.eval("r").to_ruby
|
161
|
+
|
162
|
+
self.mae = self.mae/predictions.size
|
163
|
+
#self.weighted_mae = self.weighted_mae/confidence_sum
|
164
|
+
self.rmse = Math.sqrt(self.rmse/predictions.size)
|
165
|
+
#self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
|
166
|
+
self.r_squared = r**2
|
167
|
+
self.finished_at = Time.now
|
168
|
+
save
|
169
|
+
$logger.debug "R^2 #{r**2}"
|
170
|
+
$logger.debug "RMSE #{rmse}"
|
171
|
+
$logger.debug "MAE #{mae}"
|
172
|
+
end
|
173
|
+
|
174
|
+
def correlation_plot
|
175
|
+
unless correlation_plot_id
|
176
|
+
tmpfile = "/tmp/#{id.to_s}_correlation.svg"
|
177
|
+
predicted_values = []
|
178
|
+
measured_values = []
|
179
|
+
predictions.each do |pred|
|
180
|
+
pred[:database_activities].each do |activity|
|
181
|
+
if pred[:value]
|
182
|
+
predicted_values << pred[:value]
|
183
|
+
measured_values << activity
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
attributes = Model::Lazar.find(self.model_id).attributes
|
188
|
+
attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
|
189
|
+
attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
|
190
|
+
R.assign "measurement", measured_values
|
191
|
+
R.assign "prediction", predicted_values
|
192
|
+
R.eval "all = c(-log(measurement),-log(prediction))"
|
193
|
+
R.eval "range = c(min(all), max(all))"
|
194
|
+
R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
|
195
|
+
R.eval "image = image + geom_abline(intercept=0, slope=1)"
|
196
|
+
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
197
|
+
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
|
198
|
+
plot_id = $gridfs.insert_one(file)
|
199
|
+
update(:correlation_plot_id => plot_id)
|
200
|
+
end
|
201
|
+
$gridfs.find_one(_id: correlation_plot_id).data
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
data/lib/model.rb
CHANGED
@@ -2,51 +2,79 @@ module OpenTox
|
|
2
2
|
|
3
3
|
module Model
|
4
4
|
|
5
|
-
class
|
5
|
+
class Model
|
6
6
|
include OpenTox
|
7
7
|
include Mongoid::Document
|
8
8
|
include Mongoid::Timestamps
|
9
9
|
store_in collection: "models"
|
10
10
|
|
11
|
-
field :
|
11
|
+
field :name, type: String
|
12
12
|
field :creator, type: String, default: __FILE__
|
13
13
|
# datasets
|
14
14
|
field :training_dataset_id, type: BSON::ObjectId
|
15
15
|
# algorithms
|
16
16
|
field :prediction_algorithm, type: String
|
17
|
-
field :neighbor_algorithm, type: String
|
18
|
-
field :neighbor_algorithm_parameters, type: Hash
|
19
17
|
# prediction feature
|
20
18
|
field :prediction_feature_id, type: BSON::ObjectId
|
21
19
|
|
22
|
-
|
23
|
-
|
20
|
+
def training_dataset
|
21
|
+
Dataset.find(training_dataset_id)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class Lazar < Model
|
26
|
+
|
27
|
+
# algorithms
|
28
|
+
field :neighbor_algorithm, type: String
|
29
|
+
field :neighbor_algorithm_parameters, type: Hash, default: {}
|
24
30
|
|
25
31
|
# Create a lazar model from a training_dataset and a feature_dataset
|
26
32
|
# @param [OpenTox::Dataset] training_dataset
|
27
33
|
# @return [OpenTox::Model::Lazar] Regression or classification model
|
28
|
-
def
|
34
|
+
def initialize training_dataset, params={}
|
29
35
|
|
30
|
-
|
36
|
+
super params
|
31
37
|
|
32
38
|
# TODO document convention
|
33
39
|
prediction_feature = training_dataset.features.first
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
40
|
+
# set defaults for empty parameters
|
41
|
+
self.prediction_feature_id ||= prediction_feature.id
|
42
|
+
self.training_dataset_id ||= training_dataset.id
|
43
|
+
self.name ||= "#{training_dataset.name} #{prediction_feature.name}"
|
44
|
+
self.neighbor_algorithm_parameters ||= {}
|
45
|
+
self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
|
46
|
+
save
|
47
|
+
self
|
48
|
+
end
|
38
49
|
|
39
|
-
|
40
|
-
|
50
|
+
def predict_compound compound
|
51
|
+
prediction_feature = Feature.find prediction_feature_id
|
52
|
+
neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
|
53
|
+
# remove neighbors without prediction_feature
|
54
|
+
# check for database activities (neighbors may include query compound)
|
55
|
+
database_activities = nil
|
56
|
+
prediction = {}
|
57
|
+
if neighbors.collect{|n| n["_id"]}.include? compound.id
|
58
|
+
|
59
|
+
database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
|
60
|
+
prediction[:database_activities] = database_activities
|
61
|
+
prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
|
62
|
+
neighbors.delete_if{|n| n["_id"] == compound.id}
|
63
|
+
end
|
64
|
+
neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
|
65
|
+
if neighbors.empty?
|
66
|
+
prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
|
67
|
+
else
|
68
|
+
prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id}))
|
69
|
+
prediction[:neighbors] = neighbors
|
70
|
+
prediction[:neighbors] ||= []
|
71
|
+
end
|
72
|
+
prediction
|
41
73
|
end
|
42
74
|
|
43
75
|
def predict object
|
44
76
|
|
45
|
-
t = Time.now
|
46
|
-
at = Time.now
|
47
|
-
|
48
77
|
training_dataset = Dataset.find training_dataset_id
|
49
|
-
prediction_feature = Feature.find prediction_feature_id
|
50
78
|
|
51
79
|
# parse data
|
52
80
|
compounds = []
|
@@ -63,50 +91,33 @@ module OpenTox
|
|
63
91
|
|
64
92
|
# make predictions
|
65
93
|
predictions = []
|
66
|
-
|
67
|
-
compounds.each_with_index do |compound,c|
|
68
|
-
t = Time.new
|
69
|
-
database_activities = training_dataset.values(compound,prediction_feature)
|
70
|
-
if database_activities and !database_activities.empty?
|
71
|
-
database_activities = database_activities.first if database_activities.size == 1
|
72
|
-
predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
|
73
|
-
next
|
74
|
-
end
|
75
|
-
neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
|
76
|
-
# add activities
|
77
|
-
# TODO: improve efficiency, takes 3 times longer than previous version
|
78
|
-
neighbors.collect! do |n|
|
79
|
-
rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
|
80
|
-
acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
|
81
|
-
acts.empty? ? nil : n << acts
|
82
|
-
end
|
83
|
-
neighbors.compact! # remove neighbors without training activities
|
84
|
-
predictions << Algorithm.run(prediction_algorithm, neighbors)
|
85
|
-
end
|
94
|
+
predictions = compounds.collect{|c| predict_compound c}
|
86
95
|
|
87
96
|
# serialize result
|
88
97
|
case object.class.to_s
|
89
98
|
when "OpenTox::Compound"
|
90
99
|
prediction = predictions.first
|
91
|
-
prediction[:neighbors]
|
100
|
+
prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
|
92
101
|
return prediction
|
93
102
|
when "Array"
|
94
103
|
return predictions
|
95
104
|
when "OpenTox::Dataset"
|
96
105
|
# prepare prediction dataset
|
106
|
+
measurement_feature = Feature.find prediction_feature_id
|
107
|
+
|
108
|
+
prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
|
97
109
|
prediction_dataset = LazarPrediction.new(
|
98
|
-
:
|
110
|
+
:name => "Lazar prediction for #{prediction_feature.name}",
|
99
111
|
:creator => __FILE__,
|
100
112
|
:prediction_feature_id => prediction_feature.id
|
101
113
|
|
102
114
|
)
|
103
|
-
confidence_feature = OpenTox::NumericFeature.find_or_create_by( "
|
104
|
-
|
105
|
-
|
106
|
-
prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
|
115
|
+
confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
|
116
|
+
warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
|
117
|
+
prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
|
107
118
|
prediction_dataset.compounds = compounds
|
108
|
-
prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:
|
109
|
-
prediction_dataset.
|
119
|
+
prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
|
120
|
+
prediction_dataset.save
|
110
121
|
return prediction_dataset
|
111
122
|
end
|
112
123
|
|
@@ -120,26 +131,19 @@ module OpenTox
|
|
120
131
|
end
|
121
132
|
|
122
133
|
class LazarClassification < Lazar
|
123
|
-
|
124
|
-
|
125
|
-
self.
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
model = self.find model.id # adjust class
|
137
|
-
model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity"
|
138
|
-
model.neighbor_algorithm_parameters = {
|
139
|
-
:feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
|
140
|
-
:feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id,
|
141
|
-
:min_sim => 0.3
|
142
|
-
}
|
134
|
+
|
135
|
+
def self.create training_dataset, params={}
|
136
|
+
model = self.new training_dataset, params
|
137
|
+
model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
|
138
|
+
model.neighbor_algorithm ||= "fingerprint_neighbors"
|
139
|
+
model.neighbor_algorithm_parameters ||= {}
|
140
|
+
{
|
141
|
+
:type => "MP2D",
|
142
|
+
:training_dataset_id => training_dataset.id,
|
143
|
+
:min_sim => 0.1
|
144
|
+
}.each do |key,value|
|
145
|
+
model.neighbor_algorithm_parameters[key] ||= value
|
146
|
+
end
|
143
147
|
model.save
|
144
148
|
model
|
145
149
|
end
|
@@ -147,20 +151,27 @@ module OpenTox
|
|
147
151
|
|
148
152
|
class LazarRegression < Lazar
|
149
153
|
|
150
|
-
def
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
154
|
+
def self.create training_dataset, params={}
|
155
|
+
model = self.new training_dataset, params
|
156
|
+
model.neighbor_algorithm ||= "fingerprint_neighbors"
|
157
|
+
model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression"
|
158
|
+
model.neighbor_algorithm_parameters ||= {}
|
159
|
+
{
|
160
|
+
:type => "MP2D",
|
161
|
+
:training_dataset_id => training_dataset.id,
|
162
|
+
:min_sim => 0.1
|
163
|
+
}.each do |key,value|
|
164
|
+
model.neighbor_algorithm_parameters[key] ||= value
|
165
|
+
end
|
166
|
+
model.save
|
167
|
+
model
|
155
168
|
end
|
156
|
-
|
157
169
|
end
|
158
170
|
|
159
|
-
class
|
171
|
+
class Prediction
|
160
172
|
include OpenTox
|
161
173
|
include Mongoid::Document
|
162
174
|
include Mongoid::Timestamps
|
163
|
-
store_in collection: "models"
|
164
175
|
|
165
176
|
# TODO field Validations
|
166
177
|
field :endpoint, type: String
|
@@ -168,10 +179,54 @@ module OpenTox
|
|
168
179
|
field :source, type: String
|
169
180
|
field :unit, type: String
|
170
181
|
field :model_id, type: BSON::ObjectId
|
171
|
-
field :
|
182
|
+
field :repeated_crossvalidation_id, type: BSON::ObjectId
|
183
|
+
|
184
|
+
def predict object
|
185
|
+
Lazar.find(model_id).predict object
|
186
|
+
end
|
187
|
+
|
188
|
+
def training_dataset
|
189
|
+
model.training_dataset
|
190
|
+
end
|
191
|
+
|
192
|
+
def model
|
193
|
+
Lazar.find model_id
|
194
|
+
end
|
195
|
+
|
196
|
+
def repeated_crossvalidation
|
197
|
+
RepeatedCrossValidation.find repeated_crossvalidation_id
|
198
|
+
end
|
199
|
+
|
200
|
+
def crossvalidations
|
201
|
+
repeated_crossvalidation.crossvalidations
|
202
|
+
end
|
203
|
+
|
204
|
+
def regression?
|
205
|
+
training_dataset.features.first.numeric?
|
206
|
+
end
|
207
|
+
|
208
|
+
def classification?
|
209
|
+
training_dataset.features.first.nominal?
|
210
|
+
end
|
211
|
+
|
212
|
+
def self.from_csv_file file
|
213
|
+
metadata_file = file.sub(/csv$/,"json")
|
214
|
+
bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
|
215
|
+
prediction_model = self.new JSON.parse(File.read(metadata_file))
|
216
|
+
training_dataset = Dataset.from_csv_file file
|
217
|
+
model = nil
|
218
|
+
if training_dataset.features.first.nominal?
|
219
|
+
model = LazarClassification.create training_dataset
|
220
|
+
elsif training_dataset.features.first.numeric?
|
221
|
+
model = LazarRegression.create training_dataset
|
222
|
+
end
|
223
|
+
prediction_model[:model_id] = model.id
|
224
|
+
prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id
|
225
|
+
prediction_model.save
|
226
|
+
prediction_model
|
227
|
+
end
|
172
228
|
end
|
173
229
|
|
174
230
|
end
|
175
231
|
|
176
232
|
end
|
177
|
-
|
data/lib/opentox.rb
CHANGED
@@ -12,8 +12,8 @@ module OpenTox
|
|
12
12
|
include Mongoid::Document
|
13
13
|
include Mongoid::Timestamps
|
14
14
|
store_in collection: klass.downcase.pluralize
|
15
|
-
field :
|
16
|
-
|
15
|
+
field :name, type: String
|
16
|
+
field :warnings, type: Array, default: []
|
17
17
|
end
|
18
18
|
OpenTox.const_set klass,c
|
19
19
|
end
|
data/lib/overwrite.rb
CHANGED
@@ -9,6 +9,11 @@ class Object
|
|
9
9
|
def numeric?
|
10
10
|
true if Float(self) rescue false
|
11
11
|
end
|
12
|
+
|
13
|
+
# Returns dimension of nested arrays
|
14
|
+
def dimension
|
15
|
+
self.class == Array ? 1 + self[0].dimension : 0
|
16
|
+
end
|
12
17
|
end
|
13
18
|
|
14
19
|
class Numeric
|
@@ -17,6 +22,14 @@ class Numeric
|
|
17
22
|
end
|
18
23
|
end
|
19
24
|
|
25
|
+
class Float
|
26
|
+
# round to n significant digits
|
27
|
+
# http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby
|
28
|
+
def signif(n)
|
29
|
+
Float("%.#{n}g" % self)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
20
33
|
module Enumerable
|
21
34
|
# @return [Array] only the duplicates of an enumerable
|
22
35
|
def duplicates
|
@@ -81,6 +94,26 @@ class Array
|
|
81
94
|
return self.uniq.size == 1
|
82
95
|
end
|
83
96
|
|
97
|
+
def median
|
98
|
+
sorted = self.sort
|
99
|
+
len = sorted.length
|
100
|
+
(sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0
|
101
|
+
end
|
102
|
+
|
103
|
+
def mean
|
104
|
+
self.inject{ |sum, el| sum + el }.to_f / self.size
|
105
|
+
end
|
106
|
+
|
107
|
+
def sample_variance
|
108
|
+
m = self.mean
|
109
|
+
sum = self.inject(0){|accum, i| accum +(i-m)**2 }
|
110
|
+
sum/(self.length - 1).to_f
|
111
|
+
end
|
112
|
+
|
113
|
+
def standard_deviation
|
114
|
+
Math.sqrt(self.sample_variance)
|
115
|
+
end
|
116
|
+
|
84
117
|
end
|
85
118
|
|
86
119
|
module URI
|
@@ -116,4 +149,8 @@ module URI
|
|
116
149
|
false
|
117
150
|
end
|
118
151
|
|
152
|
+
def self.task? uri
|
153
|
+
uri =~ /task/ and URI.valid? uri
|
154
|
+
end
|
155
|
+
|
119
156
|
end
|