lazar 0.0.7 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/README.md +2 -1
- data/VERSION +1 -1
- data/ext/lazar/extconf.rb +15 -76
- data/ext/lazar/rinstall.R +9 -0
- data/lazar.gemspec +7 -7
- data/lib/classification.rb +5 -78
- data/lib/compound.rb +201 -44
- data/lib/crossvalidation.rb +224 -121
- data/lib/dataset.rb +83 -93
- data/lib/error.rb +1 -1
- data/lib/experiment.rb +99 -0
- data/lib/feature.rb +2 -54
- data/lib/lazar.rb +47 -34
- data/lib/leave-one-out-validation.rb +205 -0
- data/lib/model.rb +131 -76
- data/lib/opentox.rb +2 -2
- data/lib/overwrite.rb +37 -0
- data/lib/physchem.rb +133 -0
- data/lib/regression.rb +117 -189
- data/lib/rest-client-wrapper.rb +4 -5
- data/lib/unique_descriptors.rb +6 -7
- data/lib/validation.rb +63 -69
- data/test/all.rb +2 -2
- data/test/classification.rb +41 -0
- data/test/compound.rb +116 -7
- data/test/data/LOAEL_log_mg_corrected_smiles.csv +567 -567
- data/test/data/LOAEL_log_mmol_corrected_smiles.csv +566 -566
- data/test/data/LOAEL_mmol_corrected_smiles.csv +568 -0
- data/test/data/batch_prediction.csv +25 -0
- data/test/data/batch_prediction_inchi_small.csv +4 -0
- data/test/data/batch_prediction_smiles_small.csv +4 -0
- data/test/data/hamster_carcinogenicity.json +3 -0
- data/test/data/loael.csv +568 -0
- data/test/dataset-long.rb +5 -8
- data/test/dataset.rb +31 -11
- data/test/default_environment.rb +11 -0
- data/test/descriptor.rb +26 -41
- data/test/error.rb +1 -3
- data/test/experiment.rb +301 -0
- data/test/feature.rb +22 -10
- data/test/lazar-long.rb +43 -23
- data/test/lazar-physchem-short.rb +19 -16
- data/test/prediction_models.rb +20 -0
- data/test/regression.rb +43 -0
- data/test/setup.rb +3 -1
- data/test/test_environment.rb +10 -0
- data/test/validation.rb +92 -26
- metadata +64 -38
- data/lib/SMARTS_InteLigand.txt +0 -983
- data/lib/bbrc.rb +0 -165
- data/lib/descriptor.rb +0 -247
- data/lib/neighbor.rb +0 -25
- data/lib/similarity.rb +0 -58
- data/mongoid.yml +0 -8
- data/test/descriptor-long.rb +0 -26
- data/test/fminer-long.rb +0 -38
- data/test/fminer.rb +0 -52
- data/test/lazar-fminer.rb +0 -50
- data/test/lazar-regression.rb +0 -27
@@ -0,0 +1,205 @@
|
|
1
|
+
module OpenTox
|
2
|
+
|
3
|
+
class LeaveOneOutValidation
|
4
|
+
|
5
|
+
field :model_id, type: BSON::ObjectId
|
6
|
+
field :dataset_id, type: BSON::ObjectId
|
7
|
+
field :nr_instances, type: Integer
|
8
|
+
field :nr_unpredicted, type: Integer
|
9
|
+
field :predictions, type: Array
|
10
|
+
field :finished_at, type: Time
|
11
|
+
|
12
|
+
def self.create model
|
13
|
+
model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation
|
14
|
+
loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id
|
15
|
+
compound_ids = model.training_dataset.compound_ids
|
16
|
+
predictions = model.predict model.training_dataset.compounds
|
17
|
+
predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]}
|
18
|
+
predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?}
|
19
|
+
loo.nr_instances = predictions.size
|
20
|
+
predictions.select!{|p| p[:value]} # remove unpredicted
|
21
|
+
loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]}
|
22
|
+
loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
|
23
|
+
loo.statistics
|
24
|
+
loo.save
|
25
|
+
loo
|
26
|
+
end
|
27
|
+
|
28
|
+
def model
|
29
|
+
Model::Lazar.find model_id
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class ClassificationLeaveOneOutValidation < LeaveOneOutValidation
|
34
|
+
|
35
|
+
field :accept_values, type: Array
|
36
|
+
field :confusion_matrix, type: Array, default: []
|
37
|
+
field :weighted_confusion_matrix, type: Array, default: []
|
38
|
+
field :accuracy, type: Float
|
39
|
+
field :weighted_accuracy, type: Float
|
40
|
+
field :true_rate, type: Hash, default: {}
|
41
|
+
field :predictivity, type: Hash, default: {}
|
42
|
+
field :confidence_plot_id, type: BSON::ObjectId
|
43
|
+
|
44
|
+
def statistics
|
45
|
+
accept_values = Feature.find(model.prediction_feature_id).accept_values
|
46
|
+
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
47
|
+
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
|
48
|
+
predictions.each do |pred|
|
49
|
+
pred[:database_activities].each do |db_act|
|
50
|
+
if pred[:value]
|
51
|
+
if pred[:value] == db_act
|
52
|
+
if pred[:value] == accept_values[0]
|
53
|
+
confusion_matrix[0][0] += 1
|
54
|
+
weighted_confusion_matrix[0][0] += pred[:confidence]
|
55
|
+
elsif pred[:value] == accept_values[1]
|
56
|
+
confusion_matrix[1][1] += 1
|
57
|
+
weighted_confusion_matrix[1][1] += pred[:confidence]
|
58
|
+
end
|
59
|
+
else
|
60
|
+
if pred[:value] == accept_values[0]
|
61
|
+
confusion_matrix[0][1] += 1
|
62
|
+
weighted_confusion_matrix[0][1] += pred[:confidence]
|
63
|
+
elsif pred[:value] == accept_values[1]
|
64
|
+
confusion_matrix[1][0] += 1
|
65
|
+
weighted_confusion_matrix[1][0] += pred[:confidence]
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
accept_values.each_with_index do |v,i|
|
72
|
+
true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
|
73
|
+
predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
|
74
|
+
end
|
75
|
+
confidence_sum = 0
|
76
|
+
weighted_confusion_matrix.each do |r|
|
77
|
+
r.each do |c|
|
78
|
+
confidence_sum += c
|
79
|
+
end
|
80
|
+
end
|
81
|
+
update_attributes(
|
82
|
+
accept_values: accept_values,
|
83
|
+
confusion_matrix: confusion_matrix,
|
84
|
+
weighted_confusion_matrix: weighted_confusion_matrix,
|
85
|
+
accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
|
86
|
+
weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
|
87
|
+
true_rate: true_rate,
|
88
|
+
predictivity: predictivity,
|
89
|
+
finished_at: Time.now
|
90
|
+
)
|
91
|
+
$logger.debug "Accuracy #{accuracy}"
|
92
|
+
end
|
93
|
+
|
94
|
+
def confidence_plot
|
95
|
+
unless confidence_plot_id
|
96
|
+
tmpfile = "/tmp/#{id.to_s}_confidence.svg"
|
97
|
+
accuracies = []
|
98
|
+
confidences = []
|
99
|
+
correct_predictions = 0
|
100
|
+
incorrect_predictions = 0
|
101
|
+
predictions.each do |p|
|
102
|
+
p[:database_activities].each do |db_act|
|
103
|
+
if p[:value]
|
104
|
+
p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
|
105
|
+
accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
|
106
|
+
confidences << p[:confidence]
|
107
|
+
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
R.assign "accuracy", accuracies
|
112
|
+
R.assign "confidence", confidences
|
113
|
+
R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
|
114
|
+
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
115
|
+
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
|
116
|
+
plot_id = $gridfs.insert_one(file)
|
117
|
+
update(:confidence_plot_id => plot_id)
|
118
|
+
end
|
119
|
+
$gridfs.find_one(_id: confidence_plot_id).data
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
|
124
|
+
class RegressionLeaveOneOutValidation < LeaveOneOutValidation
|
125
|
+
|
126
|
+
|
127
|
+
field :rmse, type: Float, default: 0.0
|
128
|
+
field :mae, type: Float, default: 0
|
129
|
+
#field :weighted_rmse, type: Float, default: 0
|
130
|
+
#field :weighted_mae, type: Float, default: 0
|
131
|
+
field :r_squared, type: Float
|
132
|
+
field :correlation_plot_id, type: BSON::ObjectId
|
133
|
+
field :confidence_plot_id, type: BSON::ObjectId
|
134
|
+
|
135
|
+
def statistics
|
136
|
+
confidence_sum = 0
|
137
|
+
predicted_values = []
|
138
|
+
measured_values = []
|
139
|
+
predictions.each do |pred|
|
140
|
+
pred[:database_activities].each do |activity|
|
141
|
+
if pred[:value]
|
142
|
+
predicted_values << pred[:value]
|
143
|
+
measured_values << activity
|
144
|
+
error = Math.log10(pred[:value])-Math.log10(activity)
|
145
|
+
self.rmse += error**2
|
146
|
+
#self.weighted_rmse += pred[:confidence]*error**2
|
147
|
+
self.mae += error.abs
|
148
|
+
#self.weighted_mae += pred[:confidence]*error.abs
|
149
|
+
#confidence_sum += pred[:confidence]
|
150
|
+
end
|
151
|
+
end
|
152
|
+
if pred[:database_activities].empty?
|
153
|
+
warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
154
|
+
$logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
|
155
|
+
end
|
156
|
+
end
|
157
|
+
R.assign "measurement", measured_values
|
158
|
+
R.assign "prediction", predicted_values
|
159
|
+
R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
|
160
|
+
r = R.eval("r").to_ruby
|
161
|
+
|
162
|
+
self.mae = self.mae/predictions.size
|
163
|
+
#self.weighted_mae = self.weighted_mae/confidence_sum
|
164
|
+
self.rmse = Math.sqrt(self.rmse/predictions.size)
|
165
|
+
#self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
|
166
|
+
self.r_squared = r**2
|
167
|
+
self.finished_at = Time.now
|
168
|
+
save
|
169
|
+
$logger.debug "R^2 #{r**2}"
|
170
|
+
$logger.debug "RMSE #{rmse}"
|
171
|
+
$logger.debug "MAE #{mae}"
|
172
|
+
end
|
173
|
+
|
174
|
+
def correlation_plot
|
175
|
+
unless correlation_plot_id
|
176
|
+
tmpfile = "/tmp/#{id.to_s}_correlation.svg"
|
177
|
+
predicted_values = []
|
178
|
+
measured_values = []
|
179
|
+
predictions.each do |pred|
|
180
|
+
pred[:database_activities].each do |activity|
|
181
|
+
if pred[:value]
|
182
|
+
predicted_values << pred[:value]
|
183
|
+
measured_values << activity
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
attributes = Model::Lazar.find(self.model_id).attributes
|
188
|
+
attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
|
189
|
+
attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
|
190
|
+
R.assign "measurement", measured_values
|
191
|
+
R.assign "prediction", predicted_values
|
192
|
+
R.eval "all = c(-log(measurement),-log(prediction))"
|
193
|
+
R.eval "range = c(min(all), max(all))"
|
194
|
+
R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
|
195
|
+
R.eval "image = image + geom_abline(intercept=0, slope=1)"
|
196
|
+
R.eval "ggsave(file='#{tmpfile}', plot=image)"
|
197
|
+
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
|
198
|
+
plot_id = $gridfs.insert_one(file)
|
199
|
+
update(:correlation_plot_id => plot_id)
|
200
|
+
end
|
201
|
+
$gridfs.find_one(_id: correlation_plot_id).data
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
data/lib/model.rb
CHANGED
@@ -2,51 +2,79 @@ module OpenTox
|
|
2
2
|
|
3
3
|
module Model
|
4
4
|
|
5
|
-
class
|
5
|
+
class Model
|
6
6
|
include OpenTox
|
7
7
|
include Mongoid::Document
|
8
8
|
include Mongoid::Timestamps
|
9
9
|
store_in collection: "models"
|
10
10
|
|
11
|
-
field :
|
11
|
+
field :name, type: String
|
12
12
|
field :creator, type: String, default: __FILE__
|
13
13
|
# datasets
|
14
14
|
field :training_dataset_id, type: BSON::ObjectId
|
15
15
|
# algorithms
|
16
16
|
field :prediction_algorithm, type: String
|
17
|
-
field :neighbor_algorithm, type: String
|
18
|
-
field :neighbor_algorithm_parameters, type: Hash
|
19
17
|
# prediction feature
|
20
18
|
field :prediction_feature_id, type: BSON::ObjectId
|
21
19
|
|
22
|
-
|
23
|
-
|
20
|
+
def training_dataset
|
21
|
+
Dataset.find(training_dataset_id)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class Lazar < Model
|
26
|
+
|
27
|
+
# algorithms
|
28
|
+
field :neighbor_algorithm, type: String
|
29
|
+
field :neighbor_algorithm_parameters, type: Hash, default: {}
|
24
30
|
|
25
31
|
# Create a lazar model from a training_dataset and a feature_dataset
|
26
32
|
# @param [OpenTox::Dataset] training_dataset
|
27
33
|
# @return [OpenTox::Model::Lazar] Regression or classification model
|
28
|
-
def
|
34
|
+
def initialize training_dataset, params={}
|
29
35
|
|
30
|
-
|
36
|
+
super params
|
31
37
|
|
32
38
|
# TODO document convention
|
33
39
|
prediction_feature = training_dataset.features.first
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
40
|
+
# set defaults for empty parameters
|
41
|
+
self.prediction_feature_id ||= prediction_feature.id
|
42
|
+
self.training_dataset_id ||= training_dataset.id
|
43
|
+
self.name ||= "#{training_dataset.name} #{prediction_feature.name}"
|
44
|
+
self.neighbor_algorithm_parameters ||= {}
|
45
|
+
self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
|
46
|
+
save
|
47
|
+
self
|
48
|
+
end
|
38
49
|
|
39
|
-
|
40
|
-
|
50
|
+
def predict_compound compound
|
51
|
+
prediction_feature = Feature.find prediction_feature_id
|
52
|
+
neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
|
53
|
+
# remove neighbors without prediction_feature
|
54
|
+
# check for database activities (neighbors may include query compound)
|
55
|
+
database_activities = nil
|
56
|
+
prediction = {}
|
57
|
+
if neighbors.collect{|n| n["_id"]}.include? compound.id
|
58
|
+
|
59
|
+
database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
|
60
|
+
prediction[:database_activities] = database_activities
|
61
|
+
prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
|
62
|
+
neighbors.delete_if{|n| n["_id"] == compound.id}
|
63
|
+
end
|
64
|
+
neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
|
65
|
+
if neighbors.empty?
|
66
|
+
prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
|
67
|
+
else
|
68
|
+
prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id}))
|
69
|
+
prediction[:neighbors] = neighbors
|
70
|
+
prediction[:neighbors] ||= []
|
71
|
+
end
|
72
|
+
prediction
|
41
73
|
end
|
42
74
|
|
43
75
|
def predict object
|
44
76
|
|
45
|
-
t = Time.now
|
46
|
-
at = Time.now
|
47
|
-
|
48
77
|
training_dataset = Dataset.find training_dataset_id
|
49
|
-
prediction_feature = Feature.find prediction_feature_id
|
50
78
|
|
51
79
|
# parse data
|
52
80
|
compounds = []
|
@@ -63,50 +91,33 @@ module OpenTox
|
|
63
91
|
|
64
92
|
# make predictions
|
65
93
|
predictions = []
|
66
|
-
|
67
|
-
compounds.each_with_index do |compound,c|
|
68
|
-
t = Time.new
|
69
|
-
database_activities = training_dataset.values(compound,prediction_feature)
|
70
|
-
if database_activities and !database_activities.empty?
|
71
|
-
database_activities = database_activities.first if database_activities.size == 1
|
72
|
-
predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
|
73
|
-
next
|
74
|
-
end
|
75
|
-
neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
|
76
|
-
# add activities
|
77
|
-
# TODO: improve efficiency, takes 3 times longer than previous version
|
78
|
-
neighbors.collect! do |n|
|
79
|
-
rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
|
80
|
-
acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
|
81
|
-
acts.empty? ? nil : n << acts
|
82
|
-
end
|
83
|
-
neighbors.compact! # remove neighbors without training activities
|
84
|
-
predictions << Algorithm.run(prediction_algorithm, neighbors)
|
85
|
-
end
|
94
|
+
predictions = compounds.collect{|c| predict_compound c}
|
86
95
|
|
87
96
|
# serialize result
|
88
97
|
case object.class.to_s
|
89
98
|
when "OpenTox::Compound"
|
90
99
|
prediction = predictions.first
|
91
|
-
prediction[:neighbors]
|
100
|
+
prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
|
92
101
|
return prediction
|
93
102
|
when "Array"
|
94
103
|
return predictions
|
95
104
|
when "OpenTox::Dataset"
|
96
105
|
# prepare prediction dataset
|
106
|
+
measurement_feature = Feature.find prediction_feature_id
|
107
|
+
|
108
|
+
prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
|
97
109
|
prediction_dataset = LazarPrediction.new(
|
98
|
-
:
|
110
|
+
:name => "Lazar prediction for #{prediction_feature.name}",
|
99
111
|
:creator => __FILE__,
|
100
112
|
:prediction_feature_id => prediction_feature.id
|
101
113
|
|
102
114
|
)
|
103
|
-
confidence_feature = OpenTox::NumericFeature.find_or_create_by( "
|
104
|
-
|
105
|
-
|
106
|
-
prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
|
115
|
+
confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
|
116
|
+
warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
|
117
|
+
prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
|
107
118
|
prediction_dataset.compounds = compounds
|
108
|
-
prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:
|
109
|
-
prediction_dataset.
|
119
|
+
prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
|
120
|
+
prediction_dataset.save
|
110
121
|
return prediction_dataset
|
111
122
|
end
|
112
123
|
|
@@ -120,26 +131,19 @@ module OpenTox
|
|
120
131
|
end
|
121
132
|
|
122
133
|
class LazarClassification < Lazar
|
123
|
-
|
124
|
-
|
125
|
-
self.
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
model = self.find model.id # adjust class
|
137
|
-
model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity"
|
138
|
-
model.neighbor_algorithm_parameters = {
|
139
|
-
:feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
|
140
|
-
:feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id,
|
141
|
-
:min_sim => 0.3
|
142
|
-
}
|
134
|
+
|
135
|
+
def self.create training_dataset, params={}
|
136
|
+
model = self.new training_dataset, params
|
137
|
+
model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
|
138
|
+
model.neighbor_algorithm ||= "fingerprint_neighbors"
|
139
|
+
model.neighbor_algorithm_parameters ||= {}
|
140
|
+
{
|
141
|
+
:type => "MP2D",
|
142
|
+
:training_dataset_id => training_dataset.id,
|
143
|
+
:min_sim => 0.1
|
144
|
+
}.each do |key,value|
|
145
|
+
model.neighbor_algorithm_parameters[key] ||= value
|
146
|
+
end
|
143
147
|
model.save
|
144
148
|
model
|
145
149
|
end
|
@@ -147,20 +151,27 @@ module OpenTox
|
|
147
151
|
|
148
152
|
class LazarRegression < Lazar
|
149
153
|
|
150
|
-
def
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
154
|
+
def self.create training_dataset, params={}
|
155
|
+
model = self.new training_dataset, params
|
156
|
+
model.neighbor_algorithm ||= "fingerprint_neighbors"
|
157
|
+
model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression"
|
158
|
+
model.neighbor_algorithm_parameters ||= {}
|
159
|
+
{
|
160
|
+
:type => "MP2D",
|
161
|
+
:training_dataset_id => training_dataset.id,
|
162
|
+
:min_sim => 0.1
|
163
|
+
}.each do |key,value|
|
164
|
+
model.neighbor_algorithm_parameters[key] ||= value
|
165
|
+
end
|
166
|
+
model.save
|
167
|
+
model
|
155
168
|
end
|
156
|
-
|
157
169
|
end
|
158
170
|
|
159
|
-
class
|
171
|
+
class Prediction
|
160
172
|
include OpenTox
|
161
173
|
include Mongoid::Document
|
162
174
|
include Mongoid::Timestamps
|
163
|
-
store_in collection: "models"
|
164
175
|
|
165
176
|
# TODO field Validations
|
166
177
|
field :endpoint, type: String
|
@@ -168,10 +179,54 @@ module OpenTox
|
|
168
179
|
field :source, type: String
|
169
180
|
field :unit, type: String
|
170
181
|
field :model_id, type: BSON::ObjectId
|
171
|
-
field :
|
182
|
+
field :repeated_crossvalidation_id, type: BSON::ObjectId
|
183
|
+
|
184
|
+
def predict object
|
185
|
+
Lazar.find(model_id).predict object
|
186
|
+
end
|
187
|
+
|
188
|
+
def training_dataset
|
189
|
+
model.training_dataset
|
190
|
+
end
|
191
|
+
|
192
|
+
def model
|
193
|
+
Lazar.find model_id
|
194
|
+
end
|
195
|
+
|
196
|
+
def repeated_crossvalidation
|
197
|
+
RepeatedCrossValidation.find repeated_crossvalidation_id
|
198
|
+
end
|
199
|
+
|
200
|
+
def crossvalidations
|
201
|
+
repeated_crossvalidation.crossvalidations
|
202
|
+
end
|
203
|
+
|
204
|
+
def regression?
|
205
|
+
training_dataset.features.first.numeric?
|
206
|
+
end
|
207
|
+
|
208
|
+
def classification?
|
209
|
+
training_dataset.features.first.nominal?
|
210
|
+
end
|
211
|
+
|
212
|
+
def self.from_csv_file file
|
213
|
+
metadata_file = file.sub(/csv$/,"json")
|
214
|
+
bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
|
215
|
+
prediction_model = self.new JSON.parse(File.read(metadata_file))
|
216
|
+
training_dataset = Dataset.from_csv_file file
|
217
|
+
model = nil
|
218
|
+
if training_dataset.features.first.nominal?
|
219
|
+
model = LazarClassification.create training_dataset
|
220
|
+
elsif training_dataset.features.first.numeric?
|
221
|
+
model = LazarRegression.create training_dataset
|
222
|
+
end
|
223
|
+
prediction_model[:model_id] = model.id
|
224
|
+
prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id
|
225
|
+
prediction_model.save
|
226
|
+
prediction_model
|
227
|
+
end
|
172
228
|
end
|
173
229
|
|
174
230
|
end
|
175
231
|
|
176
232
|
end
|
177
|
-
|
data/lib/opentox.rb
CHANGED
@@ -12,8 +12,8 @@ module OpenTox
|
|
12
12
|
include Mongoid::Document
|
13
13
|
include Mongoid::Timestamps
|
14
14
|
store_in collection: klass.downcase.pluralize
|
15
|
-
field :
|
16
|
-
|
15
|
+
field :name, type: String
|
16
|
+
field :warnings, type: Array, default: []
|
17
17
|
end
|
18
18
|
OpenTox.const_set klass,c
|
19
19
|
end
|
data/lib/overwrite.rb
CHANGED
@@ -9,6 +9,11 @@ class Object
|
|
9
9
|
def numeric?
|
10
10
|
true if Float(self) rescue false
|
11
11
|
end
|
12
|
+
|
13
|
+
# Returns dimension of nested arrays
|
14
|
+
def dimension
|
15
|
+
self.class == Array ? 1 + self[0].dimension : 0
|
16
|
+
end
|
12
17
|
end
|
13
18
|
|
14
19
|
class Numeric
|
@@ -17,6 +22,14 @@ class Numeric
|
|
17
22
|
end
|
18
23
|
end
|
19
24
|
|
25
|
+
class Float
|
26
|
+
# round to n significant digits
|
27
|
+
# http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby
|
28
|
+
def signif(n)
|
29
|
+
Float("%.#{n}g" % self)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
20
33
|
module Enumerable
|
21
34
|
# @return [Array] only the duplicates of an enumerable
|
22
35
|
def duplicates
|
@@ -81,6 +94,26 @@ class Array
|
|
81
94
|
return self.uniq.size == 1
|
82
95
|
end
|
83
96
|
|
97
|
+
def median
|
98
|
+
sorted = self.sort
|
99
|
+
len = sorted.length
|
100
|
+
(sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0
|
101
|
+
end
|
102
|
+
|
103
|
+
def mean
|
104
|
+
self.inject{ |sum, el| sum + el }.to_f / self.size
|
105
|
+
end
|
106
|
+
|
107
|
+
def sample_variance
|
108
|
+
m = self.mean
|
109
|
+
sum = self.inject(0){|accum, i| accum +(i-m)**2 }
|
110
|
+
sum/(self.length - 1).to_f
|
111
|
+
end
|
112
|
+
|
113
|
+
def standard_deviation
|
114
|
+
Math.sqrt(self.sample_variance)
|
115
|
+
end
|
116
|
+
|
84
117
|
end
|
85
118
|
|
86
119
|
module URI
|
@@ -116,4 +149,8 @@ module URI
|
|
116
149
|
false
|
117
150
|
end
|
118
151
|
|
152
|
+
def self.task? uri
|
153
|
+
uri =~ /task/ and URI.valid? uri
|
154
|
+
end
|
155
|
+
|
119
156
|
end
|